26 changed files with 3988 additions and 161 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,6 +6,12 @@ __pycache__/
 dist/
 build/
 reports/
+reports_*
+H100*.md
+test_all*.md
+docs/h100_test_all_metrics_guide_cn.md
+docs/multinode_nccl_concepts.md
+docs/multinode_nccl_deep_diagnose_runbook.md
 *.egg
 .eggs/
 *.log
@ -14,4 +20,6 @@ reports/
 .venv/
 venv/
 .qoder/*
+.playwright-mcp/
 .claude/settings.local.json
+.omx/
--- a/configs/default.yaml
+++ b/configs/default.yaml
@ -48,6 +48,55 @@ nccl:
  test_allgather: false
  test_sendrecv: false

+multinode_nccl:
+  enabled: false
+  mode: sweep
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: null  # null = tools.install_dir/nccl-tests/build
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+  begin_size: 1k
+  end_size: 16g
+  step_factor: 2
+  warmup_iters: 10
+  gpus_per_rank: 1
+  timeout_sec: 1800
+  socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: null
+  min_nchannels: null
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: null
+  extra_env: {}
+  min_peak_busbw_gbps:
+    allreduce: 480
+    alltoall: 75
+
 stress:
  duration_sec: 600           # 10 min — reaches thermal steady state, validates throttle/jitter beyond warmup
  use_doubles: false
--- a/configs/multinode_nccl_diagnostic.yaml
+++ b/configs/multinode_nccl_diagnostic.yaml
@ -0,0 +1,62 @@
+tools:
+  install_dir: /opt/gpu-test-tools
+
+report:
+  output_dir: ./reports
+  format: md
+
+multinode_nccl:
+  enabled: true
+  mode: diagnostic
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: null
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+      label: 2 nodes x 8 GPUs diagnostic
+  begin_size: 256M
+  end_size: 256M
+  step_factor: 2
+  warmup_iters: 1
+  iters: 3
+  gpus_per_rank: 1
+  timeout_sec: 600
+  debug: INFO
+  socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: null
+  min_nchannels: null
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: null
+  extra_env:
+    NCCL_DEBUG_SUBSYS: INIT,NET
+    NCCL_NET_GDR_LEVEL: 5
+    NCCL_NET_GDR_READ: 1
+    NCCL_DMABUF_ENABLE: 0
+  min_peak_busbw_gbps:
+    allreduce: 480
+    alltoall: 75
--- a/configs/multinode_nccl_nccl227_16g.yaml
+++ b/configs/multinode_nccl_nccl227_16g.yaml
@ -0,0 +1,62 @@
+tools:
+  install_dir: /opt/gpu-test-tools
+
+report:
+  output_dir: ./reports
+  format: md
+
+multinode_nccl:
+  enabled: true
+  mode: large-message-nccl-2.27.7
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: /data/nccl-tests-latest/build
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+      label: 2 nodes x 8 GPUs NCCL 2.27.7 16G
+  begin_size: 16G
+  end_size: 16G
+  step_factor: 2
+  warmup_iters: 1
+  iters: 3
+  gpus_per_rank: 1
+  timeout_sec: 1200
+  debug: INFO
+  socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: null
+  min_nchannels: null
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: null
+  extra_env:
+    NCCL_DEBUG_SUBSYS: INIT,NET
+    NCCL_NET_GDR_LEVEL: 5
+    NCCL_NET_GDR_READ: 1
+    NCCL_DMABUF_ENABLE: 0
+  min_peak_busbw_gbps:
+    allreduce: 480
+    alltoall: 75
--- a/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml
+++ b/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml
@ -0,0 +1,72 @@
+tools:
+  install_dir: /opt/gpu-test-tools
+
+report:
+  output_dir: ./reports
+  format: md
+
+multinode_nccl:
+  enabled: true
+  mode: cross-leaf-all-collectives-nccl-2.27.7
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: /data/nccl-tests-latest/build
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+    - broadcast_perf
+    - reduce_scatter_perf
+    - all_gather_perf
+    - sendrecv_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+      label: 2 nodes x 8 GPUs (all collectives evidence run)
+      op_env:
+        alltoall:
+          NCCL_PXN_DISABLE: 1
+  begin_size: 16G
+  end_size: 16G
+  step_factor: 2
+  warmup_iters: 10
+  gpus_per_rank: 1
+  timeout_sec: 1800
+  debug: INFO
+  socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: null
+  min_nchannels: null
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: null
+  extra_env:
+    NCCL_DEBUG_SUBSYS: INIT,NET
+    NCCL_NET_GDR_LEVEL: 5
+    NCCL_NET_GDR_READ: 1
+    NCCL_DMABUF_ENABLE: 0
+  min_peak_busbw_gbps:
+    allreduce: 491.84
+    alltoall: 76.54
+    broadcast: 0
+    reducescatter: 0
+    allgather: 0
+    sendrecv: 0
--- a/configs/multinode_nccl_nccl227_auto_16g.yaml
+++ b/configs/multinode_nccl_nccl227_auto_16g.yaml
@ -0,0 +1,62 @@
+tools:
+  install_dir: /opt/gpu-test-tools
+
+report:
+  output_dir: ./reports
+  format: md
+
+multinode_nccl:
+  enabled: true
+  mode: large-message-nccl-2.27.7-auto
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: /data/nccl-tests-latest/build
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+      label: 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G
+  begin_size: 16G
+  end_size: 16G
+  step_factor: 2
+  warmup_iters: 1
+  iters: 3
+  gpus_per_rank: 1
+  timeout_sec: 1200
+  debug: INFO
+  socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: null
+  min_nchannels: null
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: null
+  extra_env:
+    NCCL_DEBUG_SUBSYS: INIT,NET
+    NCCL_NET_GDR_LEVEL: 5
+    NCCL_NET_GDR_READ: 1
+    NCCL_DMABUF_ENABLE: 0
+  min_peak_busbw_gbps:
+    allreduce: 480
+    alltoall: 75
--- a/configs/multinode_nccl_nccl227_diagnostic.yaml
+++ b/configs/multinode_nccl_nccl227_diagnostic.yaml
@ -0,0 +1,62 @@
+tools:
+  install_dir: /opt/gpu-test-tools
+
+report:
+  output_dir: ./reports
+  format: md
+
+multinode_nccl:
+  enabled: true
+  mode: diagnostic-nccl-2.27.7
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: /data/nccl-tests-latest/build
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+      label: 2 nodes x 8 GPUs NCCL 2.27.7
+  begin_size: 256M
+  end_size: 256M
+  step_factor: 2
+  warmup_iters: 1
+  iters: 3
+  gpus_per_rank: 1
+  timeout_sec: 600
+  debug: INFO
+  socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: null
+  min_nchannels: null
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: null
+  extra_env:
+    NCCL_DEBUG_SUBSYS: INIT,NET
+    NCCL_NET_GDR_LEVEL: 5
+    NCCL_NET_GDR_READ: 1
+    NCCL_DMABUF_ENABLE: 0
+  min_peak_busbw_gbps:
+    allreduce: 480
+    alltoall: 75
--- a/configs/multinode_nccl_nccl227_pdf_matrix.yaml
+++ b/configs/multinode_nccl_nccl227_pdf_matrix.yaml
@ -0,0 +1,91 @@
+tools:
+  install_dir: /opt/gpu-test-tools
+
+report:
+  output_dir: ./reports
+  format: md
+
+multinode_nccl:
+  enabled: true
+  mode: cross-leaf-pdf-matrix-nccl-2.27.7
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: /data/nccl-tests-latest/build
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 1
+      label: 2 nodes x 1 GPU (PDF 2 machines 2 GPUs)
+      min_peak_busbw_gbps:
+        allreduce: 48.90
+        alltoall: 27.25
+    - nodes: 2
+      gpus_per_node: 2
+      label: 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs)
+      min_peak_busbw_gbps:
+        allreduce: 136.93
+        alltoall: 54.41
+    - nodes: 2
+      gpus_per_node: 4
+      label: 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs)
+      cuda_visible_devices: 0,1,4,5
+      op_env:
+        alltoall:
+          NCCL_IB_QPS_PER_CONNECTION: 4
+          NCCL_MIN_NCHANNELS: 4
+          NCCL_IB_SPLIT_DATA_ON_QPS: 1
+      min_peak_busbw_gbps:
+        allreduce: 335.48
+        alltoall: 73.73
+    - nodes: 2
+      gpus_per_node: 8
+      label: 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs)
+      op_env:
+        alltoall:
+          NCCL_PXN_DISABLE: 1
+      min_peak_busbw_gbps:
+        allreduce: 491.84
+        alltoall: 76.54
+  begin_size: 16G
+  end_size: 16G
+  step_factor: 2
+  warmup_iters: 10
+  gpus_per_rank: 1
+  timeout_sec: 1800
+  debug: INFO
+  socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: null
+  min_nchannels: null
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: null
+  extra_env:
+    NCCL_DEBUG_SUBSYS: INIT,NET
+    NCCL_NET_GDR_LEVEL: 5
+    NCCL_NET_GDR_READ: 1
+    NCCL_DMABUF_ENABLE: 0
+  min_peak_busbw_gbps:
+    allreduce: 0
+    alltoall: 0
--- a/configs/multinode_nccl_nccl227_sweep.yaml
+++ b/configs/multinode_nccl_nccl227_sweep.yaml
@ -0,0 +1,62 @@
+tools:
+  install_dir: /opt/gpu-test-tools
+
+report:
+  output_dir: ./reports
+  format: md
+
+multinode_nccl:
+  enabled: true
+  mode: sweep-nccl-2.27.7
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: /data/nccl-tests-latest/build
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+      label: 2 nodes x 8 GPUs NCCL 2.27.7 sweep
+  begin_size: 1M
+  end_size: 4G
+  step_factor: 4
+  warmup_iters: 2
+  iters: 5
+  gpus_per_rank: 1
+  timeout_sec: 1200
+  debug: INFO
+  socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: null
+  min_nchannels: null
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: null
+  extra_env:
+    NCCL_DEBUG_SUBSYS: INIT,NET
+    NCCL_NET_GDR_LEVEL: 5
+    NCCL_NET_GDR_READ: 1
+    NCCL_DMABUF_ENABLE: 0
+  min_peak_busbw_gbps:
+    allreduce: 480
+    alltoall: 75
--- a/gpu_tester.py
+++ b/gpu_tester.py
@ -5,6 +5,7 @@ import argparse
 import json
 import os
 import signal
+import socket
 import sys
 import time
 from datetime import datetime
@ -25,6 +26,9 @@ from modules.nccl_test import NCCLTest
 from modules.training_sim import TrainingSim
 from modules.stress_test import StressTest
 from modules.rdma_test import RDMATest
+from modules.nvlink_test import NVLinkTest
+from modules.dcgm_test import DCGMTest
+from modules.multinode_nccl_test import MultiNodeNCCLTest
 from modules.report import ReportGenerator
 from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label, get_supported_gpus, validate_driver_compatibility

@ -32,43 +36,125 @@ DEFAULT_CONFIG = {
    "benchmark": {
        "memory": {"size_mb": 4096, "iterations": 10, "nvbandwidth_buffer_mb": 512, "nvbandwidth_samples": 3},
        "compute": {
-            "dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8"],
-            "matrix_size": 4096,
-            "warmup": 10,
-            "iterations": 100,
+            "dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"],
+            "matrix_size": 8192,
+            "warmup": 50,
+            "iterations": 500,
+            "use_compile": True,
        },
    },
-    "health": {"temp_warning": 80, "temp_critical": 90, "power_limit": None},
+    "health": {"temp_warning": 75, "temp_critical": 85, "power_limit": None},
    "nccl": {
        "min_bandwidth_gbps": None,
        "test_allreduce": True,
        "test_alltoall": True,
        "test_broadcast": True,
-        "test_reduce_scatter": False,
-        "test_allgather": False,
-        "test_sendrecv": False,
+        "test_reduce_scatter": True,
+        "test_allgather": True,
+        "test_sendrecv": True,
+        "message_sizes": ["1M", "256M", "2G"],
+        "repeats": 3,
+        "max_stddev_pct": 3,
+    },
+    "multinode_nccl": {
+        "enabled": False,
+        "mode": "sweep",
+        "hosts": [
+            {"name": "nccl-gpu-1", "addr": "172.72.8.12", "slots": 8},
+            {"name": "nccl-gpu-2", "addr": "172.72.8.16", "slots": 8},
+        ],
+        "ssh_user": "root",
+        "ssh_preflight": True,
+        "mpirun_path": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun",
+        "mpi_ld_preload": None,
+        "extra_ld_library_path": [
+            "/usr/mpi/gcc/openmpi-4.1.9a1/lib",
+            "/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib",
+            "/usr/local/cuda-12.4/targets/x86_64-linux/lib",
+        ],
+        "nccl_tests_dir": None,
+        "tests": ["all_reduce_perf", "alltoall_perf"],
+        "topologies": [{"nodes": 2, "gpus_per_node": 8}],
+        "begin_size": "1k",
+        "end_size": "16g",
+        "step_factor": 2,
+        "warmup_iters": 10,
+        "gpus_per_rank": 1,
+        "timeout_sec": 1800,
+        "socket_ifname": "bond0",
+        "ib_gid_index": 3,
+        "ib_sl": 5,
+        "ib_tc": 136,
+        "ib_hca": "mlx5_0,mlx5_1,mlx5_6,mlx5_7",
+        "ib_timeout": 22,
+        "qps_per_connection": 4,
+        "min_nchannels": 4,
+        "net_plugin": "none",
+        "nvls_enable": 1,
+        "split_data_on_qps": 1,
+        "min_peak_busbw_gbps": {"allreduce": 480, "alltoall": 75},
    },
    "stress": {
-        "duration_sec": 60,
+        "duration_sec": 1800,
+        "production_duration_sec": 1800,
+        "use_gpu_burn": False,
        "use_doubles": False,
        "use_tensor_cores": True,
        "memory_pct": 90,
        "gpus": "all",
+        "dtype": "bf16",
+        "matrix_size": 24576,
+        "telemetry_interval_sec": 1,
+        "warmup_sec": 60,
+        "min_steady_samples": 10,
+        "max_temp_c": 80,
+        "max_temp_delta_c": 5,
+        "min_power_watts": 630,
+        "max_tflops_jitter_pct": 5,
+        "require_tflops_jitter": True,
    },
    "rdma": {
-        "min_bandwidth_gbps": 50,
-        "max_latency_us": 10,
+        "min_bandwidth_gbps": 47,
+        "min_port_rate_gbps": 400,
+        "max_latency_us": 3.5,
+        "max_write_latency_us": 2.0,
+        "max_read_latency_us": 3.5,
        "ib_iterations": 1000,
-        "msg_size": 65536,
+        "msg_size": 4194304,
+        "latency_msg_size": 8,
        "ib_device": None,
        "ib_port": 1,
+        "server_addr": None,
+        "ibping_target": None,
+        "ibping_count": 5,
+        "role": "auto",
+        "pfc_ecn_counters": True,
+    },
+    "nvlink": {
+        "expected_links_per_gpu": 18,
+        "expected_link_speed_gbps": 25,
+        "require_zero_errors": True,
+    },
+    "dcgm": {
+        "diag_level": 3,
+        "timeout_sec": 1200,
+        "expected_num_gpus": 8,
+        "json_output": True,
+        "require_subtests": True,
    },
    "training": {
-        "model": "gpt2",
+        "model": "synthetic_1.5b",
        "batch_size": 8,
        "seq_length": 2048,
        "num_steps": 50,
+        "warmup_steps": 5,
        "dtype": "bf16",
+        "mode": "ddp",
+        "synthetic_params_b": 1.5,
+        "min_tokens_per_sec": 45000,
+        "max_step_jitter_pct": 3,
+        "max_peak_memory_gb": 70,
+        "require_distributed": True,
    },
    "report": {"output_dir": "./reports", "format": "json"},
    "tools": {"install_dir": "/opt/gpu-test-tools"},
@ -131,7 +217,7 @@ def interactive_menu(config: dict):
    if not check_prerequisites(console):
        return

-    results_store: dict = {"timestamp": datetime.now().isoformat(), "tests": {}}
+    results_store: dict = {"timestamp": datetime.now().isoformat(), "hostname": socket.gethostname(), "tests": {}}

    menu_items = [
        ("1", "GPU Information", "gpu_info"),
@ -139,10 +225,13 @@ def interactive_menu(config: dict):
        ("3", "Memory Benchmark (nvbandwidth)", "memory_bench"),
        ("4", "Compute Benchmark", "compute_bench"),
        ("5", "NCCL Multi-GPU Test", "nccl"),
-        ("6", "GPU Stress Test (gpu-burn)", "stress"),
+        ("6", "GPU Stress Test (PyTorch/gpu-burn)", "stress"),
        ("7", "RDMA/IB Test", "rdma"),
-        ("8", "Training Simulation", "training"),
-        ("9", "Full Test Suite (All Tests)", "all"),
+        ("8", "NVLink/NVSwitch Test", "nvlink"),
+        ("9", "DCGM Diagnostic", "dcgm"),
+        ("10", "Training Simulation", "training"),
+        ("11", "Multi-node NCCL Test", "multinode_nccl"),
+        ("12", "Full Test Suite (All Tests)", "all"),
        ("0", "Generate Report", "report"),
    ]

@ -164,9 +253,12 @@ def interactive_menu(config: dict):
            "memory_bench": "HBM bandwidth via nvbandwidth",
            "compute_bench": "GEMM TFLOPS across FP32/TF32/FP16/BF16/FP8",
            "nccl": "AllReduce, AllToAll, Broadcast via nccl-tests",
-            "stress": "Long-running GPU stress via gpu-burn",
+            "stress": "Long-running high-power GEMM stress with telemetry",
            "rdma": "InfiniBand bandwidth & latency (ib_write_bw)",
+            "nvlink": "NVLink links, speed, and error counters",
+            "dcgm": "DCGM diag -r 3 production diagnostic",
            "training": "Simulate LLM training with PyTorch",
+            "multinode_nccl": "Cross-node NCCL via mpirun/nccl-tests",
            "all": "Run all tests sequentially",
            "report": "Export results to JSON/HTML",
        }
@ -257,12 +349,30 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict:
            m.print_results(result)
            return result

+        elif test_name == "nvlink":
+            m = NVLinkTest(config)
+            result = m.run()
+            m.print_results(result)
+            return result
+
+        elif test_name == "dcgm":
+            m = DCGMTest(config)
+            result = m.run()
+            m.print_results(result)
+            return result
+
        elif test_name == "training":
            m = TrainingSim(config)
            result = m.run()
            m.print_results(result)
            return result

+        elif test_name == "multinode_nccl":
+            m = MultiNodeNCCLTest(config)
+            result = m.run()
+            m.print_results(result)
+            return result
+
        elif test_name == "all":
            return _run_full_suite(config, console)

@ -280,17 +390,21 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict:
 def _run_full_suite(config: dict, console: Console) -> dict:
    """Run all tests sequentially."""
    console.print(Panel("[bold cyan]Running Full Test Suite[/bold cyan]", box=box.DOUBLE))
-    all_results: dict = {"timestamp": datetime.now().isoformat()}
+    all_results: dict = {"timestamp": datetime.now().isoformat(), "hostname": socket.gethostname()}
    tests = [
        ("gpu_info", "GPU Information", GPUInfo),
        ("health", "Health Check", HealthCheck),
        ("memory_bench", "Memory Benchmark", lambda c: Benchmark(c)),
        ("compute_bench", "Compute Benchmark", lambda c: Benchmark(c)),
+        ("nvlink", "NVLink/NVSwitch Test", NVLinkTest),
        ("nccl", "NCCL Test", NCCLTest),
        ("stress", "GPU Stress Test", StressTest),
        ("rdma", "RDMA/IB Test", RDMATest),
+        ("dcgm", "DCGM Diagnostic", DCGMTest),
        ("training", "Training Simulation", TrainingSim),
    ]
+    if (config.get("multinode_nccl", {}) or {}).get("enabled"):
+        tests.append(("multinode_nccl", "Multi-node NCCL Test", MultiNodeNCCLTest))

    for i, (key, name, mod_cls) in enumerate(tests, 1):
        console.print(f"\n[bold cyan][{i}/{len(tests)}] {name}[/bold cyan]")
@ -313,14 +427,49 @@ def _run_full_suite(config: dict, console: Console) -> dict:
    # Summary
    console.print("\n" + "=" * 60)
    # Only count test results, exclude metadata like timestamp
-    test_results = {k: v for k, v in all_results.items() if k != "timestamp"}
-    passed = sum(1 for v in test_results.values() if not isinstance(v, dict) or "error" not in v)
+    test_results = {k: v for k, v in all_results.items() if k not in ("timestamp", "hostname")}
+    passed = sum(1 for v in test_results.values() if _test_result_passed(v))
    total = len(test_results)
    color = "green" if passed == total else ("yellow" if passed > 0 else "red")
    console.print(f"[bold {color}]Suite complete: {passed}/{total} tests passed[/bold {color}]")
    return all_results


+def _test_result_passed(result) -> bool:
+    """Strict production verdict helper for full-suite exit status."""
+    if not isinstance(result, dict):
+        return True
+    if result.get("error"):
+        return False
+    if result.get("skipped") or result.get("status") == "SKIP":
+        return False
+    if result.get("source") == "torchrun_fallback":
+        return False
+    if "passed" in result:
+        return bool(result.get("passed"))
+    if "memory" in result:
+        mem = result["memory"]
+        if isinstance(mem, dict) and "passed" in mem:
+            return bool(mem.get("passed"))
+        if mem.get("error") or mem.get("source") == "pytorch":
+            return False
+        eff = mem.get("d2d_efficiency_pct") or mem.get("efficiency_pct") or 0
+        return eff >= 80
+    if "compute" in result:
+        comp = result["compute"]
+        if isinstance(comp, dict) and "passed" in comp:
+            return bool(comp.get("passed"))
+        thresholds = comp.get("pass_thresholds_tflops", {}) or {}
+        per_dtype = comp.get("per_dtype_tflops", {})
+        for dt, threshold in thresholds.items():
+            val = per_dtype.get(dt)
+            if not isinstance(val, (int, float)) or val < threshold:
+                return False
+        consistency = comp.get("consistency", {})
+        return not any(not c.get("passed", False) for c in consistency.values())
+    return True
+
+
 def main():
    gpu_list_str = " / ".join(g.upper() for g in get_supported_gpus())
    parser = argparse.ArgumentParser(
@ -335,15 +484,18 @@ Examples:
   python gpu_tester.py --test benchmark --type memory
   python gpu_tester.py --test benchmark --type compute --dtype fp16
   python gpu_tester.py --test nccl            # NCCL test
+   python gpu_tester.py --test multinode-nccl  # Cross-node NCCL test
+   python gpu_tester.py --test nvlink          # NVLink/NVSwitch test
+   python gpu_tester.py --test dcgm            # DCGM diagnostic
   python gpu_tester.py --test training        # Training sim
   python gpu_tester.py --test all             # Full suite
   python gpu_tester.py --report --format json --output report.json
        """,
    )
-    parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "training", "all"],
+    parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "multinode-nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"],
                        help="Run a specific test")
    parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)")
-    parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8"],
+    parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"],
                        help="Compute benchmark dtype (with --test benchmark --type compute)")
    parser.add_argument("--interactive", action="store_true", help="Force interactive mode")
    parser.add_argument("--report", action="store_true", help="Generate report from last results")
@ -397,8 +549,11 @@ Examples:
        "health": "health",
        "benchmark": None,
        "nccl": "nccl",
+        "multinode-nccl": "multinode_nccl",
        "stress": "stress",
        "rdma": "rdma",
+        "nvlink": "nvlink",
+        "dcgm": "dcgm",
        "training": "training",
        "all": "all",
    }
@ -415,19 +570,30 @@ Examples:
            result = bench.run()
            Benchmark.print_results(result)
        if args.report:
-            ReportGenerator(config).generate({"benchmark": result, "timestamp": datetime.now().isoformat()},
+            ReportGenerator(config).generate({
+                "benchmark": result,
+                "timestamp": datetime.now().isoformat(),
+                "hostname": socket.gethostname(),
+            },
                                             fmt=args.format, output=args.output)
+        sys.exit(0 if _test_result_passed(result) else 1)
    elif args.test == "all":
        results = _run_full_suite(config, console)
        if args.report:
            ReportGenerator(config).generate(results, fmt=args.format, output=args.output)
-        has_errors = any("error" in v for v in results.values() if isinstance(v, dict))
-        sys.exit(1 if has_errors else 0)
+        failed = any(not _test_result_passed(v) for k, v in results.items() if k not in ("timestamp", "hostname"))
+        sys.exit(1 if failed else 0)
    else:
        result = _run_test(test_map[args.test], config, console)
        if args.report and result:
-            ReportGenerator(config).generate({args.test: result, "timestamp": datetime.now().isoformat()},
+            report_key = test_map[args.test] or args.test
+            ReportGenerator(config).generate({
+                report_key: result,
+                "timestamp": datetime.now().isoformat(),
+                "hostname": socket.gethostname(),
+            },
                                             fmt=args.format, output=args.output)
+        sys.exit(0 if _test_result_passed(result) else 1)


 if __name__ == "__main__":
--- a/modules/dcgm_test.py
+++ b/modules/dcgm_test.py
@ -0,0 +1,231 @@
+"""DCGM diagnostic acceptance wrapper."""
+
+import json
+import os
+import re
+import shutil
+import signal
+import subprocess
+from datetime import datetime
+from typing import Optional
+
+from rich.console import Console
+from rich.table import Table
+
+
+class DCGMTest:
+    def __init__(self, config: dict):
+        self.config = config
+        self.console = Console()
+        self.cfg = config.get("dcgm", {})
+
+    def run(self) -> dict:
+        dcgmi = shutil.which("dcgmi")
+        if not dcgmi:
+            return {
+                "passed": False,
+                "error": "dcgmi not found",
+                "timestamp": datetime.now().isoformat(),
+            }
+
+        level = str(self.cfg.get("diag_level", 3))
+        timeout = int(self.cfg.get("timeout_sec", 1200))
+        cmd = [dcgmi, "diag", "-r", level]
+        expected_gpus = self.cfg.get("expected_num_gpus")
+        if expected_gpus:
+            cmd.extend(["-n", f"gpu:{int(expected_gpus)}"])
+        if self.cfg.get("json_output", True):
+            cmd.append("-j")
+
+        try:
+            r = self._run_with_process_group_timeout(cmd, timeout)
+        except subprocess.TimeoutExpired as e:
+            output = ((e.output or "") + "\n" + (e.stderr or "")).strip()
+            return {
+                "passed": False,
+                "error": f"dcgmi diag -r {level} timeout after {timeout}s",
+                "command": cmd,
+                "raw_output_tail": output[-8000:],
+                "timestamp": datetime.now().isoformat(),
+            }
+
+        output = r.stdout + "\n" + r.stderr
+        subtests = self._parse_json_output(output) or self._parse_output(output)
+        strict_statuses = {"PASS"}
+        failed = [s for s in subtests if s["status"] not in strict_statuses]
+        require_subtests = bool(self.cfg.get("require_subtests", True))
+        passed = r.returncode == 0 and not failed and (bool(subtests) or not require_subtests)
+        return {
+            "passed": passed,
+            "returncode": r.returncode,
+            "level": int(level),
+            "command": cmd,
+            "expected_num_gpus": int(expected_gpus) if expected_gpus else None,
+            "subtests": subtests,
+            "raw_output_tail": output[-8000:],
+            "timestamp": datetime.now().isoformat(),
+        }
+
+    @staticmethod
+    def _run_with_process_group_timeout(cmd: list[str], timeout: int) -> subprocess.CompletedProcess:
+        proc = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            start_new_session=True,
+        )
+        try:
+            stdout, stderr = proc.communicate(timeout=timeout)
+        except subprocess.TimeoutExpired as e:
+            try:
+                os.killpg(proc.pid, signal.SIGTERM)
+                stdout, stderr = proc.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                os.killpg(proc.pid, signal.SIGKILL)
+                stdout, stderr = proc.communicate(timeout=10)
+            raise subprocess.TimeoutExpired(cmd, timeout, output=stdout, stderr=stderr) from e
+        return subprocess.CompletedProcess(cmd, proc.returncode, stdout, stderr)
+
+    @classmethod
+    def _parse_json_output(cls, output: str) -> list[dict]:
+        text = output.strip()
+        if not text:
+            return []
+        try:
+            payload = json.loads(text)
+        except json.JSONDecodeError:
+            m = re.search(r"(\{.*\})", text, re.S)
+            if not m:
+                return []
+            try:
+                payload = json.loads(m.group(1))
+            except json.JSONDecodeError:
+                return []
+
+        dcgm_payload = payload.get("DCGM Diagnostic") if isinstance(payload, dict) else None
+        if isinstance(dcgm_payload, dict):
+            parsed = cls._parse_dcgm_diagnostic_json(dcgm_payload)
+            if parsed:
+                return parsed
+
+        subtests = []
+
+        def walk(node, path: list[str]):
+            if isinstance(node, dict):
+                node_name = (
+                    node.get("name")
+                    or node.get("testName")
+                    or node.get("test_name")
+                    or node.get("category")
+                    or node.get("category_name")
+                )
+                child_path = [*path, str(node_name)] if node_name else path
+                status = node.get("status") or node.get("result") or node.get("Result")
+                if isinstance(status, str):
+                    name = (
+                        node_name
+                        or " / ".join(path[-3:])
+                    )
+                    normalized = cls._normalize_status(status)
+                    if normalized:
+                        subtests.append({
+                            "name": str(name)[:160],
+                            "status": normalized,
+                            "raw": json.dumps(node, default=str)[:1000],
+                        })
+                for key, value in node.items():
+                    walk(value, [*child_path, str(key)])
+            elif isinstance(node, list):
+                for idx, item in enumerate(node):
+                    walk(item, [*path, str(idx)])
+
+        walk(payload, [])
+        return subtests
+
+    @classmethod
+    def _parse_dcgm_diagnostic_json(cls, payload: dict) -> list[dict]:
+        subtests = []
+        for category in payload.get("test_categories", []) or []:
+            category_name = str(category.get("category") or "DCGM")
+            for test in category.get("tests", []) or []:
+                test_name = str(test.get("name") or "unnamed")
+                for result in test.get("results", []) or []:
+                    status = cls._normalize_status(str(result.get("status", "")))
+                    if not status:
+                        continue
+                    entity_group = result.get("entity_group") or "entity"
+                    entity_id = result.get("entity_id", "unknown")
+                    name = f"{category_name}/{test_name}/{entity_group}{entity_id}"
+                    subtests.append({
+                        "name": name[:160],
+                        "status": status,
+                        "raw": json.dumps(result, default=str)[:1000],
+                    })
+                summary = test.get("test_summary") or {}
+                status = cls._normalize_status(str(summary.get("status", "")))
+                if status:
+                    subtests.append({
+                        "name": f"{category_name}/{test_name}/summary"[:160],
+                        "status": status,
+                        "raw": json.dumps(summary, default=str)[:1000],
+                    })
+        return subtests
+
+    @staticmethod
+    def _normalize_status(status: str) -> str:
+        s = status.strip().upper()
+        aliases = {
+            "PASS": "PASS",
+            "PASSED": "PASS",
+            "OK": "PASS",
+            "FAIL": "FAIL",
+            "FAILED": "FAIL",
+            "ERROR": "ERROR",
+            "WARN": "WARN",
+            "WARNING": "WARN",
+            "SKIP": "SKIP",
+            "SKIPPED": "SKIP",
+            "NOT_RUN": "SKIP",
+            "NOT RUN": "SKIP",
+        }
+        return aliases.get(s, s if s in {"PASS", "FAIL", "ERROR", "WARN", "SKIP"} else "")
+
+    @staticmethod
+    def _parse_output(output: str) -> list[dict]:
+        subtests = []
+        for line in output.splitlines():
+            stripped = line.strip()
+            if not stripped:
+                continue
+            m = re.search(r"(.+?)\s*[:|]\s*(PASS|FAIL|WARN|ERROR|SKIP)\b", stripped, re.I)
+            if not m:
+                m = re.search(r"\b(PASS|FAIL|WARN|ERROR|SKIP)\b\s*[-:|]\s*(.+)", stripped, re.I)
+                if m:
+                    status = DCGMTest._normalize_status(m.group(1))
+                    name = m.group(2).strip()
+                else:
+                    continue
+            else:
+                name = m.group(1).strip(" .|-")
+                status = DCGMTest._normalize_status(m.group(2))
+            if name and len(name) < 160:
+                subtests.append({"name": name, "status": status, "raw": stripped})
+        return subtests
+
+    @staticmethod
+    def print_results(results: dict, console: Optional[Console] = None):
+        c = console or Console()
+        if results.get("error"):
+            c.print(f"[bold red]DCGM error: {results['error']}[/bold red]")
+            return
+        passed = results.get("passed", False)
+        c.print("[bold green]✓ DCGM diag PASSED[/bold green]" if passed else "[bold red]✗ DCGM diag FAILED[/bold red]")
+        subtests = results.get("subtests", [])
+        if subtests:
+            table = Table(box=None, padding=(0, 1))
+            table.add_column("Subtest")
+            table.add_column("Status", style="bold")
+            for s in subtests:
+                table.add_row(s.get("name", ""), s.get("status", ""))
+            c.print(table)
--- a/modules/health_check.py
+++ b/modules/health_check.py
@ -171,6 +171,10 @@ class HealthCheck:
            gpu_health.append({"index": i, "status": worst, "checks": checks})

        system_health = self._check_system()
+        for key in ("fabricmanager", "retired_pages", "kernel_errors"):
+            item = system_health.get(key, {})
+            if isinstance(item, dict) and item.get("status") == "FAIL":
+                overall_pass = False

        return {
            "passed": overall_pass,
@ -228,6 +232,9 @@ class HealthCheck:
            rdma_devs = os.listdir("/sys/class/infiniband_verbs")

        nccl_env = {k: v for k, v in os.environ.items() if k.startswith("NCCL_")}
+        fabric = self._check_fabricmanager()
+        retired = self._check_retired_pages()
+        kernel_errors = self._check_kernel_errors()

        return {
            "nvidia_persistenced": {"installed": persistd, "running": persistd_running},
@ -238,6 +245,41 @@ class HealthCheck:
            "infiniband_devices": ib_devs,
            "rdma_devices": rdma_devs,
            "nccl_env_vars": nccl_env,
+            "fabricmanager": fabric,
+            "retired_pages": retired,
+            "kernel_errors": kernel_errors,
+        }
+
+    def _check_fabricmanager(self) -> dict:
+        r = self._run_cmd(["systemctl", "is-active", "nvidia-fabricmanager"], timeout=5)
+        active = r == "active"
+        logs = self._run_cmd(["journalctl", "-u", "nvidia-fabricmanager", "-n", "200", "--no-pager"], timeout=10) or ""
+        has_error = "ERROR" in logs.upper() or "FAILED" in logs.upper()
+        return {
+            "active": active,
+            "has_error_logs": has_error,
+            "status": "PASS" if active and not has_error else "FAIL",
+        }
+
+    def _check_retired_pages(self) -> dict:
+        raw = self._run_cmd(["nvidia-smi", "-q", "-d", "PAGE_RETIREMENT"], timeout=30) or ""
+        nums = [int(x) for x in __import__("re").findall(r"Retired Pages.*?:\s*(\d+)", raw, flags=__import__("re").I)]
+        pending = "Pending Page Blacklist" in raw and "Yes" in raw
+        total = sum(nums)
+        return {
+            "retired_pages": total,
+            "pending_blacklist": pending,
+            "status": "PASS" if total == 0 and not pending else "FAIL",
+        }
+
+    def _check_kernel_errors(self) -> dict:
+        raw = self._run_cmd(["dmesg", "--ctime", "--level=err,crit,alert,emerg"], timeout=10) or ""
+        upper = raw.upper()
+        hits = [line for line in raw.splitlines() if any(k in line.upper() for k in ("XID", "AER", "PCIE", "NVRM"))]
+        return {
+            "count": len(hits),
+            "tail": hits[-20:],
+            "status": "PASS" if not hits else "FAIL",
        }

    @staticmethod
--- a/modules/nccl_test.py
+++ b/modules/nccl_test.py
@ -5,6 +5,8 @@ import os
 import re
 import shutil
 import subprocess
+import statistics
+import sys
 from datetime import datetime
 from typing import Optional

@ -70,6 +72,38 @@ class NCCLTest:
                return p
        return None

+    def _message_sizes(self) -> list[str]:
+        return list(self.nccl_cfg.get("message_sizes") or ["1M", "256M", "2G"])
+
+    def _repeats(self) -> int:
+        return int(self.nccl_cfg.get("repeats", 3))
+
+    def _max_stddev_pct(self) -> float:
+        return float(self.nccl_cfg.get("max_stddev_pct", 3))
+
+    def _runtime_env(self) -> dict:
+        env = {**os.environ, "NCCL_DEBUG": "WARN"}
+        lib_dirs = []
+
+        nccl_home = env.get("NCCL_HOME") or self.nccl_cfg.get("nccl_home")
+        if nccl_home:
+            lib_dirs.append(os.path.join(str(nccl_home), "lib"))
+
+        for path in sys.path:
+            lib_dirs.append(os.path.join(path, "nvidia", "nccl", "lib"))
+
+        venv_root = os.path.dirname(os.path.dirname(sys.executable))
+        lib_dirs.extend(glob.glob(os.path.join(venv_root, "lib", "python*", "site-packages", "nvidia", "nccl", "lib")))
+
+        existing = env.get("LD_LIBRARY_PATH", "")
+        valid_dirs = []
+        for d in lib_dirs:
+            if d and os.path.isdir(d) and d not in valid_dirs:
+                valid_dirs.append(d)
+        if valid_dirs:
+            env["LD_LIBRARY_PATH"] = ":".join(valid_dirs + ([existing] if existing else []))
+        return env
+
    def run(self) -> dict:
        gpu_count = 0
        if TORCH_AVAILABLE:
@ -89,7 +123,7 @@ class NCCLTest:
        if self.nccl_cfg.get("test_reduce_scatter", False):
            tests.append(("reduce_scatter_perf", "ReduceScatter"))
        if self.nccl_cfg.get("test_allgather", False):
-            tests.append(("allgather_perf", "AllGather"))
+            tests.append(("all_gather_perf", "AllGather"))
        if self.nccl_cfg.get("test_sendrecv", False):
            tests.append(("sendrecv_perf", "SendRecv"))

@ -170,39 +204,7 @@ class NCCLTest:
        if not binary:
            return {"status": "SKIP", "error": f"{binary_name} not found"}

-        cmd = [
-            binary,
-            "-b", "8M",
-            "-e", "8G",
-            "-f", "2",
-            "-g", str(gpu_count),
-            "-w", "5",
-            "-n", "20",
-        ]
-
-        try:
-            env = os.environ.copy()
-            env["NCCL_DEBUG"] = "WARN"
-            r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env)
-
-            combined = r.stdout + r.stderr
-            # Check for NCCL/CUDA compatibility errors
-            if "CUDA driver version is insufficient" in combined or \
-               "Test NCCL failure" in combined:
-                error_msg = "NCCL/CUDA driver version mismatch" \
-                    if "CUDA driver version" in combined \
-                    else "NCCL test failure (library incompatibility)"
-                return {"status": "FAIL", "error": error_msg}
-
-            if r.returncode != 0:
-                return {"status": "FAIL", "error": r.stderr[:300]}
-
-            return self._parse_nccl_output(r.stdout, min_bw)
-
-        except subprocess.TimeoutExpired:
-            return {"status": "FAIL", "error": "timeout"}
-        except Exception as e:
-            return {"status": "FAIL", "error": str(e)}
+        return self._run_nccl_matrix([binary, "-g", str(gpu_count)], min_bw)

    def _run_one_nccl_test_mpirun(self, binary_name: str, label: str,
                                   gpu_count: int, mpirun: str, min_bw: float) -> dict:
@ -218,37 +220,64 @@ class NCCLTest:
            "-x", "NCCL_DEBUG=WARN",
            "-x", "CUDA_VISIBLE_DEVICES=" + ",".join(str(i) for i in range(gpu_count)),
            binary,
-            "-b", "8",
-            "-e", "256M",
-            "-f", "2",
            "-g", "1",
-            "-w", "5",
-            "-n", "20",
        ]

+        return self._run_nccl_matrix(cmd, min_bw)
+
+    def _run_nccl_matrix(self, base_cmd: list[str], min_bw: float) -> dict:
+        size_results = []
+        failures = []
+        env = self._runtime_env()
+
        try:
-            env = os.environ.copy()
-            env["NCCL_DEBUG"] = "WARN"
-            r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env)
-
-            combined = r.stdout + r.stderr
-            if "CUDA driver version is insufficient" in combined or \
-               "Test NCCL failure" in combined:
-                error_msg = "NCCL/CUDA driver version mismatch" \
-                    if "CUDA driver version" in combined \
-                    else "NCCL test failure (library incompatibility)"
-                return {"status": "FAIL", "error": error_msg}
-
-            if r.returncode != 0:
-                return {"status": "FAIL", "error": r.stderr[:300]}
-
-            return self._parse_nccl_output(r.stdout, min_bw)
+            for size in self._message_sizes():
+                runs = []
+                for _ in range(self._repeats()):
+                    cmd = [*base_cmd, "-b", size, "-e", size, "-f", "2", "-w", "5", "-n", "20"]
+                    r = subprocess.run(cmd, capture_output=True, text=True, timeout=300, env=env)
+                    combined = r.stdout + r.stderr
+                    if "CUDA driver version is insufficient" in combined or "Test NCCL failure" in combined:
+                        failures.append({"size": size, "error": "NCCL/CUDA/library failure"})
+                        continue
+                    if r.returncode != 0:
+                        failures.append({"size": size, "error": r.stderr[:300]})
+                        continue
+                    parsed = self._parse_nccl_output(r.stdout, min_bw)
+                    runs.append(parsed.get("best_busbw_gbps", 0))
+                if runs:
+                    worst = min(runs)
+                    mean = sum(runs) / len(runs)
+                    std_pct = (statistics.pstdev(runs) / mean * 100) if len(runs) > 1 and mean else 0
+                    size_results.append({
+                        "size": size,
+                        "runs_busbw_gbps": [round(v, 1) for v in runs],
+                        "worst_busbw_gbps": round(worst, 1),
+                        "mean_busbw_gbps": round(mean, 1),
+                        "stddev_pct": round(std_pct, 2),
+                        "status": "PASS" if worst >= min_bw and std_pct <= self._max_stddev_pct() else "FAIL",
+                    })
+                else:
+                    size_results.append({"size": size, "status": "FAIL", "runs_busbw_gbps": []})

        except subprocess.TimeoutExpired:
            return {"status": "FAIL", "error": "timeout"}
        except Exception as e:
            return {"status": "FAIL", "error": str(e)}

+        best_bus = max((r.get("mean_busbw_gbps", 0) for r in size_results), default=0)
+        worst_bus = min((r.get("worst_busbw_gbps", 0) for r in size_results if r.get("runs_busbw_gbps")), default=0)
+        passed = bool(size_results) and all(r.get("status") == "PASS" for r in size_results) and not failures
+        return {
+            "status": "PASS" if passed else "FAIL",
+            "best_busbw_gbps": round(best_bus, 1),
+            "worst_busbw_gbps": round(worst_bus, 1),
+            "min_required_gbps": min_bw,
+            "max_stddev_pct": self._max_stddev_pct(),
+            "by_size": size_results,
+            "failures": failures,
+        }
+
    @staticmethod
    def _parse_nccl_output(stdout: str, min_bw: float) -> dict:
        """Parse nccl-tests tabular output and extract bandwidth results."""
@ -363,7 +392,7 @@ dist.destroy_process_group()
            r = subprocess.run(
                [torchrun_cmd, f"--nproc_per_node={gpu_count}", tmp.name],
                capture_output=True, text=True, timeout=120,
-                env={**os.environ, "NCCL_DEBUG": "WARN"},
+                env=self._runtime_env(),
            )
            os.unlink(tmp.name)
            
@ -390,10 +419,15 @@ dist.destroy_process_group()
                }
            
            return {
-                "passed": all_passed,
+                # torchrun fallback is a functional smoke only. It never proves
+                # production bus bandwidth, so it must not satisfy acceptance.
+                "passed": False,
+                "functional_passed": all_passed,
                "source": "torchrun_fallback",
                "tests": tests,
                "gpu_count": gpu_count,
+                "error": None if all_passed else "torchrun functional NCCL smoke failed",
+                "acceptance_gap": "nccl-tests bus bandwidth was not measured",
            }
        except Exception as e:
            return {"passed": False, "source": "torchrun_fallback", "error": str(e)}
@ -410,7 +444,8 @@ dist.destroy_process_group()
        
        if source == "torchrun_fallback":
            # Connectivity check mode
-            verdict = "[bold green]✓ NCCL Connectivity OK[/bold green]" if passed else "[bold red]✗ NCCL Connectivity FAILED[/bold red]"
+            functional = results.get("functional_passed", passed)
+            verdict = "[bold yellow]⚠ NCCL bus BW NOT VERIFIED[/bold yellow]" if functional else "[bold red]✗ NCCL Connectivity FAILED[/bold red]"
            c.print(f"{verdict} [dim](basic check via torchrun)[/dim]")
            
            tests = results.get("tests", {})
@ -427,7 +462,7 @@ dist.destroy_process_group()
                    else:
                        c.print(f"  [{s_color}]{op_name}[/{s_color}]")
            
-            c.print("\n[yellow]Note: functional connectivity test only (no performance data)[/yellow]")
+            c.print("\n[yellow]Note: functional connectivity test only (no bus bandwidth data; acceptance FAIL)[/yellow]")
        else:
            # nccl-tests mode
            verdict = "[bold green]✓ NCCL tests PASSED[/bold green]" if passed else "[bold yellow]⚠ NCCL tests WARNING[/bold yellow]"
@ -448,12 +483,16 @@ dist.destroy_process_group()
                if by_size:
                    t = Table(box=None, padding=(0, 1))
                    t.add_column("Size", style="bold", justify="right")
-                    t.add_column("Time (us)", justify="right")
-                    t.add_column("Alg BW (GB/s)", justify="right")
-                    t.add_column("Bus BW (GB/s)", justify="right")
+                    t.add_column("Worst Bus BW", justify="right")
+                    t.add_column("Mean Bus BW", justify="right")
+                    t.add_column("StdDev", justify="right")
+                    t.add_column("Status", justify="right")
                    for r in by_size:
-                        sz = r.get("size", 0)
-                        sz_str = f"{sz/1024:.0f}K" if sz < 1048576 else f"{sz/1048576:.0f}M"
-                        t.add_row(sz_str, f"{r.get('time_us',0):.1f}",
-                                  f"{r.get('algbw_gbps',0):.1f}", f"{r.get('busbw_gbps',0):.1f}")
+                        t.add_row(
+                            str(r.get("size", "")),
+                            f"{r.get('worst_busbw_gbps', 0):.1f}",
+                            f"{r.get('mean_busbw_gbps', 0):.1f}",
+                            f"{r.get('stddev_pct', 0):.2f}%",
+                            r.get("status", "?"),
+                        )
                    c.print(t)
--- a/modules/nvlink_test.py
+++ b/modules/nvlink_test.py
@ -0,0 +1,188 @@
+"""NVLink / NVSwitch production acceptance checks."""
+
+import re
+import shutil
+import subprocess
+from datetime import datetime
+from typing import Optional
+
+from rich.console import Console
+from rich.table import Table
+
+
+class NVLinkTest:
+    def __init__(self, config: dict):
+        self.config = config
+        self.console = Console()
+        self.cfg = config.get("nvlink", {})
+
+    def _run(self, args: list[str], timeout: int = 60) -> tuple[int, str, str]:
+        if not shutil.which("nvidia-smi"):
+            return 127, "", "nvidia-smi not found"
+        r = subprocess.run(["nvidia-smi", *args], capture_output=True, text=True, timeout=timeout)
+        return r.returncode, r.stdout, r.stderr
+
+    def run(self) -> dict:
+        expected_links = int(self.cfg.get("expected_links_per_gpu", 18))
+        expected_speed = float(self.cfg.get("expected_link_speed_gbps", 25))
+        require_zero_errors = bool(self.cfg.get("require_zero_errors", True))
+
+        rc_s, out_s, err_s = self._run(["nvlink", "-s"])
+        rc_c, out_c, err_c = self._run(["nvlink", "-c"])
+        rc_e, out_e, err_e = self._run(["nvlink", "-e"])
+
+        if rc_s != 0:
+            return {
+                "passed": False,
+                "error": (err_s or out_s or "nvidia-smi nvlink -s failed")[:1000],
+                "timestamp": datetime.now().isoformat(),
+            }
+
+        links = self._parse_status(out_s)
+        if not links:
+            return {
+                "passed": False,
+                "error": "no NVLink status entries parsed from nvidia-smi nvlink -s",
+                "raw_status": out_s[-4000:],
+                "timestamp": datetime.now().isoformat(),
+            }
+        speeds = self._parse_speeds(out_c) if rc_c == 0 else {}
+        status_speeds = self._parse_speeds(out_s)
+        for gpu, gpu_speeds in status_speeds.items():
+            speeds.setdefault(gpu, {}).update({k: v for k, v in gpu_speeds.items() if k not in speeds.get(gpu, {})})
+        errors = self._parse_errors(out_e) if rc_e == 0 else {}
+
+        gpu_results = []
+        overall = True
+        for gpu, gpu_links in sorted(links.items(), key=lambda x: int(x[0])):
+            active = sum(1 for l in gpu_links.values() if l.get("active"))
+            inactive = [lid for lid, l in gpu_links.items() if not l.get("active")]
+            speed_bad = []
+            for lid in gpu_links:
+                speed = speeds.get(gpu, {}).get(lid)
+                if speed is not None and speed < expected_speed:
+                    speed_bad.append({"link": lid, "speed_gbps": speed})
+            err_bad = []
+            if require_zero_errors:
+                for lid, counters in errors.get(gpu, {}).items():
+                    total = sum(v for v in counters.values() if isinstance(v, int))
+                    if total:
+                        err_bad.append({"link": lid, "counters": counters})
+
+            passed = active == expected_links and not inactive and not speed_bad and not err_bad
+            if not passed:
+                overall = False
+            gpu_results.append({
+                "gpu": int(gpu),
+                "active_links": active,
+                "expected_links": expected_links,
+                "inactive_links": inactive,
+                "speed_issues": speed_bad,
+                "error_issues": err_bad,
+                "passed": passed,
+            })
+
+        return {
+            "passed": overall,
+            "expected_links_per_gpu": expected_links,
+            "expected_link_speed_gbps": expected_speed,
+            "require_zero_errors": require_zero_errors,
+            "gpus": gpu_results,
+            "raw_status": out_s[-4000:],
+            "raw_speed": out_c[-4000:] if out_c else "",
+            "raw_errors": out_e[-4000:] if out_e else "",
+            "timestamp": datetime.now().isoformat(),
+        }
+
+    @staticmethod
+    def _parse_status(text: str) -> dict[str, dict[str, dict]]:
+        result: dict[str, dict[str, dict]] = {}
+        gpu = None
+        for line in text.splitlines():
+            m_gpu = re.search(r"GPU\s+(\d+)", line, re.I)
+            if m_gpu:
+                gpu = m_gpu.group(1)
+                result.setdefault(gpu, {})
+                continue
+            if gpu is None:
+                continue
+            m_link = re.search(r"Link\s+(\d+).*?(Active|Inactive|Disabled|Off|Down)", line, re.I)
+            if m_link:
+                state = m_link.group(2)
+                result[gpu][m_link.group(1)] = {
+                    "state": state,
+                    "active": state.lower() == "active",
+                    "raw": line.strip(),
+                }
+                continue
+            m_speed = re.search(r"Link\s+(\d+).*?([0-9.]+)\s*GB/s", line, re.I)
+            if m_speed:
+                result[gpu][m_speed.group(1)] = {
+                    "state": "Active",
+                    "active": True,
+                    "raw": line.strip(),
+                }
+        return result
+
+    @staticmethod
+    def _parse_speeds(text: str) -> dict[str, dict[str, float]]:
+        result: dict[str, dict[str, float]] = {}
+        gpu = None
+        for line in text.splitlines():
+            m_gpu = re.search(r"GPU\s+(\d+)", line, re.I)
+            if m_gpu:
+                gpu = m_gpu.group(1)
+                result.setdefault(gpu, {})
+                continue
+            if gpu is None:
+                continue
+            m_link = re.search(r"Link\s+(\d+).*?([0-9.]+)\s*GB/s", line, re.I)
+            if m_link:
+                result[gpu][m_link.group(1)] = float(m_link.group(2))
+        return result
+
+    @staticmethod
+    def _parse_errors(text: str) -> dict[str, dict[str, dict[str, int]]]:
+        result: dict[str, dict[str, dict[str, int]]] = {}
+        gpu = None
+        link = None
+        for line in text.splitlines():
+            m_gpu = re.search(r"GPU\s+(\d+)", line, re.I)
+            if m_gpu:
+                gpu = m_gpu.group(1)
+                result.setdefault(gpu, {})
+                continue
+            m_link = re.search(r"Link\s+(\d+)", line, re.I)
+            if m_link and gpu is not None:
+                link = m_link.group(1)
+                result[gpu].setdefault(link, {})
+            if gpu is None or link is None:
+                continue
+            for name in ("CRC", "Replay", "Recovery"):
+                m = re.search(rf"{name}[^0-9]*(\d+)", line, re.I)
+                if m:
+                    result[gpu][link][name.lower()] = int(m.group(1))
+        return result
+
+    @staticmethod
+    def print_results(results: dict, console: Optional[Console] = None):
+        c = console or Console()
+        if results.get("error"):
+            c.print(f"[bold red]NVLink error: {results['error']}[/bold red]")
+            return
+        passed = results.get("passed", False)
+        c.print("[bold green]✓ NVLink PASSED[/bold green]" if passed else "[bold red]✗ NVLink FAILED[/bold red]")
+        table = Table(box=None, padding=(0, 1))
+        table.add_column("GPU", style="bold")
+        table.add_column("Active Links", justify="right")
+        table.add_column("Issues")
+        for g in results.get("gpus", []):
+            issues = []
+            if g.get("inactive_links"):
+                issues.append("inactive=" + ",".join(g["inactive_links"]))
+            if g.get("speed_issues"):
+                issues.append(f"speed={len(g['speed_issues'])}")
+            if g.get("error_issues"):
+                issues.append(f"errors={len(g['error_issues'])}")
+            table.add_row(str(g["gpu"]), f"{g['active_links']}/{g['expected_links']}", "; ".join(issues) or "OK")
+        c.print(table)
--- a/modules/report.py
+++ b/modules/report.py
@ -93,8 +93,8 @@ class ReportGenerator:

    def _generate_html(self, results: dict, output: str) -> str:
        import socket
-        hostname = socket.gethostname()
-        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        hostname = results.get("hostname") or socket.gethostname()
+        timestamp = results.get("timestamp") or datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        sections = []

@ -178,8 +178,8 @@ class ReportGenerator:

    def _generate_markdown(self, results: dict, output: str) -> str:
        import socket
-        hostname = socket.gethostname()
-        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        hostname = results.get("hostname") or socket.gethostname()
+        timestamp = results.get("timestamp") or datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        lines: list[str] = []

@ -201,6 +201,21 @@ class ReportGenerator:
        # --- Summary table ---
        summary_items = self._build_summary(results)
        if summary_items:
+            verdict, failures, missing = self._overall_acceptance_verdict(summary_items)
+            lines.append("## Overall Acceptance Verdict\n")
+            lines.append(f"**Result: {verdict}**")
+            lines.append("")
+            if failures:
+                lines.append("Failed or unverified items:")
+                for name, status in failures:
+                    lines.append(f"- {name}: {status}")
+                lines.append("")
+            if missing:
+                lines.append("Missing required evidence:")
+                for name in missing:
+                    lines.append(f"- {name}")
+                lines.append("")
+
            lines.append("## Summary\n")
            lines.append("| Test | Result |")
            lines.append("|------|--------|")
@ -319,8 +334,6 @@ class ReportGenerator:
                    if use_abs and thr:
                        if val >= thr:
                            status = "PASS"
-                        elif val >= thr * 0.9:
-                            status = "WARN"
                        else:
                            status = "FAIL"
                        lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | >= {thr} | {status} |")
@ -331,33 +344,194 @@ class ReportGenerator:
                        overall_status = status
            lines.append("")
            if use_abs:
+                if any(not row.get("passed", False) for row in (comp_data.get("consistency", {}) or {}).values()):
+                    overall_status = "FAIL"
                lines.append(f"**Verdict: {overall_status}** (absolute TFLOPS thresholds; worst efficiency {worst_eff:.1f}%)\n")
            else:
                overall_status = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
                lines.append(f"**Verdict: {overall_status}** (worst efficiency {worst_eff:.1f}%)\n")

+            consistency = comp_data.get("consistency", {}) or {}
+            if consistency:
+                lines.append("### Compute Consistency\n")
+                lines.append("| DType | Min | Mean | Max | Spread | Limit | Status |")
+                lines.append("|-------|-----|------|-----|--------|-------|--------|")
+                for dt, row in consistency.items():
+                    status = "PASS" if row.get("passed") else "FAIL"
+                    lines.append(
+                        f"| {dt.upper()} | {row.get('min_tflops', 0):.1f} | "
+                        f"{row.get('mean_tflops', 0):.1f} | {row.get('max_tflops', 0):.1f} | "
+                        f"{row.get('spread_pct', 0):.2f}% | <= {row.get('max_allowed_pct', 3)}% | {status} |"
+                    )
+                lines.append("")
+
+            per_gpu = comp_data.get("per_gpu", []) or []
+            dtype_order = [dt for dt in per_dtype.keys() if not isinstance(per_dtype.get(dt), str)]
+            if per_gpu and dtype_order:
+                lines.append("### Compute Per-GPU TFLOPS\n")
+                headers = ["GPU", *[dt.upper() for dt in dtype_order]]
+                lines.append("| " + " | ".join(headers) + " |")
+                lines.append("|" + "|".join(["---"] * len(headers)) + "|")
+                for row in per_gpu:
+                    cells = [str(row.get("index", ""))]
+                    for dt in dtype_order:
+                        val = row.get(dt, "")
+                        cells.append(f"{val:.1f}" if isinstance(val, (int, float)) else str(val))
+                    lines.append("| " + " | ".join(cells) + " |")
+                lines.append("")
+
+        # --- NCCL ---
+        nvlink = results.get("nvlink")
+        if nvlink and not nvlink.get("error"):
+            lines.append("## NVLink/NVSwitch\n")
+            lines.append(f"**Overall: {'PASS' if nvlink.get('passed') else 'FAIL'}**\n")
+            lines.append("| GPU | Active Links | Issues |")
+            lines.append("|-----|--------------|--------|")
+            for g in nvlink.get("gpus", []):
+                issues = []
+                if g.get("inactive_links"):
+                    issues.append("inactive=" + ",".join(g["inactive_links"]))
+                if g.get("speed_issues"):
+                    issues.append(f"speed issues={len(g['speed_issues'])}")
+                if g.get("error_issues"):
+                    issues.append(f"errors={len(g['error_issues'])}")
+                lines.append(f"| {g.get('gpu')} | {g.get('active_links')}/{g.get('expected_links')} | {', '.join(issues) or 'OK'} |")
+            lines.append("")
+        elif nvlink and nvlink.get("error"):
+            lines.append("## NVLink/NVSwitch\n")
+            lines.append(f"**Overall: FAIL** ({nvlink.get('error')})\n")
+
+        dcgm = results.get("dcgm")
+        if dcgm and not dcgm.get("error"):
+            lines.append("## DCGM Diagnostic\n")
+            lines.append(f"**Overall: {'PASS' if dcgm.get('passed') else 'FAIL'}**\n")
+            if dcgm.get("subtests"):
+                lines.append("| Subtest | Status |")
+                lines.append("|---------|--------|")
+                for s in dcgm.get("subtests", []):
+                    lines.append(f"| {s.get('name', '')} | {s.get('status', '')} |")
+                lines.append("")
+        elif dcgm and dcgm.get("error"):
+            lines.append("## DCGM Diagnostic\n")
+            lines.append(f"**Overall: FAIL** ({dcgm.get('error')})\n")
+
        # --- NCCL ---
        nccl = results.get("nccl")
        if nccl and not nccl.get("error"):
            lines.append("## NCCL Multi-GPU\n")
            lines.append(f"Source: {nccl.get('source', 'unknown')} | "
                         f"GPUs: {nccl.get('gpu_count', '?')}\n")
+            if nccl.get("source") == "torchrun_fallback":
+                lines.append("> Functional NCCL smoke only: nccl-tests bus bandwidth was not measured, so this does not satisfy production acceptance.\n")
            tests = nccl.get("tests", {})
            if tests:
-                lines.append("| Operation | Bus BW (GB/s) | Threshold | Status |")
-                lines.append("|-----------|---------------|-----------|--------|")
+                lines.append("> Summary reports the best Bus BW observed for each operation. PASS/FAIL is evaluated across every tested message size and repeat run shown in the detail table below.\n")
+                lines.append("| Operation | Best Bus BW (GB/s) | Failed Sizes | Threshold | Status |")
+                lines.append("|-----------|--------------------|--------------|-----------|--------|")
                for op, data in tests.items():
                    if isinstance(data, dict) and not data.get("error"):
                        bw = data.get("best_busbw_gbps", 0)
                        req = data.get("min_required_gbps", 0)
                        status = data.get("status", "?")
-                        lines.append(f"| {op} | {bw:.1f} | >= {req:.0f} | {status} |")
+                        failed_sizes = [
+                            str(row.get("size", "?"))
+                            for row in data.get("by_size", [])
+                            if row.get("status") != "PASS"
+                        ]
+                        failed_sizes_text = ", ".join(failed_sizes) if failed_sizes else "-"
+                        lines.append(f"| {op} | {bw:.1f} | {failed_sizes_text} | >= {_format_gbps(req)} | {status} |")
                    elif isinstance(data, dict) and data.get("error"):
-                        lines.append(f"| {op} | - | - | ERROR: {data['error']} |")
+                        lines.append(f"| {op} | - | - | - | ERROR: {data['error']} |")
                lines.append("")
+                for op, data in tests.items():
+                    by_size = data.get("by_size", []) if isinstance(data, dict) else []
+                    if not by_size:
+                        continue
+                    lines.append(f"### NCCL {op} by size\n")
+                    lines.append("| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |")
+                    lines.append("|------|---------------------|-------|------|--------|-----------|--------|")
+                    for row in by_size:
+                        runs = ", ".join(str(v) for v in row.get("runs_busbw_gbps", []))
+                        lines.append(
+                            f"| {row.get('size', '')} | {runs} | "
+                            f"{row.get('worst_busbw_gbps', 0):.1f} | "
+                            f"{row.get('mean_busbw_gbps', 0):.1f} | "
+                            f"{row.get('stddev_pct', 0):.2f}% | "
+                            f">= {_format_gbps(data.get('min_required_gbps', 0))} | "
+                            f"{row.get('status', '?')} |"
+                        )
+                    lines.append("")
            passed = nccl.get("passed", False)
            lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")

+        multinode = results.get("multinode_nccl")
+        if multinode and not multinode.get("error"):
+            lines.append("## Multi-node NCCL / Cross Leaf\n")
+            lines.append(f"Source: {multinode.get('source', 'unknown')} | Mode: {multinode.get('mode', 'unknown')}\n")
+            if multinode.get("artifact_dir"):
+                lines.append(f"- **Artifacts:** `{multinode.get('artifact_dir')}`")
+            hosts = multinode.get("hosts", [])
+            if hosts:
+                host_text = ", ".join(f"{h.get('name') or h.get('addr')}({h.get('addr')})" for h in hosts)
+                lines.append(f"- **Hosts:** {host_text}")
+            preflight = multinode.get("preflight", {})
+            if preflight.get("checks"):
+                failed_checks = [c for c in preflight["checks"] if c.get("status") == "FAIL"]
+                warn_checks = [c for c in preflight["checks"] if c.get("status") == "WARN"]
+                lines.append(f"- **Preflight:** {'PASS' if not failed_checks else 'FAIL'}"
+                             f"{f' ({len(warn_checks)} warnings)' if warn_checks else ''}")
+            lines.append("")
+            for op, data in (multinode.get("tests") or {}).items():
+                lines.append(f"### Multi-node NCCL {op}\n")
+                lines.append("| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |")
+                lines.append("|----------|----------------------|-------------|-----------|------------|-----------|--------|")
+                for topo in data.get("topologies", []):
+                    threshold = topo.get("min_required_gbps", 0) or 0
+                    threshold_text = f">= {_format_gbps(threshold)} GB/s" if threshold else "-"
+                    cuda_visible = topo.get("cuda_visible_devices") or "-"
+                    lines.append(
+                        f"| {topo.get('label', '')} | {cuda_visible} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | "
+                        f"{topo.get('peak_size', '')} | {topo.get('avg_busbw_gbps', 0):.2f} GB/s | "
+                        f"{threshold_text} | {topo.get('status', '?')} |"
+                    )
+                lines.append("")
+                diag_rows = []
+                for topo in data.get("topologies", []):
+                    net = topo.get("network") or {}
+                    if net:
+                        diag_rows.append((topo, net))
+                if diag_rows:
+                    lines.append("| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |")
+                    lines.append("|----------|--------------|-----------------|------------------|-------------------|")
+                    for topo, net in diag_rows:
+                        networks = ", ".join(net.get("networks") or []) or "unknown"
+                        gdr = net.get("gpu_direct_rdma", "UNKNOWN")
+                        enabled = ", ".join(net.get("gdr_enabled_hcas") or []) or "-"
+                        disabled = ", ".join(net.get("gdr_disabled_hcas") or []) or "-"
+                        lines.append(f"| {topo.get('label', '')} | {networks} | {gdr} | {enabled} | {disabled} |")
+                    lines.append("")
+                failed_topos = [topo for topo in data.get("topologies", []) if topo.get("status") == "FAIL"]
+                if failed_topos:
+                    lines.append("| Topology | Return Code | Error / Output Tail |")
+                    lines.append("|----------|-------------|---------------------|")
+                    for topo in failed_topos:
+                        tail = topo.get("error") or topo.get("stderr_tail") or topo.get("stdout_tail") or ""
+                        tail = str(tail).replace("\n", " ").replace("|", "\\|")[-240:]
+                        lines.append(f"| {topo.get('label', '')} | {topo.get('returncode', '')} | {tail} |")
+                    lines.append("")
+            lines.append(f"**Overall: {'PASS' if multinode.get('passed') else 'FAIL'}**\n")
+        elif multinode and multinode.get("error"):
+            lines.append("## Multi-node NCCL / Cross Leaf\n")
+            lines.append(f"**Overall: FAIL** ({multinode.get('error')})\n")
+            preflight = multinode.get("preflight", {})
+            if preflight.get("checks"):
+                lines.append("| Check | Status | Detail |")
+                lines.append("|-------|--------|--------|")
+                for check in preflight["checks"]:
+                    detail = str(check.get("detail", "")).replace("\n", " ")
+                    lines.append(f"| {check.get('name', '')} | {check.get('status', '')} | {detail} |")
+                lines.append("")
+
        # --- Stress Test ---
        stress = results.get("stress")
        if stress and not stress.get("error"):
@ -368,6 +542,21 @@ class ReportGenerator:
            source = stress.get("source", "unknown")
            lines.append(f"- **Source:** {source}")
            lines.append(f"- **Duration:** {elapsed:.0f}s (requested {duration}s)")
+            telemetry = stress.get("telemetry") or {}
+            if telemetry:
+                lines.append(f"- **Telemetry samples:** {telemetry.get('samples', 0)}")
+                lines.append(f"- **Max temp:** {telemetry.get('max_temp_c', {})}")
+                lines.append(f"- **Avg power:** {telemetry.get('avg_power_w', {})}")
+                lines.append(f"- **Temp delta:** {telemetry.get('temp_delta_c', 'N/A')} C")
+                lines.append(f"- **TFLOPS jitter:** {telemetry.get('tflops_jitter_pct', 'N/A')}%")
+                lines.append(f"- **Steady TFLOPS samples:** {telemetry.get('steady_tflops_samples', 0)}")
+                lines.append(f"- **Throttle events:** {telemetry.get('throttle_event_count', len(telemetry.get('throttle_events', [])))}")
+                lines.append(f"- **XID events:** {len(telemetry.get('xid_events', []))}")
+                failures = telemetry.get("failures") or []
+                if failures:
+                    lines.append("- **Failure reasons:**")
+                    for reason in failures:
+                        lines.append(f"  - {reason}")
            lines.append(f"- **Result: {'PASS' if passed else 'FAIL'}**")
            lines.append("")

@ -378,26 +567,70 @@ class ReportGenerator:
            lines.append(f"**Overall: SKIP** [{rdma.get('reason', 'no IB hardware detected')}]\n")
        elif rdma and not rdma.get("error"):
            lines.append("## RDMA/InfiniBand\n")
+            rdma_legacy_note = self._rdma_legacy_note(rdma)
+            if rdma_legacy_note:
+                lines.append(f"> {rdma_legacy_note}\n")
+            port_checks = rdma.get("port_checks", [])
+            if port_checks:
+                lines.append("### RDMA Port Checks\n")
+                lines.append("| Device | Port | State | Rate | Required | Status |")
+                lines.append("|--------|------|-------|------|----------|--------|")
+                for p in port_checks:
+                    lines.append(
+                        f"| {p.get('device', '')} | {p.get('port', '')} | "
+                        f"{p.get('state', '')} | {p.get('rate', '')} | "
+                        f">= {p.get('min_rate_gbps', 400):.0f}Gbps ACTIVE | {p.get('status', '?')} |"
+                    )
+                lines.append("")
            bw_tests = rdma.get("bandwidth_tests", [])
            lat_tests = rdma.get("latency_tests", [])
-            if bw_tests or lat_tests:
+            ibping_tests = rdma.get("ibping_tests", [])
+            if bw_tests or lat_tests or ibping_tests:
                lines.append("| Test | Value | Threshold | Status |")
                lines.append("|------|-------|-----------|--------|")
                for bt in bw_tests:
-                    if not bt.get("error"):
+                    if bt.get("error"):
+                        lines.append(f"| {bt.get('test', 'ib_bw')} | {bt.get('error')} | required runnable test | {bt.get('status', 'FAIL')} |")
+                    else:
+                        threshold, status = self._rdma_bandwidth_verdict(bt)
                        lines.append(f"| {bt['test']} | {bt.get('bandwidth_gbps', 0):.1f} GB/s | "
-                                     f">= {bt.get('min_required_gbps', 0)} GB/s | {bt.get('status', '?')} |")
+                                     f">= {threshold:g} GB/s | {status} |")
                for lt in lat_tests:
-                    if not lt.get("error"):
+                    if lt.get("error"):
+                        lines.append(f"| {lt.get('test', 'ib_lat')} | {lt.get('error')} | required runnable test | {lt.get('status', 'FAIL')} |")
+                    else:
+                        threshold, status = self._rdma_latency_verdict(lt)
                        lines.append(f"| {lt['test']} | {lt.get('latency_us', 0):.2f} us | "
-                                     f"<= {lt.get('max_allowed_us', 0)} us | {lt.get('status', '?')} |")
+                                     f"<= {threshold:g} us | {status} |")
+                for it in ibping_tests:
+                    direction = it.get("direction") or it.get("role", "N/A")
+                    if it.get("error"):
+                        lines.append(f"| {it.get('test', 'ibping')} | {it.get('error')} | bidirectional peer evidence | {it.get('status', 'FAIL')} |")
+                    else:
+                        lines.append(f"| {it['test']} | {direction} target={it.get('target', 'N/A')} count={it.get('count', 'N/A')} | "
+                                     f"0% packet loss | {it.get('status', '?')} |")
                lines.append("")
+            fabric = rdma.get("fabric_counters") or {}
+            if fabric:
+                counters = fabric.get("counters", {})
+                lines.append(f"- **PFC/ECN/CNP/congestion counters checked:** {len(counters)}")
+                lines.append(f"- **PFC/ECN/CNP/congestion non-zero:** {'yes' if fabric.get('failed') else 'no'}")
+                if not counters:
+                    lines.append("- **PFC/ECN/CNP/congestion evidence:** missing")
+            failures = rdma.get("failures") or []
+            if not failures:
+                failures = self._rdma_failure_reasons(rdma)
+            if failures:
+                lines.append("- **Failure reasons:**")
+                for reason in failures:
+                    lines.append(f"  - {reason}")
            passed = rdma.get("passed", False)
            lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")

        # --- Training ---
        training = results.get("training")
        if training and not training.get("error"):
+            training_status, training_detail, training_missing = self._training_verdict(training)
            lines.append("## Training Simulation\n")
            lines.append("| Metric | Value |")
            lines.append("|--------|-------|")
@ -405,8 +638,14 @@ class ReportGenerator:
            lines.append(f"| Params | {training.get('total_params_m', 0):.1f}M |")
            lines.append(f"| Throughput | {training.get('throughput_tokens_per_sec', 0):.0f} tokens/sec |")
            lines.append(f"| Avg Step Time | {training.get('avg_step_time_ms', 0):.1f} ms |")
+            lines.append(f"| Warmup Steps | {training.get('warmup_steps', 'N/A')} |")
            lines.append(f"| Peak Memory | {training.get('peak_memory_gb', 0):.1f} GB |")
            lines.append(f"| Final Loss | {training.get('final_loss', 'N/A')} |")
+            lines.append(f"| Step Jitter | {training.get('step_jitter_pct', 'N/A')}% |")
+            lines.append(f"| Distributed Mode | {training.get('distributed_mode', 'N/A')} |")
+            if training_missing:
+                lines.append(f"| Acceptance Gaps | missing {', '.join(training_missing)} |")
+            lines.append(f"| Verdict | {training_status} ({training_detail}) |")
            lines.append("")

        # --- Footer ---
@ -441,6 +680,114 @@ class ReportGenerator:
                return bench["compute"]
        return {}

+    @staticmethod
+    def _training_verdict(training: dict) -> tuple[str, str, list[str]]:
+        """Return report status for both current and legacy training result schemas."""
+        tps = float(training.get("throughput_tokens_per_sec", 0) or 0)
+        if "passed" in training:
+            status = "PASS" if training.get("passed") else "FAIL"
+            return status, f"{tps:.0f} tokens/sec", []
+
+        required = ["passed", "step_jitter_pct", "distributed_mode", "loss_finite"]
+        missing = [k for k in required if k not in training]
+        return "UNVERIFIED", f"{tps:.0f} tokens/sec; legacy result lacks explicit acceptance verdict", missing
+
+    def _rdma_cfg_value(self, key: str, default: float) -> float:
+        try:
+            return float((self.config.get("rdma", {}) or {}).get(key, default))
+        except (TypeError, ValueError):
+            return default
+
+    def _rdma_bandwidth_verdict(self, row: dict) -> tuple[float, str]:
+        threshold = self._rdma_cfg_value("min_bandwidth_gbps", 47.0)
+        value = float(row.get("bandwidth_gbps", 0) or 0)
+        return threshold, "PASS" if value >= threshold else "FAIL"
+
+    def _rdma_latency_verdict(self, row: dict) -> tuple[float, str]:
+        name = row.get("test", "")
+        if name == "ib_write_lat":
+            threshold = self._rdma_cfg_value("max_write_latency_us", 2.0)
+        elif name == "ib_read_lat":
+            threshold = self._rdma_cfg_value("max_read_latency_us", 3.5)
+        else:
+            threshold = self._rdma_cfg_value("max_latency_us", 3.5)
+        value = float(row.get("latency_us", 0) or 0)
+        return threshold, "PASS" if 0 < value <= threshold else "FAIL"
+
+    def _rdma_legacy_note(self, rdma: dict) -> str:
+        """Flag old RDMA result schemas whose embedded thresholds were looser."""
+        for row in rdma.get("bandwidth_tests", []) or []:
+            if row.get("min_required_gbps") != self._rdma_cfg_value("min_bandwidth_gbps", 47.0):
+                return (
+                    "Legacy RDMA result re-evaluated with current PDF acceptance thresholds; "
+                    "old WARN statuses and old 50GB/s/10us limits are not used for verdict."
+                )
+        for row in rdma.get("latency_tests", []) or []:
+            threshold, _ = self._rdma_latency_verdict(row)
+            if row.get("max_allowed_us") != threshold:
+                return (
+                    "Legacy RDMA result re-evaluated with current PDF acceptance thresholds; "
+                    "old WARN statuses and old 50GB/s/10us limits are not used for verdict."
+                )
+        return ""
+
+    def _rdma_failure_reasons(self, rdma: dict) -> list[str]:
+        failures = []
+        for row in rdma.get("bandwidth_tests", []) or []:
+            threshold, status = self._rdma_bandwidth_verdict(row)
+            if status != "PASS":
+                failures.append(
+                    f"{row.get('test')} bandwidth {row.get('bandwidth_gbps', 0)}GB/s < {threshold:g}GB/s"
+                )
+        for row in rdma.get("latency_tests", []) or []:
+            threshold, status = self._rdma_latency_verdict(row)
+            if status != "PASS":
+                failures.append(
+                    f"{row.get('test')} latency {row.get('latency_us', 0)}us > {threshold:g}us"
+                )
+        for row in rdma.get("ibping_tests", []) or []:
+            if row.get("status") != "PASS":
+                failures.append(f"{row.get('test')} failed")
+        return failures
+
+    @staticmethod
+    def _overall_acceptance_verdict(summary_items: list[tuple[str, str]]) -> tuple[str, list[tuple[str, str]], list[str]]:
+        """PDF-style verdict for the report scope.
+
+        Full-suite reports require every single-node acceptance item. Standalone
+        reports, such as `--test multinode-nccl`, should only judge the items
+        that were actually requested instead of reporting unrelated evidence as
+        missing.
+        """
+        single_node_required = [
+            "GPU Info",
+            "Health Check",
+            "Memory Bandwidth",
+            "Compute Throughput",
+            "NVLink/NVSwitch",
+            "NCCL",
+            "Stress Test",
+            "RDMA",
+            "DCGM",
+            "Training",
+        ]
+        status_by_name = dict(summary_items)
+        present_single_node = [name for name in single_node_required if name in status_by_name]
+        if len(present_single_node) >= 3:
+            required = list(single_node_required)
+            if "Multi-node NCCL" in status_by_name:
+                required.append("Multi-node NCCL")
+        else:
+            required = list(status_by_name)
+        missing = [name for name in required if name not in status_by_name]
+        failures = [
+            (name, status)
+            for name, status in summary_items
+            if name in required and not str(status).startswith("PASS")
+        ]
+        verdict = "PASS" if not missing and not failures else "FAIL"
+        return verdict, failures, missing
+
    def _build_summary(self, results: dict) -> list[tuple[str, str]]:
        """Build summary verdict list from results."""
        items = []
@ -473,7 +820,7 @@ class ReportGenerator:
                d2d = mem.get("d2d_bandwidth_gbps") or 0
                items.append(("Memory Bandwidth", f"WARN ({d2d:.0f} GB/s via PyTorch fallback)"))
            else:
-                eff = mem.get("efficiency_pct") or 0
+                eff = mem.get("d2d_efficiency_pct") or mem.get("efficiency_pct") or 0
                verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 60 else "FAIL")
                items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))

@ -491,25 +838,43 @@ class ReportGenerator:
                    rank = {"PASS": 0, "WARN": 1, "FAIL": 2}
                    worst_status = "PASS"
                    worst_dt = None
+                    lowest_margin = None
                    for dt, thr in pass_thresholds.items():
                        val = per_dtype.get(dt)
                        if not isinstance(val, (int, float)):
                            continue
                        if val >= thr:
                            st = "PASS"
-                        elif val >= thr * 0.9:
-                            st = "WARN"
                        else:
                            st = "FAIL"
+                        margin = val / thr if thr else 0
+                        if lowest_margin is None or margin < lowest_margin:
+                            lowest_margin = margin
+                            worst_dt = dt
                        if rank[st] > rank[worst_status]:
                            worst_status = st
-                            worst_dt = dt
                    if worst_dt:
-                        items.append((
-                            "Compute Throughput",
-                            f"{worst_status} (worst {worst_dt.upper()} "
-                            f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})"
-                        ))
+                        consistency = comp.get("consistency", {}) or {}
+                        failed_consistency = [
+                            (dt, row)
+                            for dt, row in consistency.items()
+                            if not row.get("passed", False)
+                        ]
+                        if failed_consistency:
+                            worst_status = "FAIL"
+                            fail_dt, fail_row = failed_consistency[0]
+                            items.append((
+                                "Compute Throughput",
+                                f"FAIL ({fail_dt.upper()} spread "
+                                f"{fail_row.get('spread_pct', 0):.2f}% > "
+                                f"{fail_row.get('max_allowed_pct', 3)}%)"
+                            ))
+                        else:
+                            items.append((
+                                "Compute Throughput",
+                                f"{worst_status} (worst {worst_dt.upper()} "
+                                f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})"
+                            ))
                    else:
                        items.append(("Compute Throughput", f"{worst_status}"))
                else:
@ -521,16 +886,46 @@ class ReportGenerator:
                    else:
                        items.append(("Compute Throughput", "N/A"))

+        # NCCL
+        if "nvlink" in results:
+            nvl = results["nvlink"]
+            if nvl.get("error"):
+                items.append(("NVLink/NVSwitch", f"ERROR: {nvl['error']}"))
+            elif nvl.get("passed"):
+                items.append(("NVLink/NVSwitch", "PASS"))
+            else:
+                items.append(("NVLink/NVSwitch", "FAIL"))
+
+        if "dcgm" in results:
+            d = results["dcgm"]
+            if d.get("error"):
+                items.append(("DCGM", f"ERROR: {d['error']}"))
+            elif d.get("passed"):
+                items.append(("DCGM", "PASS"))
+            else:
+                items.append(("DCGM", "FAIL"))
+
        # NCCL
        if "nccl" in results:
            n = results["nccl"]
            if n.get("error"):
                items.append(("NCCL", f"ERROR: {n['error']}"))
+            elif n.get("source") == "torchrun_fallback":
+                items.append(("NCCL", "FAIL (no nccl-tests bus BW)"))
            elif n.get("passed"):
                items.append(("NCCL", "PASS"))
            else:
                items.append(("NCCL", "FAIL"))

+        if "multinode_nccl" in results:
+            mn = results["multinode_nccl"]
+            if mn.get("error"):
+                items.append(("Multi-node NCCL", f"ERROR: {mn['error']}"))
+            elif mn.get("passed"):
+                items.append(("Multi-node NCCL", "PASS"))
+            else:
+                items.append(("Multi-node NCCL", "FAIL"))
+
        # Stress
        if "stress" in results:
            s = results["stress"]
@ -559,7 +954,17 @@ class ReportGenerator:
            if t.get("error"):
                items.append(("Training", f"ERROR: {t['error']}"))
            else:
-                tps = t.get("throughput_tokens_per_sec", 0)
-                items.append(("Training", f"PASS ({tps:.0f} tokens/sec)"))
+                status, detail, _missing = self._training_verdict(t)
+                items.append(("Training", f"{status} ({detail})"))

        return items
+
+
+def _format_gbps(value) -> str:
+    try:
+        numeric = float(value)
+    except (TypeError, ValueError):
+        return str(value)
+    if numeric.is_integer():
+        return f"{numeric:.0f}"
+    return f"{numeric:.2f}"
--- a/modules/stress_test.py
+++ b/modules/stress_test.py
@ -1,9 +1,10 @@
-"""GPU stress test module — wraps gpu-burn for long-running stability tests."""
+"""GPU stress test module — gpu-burn or PyTorch GEMM with telemetry."""

 import glob
 import os
 import shutil
 import subprocess
+import threading
 import time
 from datetime import datetime

@ -46,7 +47,7 @@ class StressTest:
        memory_pct = cfg.get("memory_pct", 90)
        target_gpus = cfg.get("gpus", "all")

-        gpu_burn = self._find_gpu_burn()
+        gpu_burn = self._find_gpu_burn() if cfg.get("use_gpu_burn", False) else ""

        if gpu_burn:
            # Try gpu-burn first
@ -60,7 +61,7 @@ class StressTest:
            
            return result

-        self.console.print("[yellow]gpu_burn not found, using PyTorch stress test[/yellow]")
+        self.console.print("[yellow]Using PyTorch stress test[/yellow]")
        return self._run_pytorch_stress(duration_sec, memory_pct)

    def _run_gpu_burn(self, gpu_burn: str, duration: int,
@ -77,12 +78,26 @@ class StressTest:
        cmd.append(str(duration))

        t0 = time.time()
+        xid_before = self._collect_xid_events()
+        interval = int(self.stress_cfg.get("telemetry_interval_sec", 1))
+        telemetry = []
+        stop_sampling = threading.Event()
+        sampler = threading.Thread(
+            target=self._sample_telemetry,
+            args=(telemetry, stop_sampling, interval),
+            daemon=True,
+        )
+        sampler.start()
        try:
            r = subprocess.run(cmd, capture_output=True, text=True, timeout=duration + 120)
            elapsed = round(time.time() - t0, 1)
+            stop_sampling.set()
+            sampler.join(timeout=interval + 1)

            output = r.stdout + r.stderr
-            passed = r.returncode == 0
+            xid_events = self._new_xid_events(xid_before, self._collect_xid_events())
+            telemetry_summary = self._evaluate_telemetry(telemetry, [], xid_events)
+            passed = r.returncode == 0 and telemetry_summary.get("passed", False)

            gpu_results = []
            for line in output.split("\n"):
@ -96,25 +111,36 @@ class StressTest:
                "duration_sec": duration,
                "elapsed_sec": elapsed,
                "gpu_results": gpu_results,
+                "telemetry": telemetry_summary,
                "raw_output_tail": output[-500:] if output else "",
                "timestamp": datetime.now().isoformat(),
            }

        except subprocess.TimeoutExpired:
+            stop_sampling.set()
            return {
                "source": "gpu-burn",
                "passed": False,
                "duration_sec": duration,
                "error": "timeout",
+                "telemetry": self._evaluate_telemetry(
+                    telemetry, [], self._new_xid_events(xid_before, self._collect_xid_events())
+                ),
                "timestamp": datetime.now().isoformat(),
            }
        except Exception as e:
+            stop_sampling.set()
            return {
                "source": "gpu-burn",
                "passed": False,
                "error": str(e),
+                "telemetry": self._evaluate_telemetry(
+                    telemetry, [], self._new_xid_events(xid_before, self._collect_xid_events())
+                ),
                "timestamp": datetime.now().isoformat(),
            }
+        finally:
+            stop_sampling.set()

    def _run_pytorch_stress(self, duration: int, memory_pct: int = 90) -> dict:
        try:
@ -127,58 +153,79 @@ class StressTest:
        gpu_count = torch.cuda.device_count()
        self.console.print(f"[cyan]PyTorch Stress Test ({duration}s, {gpu_count} GPUs, target {memory_pct}% memory)[/cyan]")

+        dtype_name = self.stress_cfg.get("dtype", "bf16")
+        matrix_size = int(self.stress_cfg.get("matrix_size", 8192))
+        interval = int(self.stress_cfg.get("telemetry_interval_sec", 1))
+        dtype_map = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
+        dtype = dtype_map.get(dtype_name, torch.bfloat16)
+
        gpu_status = {}
+        telemetry = []
+        stop_sampling = threading.Event()
        t0 = time.time()
+        xid_before = self._collect_xid_events()

        try:
+            sampler = threading.Thread(
+                target=self._sample_telemetry,
+                args=(telemetry, stop_sampling, interval),
+                daemon=True,
+            )
+            sampler.start()
            tensors = {}
+            ballast = {}
+            pass_tflops = []
            for i in range(gpu_count):
                with torch.cuda.device(i):
-                    # Get actual free memory (accounting for other processes)
                    free_mem, total_mem = torch.cuda.mem_get_info(i)
-                    
-                    # Calculate allocation from configured memory_pct
-                    target_mem = int(total_mem * memory_pct / 100)
-                    
-                    # Cap at actual free memory with 5% safety margin
-                    alloc_bytes = min(target_mem, int(free_mem * 0.95))
-                    
-                    # matmul(A, A.T) needs 2x input memory (input + output)
-                    mem_side = int((alloc_bytes / 4 / 2) ** 0.5)
-                    # Cap compute matrix so a single matmul completes in ~2s on H100/H200
-                    # (FP32 ≈ 67 TFLOPS → 2*4096³/67e12 ≈ 2s). Without this cap, a 141GB
-                    # HBM yields side ≈ 131K → single matmul ~68s × 8 GPUs serial → loop
-                    # overshoots a 60s duration request by 10×+.
-                    MAX_COMPUTE_SIDE = 4096
-                    side = min(mem_side, MAX_COMPUTE_SIDE)
-
-                    actual_mem_mb = side * side * 4 / 1024 / 1024
+                    side = matrix_size
+                    elem = torch.tensor([], dtype=dtype).element_size()
+                    compute_bytes = side * side * elem * 3
+                    target_mem = min(int(total_mem * memory_pct / 100), int(free_mem * 0.90))
+                    ballast_bytes = max(0, target_mem - compute_bytes)
+                    if ballast_bytes:
+                        ballast_elems = ballast_bytes // 2
+                        ballast[i] = torch.empty(ballast_elems, device=f"cuda:{i}", dtype=torch.float16)
+                    actual_mem_mb = (compute_bytes + ballast_bytes) / 1024 / 1024
                    total_mem_mb = total_mem / 1024 / 1024
                    free_mem_mb = free_mem / 1024 / 1024
-                    
+
                    self.console.print(
                        f"  [dim]GPU {i}: total {total_mem_mb:.0f}MB, free {free_mem_mb:.0f}MB, "
                        f"alloc {actual_mem_mb:.0f}MB ({actual_mem_mb/total_mem_mb*100:.0f}%) - "
-                        f"matrix {side}x{side}[/dim]"
+                        f"{dtype_name} matrix {side}x{side}[/dim]"
+                    )
+                    tensors[i] = (
+                        torch.randn(side, side, device=f"cuda:{i}", dtype=dtype),
+                        torch.randn(side, side, device=f"cuda:{i}", dtype=dtype),
+                        torch.empty(side, side, device=f"cuda:{i}", dtype=dtype),
                    )
-                    tensors[i] = torch.randn(side, side, device=f"cuda:{i}", dtype=torch.float32)

            self.console.print(f"\n[cyan]Starting stress test for {duration} seconds...[/cyan]")
            
            elapsed_check = 0
            while time.time() - t0 < duration:
+                loop_start = time.perf_counter()
                # Dispatch matmul on all GPUs in parallel — do NOT synchronize between
                # GPUs, otherwise the 8 GPUs run serially and overshoot the duration.
                for i in range(gpu_count):
                    with torch.cuda.device(i):
-                        tensors[i] = torch.matmul(tensors[i], tensors[i].T)
+                        a, b, out = tensors[i]
+                        torch.matmul(a, b, out=out)
                # Single sync per pass — waits for all 8 streams concurrently
                for i in range(gpu_count):
                    with torch.cuda.device(i):
                        torch.cuda.synchronize()
+                loop_elapsed = time.perf_counter() - loop_start
+                current_elapsed = time.time() - t0
+                if loop_elapsed > 0:
+                    flops = gpu_count * 2 * (matrix_size ** 3)
+                    pass_tflops.append({
+                        "elapsed_sec": current_elapsed,
+                        "tflops": flops / loop_elapsed / 1e12,
+                    })

                # Show progress every 10 seconds
-                current_elapsed = time.time() - t0
                if int(current_elapsed) != int(elapsed_check) and int(current_elapsed) % 10 == 0:
                    self.console.print(f"  [dim]Running {int(current_elapsed)}s / {duration}s[/dim]")
                    elapsed_check = current_elapsed
@ -198,21 +245,196 @@ class StressTest:
                "duration_sec": duration,
                "error": error_msg,
                "gpu_status": gpu_status,
+                "telemetry": self._evaluate_telemetry(
+                    telemetry, pass_tflops if "pass_tflops" in locals() else [],
+                    self._new_xid_events(xid_before, self._collect_xid_events()),
+                ),
            }
        finally:
+            stop_sampling.set()
            tensors.clear()
+            ballast.clear()
            torch.cuda.empty_cache()

        elapsed = round(time.time() - t0, 1)
+        xid_events = self._new_xid_events(xid_before, self._collect_xid_events())
+        telemetry_summary = self._evaluate_telemetry(telemetry, pass_tflops, xid_events)
+        passed = all(v == "PASS" for v in gpu_status.values()) and telemetry_summary.get("passed", False)
        return {
            "source": "pytorch",
-            "passed": True,
+            "passed": passed,
            "duration_sec": duration,
            "elapsed_sec": elapsed,
            "gpu_status": gpu_status,
+            "telemetry": telemetry_summary,
            "timestamp": datetime.now().isoformat(),
        }

+    def _sample_telemetry(self, telemetry: list, stop_event: threading.Event, interval: int):
+        query = "index,temperature.gpu,power.draw,clocks_throttle_reasons.active"
+        while not stop_event.is_set():
+            try:
+                r = subprocess.run(
+                    ["nvidia-smi", f"--query-gpu={query}", "--format=csv,noheader,nounits"],
+                    capture_output=True, text=True, timeout=10,
+                )
+                if r.returncode == 0:
+                    sample = {"time": time.time(), "gpus": []}
+                    for line in r.stdout.splitlines():
+                        parts = [p.strip() for p in line.split(",")]
+                        if len(parts) >= 4:
+                            sample["gpus"].append({
+                                "index": int(parts[0]),
+                                "temp_c": float(parts[1]),
+                                "power_w": float(parts[2]),
+                                "throttle": parts[3],
+                            })
+                    telemetry.append(sample)
+            except Exception:
+                pass
+            stop_event.wait(interval)
+
+    def _collect_xid_events(self) -> list[str]:
+        try:
+            r = subprocess.run(
+                ["dmesg", "--color=never"],
+                capture_output=True, text=True, timeout=10,
+            )
+            if r.returncode != 0:
+                return []
+            return [
+                line.strip()
+                for line in r.stdout.splitlines()
+                if any(token in line.upper() for token in ("XID", "NVRM: XID"))
+            ]
+        except Exception:
+            return []
+
+    @staticmethod
+    def _new_xid_events(before: list[str], after: list[str]) -> list[str]:
+        seen = set(before)
+        return [line for line in after if line not in seen]
+
+    def _evaluate_telemetry(self, telemetry: list, pass_tflops: list, xid_events: list[str] | None = None) -> dict:
+        cfg = self.stress_cfg
+        max_temp = float(cfg.get("max_temp_c", 80))
+        max_delta = float(cfg.get("max_temp_delta_c", 5))
+        min_power = float(cfg.get("min_power_watts", 630))
+        max_jitter = float(cfg.get("max_tflops_jitter_pct", 5))
+        require_jitter = bool(cfg.get("require_tflops_jitter", True))
+        duration = float(cfg.get("duration_sec", 60))
+        requested_warmup = float(cfg.get("warmup_sec", 60))
+        warmup_sec = min(requested_warmup, max(0.0, duration * 0.2))
+        min_steady_samples = int(cfg.get("min_steady_samples", 10))
+        temps = {}
+        powers = {}
+        throttle_bad = []
+        xid_events = xid_events or []
+        steady_telemetry = [
+            sample for sample in telemetry
+            if sample.get("time", 0) - telemetry[0].get("time", 0) >= warmup_sec
+        ] if telemetry else []
+        evaluation_samples = steady_telemetry if len(steady_telemetry) >= min_steady_samples else telemetry
+        for sample in evaluation_samples:
+            for g in sample.get("gpus", []):
+                idx = g["index"]
+                temps.setdefault(idx, []).append(g["temp_c"])
+                powers.setdefault(idx, []).append(g["power_w"])
+                try:
+                    bitmask = int(str(g["throttle"]), 16)
+                except ValueError:
+                    bitmask = 0
+                real_throttle = bitmask & ~0x1
+                if real_throttle:
+                    throttle_bad.append({
+                        "gpu": idx,
+                        "throttle": g["throttle"],
+                        "real_throttle": f"0x{real_throttle:x}",
+                    })
+        max_temps = {idx: max(vals) for idx, vals in temps.items() if vals}
+        avg_powers = {idx: sum(vals) / len(vals) for idx, vals in powers.items() if vals}
+        temp_delta = (max(max_temps.values()) - min(max_temps.values())) if len(max_temps) >= 2 else 0
+        jitter = 0
+        steady_tflops = []
+        for item in pass_tflops:
+            if isinstance(item, dict):
+                if float(item.get("elapsed_sec", 0)) >= warmup_sec:
+                    steady_tflops.append(float(item.get("tflops", 0)))
+            else:
+                steady_tflops.append(float(item))
+        if len(steady_tflops) < 2 and pass_tflops:
+            steady_tflops = [
+                float(item.get("tflops", 0)) if isinstance(item, dict) else float(item)
+                for item in pass_tflops
+            ]
+        if steady_tflops:
+            mean = sum(steady_tflops) / len(steady_tflops)
+            jitter = max(abs(v - mean) / mean * 100 for v in steady_tflops) if mean else 0
+        failures = []
+        temp_failures = {idx: v for idx, v in max_temps.items() if v > max_temp}
+        power_failures = {idx: v for idx, v in avg_powers.items() if v < min_power}
+        if not evaluation_samples:
+            failures.append("no telemetry samples available for evaluation")
+        if temp_failures:
+            failures.append(
+                "max temperature above threshold: "
+                + ", ".join(f"GPU {idx} {val:.1f}C" for idx, val in sorted(temp_failures.items()))
+            )
+        if temp_delta > max_delta:
+            failures.append(f"GPU temperature delta {temp_delta:.1f}C exceeds {max_delta:.1f}C")
+        if power_failures:
+            failures.append(
+                "average steady-state power below threshold: "
+                + ", ".join(f"GPU {idx} {val:.1f}W" for idx, val in sorted(power_failures.items()))
+            )
+        if throttle_bad:
+            failures.append(
+                f"non-idle throttle reasons observed in {len(throttle_bad)} samples "
+                f"(first: GPU {throttle_bad[0]['gpu']} {throttle_bad[0]['real_throttle']})"
+            )
+        if xid_events:
+            failures.append(f"{len(xid_events)} new XID/NVRM XID events observed")
+        if require_jitter and len(steady_tflops) < 2:
+            failures.append(
+                f"insufficient steady TFLOPS samples for jitter evaluation: {len(steady_tflops)} < 2"
+            )
+        if jitter > max_jitter:
+            failures.append(f"TFLOPS jitter {jitter:.2f}% exceeds {max_jitter:.2f}%")
+        passed = (
+            bool(evaluation_samples)
+            and all(v <= max_temp for v in max_temps.values())
+            and temp_delta <= max_delta
+            and all(v >= min_power for v in avg_powers.values())
+            and not throttle_bad
+            and not xid_events
+            and (not require_jitter or len(steady_tflops) >= 2)
+            and jitter <= max_jitter
+        )
+        return {
+            "passed": passed,
+            "samples": len(telemetry),
+            "steady_samples": len(evaluation_samples),
+            "warmup_sec": round(warmup_sec, 1),
+            "max_temp_c": {k: round(v, 1) for k, v in max_temps.items()},
+            "avg_power_w": {k: round(v, 1) for k, v in avg_powers.items()},
+            "temp_delta_c": round(temp_delta, 1),
+            "throttle_events": throttle_bad[:20],
+            "throttle_event_count": len(throttle_bad),
+            "xid_events": xid_events[-20:],
+            "tflops_jitter_pct": round(jitter, 2),
+            "steady_tflops_samples": len(steady_tflops),
+            "failures": failures,
+            "thresholds": {
+                "max_temp_c": max_temp,
+                "max_temp_delta_c": max_delta,
+                "min_power_w": min_power,
+                "max_tflops_jitter_pct": max_jitter,
+                "require_tflops_jitter": require_jitter,
+                "warmup_sec": requested_warmup,
+                "min_steady_samples": min_steady_samples,
+            },
+        }
+
    @staticmethod
    def print_results(results: dict, console: Console = None):
        c = console or Console()
@ -245,5 +467,21 @@ class StressTest:
                color = "green" if status == "PASS" else "red"
                c.print(f"    GPU {gid}: [{color}]{status}[/{color}]")

+        telemetry = results.get("telemetry") or {}
+        if telemetry:
+            c.print("\n  Telemetry:")
+            c.print(f"    Samples: {telemetry.get('samples', 0)} total, {telemetry.get('steady_samples', 0)} evaluated after {telemetry.get('warmup_sec', 0)}s warmup")
+            c.print(f"    Avg steady power: {telemetry.get('avg_power_w', {})}")
+            c.print(f"    Max steady temp: {telemetry.get('max_temp_c', {})}")
+            c.print(f"    Temp delta: {telemetry.get('temp_delta_c', 'N/A')} C")
+            c.print(f"    TFLOPS jitter: {telemetry.get('tflops_jitter_pct', 'N/A')}%")
+            c.print(f"    Throttle events: {telemetry.get('throttle_event_count', len(telemetry.get('throttle_events', [])))}")
+            c.print(f"    XID events: {len(telemetry.get('xid_events', []))}")
+            failures = telemetry.get("failures", [])
+            if failures:
+                c.print("  [red]Failure reasons:[/red]")
+                for reason in failures:
+                    c.print(f"    [red]- {reason}[/red]")
+
        if results.get("error"):
            c.print(f"  [red]Error: {results['error']}[/red]")
--- a/modules/training_sim.py
+++ b/modules/training_sim.py
@ -1,8 +1,13 @@
 """Training simulation module - LLM training workload with PyTorch."""

+import json
+import os
+import sys
+import tempfile
 import time
 import subprocess
 import shutil
+import math
 from datetime import datetime
 from typing import Optional

@ -36,6 +41,7 @@ class TrainingSim:
        batch_size = self.train_cfg.get("batch_size", 8)
        seq_length = self.train_cfg.get("seq_length", 2048)
        num_steps = self.train_cfg.get("num_steps", 50)
+        warmup_steps = int(self.train_cfg.get("warmup_steps", 5))
        dtype_str = self.train_cfg.get("dtype", "bf16")

        dtype_map = {
@ -47,7 +53,13 @@ class TrainingSim:

        self.console.print(f"[cyan]Training Simulation[/cyan]")
        self.console.print(f"  Model: {model_name} | Batch: {batch_size} | Seq: {seq_length} | "
-                           f"DType: {dtype_str} | Steps: {num_steps} | GPUs: {gpu_count}")
+                           f"DType: {dtype_str} | Steps: {num_steps} | Warmup: {warmup_steps} | GPUs: {gpu_count}")
+
+        if self.train_cfg.get("mode", "ddp") == "ddp" and gpu_count > 1:
+            ddp_result = self._run_synthetic_ddp(gpu_count, batch_size, seq_length, num_steps, dtype_str)
+            if ddp_result.get("passed") or not self.train_cfg.get("allow_fallback", False):
+                return ddp_result
+            self.console.print("[yellow]DDP synthetic training failed, falling back to single-process synthetic path[/yellow]")

        try:
            from transformers import AutoModelForCausalLM, AutoTokenizer
@ -87,9 +99,10 @@ class TrainingSim:
                BarColumn(), TextColumn("{task.completed}/{task.total}"),
                TimeElapsedColumn(), console=self.console,
            ) as progress:
-                task = progress.add_task("Training steps...", total=num_steps)
+                total_steps = num_steps + warmup_steps
+                task = progress.add_task("Training steps...", total=total_steps)

-                for step in range(num_steps):
+                for step in range(total_steps):
                    torch.cuda.synchronize()
                    t0 = time.perf_counter()

@ -119,8 +132,15 @@ class TrainingSim:

                    progress.advance(task)

-            avg_step_time = sum(step_times) / len(step_times)
+            measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times
+            avg_step_time = sum(measured_steps) / len(measured_steps)
            throughput = batch_size * seq_length / avg_step_time
+            jitter = self._jitter_pct(measured_steps)
+            peak_mem = round(max(mem_usage) if mem_usage else 0, 2)
+            final_loss = float(loss.item()) if hasattr(loss, "item") else float("nan")
+            passed = self._acceptance_pass(throughput, jitter, peak_mem, final_loss)
+            if self.train_cfg.get("require_distributed", True):
+                passed = False

            return {
                "model": model_name,
@ -130,11 +150,18 @@ class TrainingSim:
                "batch_size": batch_size,
                "seq_length": seq_length,
                "num_steps": num_steps,
+                "warmup_steps": warmup_steps,
+                "total_steps": total_steps,
                "avg_step_time_ms": round(avg_step_time * 1000, 1),
                "throughput_tokens_per_sec": round(throughput, 0),
                "throughput_samples_per_sec": round(batch_size / avg_step_time, 2),
-                "peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2),
-                "final_loss": round(loss.item(), 4) if hasattr(loss, 'item') else None,
+                "peak_memory_gb": peak_mem,
+                "final_loss": round(final_loss, 4),
+                "step_jitter_pct": round(jitter, 2),
+                "distributed_mode": "device_map",
+                "loss_finite": math.isfinite(final_loss),
+                "passed": passed,
+                "acceptance_gap": "8-GPU DDP was not used" if self.train_cfg.get("require_distributed", True) else "",
                "timestamp": datetime.now().isoformat(),
            }

@ -142,6 +169,196 @@ class TrainingSim:
            self.console.print(f"[yellow]Model loading failed: {e}[/yellow]")
            return self._run_synthetic(gpu_count, batch_size, seq_length, num_steps, dtype)

+    def _run_synthetic_ddp(self, gpu_count: int, batch_size: int, seq_length: int,
+                           num_steps: int, dtype_str: str) -> dict:
+        """Run the 1.5B synthetic Transformer with one process per GPU."""
+        torchrun = os.path.join(os.path.dirname(sys.executable), "torchrun")
+        if not os.path.isfile(torchrun):
+            torchrun = shutil.which("torchrun") or ""
+        if not torchrun:
+            return {
+                "model": "synthetic_transformer_1.5b",
+                "gpu_count": gpu_count,
+                "distributed_mode": "ddp",
+                "passed": False,
+                "error": "torchrun not found",
+                "timestamp": datetime.now().isoformat(),
+            }
+
+        script = r'''
+import json
+import math
+import os
+import time
+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+def main():
+    local_rank = int(os.environ["LOCAL_RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group("nccl")
+
+    global_batch = int(os.environ["TRAIN_BATCH_SIZE"])
+    local_batch = max(1, global_batch // world_size)
+    seq_length = int(os.environ["TRAIN_SEQ_LENGTH"])
+    num_steps = int(os.environ["TRAIN_NUM_STEPS"])
+    warmup_steps = int(os.environ.get("TRAIN_WARMUP_STEPS", "5"))
+    total_steps = num_steps + warmup_steps
+    dtype_name = os.environ.get("TRAIN_DTYPE", "bf16")
+    dtype = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}.get(dtype_name, torch.bfloat16)
+
+    hidden_size = 4096
+    num_layers = 6
+    num_heads = 32
+    vocab_size = 32000
+
+    class SyntheticTransformer(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.embed = torch.nn.Embedding(vocab_size, hidden_size)
+            self.layers = torch.nn.ModuleList([
+                torch.nn.TransformerEncoderLayer(
+                    d_model=hidden_size,
+                    nhead=num_heads,
+                    dim_feedforward=hidden_size * 4,
+                    batch_first=True,
+                    dtype=dtype,
+                ) for _ in range(num_layers)
+            ])
+            self.head = torch.nn.Linear(hidden_size, vocab_size, dtype=dtype)
+
+        def forward(self, x):
+            h = self.embed(x).to(dtype)
+            for layer in self.layers:
+                h = layer(h)
+            return self.head(h)
+
+    model = SyntheticTransformer().cuda()
+    total_params = sum(p.numel() for p in model.parameters())
+    model = DDP(model, device_ids=[local_rank], output_device=local_rank)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
+    input_ids = torch.randint(0, vocab_size, (local_batch, seq_length), device="cuda")
+    step_times = []
+    last_loss = torch.tensor(float("nan"), device="cuda")
+    torch.cuda.reset_peak_memory_stats(local_rank)
+
+    for _ in range(total_steps):
+        torch.cuda.synchronize()
+        t0 = time.perf_counter()
+        with torch.amp.autocast("cuda", dtype=dtype, enabled=dtype in (torch.float16, torch.bfloat16)):
+            logits = model(input_ids)
+            loss = torch.nn.functional.cross_entropy(logits.reshape(-1, vocab_size), input_ids.reshape(-1))
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad(set_to_none=True)
+        torch.cuda.synchronize()
+        step_times.append(time.perf_counter() - t0)
+        last_loss = loss.detach()
+
+    peak_mem = torch.tensor(torch.cuda.max_memory_allocated(local_rank) / 1024**3, device="cuda")
+    dist.all_reduce(peak_mem, op=dist.ReduceOp.MAX)
+    finite = torch.tensor(1 if math.isfinite(float(last_loss.item())) else 0, device="cuda")
+    dist.all_reduce(finite, op=dist.ReduceOp.MIN)
+
+    if dist.get_rank() == 0:
+        measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times
+        avg_step = sum(measured_steps) / len(measured_steps)
+        mean = avg_step
+        jitter = max(abs(v - mean) / mean * 100 for v in measured_steps) if mean else 0.0
+        throughput = global_batch * seq_length / avg_step if avg_step else 0.0
+        print("TRAINING_DDP_JSON=" + json.dumps({
+            "model": "synthetic_transformer_1.5b",
+            "total_params_m": round(total_params / 1e6, 1),
+            "num_layers": num_layers,
+            "hidden_size": hidden_size,
+            "gpu_count": world_size,
+            "dtype": dtype_name,
+            "batch_size": global_batch,
+            "local_batch_size": local_batch,
+            "seq_length": seq_length,
+            "num_steps": num_steps,
+            "warmup_steps": warmup_steps,
+            "total_steps": total_steps,
+            "avg_step_time_ms": round(avg_step * 1000, 1),
+            "throughput_tokens_per_sec": round(throughput, 0),
+            "throughput_samples_per_sec": round(global_batch / avg_step, 2) if avg_step else 0,
+            "peak_memory_gb": round(float(peak_mem.item()), 2),
+            "final_loss": round(float(last_loss.item()), 4),
+            "step_jitter_pct": round(jitter, 2),
+            "distributed_mode": "ddp",
+            "loss_finite": bool(int(finite.item())),
+        }), flush=True)
+    dist.destroy_process_group()
+
+if __name__ == "__main__":
+    main()
+'''
+        tmp = tempfile.NamedTemporaryFile("w", suffix="_training_ddp.py", delete=False)
+        tmp.write(script)
+        tmp.close()
+
+        env = {
+            **os.environ,
+            "TRAIN_BATCH_SIZE": str(batch_size),
+            "TRAIN_SEQ_LENGTH": str(seq_length),
+            "TRAIN_NUM_STEPS": str(num_steps),
+            "TRAIN_WARMUP_STEPS": str(int(self.train_cfg.get("warmup_steps", 5))),
+            "TRAIN_DTYPE": dtype_str,
+            "NCCL_DEBUG": os.environ.get("NCCL_DEBUG", "WARN"),
+        }
+        cmd = [torchrun, f"--nproc_per_node={gpu_count}", tmp.name]
+        self.console.print(f"  Running synthetic 1.5B DDP via torchrun ({gpu_count} processes)...")
+        try:
+            timeout = int(self.train_cfg.get("timeout_sec", max(600, num_steps * 180)))
+            r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, env=env)
+        except subprocess.TimeoutExpired:
+            os.unlink(tmp.name)
+            return {
+                "model": "synthetic_transformer_1.5b",
+                "gpu_count": gpu_count,
+                "distributed_mode": "ddp",
+                "passed": False,
+                "error": "training_ddp_timeout",
+                "timestamp": datetime.now().isoformat(),
+            }
+        finally:
+            if os.path.exists(tmp.name):
+                try:
+                    os.unlink(tmp.name)
+                except OSError:
+                    pass
+
+        marker = "TRAINING_DDP_JSON="
+        payload = None
+        for line in (r.stdout + "\n" + r.stderr).splitlines():
+            if marker in line:
+                payload = line.split(marker, 1)[1].strip()
+        if r.returncode != 0 or not payload:
+            return {
+                "model": "synthetic_transformer_1.5b",
+                "gpu_count": gpu_count,
+                "distributed_mode": "ddp",
+                "passed": False,
+                "error": (r.stderr or r.stdout or "training_ddp_failed")[-1000:],
+                "timestamp": datetime.now().isoformat(),
+            }
+
+        result = json.loads(payload)
+        loss_value = float(result.get("final_loss", "nan"))
+        passed = self._acceptance_pass(
+            float(result.get("throughput_tokens_per_sec", 0)),
+            float(result.get("step_jitter_pct", 999)),
+            float(result.get("peak_memory_gb", 999)),
+            loss_value,
+        ) and bool(result.get("loss_finite", False)) and result.get("gpu_count") == gpu_count
+        result.update({
+            "passed": passed,
+            "timestamp": datetime.now().isoformat(),
+        })
+        return result
+
    def _run_synthetic(self, gpu_count, batch_size, seq_length, num_steps, dtype) -> dict:
        self.console.print("  Running synthetic training benchmark...")

@ -170,11 +387,17 @@ class TrainingSim:
                    h = layer(h)
                return self.head(h)

-        model = SyntheticTransformer().cuda()
+        model = SyntheticTransformer()
        total_params = sum(p.numel() for p in model.parameters())

        self.console.print(f"  Synthetic params: {total_params / 1e6:.1f}M")

+        distributed_mode = "single_gpu"
+        if gpu_count > 1:
+            model = torch.nn.DataParallel(model).cuda()
+            distributed_mode = "data_parallel"
+        else:
+            model = model.cuda()
        model.train()
        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

@ -183,14 +406,17 @@ class TrainingSim:
        step_times = []
        mem_usage = []

+        warmup_steps = int(self.train_cfg.get("warmup_steps", 5))
+        total_steps = num_steps + warmup_steps
+
        with Progress(
            SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
            BarColumn(), TextColumn("{task.completed}/{task.total}"),
            TimeElapsedColumn(), console=self.console,
        ) as progress:
-            task = progress.add_task("Synthetic training...", total=num_steps)
+            task = progress.add_task("Synthetic training...", total=total_steps)

-            for step in range(num_steps):
+            for step in range(total_steps):
                torch.cuda.synchronize()
                t0 = time.perf_counter()

@ -206,14 +432,22 @@ class TrainingSim:
                elapsed = time.perf_counter() - t0
                step_times.append(elapsed)

-                mem_used = torch.cuda.max_memory_allocated() / 1024**3
+                mem_used = max(torch.cuda.max_memory_allocated(i) for i in range(gpu_count)) / 1024**3
                mem_usage.append(mem_used)
-                torch.cuda.reset_peak_memory_stats()
+                for i in range(gpu_count):
+                    torch.cuda.reset_peak_memory_stats(i)

                progress.advance(task)

-        avg_step_time = sum(step_times) / len(step_times)
+        measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times
+        avg_step_time = sum(measured_steps) / len(measured_steps)
        throughput = batch_size * seq_length / avg_step_time
+        jitter = self._jitter_pct(measured_steps)
+        peak_mem = round(max(mem_usage) if mem_usage else 0, 2)
+        final_loss = float(loss.item())
+        passed = self._acceptance_pass(throughput, jitter, peak_mem, final_loss)
+        if self.train_cfg.get("require_distributed", True):
+            passed = False

        return {
            "model": "synthetic_transformer",
@ -225,14 +459,36 @@ class TrainingSim:
            "batch_size": batch_size,
            "seq_length": seq_length,
            "num_steps": num_steps,
+            "warmup_steps": warmup_steps,
+            "total_steps": total_steps,
            "avg_step_time_ms": round(avg_step_time * 1000, 1),
            "throughput_tokens_per_sec": round(throughput, 0),
            "throughput_samples_per_sec": round(batch_size / avg_step_time, 2),
-            "peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2),
-            "final_loss": round(loss.item(), 4),
+            "peak_memory_gb": peak_mem,
+            "final_loss": round(final_loss, 4),
+            "step_jitter_pct": round(jitter, 2),
+            "distributed_mode": distributed_mode,
+            "loss_finite": math.isfinite(final_loss),
+            "passed": passed,
+            "acceptance_gap": "8-GPU DDP was not used" if self.train_cfg.get("require_distributed", True) else "",
            "timestamp": datetime.now().isoformat(),
        }

+    @staticmethod
+    def _jitter_pct(step_times: list[float]) -> float:
+        if not step_times:
+            return 0.0
+        mean = sum(step_times) / len(step_times)
+        return max(abs(v - mean) / mean * 100 for v in step_times) if mean else 0.0
+
+    def _acceptance_pass(self, throughput: float, jitter: float, peak_mem: float, loss_value: float) -> bool:
+        return (
+            throughput >= float(self.train_cfg.get("min_tokens_per_sec", 45000))
+            and jitter <= float(self.train_cfg.get("max_step_jitter_pct", 3))
+            and peak_mem <= float(self.train_cfg.get("max_peak_memory_gb", 70))
+            and math.isfinite(loss_value)
+        )
+
    @staticmethod
    def print_results(results: dict, console: Console = None):
        c = console or Console()
@ -254,11 +510,15 @@ class TrainingSim:
            ("Batch Size", str(results.get("batch_size", "N/A"))),
            ("Seq Length", str(results.get("seq_length", "N/A"))),
            ("Steps", str(results.get("num_steps", "N/A"))),
+            ("Warmup Steps", str(results.get("warmup_steps", "N/A"))),
            ("Avg Step Time", f"{results.get('avg_step_time_ms', 'N/A')} ms"),
            ("Throughput", f"{results.get('throughput_tokens_per_sec', 'N/A')} tokens/s"),
            ("Samples/sec", f"{results.get('throughput_samples_per_sec', 'N/A')}"),
            ("Peak Memory", f"{results.get('peak_memory_gb', 'N/A')} GB"),
            ("Final Loss", str(results.get("final_loss", "N/A"))),
+            ("Step Jitter", f"{results.get('step_jitter_pct', 'N/A')}%"),
+            ("Distributed Mode", results.get("distributed_mode", "N/A")),
+            ("Verdict", "PASS" if results.get("passed") else "FAIL"),
        ]
        for label, val in metrics:
            table.add_row(label, str(val))
--- a/scripts/cublaslt_fp8_gemm_bench.cu
+++ b/scripts/cublaslt_fp8_gemm_bench.cu
@ -0,0 +1,291 @@
+#include <cublasLt.h>
+#include <cuda_bf16.h>
+#include <cuda_fp8.h>
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#define CHECK_CUDA(call)                                                       \
+  do {                                                                         \
+    cudaError_t status = (call);                                               \
+    if (status != cudaSuccess) {                                               \
+      std::fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__,       \
+                   cudaGetErrorString(status));                                \
+      std::exit(1);                                                            \
+    }                                                                          \
+  } while (0)
+
+#define CHECK_CUBLAS(call)                                                     \
+  do {                                                                         \
+    cublasStatus_t status = (call);                                            \
+    if (status != CUBLAS_STATUS_SUCCESS) {                                     \
+      std::fprintf(stderr, "cuBLASLt error %s:%d: status=%d\n", __FILE__,      \
+                   __LINE__, static_cast<int>(status));                        \
+      std::exit(1);                                                            \
+    }                                                                          \
+  } while (0)
+
+__global__ void fill_fp8(__nv_fp8_e4m3 *ptr, size_t count, float value) {
+  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t stride = blockDim.x * gridDim.x;
+  for (size_t i = tid; i < count; i += stride) {
+    ptr[i] = __nv_fp8_e4m3(value);
+  }
+}
+
+struct Args {
+  int matrix_size = 8192;
+  int warmup = 20;
+  int iterations = 200;
+  int first_gpu = 0;
+  int gpu_count = -1;
+  size_t workspace_mb = 256;
+  int fast_accum = 1;
+};
+
+static Args parse_args(int argc, char **argv) {
+  Args args;
+  for (int i = 1; i < argc; ++i) {
+    auto need = [&](const char *name) {
+      if (i + 1 >= argc) {
+        std::fprintf(stderr, "Missing value for %s\n", name);
+        std::exit(2);
+      }
+      return argv[++i];
+    };
+    if (!std::strcmp(argv[i], "--matrix-size")) {
+      args.matrix_size = std::atoi(need(argv[i]));
+    } else if (!std::strcmp(argv[i], "--warmup")) {
+      args.warmup = std::atoi(need(argv[i]));
+    } else if (!std::strcmp(argv[i], "--iterations")) {
+      args.iterations = std::atoi(need(argv[i]));
+    } else if (!std::strcmp(argv[i], "--first-gpu")) {
+      args.first_gpu = std::atoi(need(argv[i]));
+    } else if (!std::strcmp(argv[i], "--gpu-count")) {
+      args.gpu_count = std::atoi(need(argv[i]));
+    } else if (!std::strcmp(argv[i], "--workspace-mb")) {
+      args.workspace_mb = static_cast<size_t>(std::atoll(need(argv[i])));
+    } else if (!std::strcmp(argv[i], "--fast-accum")) {
+      args.fast_accum = std::atoi(need(argv[i]));
+    } else if (!std::strcmp(argv[i], "--help") || !std::strcmp(argv[i], "-h")) {
+      std::puts("Usage: cublaslt_fp8_gemm_bench [--matrix-size N] [--warmup N] "
+                "[--iterations N] [--first-gpu N] [--gpu-count N] "
+                "[--workspace-mb N] [--fast-accum 0|1]");
+      std::exit(0);
+    } else {
+      std::fprintf(stderr, "Unknown argument: %s\n", argv[i]);
+      std::exit(2);
+    }
+  }
+  return args;
+}
+
+static double run_one_gpu(int gpu, const Args &args) {
+  CHECK_CUDA(cudaSetDevice(gpu));
+
+  const int64_t m = args.matrix_size;
+  const int64_t n = args.matrix_size;
+  const int64_t k = args.matrix_size;
+  const size_t a_elems = static_cast<size_t>(m) * k;
+  const size_t b_elems = static_cast<size_t>(k) * n;
+  const size_t d_elems = static_cast<size_t>(m) * n;
+
+  __nv_fp8_e4m3 *d_a = nullptr;
+  __nv_fp8_e4m3 *d_b = nullptr;
+  __nv_bfloat16 *d_d = nullptr;
+  void *workspace = nullptr;
+  float *d_scale_a = nullptr;
+  float *d_scale_b = nullptr;
+  const float scale = 1.0f;
+  const size_t workspace_bytes = args.workspace_mb * 1024ULL * 1024ULL;
+
+  CHECK_CUDA(cudaMalloc(&d_a, a_elems * sizeof(__nv_fp8_e4m3)));
+  CHECK_CUDA(cudaMalloc(&d_b, b_elems * sizeof(__nv_fp8_e4m3)));
+  CHECK_CUDA(cudaMalloc(&d_d, d_elems * sizeof(__nv_bfloat16)));
+  CHECK_CUDA(cudaMalloc(&workspace, workspace_bytes));
+  CHECK_CUDA(cudaMalloc(&d_scale_a, sizeof(float)));
+  CHECK_CUDA(cudaMalloc(&d_scale_b, sizeof(float)));
+  CHECK_CUDA(cudaMemcpy(d_scale_a, &scale, sizeof(scale), cudaMemcpyHostToDevice));
+  CHECK_CUDA(cudaMemcpy(d_scale_b, &scale, sizeof(scale), cudaMemcpyHostToDevice));
+
+  const int threads = 256;
+  const int blocks = 4096;
+  fill_fp8<<<blocks, threads>>>(d_a, a_elems, 0.01f);
+  fill_fp8<<<blocks, threads>>>(d_b, b_elems, 0.01f);
+  CHECK_CUDA(cudaMemset(d_d, 0, d_elems * sizeof(__nv_bfloat16)));
+  CHECK_CUDA(cudaGetLastError());
+  CHECK_CUDA(cudaDeviceSynchronize());
+
+  cublasLtHandle_t lt;
+  cublasLtMatmulDesc_t op_desc;
+  cublasLtMatrixLayout_t a_desc, b_desc, d_desc;
+  cublasLtMatmulPreference_t preference;
+  CHECK_CUBLAS(cublasLtCreate(&lt));
+  CHECK_CUBLAS(cublasLtMatmulDescCreate(&op_desc, CUBLAS_COMPUTE_32F, CUDA_R_32F));
+
+  // cuBLASLt FP8 kernels require TN format: A is transposed, B is non-transposed.
+  // With square GEMMs this keeps the benchmark FLOP count identical to the PDF
+  // acceptance shape while satisfying the library's FP8 kernel constraints.
+  cublasOperation_t transa = CUBLAS_OP_T;
+  cublasOperation_t transb = CUBLAS_OP_N;
+  CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
+      op_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)));
+  CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
+      op_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb)));
+  CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
+      op_desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &d_scale_a,
+      sizeof(d_scale_a)));
+  CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
+      op_desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &d_scale_b,
+      sizeof(d_scale_b)));
+  int8_t fast_accum = args.fast_accum ? 1 : 0;
+  CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
+      op_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fast_accum,
+      sizeof(fast_accum)));
+
+  CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&a_desc, CUDA_R_8F_E4M3, k, m, k));
+  CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&b_desc, CUDA_R_8F_E4M3, k, n, k));
+  CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&d_desc, CUDA_R_16BF, m, n, m));
+
+  CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference));
+  CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute(
+      preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_bytes,
+      sizeof(workspace_bytes)));
+
+  cublasLtMatmulHeuristicResult_t heuristic;
+  int returned = 0;
+  CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic(
+      lt, op_desc, a_desc, b_desc, d_desc, d_desc, preference, 1, &heuristic,
+      &returned));
+  if (returned == 0) {
+    std::fprintf(stderr, "No cuBLASLt heuristic returned for GPU %d\n", gpu);
+    std::exit(1);
+  }
+
+  auto get_algo_attr_i32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
+    int32_t value = -1;
+    size_t written = 0;
+    CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
+        &heuristic.algo, attr, &value, sizeof(value), &written));
+    return static_cast<int>(value);
+  };
+  auto get_algo_attr_u32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
+    uint32_t value = 0;
+    size_t written = 0;
+    CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
+        &heuristic.algo, attr, &value, sizeof(value), &written));
+    return static_cast<int>(value);
+  };
+  auto get_algo_attr_u16 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
+    uint16_t value = 0;
+    size_t written = 0;
+    CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
+        &heuristic.algo, attr, &value, sizeof(value), &written));
+    return static_cast<int>(value);
+  };
+  const int algo_id = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_ID);
+  const int tile_id = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_TILE_ID);
+  const int splitk = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_SPLITK_NUM);
+  const int stages = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_STAGES_ID);
+  const int inner_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID);
+  const int cluster_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID);
+
+  const float alpha = 1.0f;
+  const float beta = 0.0f;
+  auto matmul = [&]() {
+    CHECK_CUBLAS(cublasLtMatmul(lt, op_desc, &alpha, d_a, a_desc, d_b, b_desc,
+                                &beta, d_d, d_desc, d_d, d_desc,
+                                &heuristic.algo, workspace, workspace_bytes, 0));
+  };
+
+  for (int i = 0; i < args.warmup; ++i) {
+    matmul();
+  }
+  CHECK_CUDA(cudaDeviceSynchronize());
+
+  cudaEvent_t start, stop;
+  CHECK_CUDA(cudaEventCreate(&start));
+  CHECK_CUDA(cudaEventCreate(&stop));
+  CHECK_CUDA(cudaEventRecord(start));
+  for (int i = 0; i < args.iterations; ++i) {
+    matmul();
+  }
+  CHECK_CUDA(cudaEventRecord(stop));
+  CHECK_CUDA(cudaEventSynchronize(stop));
+  float elapsed_ms = 0.0f;
+  CHECK_CUDA(cudaEventElapsedTime(&elapsed_ms, start, stop));
+  const double flops =
+      2.0 * static_cast<double>(m) * static_cast<double>(n) *
+      static_cast<double>(k) * static_cast<double>(args.iterations);
+  const double tflops = flops / (static_cast<double>(elapsed_ms) / 1000.0) / 1e12;
+  std::printf(
+      "    {\"index\": %d, \"fp8_tflops\": %.1f, \"algo_id\": %d, "
+      "\"tile_id\": %d, \"splitk\": %d, \"stages_id\": %d, "
+      "\"inner_shape_id\": %d, \"cluster_shape_id\": %d}%s\n",
+      gpu, tflops, algo_id, tile_id, splitk, stages, inner_shape, cluster_shape,
+      (gpu + 1 == args.first_gpu + args.gpu_count) ? "" : ",");
+  std::fflush(stdout);
+
+  CHECK_CUDA(cudaEventDestroy(start));
+  CHECK_CUDA(cudaEventDestroy(stop));
+  CHECK_CUBLAS(cublasLtMatmulPreferenceDestroy(preference));
+  CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(a_desc));
+  CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(b_desc));
+  CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(d_desc));
+  CHECK_CUBLAS(cublasLtMatmulDescDestroy(op_desc));
+  CHECK_CUBLAS(cublasLtDestroy(lt));
+  CHECK_CUDA(cudaFree(d_a));
+  CHECK_CUDA(cudaFree(d_b));
+  CHECK_CUDA(cudaFree(d_d));
+  CHECK_CUDA(cudaFree(workspace));
+  CHECK_CUDA(cudaFree(d_scale_a));
+  CHECK_CUDA(cudaFree(d_scale_b));
+  CHECK_CUDA(cudaDeviceSynchronize());
+
+  return tflops;
+}
+
+int main(int argc, char **argv) {
+  Args args = parse_args(argc, argv);
+  int device_count = 0;
+  CHECK_CUDA(cudaGetDeviceCount(&device_count));
+  if (args.gpu_count < 0) {
+    args.gpu_count = device_count - args.first_gpu;
+  }
+  if (args.first_gpu < 0 || args.first_gpu + args.gpu_count > device_count) {
+    std::fprintf(stderr, "Invalid GPU range first=%d count=%d device_count=%d\n",
+                 args.first_gpu, args.gpu_count, device_count);
+    return 2;
+  }
+
+  std::vector<double> values;
+  std::printf("{\n");
+  std::printf("  \"source\": \"cuBLASLt\",\n");
+  std::printf("  \"dtype\": \"fp8_e4m3_inputs_bf16_output_fp32_accum\",\n");
+  std::printf("  \"matrix_size\": %d,\n", args.matrix_size);
+  std::printf("  \"warmup\": %d,\n", args.warmup);
+  std::printf("  \"iterations\": %d,\n", args.iterations);
+  std::printf("  \"fast_accum\": %d,\n", args.fast_accum ? 1 : 0);
+  std::printf("  \"per_gpu\": [\n");
+  for (int i = 0; i < args.gpu_count; ++i) {
+    int gpu = args.first_gpu + i;
+    double tflops = run_one_gpu(gpu, args);
+    values.push_back(tflops);
+  }
+  double mean = std::accumulate(values.begin(), values.end(), 0.0) / values.size();
+  auto minmax = std::minmax_element(values.begin(), values.end());
+  double spread = ((*minmax.second - *minmax.first) / mean) * 100.0;
+  std::printf("  ],\n");
+  std::printf("  \"mean_tflops\": %.1f,\n", mean);
+  std::printf("  \"min_tflops\": %.1f,\n", *minmax.first);
+  std::printf("  \"max_tflops\": %.1f,\n", *minmax.second);
+  std::printf("  \"spread_pct\": %.2f\n", spread);
+  std::printf("}\n");
+  return mean >= 1400.0 ? 0 : 1;
+}
--- a/scripts/multinode_nccl_deep_diagnose.sh
+++ b/scripts/multinode_nccl_deep_diagnose.sh
@ -0,0 +1,425 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Deep-diagnose multi-node NCCL behavior from the coordinator node.
+# Default values match the current 2-node H100 cross-leaf investigation.
+
+MODE="${1:-all}"
+
+MPI_BIN="${MPI_BIN:-/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun}"
+NCCL_TESTS_DIR="${NCCL_TESTS_DIR:-/data/nccl-tests-latest/build}"
+HOSTS="${HOSTS:-172.72.8.12:8,172.72.8.16:8}"
+PEER_HOST="${PEER_HOST:-172.72.8.16}"
+SSH_USER="${SSH_USER:-root}"
+HCAS="${HCAS:-mlx5_0 mlx5_1 mlx5_6 mlx5_7}"
+HCA_CSV="${HCA_CSV:-mlx5_0,mlx5_1,mlx5_6,mlx5_7}"
+OUT_DIR="${OUT_DIR:-/tmp/nccl_deep_diagnose_$(date +%Y%m%d_%H%M%S)}"
+
+BEGIN_SIZE="${BEGIN_SIZE:-16G}"
+END_SIZE="${END_SIZE:-16G}"
+WARMUP_ITERS="${WARMUP_ITERS:-10}"
+ITERS="${ITERS:-10}"
+GRAPH_WARMUP_ITERS="${GRAPH_WARMUP_ITERS:-1}"
+GRAPH_ITERS="${GRAPH_ITERS:-1}"
+SWEEP_WARMUP_ITERS="${SWEEP_WARMUP_ITERS:-3}"
+SWEEP_ITERS="${SWEEP_ITERS:-5}"
+
+NCCL_LD_LIBRARY_PATH="${NCCL_LD_LIBRARY_PATH:-/usr/mpi/gcc/openmpi-4.1.9a1/lib:/tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu:/usr/local/cuda-12.4/targets/x86_64-linux/lib}"
+DEFAULT_NCCL_DEBUG="${NCCL_DEBUG:-WARN}"
+
+COUNTERS="${COUNTERS:-port_xmit_data port_rcv_data port_xmit_packets port_rcv_packets port_xmit_wait port_xmit_discards port_rcv_errors port_rcv_remote_physical_errors port_rcv_switch_relay_errors port_xmit_constraint_errors port_rcv_constraint_errors symbol_error link_error_recovery link_downed local_link_integrity_errors excessive_buffer_overrun_errors VL15_dropped}"
+HW_COUNTERS="${HW_COUNTERS:-roce_adp_retrans roce_adp_retrans_to roce_slow_restart roce_slow_restart_cnps roce_slow_restart_trans packet_seq_err out_of_sequence out_of_buffer duplicate_request implied_nak_seq_err local_ack_timeout_err req_transport_retries_exceeded rnr_nak_retry_err rx_write_requests rx_read_requests}"
+
+mkdir -p "$OUT_DIR"
+
+mpi_base=(
+  "$MPI_BIN"
+  --allow-run-as-root
+  --mca btl_openib_warn_no_device_params_found 0
+  --mca btl_tcp_if_include bond0
+  --mca oob_tcp_if_include bond0
+  --mca plm_rsh_args "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes -o ConnectTimeout=10"
+  -H "$HOSTS"
+  --map-by ppr:8:node
+  -np 16
+)
+
+base_exports=(
+  LD_LIBRARY_PATH
+  NCCL_IB_GID_INDEX NCCL_IB_SL NCCL_IB_TC NCCL_SOCKET_IFNAME
+  NCCL_DEBUG NCCL_DEBUG_SUBSYS NCCL_IB_TIMEOUT NCCL_IB_HCA
+  NCCL_NET_PLUGIN NCCL_NVLS_ENABLE NCCL_NET_GDR_LEVEL NCCL_NET_GDR_READ
+  NCCL_DMABUF_ENABLE NCCL_PXN_DISABLE NCCL_IB_QPS_PER_CONNECTION
+  NCCL_IB_SPLIT_DATA_ON_QPS NCCL_MIN_NCHANNELS NCCL_MAX_NCHANNELS
+  NCCL_BUFFSIZE NCCL_P2P_NET_CHUNKSIZE NCCL_NCHANNELS_PER_NET_PEER
+  NCCL_IB_AR_THRESHOLD
+)
+
+set_common_env() {
+  unset NCCL_DEBUG_SUBSYS NCCL_PXN_DISABLE NCCL_IB_QPS_PER_CONNECTION
+  unset NCCL_IB_SPLIT_DATA_ON_QPS NCCL_MIN_NCHANNELS NCCL_MAX_NCHANNELS
+  unset NCCL_BUFFSIZE NCCL_P2P_NET_CHUNKSIZE NCCL_NCHANNELS_PER_NET_PEER
+  unset NCCL_IB_AR_THRESHOLD
+
+  export LD_LIBRARY_PATH="$NCCL_LD_LIBRARY_PATH"
+  export NCCL_IB_GID_INDEX="${NCCL_IB_GID_INDEX:-3}"
+  export NCCL_IB_SL="${NCCL_IB_SL:-5}"
+  export NCCL_IB_TC="${NCCL_IB_TC:-136}"
+  export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-bond0}"
+  export NCCL_DEBUG="$DEFAULT_NCCL_DEBUG"
+  export NCCL_IB_TIMEOUT="${NCCL_IB_TIMEOUT:-22}"
+  export NCCL_IB_HCA="$HCA_CSV"
+  export NCCL_NET_PLUGIN="${NCCL_NET_PLUGIN:-none}"
+  export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-1}"
+  export NCCL_NET_GDR_LEVEL="${NCCL_NET_GDR_LEVEL:-5}"
+  export NCCL_NET_GDR_READ="${NCCL_NET_GDR_READ:-1}"
+  export NCCL_DMABUF_ENABLE="${NCCL_DMABUF_ENABLE:-0}"
+}
+
+mpi_xargs() {
+  for name in "${base_exports[@]}"; do
+    if [[ -n "${!name+x}" ]]; then
+      printf -- '-x\n%s\n' "$name"
+    fi
+  done
+}
+
+run_nccl() {
+  local op="$1"
+  local bin="$2"
+  local log="$3"
+  local warmup="$4"
+  local iters="$5"
+  mapfile -t xargs < <(mpi_xargs)
+  "${mpi_base[@]}" "${xargs[@]}" \
+    "$bin" -b "$BEGIN_SIZE" -e "$END_SIZE" -g 1 -f 2 -w "$warmup" -n "$iters" \
+    >"$log" 2>&1
+  awk -v op="$op" '/Avg bus bandwidth/ {print op, $0}' "$log"
+}
+
+read_one_snapshot() {
+  local host_label="$1"
+  local out="$2"
+  : >"$out"
+  for hca in $HCAS; do
+    for c in $COUNTERS; do
+      local f="/sys/class/infiniband/$hca/ports/1/counters/$c"
+      if [[ -r "$f" ]]; then
+        printf '%s %s counters %s %s\n' "$host_label" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" >>"$out"
+      fi
+    done
+    for c in $HW_COUNTERS; do
+      local f="/sys/class/infiniband/$hca/ports/1/hw_counters/$c"
+      if [[ -r "$f" ]]; then
+        printf '%s %s hw_counters %s %s\n' "$host_label" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" >>"$out"
+      fi
+    done
+  done
+}
+
+read_remote_snapshot() {
+  local out="$1"
+  ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
+      -o BatchMode=yes -o ConnectTimeout=5 "${SSH_USER}@${PEER_HOST}" \
+      "HCAS='$HCAS' COUNTERS='$COUNTERS' HW_COUNTERS='$HW_COUNTERS' bash -s" <<'EOS' >"$out"
+for hca in $HCAS; do
+  for c in $COUNTERS; do
+    f="/sys/class/infiniband/$hca/ports/1/counters/$c"
+    if [ -r "$f" ]; then
+      printf '%s %s counters %s %s\n' "$HOSTNAME" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)"
+    fi
+  done
+  for c in $HW_COUNTERS; do
+    f="/sys/class/infiniband/$hca/ports/1/hw_counters/$c"
+    if [ -r "$f" ]; then
+      printf '%s %s hw_counters %s %s\n' "$HOSTNAME" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)"
+    fi
+  done
+done
+EOS
+}
+
+summarize_counter_delta() {
+  local before_a="$1"
+  local before_b="$2"
+  local after_a="$3"
+  local after_b="$4"
+  local out="$5"
+  python3 - "$before_a" "$before_b" "$after_a" "$after_b" >"$out" <<'PY'
+import pathlib
+import sys
+
+interesting = {
+    "port_xmit_wait", "port_xmit_discards", "port_rcv_errors",
+    "port_rcv_remote_physical_errors", "port_rcv_switch_relay_errors",
+    "port_xmit_constraint_errors", "port_rcv_constraint_errors",
+    "symbol_error", "link_error_recovery", "link_downed",
+    "local_link_integrity_errors", "excessive_buffer_overrun_errors",
+    "VL15_dropped", "roce_adp_retrans", "roce_adp_retrans_to",
+    "roce_slow_restart", "roce_slow_restart_cnps", "roce_slow_restart_trans",
+    "packet_seq_err", "out_of_sequence", "out_of_buffer",
+    "duplicate_request", "implied_nak_seq_err", "local_ack_timeout_err",
+    "req_transport_retries_exceeded", "rnr_nak_retry_err",
+}
+
+def load(path):
+    data = {}
+    for line in pathlib.Path(path).read_text().splitlines():
+        parts = line.split()
+        if len(parts) != 5:
+            continue
+        host, hca, kind, counter, value = parts
+        try:
+            data[(host, hca, kind, counter)] = int(value)
+        except ValueError:
+            pass
+    return data
+
+before = {}
+after = {}
+before.update(load(sys.argv[1]))
+before.update(load(sys.argv[2]))
+after.update(load(sys.argv[3]))
+after.update(load(sys.argv[4]))
+
+print("NONZERO_DELTAS")
+for key in sorted(set(before) | set(after)):
+    delta = after.get(key, 0) - before.get(key, 0)
+    if not delta:
+        continue
+    host, hca, kind, counter = key
+    if counter in {"port_xmit_data", "port_rcv_data"}:
+        gib = delta * 4 / (1024 ** 3)
+        print(f"{host} {hca} {kind} {counter} {delta} words4B {gib:.2f} GiB")
+    else:
+        print(f"{host} {hca} {kind} {counter} {delta}")
+
+print("ERROR_OR_CONGESTION_DELTAS")
+seen = False
+for key in sorted(set(before) | set(after)):
+    delta = after.get(key, 0) - before.get(key, 0)
+    if delta and key[3] in interesting:
+        seen = True
+        print(*key, delta)
+if not seen:
+    print("none")
+PY
+}
+
+run_counter_case() {
+  local op="$1"
+  local bin="$2"
+  local extra="${3:-}"
+  set_common_env
+  if [[ -n "$extra" ]]; then
+    eval "export $extra"
+  fi
+  local dir="$OUT_DIR/${op}_counter"
+  mkdir -p "$dir"
+  read_one_snapshot "$(hostname)" "$dir/before.local"
+  read_remote_snapshot "$dir/before.remote"
+  run_nccl "$op" "$bin" "$dir/${op}.log" "$WARMUP_ITERS" "$ITERS"
+  read_one_snapshot "$(hostname)" "$dir/after.local"
+  read_remote_snapshot "$dir/after.remote"
+  summarize_counter_delta "$dir/before.local" "$dir/before.remote" "$dir/after.local" "$dir/after.remote" "$dir/counter_delta.txt"
+  echo "$dir"
+}
+
+summarize_graph_log() {
+  local log="$1"
+  local out="$2"
+  python3 - "$log" >"$out" <<'PY'
+from pathlib import Path
+import collections
+import re
+import sys
+
+text = Path(sys.argv[1]).read_text(errors="ignore")
+print("avg_busbw", (re.findall(r"Avg bus bandwidth\s*:\s*([0-9.]+)", text) or ["NA"])[-1])
+print("nccl_version", sorted(set(re.findall(r"NCCL version ([^\s]+)", text))))
+print("plugin_missing", len(re.findall(r"Could not find: none libnccl-net-none\.so", text)))
+print("gdr_enabled_lines", len(re.findall(r"GPU Direct RDMA Enabled", text)))
+print("using_hca")
+for value, count in collections.Counter(re.findall(r"NET/IB : Using \[(.*?)\]; OOB", text)).most_common(4):
+    print(f"  {count} {value}")
+print("pattern_counts")
+patterns = re.findall(
+    r"Pattern (\d+), crossNic (\d+), nChannels (\d+), bw ([0-9.]+)/([0-9.]+), type ([^,]+), sameChannels (\d+)",
+    text,
+)
+for key, count in collections.Counter(patterns).most_common():
+    print(f"  {count} {key}")
+print("channel_summary")
+for value, count in collections.Counter(
+    re.findall(r"(\d+ coll channels, \d+ collnet channels, \d+ nvls channels, \d+ p2p channels, \d+ p2p channels per peer)", text)
+).most_common():
+    print(f"  {count} {value}")
+print("p2p_chunks", collections.Counter(re.findall(r"P2P Chunksize set to (\d+)", text)))
+print("check_p2p", collections.Counter(re.findall(r"Check P2P Type ([^\n]+)", text)))
+for token in ["NET/IB/0/GDRDMA", "NET/IB/1/GDRDMA", "NET/IB/2/GDRDMA", "NET/IB/3/GDRDMA", "P2P/CUMEM", "P2P/IPC", "SHM"]:
+    print(token, text.count(token))
+print("channel_edge_lines", len([line for line in text.splitlines() if "Channel " in line and ("via NET/IB" in line or "via P2P" in line)]))
+PY
+}
+
+run_graph_case() {
+  local op="$1"
+  local bin="$2"
+  local extra="${3:-}"
+  set_common_env
+  export NCCL_DEBUG=INFO
+  export NCCL_DEBUG_SUBSYS=INIT,NET,GRAPH,TUNING,COLL
+  if [[ -n "$extra" ]]; then
+    eval "export $extra"
+  fi
+  local dir="$OUT_DIR/graph"
+  mkdir -p "$dir"
+  local log="$dir/${op}.log"
+  run_nccl "$op" "$bin" "$log" "$GRAPH_WARMUP_ITERS" "$GRAPH_ITERS"
+  summarize_graph_log "$log" "$dir/${op}_summary.txt"
+  echo "$dir/${op}_summary.txt"
+}
+
+run_pxn_sweep() {
+  local dir="$OUT_DIR/pxn_sweep"
+  mkdir -p "$dir"
+  local cases=(
+    "baseline|"
+    "nvls_off|NCCL_NVLS_ENABLE=0"
+    "qps4_split1|NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=1"
+    "qps8_split1|NCCL_IB_QPS_PER_CONNECTION=8 NCCL_IB_SPLIT_DATA_ON_QPS=1"
+    "qps4_split0|NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=0"
+    "channels16|NCCL_MIN_NCHANNELS=16 NCCL_MAX_NCHANNELS=16"
+    "buff8m|NCCL_BUFFSIZE=8388608"
+    "p2pchunk4m|NCCL_P2P_NET_CHUNKSIZE=4194304"
+    "netpeer8|NCCL_NCHANNELS_PER_NET_PEER=8"
+    "ar0|NCCL_IB_AR_THRESHOLD=0"
+  )
+  : >"$dir/summary.txt"
+  for item in "${cases[@]}"; do
+    local name="${item%%|*}"
+    local extra="${item#*|}"
+    set_common_env
+    export NCCL_PXN_DISABLE=1
+    if [[ -n "$extra" ]]; then
+      eval "export $extra"
+    fi
+    local log="$dir/${name}.log"
+    {
+      echo "===== CASE $name ====="
+      echo "extra: ${extra:-none}"
+      run_nccl "alltoall" "$NCCL_TESTS_DIR/alltoall_perf" "$log" "$SWEEP_WARMUP_ITERS" "$SWEEP_ITERS"
+      awk '/Avg bus bandwidth/ {print}' "$log" | tail -1
+    } | tee -a "$dir/summary.txt"
+  done
+  echo "$dir/summary.txt"
+}
+
+run_preflight() {
+  set_common_env
+  local out="$OUT_DIR/preflight.txt"
+  {
+    echo "===== LOCAL ====="
+    echo "hostname: $(hostname)"
+    echo "mpirun: $MPI_BIN"
+    if [[ -x "$MPI_BIN" ]]; then
+      "$MPI_BIN" --version 2>&1 | sed -n '1p'
+    else
+      echo "MISSING executable: $MPI_BIN"
+    fi
+    for bin in "$NCCL_TESTS_DIR/all_reduce_perf" "$NCCL_TESTS_DIR/alltoall_perf"; do
+      if [[ -x "$bin" ]]; then
+        echo "OK executable: $bin"
+      else
+        echo "MISSING executable: $bin"
+      fi
+    done
+    for hca in $HCAS; do
+      local state="/sys/class/infiniband/$hca/ports/1/state"
+      local rate="/sys/class/infiniband/$hca/ports/1/rate"
+      if [[ -r "$state" ]]; then
+        echo "OK HCA: $hca state=$(cat "$state") rate=$(cat "$rate" 2>/dev/null || echo unknown)"
+      else
+        echo "MISSING HCA path: $hca"
+      fi
+    done
+
+    echo "===== REMOTE ====="
+    ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
+        -o BatchMode=yes -o ConnectTimeout=5 "${SSH_USER}@${PEER_HOST}" \
+        "MPI_BIN='$MPI_BIN' NCCL_TESTS_DIR='$NCCL_TESTS_DIR' HCAS='$HCAS' bash -s" <<'EOS'
+echo "hostname: $(hostname)"
+echo "mpirun: $MPI_BIN"
+if [ -x "$MPI_BIN" ]; then
+  "$MPI_BIN" --version 2>&1 | sed -n '1p'
+else
+  echo "MISSING executable: $MPI_BIN"
+fi
+for bin in "$NCCL_TESTS_DIR/all_reduce_perf" "$NCCL_TESTS_DIR/alltoall_perf"; do
+  if [ -x "$bin" ]; then
+    echo "OK executable: $bin"
+  else
+    echo "MISSING executable: $bin"
+  fi
+done
+for hca in $HCAS; do
+  state="/sys/class/infiniband/$hca/ports/1/state"
+  rate="/sys/class/infiniband/$hca/ports/1/rate"
+  if [ -r "$state" ]; then
+    echo "OK HCA: $hca state=$(cat "$state") rate=$(cat "$rate" 2>/dev/null || echo unknown)"
+  else
+    echo "MISSING HCA path: $hca"
+  fi
+done
+EOS
+  } | tee "$out"
+  echo "$out"
+}
+
+usage() {
+  cat <<EOF
+Usage: $0 [preflight|all|allreduce-counter|alltoall-counter|graph|pxn-sweep]
+
+Outputs are written to: $OUT_DIR
+
+Common overrides:
+  HOSTS, PEER_HOST, HCAS, HCA_CSV, MPI_BIN, NCCL_TESTS_DIR,
+  NCCL_LD_LIBRARY_PATH, BEGIN_SIZE, END_SIZE, WARMUP_ITERS, ITERS
+EOF
+}
+
+case "$MODE" in
+  preflight)
+    run_preflight
+    ;;
+  all)
+    run_preflight
+    run_counter_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
+    run_counter_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
+    run_graph_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
+    run_graph_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
+    run_pxn_sweep
+    ;;
+  allreduce-counter)
+    run_counter_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
+    ;;
+  alltoall-counter)
+    run_counter_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
+    ;;
+  graph)
+    run_graph_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
+    run_graph_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
+    ;;
+  pxn-sweep)
+    run_pxn_sweep
+    ;;
+  -h|--help|help)
+    usage
+    ;;
+  *)
+    usage
+    exit 2
+    ;;
+esac
+
+echo "OUT_DIR=$OUT_DIR"
--- a/scripts/nccl_environment_snapshot.sh
+++ b/scripts/nccl_environment_snapshot.sh
@ -0,0 +1,169 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Collect a lightweight NCCL/RDMA environment snapshot on one node.
+# This script does not run NCCL workloads and is safe to use before deeper tests.
+
+HOST="$(hostname 2>/dev/null || echo unknown)"
+TS="$(date +%Y%m%d_%H%M%S)"
+OUT_FILE="${1:-${OUT_FILE:-/tmp/nccl_environment_snapshot_${HOST}_${TS}.md}}"
+PDF_ALLREDUCE_BUSBW="${PDF_ALLREDUCE_BUSBW:-491.84}"
+PDF_ALLTOALL_BUSBW="${PDF_ALLTOALL_BUSBW:-76.54}"
+PLUGIN_SEARCH_ROOTS="${PLUGIN_SEARCH_ROOTS:-/usr /opt /tmp /root}"
+
+mkdir -p "$(dirname "$OUT_FILE")"
+shopt -s nullglob
+
+have_cmd() {
+  command -v "$1" >/dev/null 2>&1
+}
+
+emit_cmd() {
+  local title="$1"
+  shift
+  {
+    echo
+    echo "### $title"
+    echo
+    echo '```text'
+    "$@" 2>&1 || true
+    echo '```'
+  } >>"$OUT_FILE"
+}
+
+active_400g_hcas=()
+non_400g_rows=()
+
+{
+  echo "# NCCL/RDMA 环境快照"
+  echo
+  echo "- Host: \`$HOST\`"
+  echo "- Time: \`$(date -Is 2>/dev/null || date)\`"
+  echo "- Kernel: \`$(uname -r 2>/dev/null || echo unknown)\`"
+  echo
+  echo "## HCA / Port 状态"
+  echo
+  echo "| HCA | Port | State | Phys State | Rate | Link Layer | 400G IB Rail |"
+  echo "|---|---:|---|---|---:|---|---|"
+} >"$OUT_FILE"
+
+hca_paths=(/sys/class/infiniband/mlx5_*)
+if ((${#hca_paths[@]})); then
+  for hca_path in "${hca_paths[@]}"; do
+    hca="$(basename "$hca_path")"
+    for port_path in "$hca_path"/ports/*; do
+      [[ -d "$port_path" ]] || continue
+      port="$(basename "$port_path")"
+      state="$(cat "$port_path/state" 2>/dev/null || echo NA)"
+      phys_state="$(cat "$port_path/phys_state" 2>/dev/null || echo NA)"
+      rate="$(cat "$port_path/rate" 2>/dev/null || echo NA)"
+      layer="$(cat "$port_path/link_layer" 2>/dev/null || echo NA)"
+      is_400g="NO"
+      if [[ "$state" == *"ACTIVE"* && "$rate" == 400\ Gb/sec* && "$layer" == "InfiniBand" ]]; then
+        is_400g="YES"
+        active_400g_hcas+=("$hca")
+      else
+        non_400g_rows+=("$hca port=$port state=$state rate=$rate layer=$layer")
+      fi
+      printf '| `%s` | `%s` | `%s` | `%s` | `%s` | `%s` | `%s` |\n' \
+        "$hca" "$port" "$state" "$phys_state" "$rate" "$layer" "$is_400g" >>"$OUT_FILE"
+    done
+  done
+else
+  printf '| N/A | N/A | `%s` | N/A | N/A | N/A | NO |\n' "/sys/class/infiniband/mlx5_* not found" >>"$OUT_FILE"
+fi
+
+{
+  echo
+  echo "## Rail 摘要"
+  echo
+  if ((${#active_400g_hcas[@]})); then
+    hca_csv="$(IFS=,; echo "${active_400g_hcas[*]}")"
+    echo "- Active 400G IB rail count: \`${#active_400g_hcas[@]}\`"
+    echo "- Candidate \`NCCL_IB_HCA\`: \`$hca_csv\`"
+    echo "- Theoretical one-way raw bandwidth: \`${#active_400g_hcas[@]} * 400Gb/s / 8 = $((${#active_400g_hcas[@]} * 50)) GB/s\`"
+  else
+    echo "- Active 400G IB rail count: \`0\`"
+    echo "- Candidate \`NCCL_IB_HCA\`: \`N/A\`"
+  fi
+  echo
+  echo "Non-400G / non-IB / down ports:"
+  echo
+  if ((${#non_400g_rows[@]})); then
+    for row in "${non_400g_rows[@]}"; do
+      echo "- \`$row\`"
+    done
+  else
+    echo "- none"
+  fi
+  echo
+  echo "## PDF 目标换算"
+  echo
+  echo "- PDF allreduce busbw target: \`${PDF_ALLREDUCE_BUSBW} GB/s\`"
+  echo "- PDF alltoall busbw target: \`${PDF_ALLTOALL_BUSBW} GB/s\`"
+} >>"$OUT_FILE"
+
+python3 - "$PDF_ALLREDUCE_BUSBW" "${#active_400g_hcas[@]}" >>"$OUT_FILE" <<'PY' || true
+import sys
+
+busbw = float(sys.argv[1])
+rail_count = int(sys.argv[2])
+algbw = busbw / 1.875
+raw = rail_count * 50.0
+print(f"- 16-rank allreduce implied algbw: `{algbw:.2f} GB/s`")
+if rail_count:
+    pct = algbw / raw * 100
+    print(f"- Implied algbw / current raw 400G rail bandwidth: `{pct:.1f}%`")
+    if algbw > raw:
+        print("- Interpretation: PDF allreduce target is above current 400G rail one-way raw bandwidth.")
+    else:
+        print("- Interpretation: PDF allreduce target is within current 400G rail one-way raw bandwidth.")
+else:
+    print("- Interpretation: no active 400G IB rail was detected.")
+PY
+
+{
+  echo
+  echo "## NCCL Net Plugin / SHARP 文件"
+  echo
+  echo '```text'
+} >>"$OUT_FILE"
+
+read -r -a plugin_roots <<<"$PLUGIN_SEARCH_ROOTS"
+find "${plugin_roots[@]}" \( -name 'libnccl-net*.so*' -o -name 'libsharp*.so*' \) \
+  2>/dev/null | sort >>"$OUT_FILE" || true
+
+if ! grep -q 'libnccl-net\|libsharp' "$OUT_FILE"; then
+  echo "none found under $PLUGIN_SEARCH_ROOTS" >>"$OUT_FILE"
+fi
+
+echo '```' >>"$OUT_FILE"
+
+if have_cmd dpkg; then
+  emit_cmd "Relevant Debian packages" bash -lc "dpkg -l | egrep -i 'nccl|sharp|hcoll|ucx|ofed|mlnx' | sed -n '1,160p'"
+else
+  emit_cmd "Relevant packages" bash -lc "echo 'dpkg not found'"
+fi
+
+if have_cmd nvidia-smi; then
+  emit_cmd "nvidia-smi topo -m" nvidia-smi topo -m
+else
+  emit_cmd "nvidia-smi topo -m" bash -lc "echo 'nvidia-smi not found'"
+fi
+
+if have_cmd ibstat; then
+  emit_cmd "ibstat" ibstat
+fi
+
+{
+  echo
+  echo "## 建议判断"
+  echo
+  echo "1. 如果 Active 400G IB rail 少于 PDF 参考环境，不能直接按 PDF 阈值判断等价。"
+  echo "2. 如果没有 \`libnccl-net*.so*\` / \`libsharp*.so*\`，NCCL 可能只能走 internal IB plugin。"
+  echo "3. 若要追 PDF 2x8 目标，请先确认 rail 数量、SHARP/NCCL net plugin、跨 Leaf 交换策略是否与 PDF 环境一致。"
+  echo
+  echo "Snapshot written to: \`$OUT_FILE\`"
+} >>"$OUT_FILE"
+
+echo "$OUT_FILE"
--- a/scripts/pytorch_fp8_path_bench.py
+++ b/scripts/pytorch_fp8_path_bench.py
@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+"""Compare FP8 GEMM paths used for H100/H200 acceptance debugging.
+
+Paths:
+  A. torch._scaled_mm eager, default accumulation
+  B. torch._scaled_mm eager, use_fast_accum=True
+  C. CUDA Graph replay of torch._scaled_mm(out=..., use_fast_accum=True)
+  D. Transformer Engine Linear under fp8_autocast, when installed
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+import sys
+import time
+from typing import Any, Callable
+
+import torch
+
+
+def tflops_from_ms(matrix_size: int, iterations: int, elapsed_ms: float) -> float:
+    flops = 2.0 * matrix_size * matrix_size * matrix_size * iterations
+    return flops / (elapsed_ms / 1000.0) / 1e12
+
+
+def cuda_event_bench(
+    name: str,
+    matrix_size: int,
+    iterations: int,
+    warmup: int,
+    func: Callable[[int], Any],
+) -> dict[str, Any]:
+    for i in range(warmup):
+        func(i)
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    wall_start = time.perf_counter()
+    start.record()
+    for i in range(iterations):
+        func(i)
+    end.record()
+    torch.cuda.synchronize()
+    wall_elapsed = time.perf_counter() - wall_start
+    elapsed_ms = start.elapsed_time(end)
+    return {
+        "name": name,
+        "status": "ok",
+        "matrix_size": matrix_size,
+        "iterations": iterations,
+        "warmup": warmup,
+        "event_ms_total": round(elapsed_ms, 3),
+        "event_us_per_iter": round(elapsed_ms * 1000.0 / iterations, 3),
+        "wall_ms_total": round(wall_elapsed * 1000.0, 3),
+        "tflops": round(tflops_from_ms(matrix_size, iterations, elapsed_ms), 1),
+    }
+
+
+def make_fp8_inputs(matrix_size: int, pools: int, device: str) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+    a = [
+        torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
+        for _ in range(pools)
+    ]
+    b = [
+        torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
+        for _ in range(pools)
+    ]
+    torch.cuda.synchronize()
+    return a, b
+
+
+def bench_scaled_mm(args: argparse.Namespace) -> list[dict[str, Any]]:
+    device = f"cuda:{args.gpu_index}"
+    torch.cuda.set_device(args.gpu_index)
+    scale_a = torch.tensor(1.0, device=device)
+    scale_b = torch.tensor(1.0, device=device)
+    pools_a, pools_b = make_fp8_inputs(args.matrix_size, args.pools, device)
+    results: list[dict[str, Any]] = []
+
+    def eager_default(i: int) -> torch.Tensor:
+        idx = i % args.pools
+        return torch._scaled_mm(
+            pools_a[idx],
+            pools_b[idx].T,
+            scale_a=scale_a,
+            scale_b=scale_b,
+            out_dtype=torch.bfloat16,
+        )
+
+    def eager_fast(i: int) -> torch.Tensor:
+        idx = i % args.pools
+        return torch._scaled_mm(
+            pools_a[idx],
+            pools_b[idx].T,
+            scale_a=scale_a,
+            scale_b=scale_b,
+            out_dtype=torch.bfloat16,
+            use_fast_accum=True,
+        )
+
+    results.append(
+        cuda_event_bench(
+            "A_eager_scaled_mm_default",
+            args.matrix_size,
+            args.iterations,
+            args.warmup,
+            eager_default,
+        )
+    )
+    results.append(
+        cuda_event_bench(
+            "B_eager_scaled_mm_fast_accum",
+            args.matrix_size,
+            args.iterations,
+            args.warmup,
+            eager_fast,
+        )
+    )
+
+    graph_out = torch.empty(
+        (args.matrix_size, args.matrix_size),
+        device=device,
+        dtype=torch.bfloat16,
+    )
+    static_a = pools_a[0]
+    static_b_t = pools_b[0].T
+
+    try:
+        side_stream = torch.cuda.Stream()
+        side_stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(side_stream):
+            for _ in range(max(3, args.warmup // 2)):
+                torch._scaled_mm(
+                    static_a,
+                    static_b_t,
+                    scale_a=scale_a,
+                    scale_b=scale_b,
+                    out_dtype=torch.bfloat16,
+                    use_fast_accum=True,
+                    out=graph_out,
+                )
+        torch.cuda.current_stream().wait_stream(side_stream)
+        torch.cuda.synchronize()
+
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph):
+            torch._scaled_mm(
+                static_a,
+                static_b_t,
+                scale_a=scale_a,
+                scale_b=scale_b,
+                out_dtype=torch.bfloat16,
+                use_fast_accum=True,
+                out=graph_out,
+            )
+
+        def graph_replay(_: int) -> None:
+            graph.replay()
+
+        results.append(
+            cuda_event_bench(
+                "C_cuda_graph_scaled_mm_fast_accum",
+                args.matrix_size,
+                args.iterations,
+                3,
+                graph_replay,
+            )
+        )
+    except Exception as exc:  # noqa: BLE001
+        results.append(
+            {
+                "name": "C_cuda_graph_scaled_mm_fast_accum",
+                "status": "unavailable",
+                "reason": f"{type(exc).__name__}: {exc}",
+            }
+        )
+
+    return results
+
+
+def bench_transformer_engine(args: argparse.Namespace) -> dict[str, Any]:
+    try:
+        import transformer_engine.pytorch as te  # type: ignore[import-not-found]
+        from transformer_engine.common.recipe import DelayedScaling, Format  # type: ignore[import-not-found]
+    except Exception as exc:  # noqa: BLE001
+        return {
+            "name": "D_transformer_engine_fp8_linear",
+            "status": "unavailable",
+            "reason": f"{type(exc).__name__}: {exc}",
+        }
+
+    device = f"cuda:{args.gpu_index}"
+    x = torch.randn(args.matrix_size, args.matrix_size, device=device, dtype=torch.bfloat16)
+    layer = te.Linear(
+        args.matrix_size,
+        args.matrix_size,
+        bias=False,
+        params_dtype=torch.bfloat16,
+        device=device,
+    )
+    recipe = DelayedScaling(fp8_format=Format.HYBRID)
+
+    def run(_: int) -> torch.Tensor:
+        with te.fp8_autocast(enabled=True, fp8_recipe=recipe):
+            return layer(x)
+
+    try:
+        result = cuda_event_bench(
+            "D_transformer_engine_fp8_linear",
+            args.matrix_size,
+            args.iterations,
+            args.warmup,
+            run,
+        )
+    except Exception as exc:  # noqa: BLE001
+        return {
+            "name": "D_transformer_engine_fp8_linear",
+            "status": "error",
+            "reason": f"{type(exc).__name__}: {exc}",
+        }
+    result["note"] = "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
+    return result
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--matrix-size", type=int, default=8192)
+    parser.add_argument("--warmup", type=int, default=20)
+    parser.add_argument("--iterations", type=int, default=100)
+    parser.add_argument("--gpu-index", type=int, default=0)
+    parser.add_argument("--pools", type=int, default=4)
+    args = parser.parse_args()
+
+    if not torch.cuda.is_available():
+        print(json.dumps({"error": "cuda unavailable"}, indent=2))
+        return 1
+    if not hasattr(torch, "_scaled_mm") or not hasattr(torch, "float8_e4m3fn"):
+        print(json.dumps({"error": "torch FP8 _scaled_mm unavailable"}, indent=2))
+        return 1
+
+    torch.cuda.set_device(args.gpu_index)
+    props = torch.cuda.get_device_properties(args.gpu_index)
+    payload = {
+        "source": "pytorch_fp8_path_bench",
+        "torch": torch.__version__,
+        "cuda": torch.version.cuda,
+        "gpu_index": args.gpu_index,
+        "gpu_name": props.name,
+        "matrix_size": args.matrix_size,
+        "warmup": args.warmup,
+        "iterations": args.iterations,
+        "results": [],
+    }
+    try:
+        payload["results"].extend(bench_scaled_mm(args))
+        payload["results"].append(bench_transformer_engine(args))
+    except torch.cuda.OutOfMemoryError as exc:
+        payload["error"] = f"CUDA OOM: {exc}"
+        print(json.dumps(payload, indent=2))
+        return 1
+
+    ok_values = [r["tflops"] for r in payload["results"] if r.get("status") == "ok"]
+    if ok_values:
+        payload["summary"] = {
+            "max_tflops": round(max(ok_values), 1),
+            "min_tflops": round(min(ok_values), 1),
+            "mean_tflops": round(statistics.mean(ok_values), 1),
+        }
+    print(json.dumps(payload, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/run_cublaslt_fp8_gemm.sh
+++ b/scripts/run_cublaslt_fp8_gemm.sh
@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+set -uo pipefail
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
+
+CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
+NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
+OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
+MATRIX_SIZE="${MATRIX_SIZE:-8192}"
+WARMUP="${WARMUP:-20}"
+ITERATIONS="${ITERATIONS:-200}"
+GPU_COUNT="${GPU_COUNT:-8}"
+FIRST_GPU="${FIRST_GPU:-0}"
+WORKSPACE_MB="${WORKSPACE_MB:-256}"
+
+if [[ ! -x "$NVCC" ]]; then
+  echo "nvcc not found: $NVCC" >&2
+  exit 1
+fi
+
+mkdir -p "$OUT_DIR" "$PROJECT_DIR/build"
+HOST="$(hostname 2>/dev/null || echo unknown)"
+TS="$(date +%Y%m%d_%H%M%S)"
+BIN="$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
+REPORT="$OUT_DIR/cublaslt_fp8_gemm_${HOST}_${TS}.json"
+
+"$NVCC" -O3 -std=c++17 -arch=sm_90 \
+  "$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
+  -lcublasLt -lcublas -o "$BIN"
+
+set +e
+"$BIN" \
+  --matrix-size "$MATRIX_SIZE" \
+  --warmup "$WARMUP" \
+  --iterations "$ITERATIONS" \
+  --first-gpu "$FIRST_GPU" \
+  --gpu-count "$GPU_COUNT" \
+  --workspace-mb "$WORKSPACE_MB" \
+  | tee "$REPORT"
+status=${PIPESTATUS[0]}
+set -e
+
+echo "Report written to: $REPORT"
+exit "$status"
--- a/scripts/run_fp8_path_comparison.sh
+++ b/scripts/run_fp8_path_comparison.sh
@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
+
+PYTHON="${PYTHON:-/root/gpu-test-venv/bin/python}"
+CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.4}"
+NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
+OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
+MATRIX_SIZE="${MATRIX_SIZE:-8192}"
+WARMUP="${WARMUP:-20}"
+ITERATIONS="${ITERATIONS:-100}"
+GPU_INDEX="${GPU_INDEX:-0}"
+WORKSPACE_MB="${WORKSPACE_MB:-256}"
+VENV_SITE_PACKAGES="$("$PYTHON" - <<'PY'
+import site
+print(site.getsitepackages()[0])
+PY
+)"
+export LD_LIBRARY_PATH="$VENV_SITE_PACKAGES/nvidia/cudnn/lib:$VENV_SITE_PACKAGES/nvidia/nccl/lib:${LD_LIBRARY_PATH:-}"
+
+mkdir -p "$PROJECT_DIR/build" "$OUT_DIR"
+
+HOST="$(hostname 2>/dev/null || echo unknown)"
+TS="$(date +%Y%m%d_%H%M%S)"
+PY_REPORT="$OUT_DIR/fp8_paths_pytorch_${HOST}_${TS}.json"
+CUBLAS_REPORT="$OUT_DIR/fp8_paths_cublaslt_${HOST}_${TS}.json"
+COMBINED_REPORT="$OUT_DIR/fp8_paths_combined_${HOST}_${TS}.json"
+
+"$PYTHON" "$PROJECT_DIR/scripts/pytorch_fp8_path_bench.py" \
+  --matrix-size "$MATRIX_SIZE" \
+  --warmup "$WARMUP" \
+  --iterations "$ITERATIONS" \
+  --gpu-index "$GPU_INDEX" | tee "$PY_REPORT"
+
+"$NVCC" -O3 -std=c++17 -arch=sm_90 \
+  "$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
+  -lcublasLt -lcublas -o "$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
+
+"$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" \
+  --matrix-size "$MATRIX_SIZE" \
+  --warmup "$WARMUP" \
+  --iterations "$ITERATIONS" \
+  --first-gpu "$GPU_INDEX" \
+  --gpu-count 1 \
+  --workspace-mb "$WORKSPACE_MB" \
+  --fast-accum 1 | tee "$CUBLAS_REPORT"
+
+"$PYTHON" - "$PY_REPORT" "$CUBLAS_REPORT" "$COMBINED_REPORT" <<'PY'
+import json
+import pathlib
+import sys
+
+py_report = pathlib.Path(sys.argv[1])
+cublas_report = pathlib.Path(sys.argv[2])
+combined_report = pathlib.Path(sys.argv[3])
+
+with py_report.open() as f:
+    py_payload = json.load(f)
+with cublas_report.open() as f:
+    cublas_payload = json.load(f)
+
+combined = {
+    "source": "fp8_path_comparison",
+    "host": cublas_payload.get("host"),
+    "matrix_size": py_payload.get("matrix_size"),
+    "gpu_index": py_payload.get("gpu_index"),
+    "pytorch": py_payload,
+    "cublaslt": cublas_payload,
+    "results": [],
+}
+combined["results"].extend(py_payload.get("results", []))
+per_gpu = cublas_payload.get("per_gpu", [])
+if per_gpu:
+    row = dict(per_gpu[0])
+    row.update({
+        "name": "E_direct_cublaslt_fast_accum",
+        "status": "ok",
+        "tflops": row.pop("fp8_tflops"),
+        "matrix_size": cublas_payload.get("matrix_size"),
+        "iterations": cublas_payload.get("iterations"),
+        "warmup": cublas_payload.get("warmup"),
+        "fast_accum": cublas_payload.get("fast_accum"),
+        "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager.",
+    })
+    combined["results"].append(row)
+
+combined_report.write_text(json.dumps(combined, indent=2), encoding="utf-8")
+print(f"Combined report written to: {combined_report}")
+PY
+
+echo "$COMBINED_REPORT"
--- a/scripts/run_h100_single_node_all.sh
+++ b/scripts/run_h100_single_node_all.sh
@ -0,0 +1,134 @@
+#!/usr/bin/env bash
+set -uo pipefail
+
+# Run the single-node H100 acceptance suite and keep the raw report paths stable.
+# The suite itself still lives in gpu_tester.py; this wrapper only standardizes
+# snapshot/report naming for repeated machine-level runs.
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
+
+PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
+CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/default.yaml}"
+OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
+FORMAT="${FORMAT:-md}"
+DRY_RUN=0
+SNAPSHOT=1
+
+usage() {
+  cat <<'EOF'
+Usage: run_h100_single_node_all.sh [options]
+
+Options:
+  --python PATH       Python executable (default: /root/gpu-test-venv/bin/python)
+  --config PATH       gpu_tester config file (default: configs/default.yaml)
+  --out-dir PATH      Report output directory (default: reports)
+  --format FORMAT     Report format: md, json, or html (default: md)
+  --no-snapshot       Do not run nccl_environment_snapshot.sh first
+  --dry-run           Print commands without running them
+  -h, --help          Show this help
+EOF
+}
+
+while (($#)); do
+  case "$1" in
+    --python)
+      PYTHON_BIN="$2"
+      shift 2
+      ;;
+    --config)
+      CONFIG_FILE="$2"
+      shift 2
+      ;;
+    --out-dir)
+      OUT_DIR="$2"
+      shift 2
+      ;;
+    --format)
+      FORMAT="$2"
+      shift 2
+      ;;
+    --no-snapshot)
+      SNAPSHOT=0
+      shift
+      ;;
+    --dry-run)
+      DRY_RUN=1
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
+  echo "Unsupported format: $FORMAT" >&2
+  exit 2
+fi
+
+if [[ ! -x "$PYTHON_BIN" ]]; then
+  PYTHON_BIN="$(command -v python3 || true)"
+fi
+
+if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
+  echo "Python executable not found. Set --python or PYTHON_BIN." >&2
+  exit 1
+fi
+
+HOST="$(hostname 2>/dev/null || echo unknown)"
+TS="$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUT_DIR"
+
+SNAPSHOT_FILE="$OUT_DIR/nccl_environment_snapshot_${HOST}_${TS}.md"
+REPORT_FILE="$OUT_DIR/h100_single_node_all_${HOST}_${TS}.${FORMAT}"
+
+snapshot_cmd=(bash "$PROJECT_DIR/scripts/nccl_environment_snapshot.sh" "$SNAPSHOT_FILE")
+test_cmd=(
+  "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
+  --config "$CONFIG_FILE"
+  --test all
+  --report
+  --format "$FORMAT"
+  --output "$REPORT_FILE"
+)
+
+echo "Project: $PROJECT_DIR"
+echo "Host: $HOST"
+echo "Config: $CONFIG_FILE"
+echo "Report: $REPORT_FILE"
+if ((SNAPSHOT)); then
+  echo "Snapshot: $SNAPSHOT_FILE"
+fi
+
+if ((DRY_RUN)); then
+  if ((SNAPSHOT)); then
+    printf 'DRY RUN snapshot:'
+    printf ' %q' "${snapshot_cmd[@]}"
+    printf '\n'
+  fi
+  printf 'DRY RUN test:'
+  printf ' %q' "${test_cmd[@]}"
+  printf '\n'
+  exit 0
+fi
+
+if ((SNAPSHOT)); then
+  "${snapshot_cmd[@]}"
+fi
+
+"${test_cmd[@]}"
+status=$?
+
+echo "Report written to: $REPORT_FILE"
+if ((SNAPSHOT)); then
+  echo "Snapshot written to: $SNAPSHOT_FILE"
+fi
+
+exit "$status"
--- a/scripts/run_multinode_nccl_all_collectives.sh
+++ b/scripts/run_multinode_nccl_all_collectives.sh
@ -0,0 +1,147 @@
+#!/usr/bin/env bash
+set -uo pipefail
+
+# Run a two-node, eight-GPU-per-node NCCL evidence pass across the six
+# collectives used by the single-node H100 acceptance flow.
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
+
+PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
+CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml}"
+OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
+FORMAT="${FORMAT:-md}"
+DRY_RUN=0
+RUN_PREFLIGHT=1
+PREFLIGHT_ONLY=0
+
+usage() {
+  cat <<'EOF'
+Usage: run_multinode_nccl_all_collectives.sh [options]
+
+Options:
+  --python PATH       Python executable (default: /root/gpu-test-venv/bin/python)
+  --config PATH       Config file (default: configs/multinode_nccl_nccl227_all_collectives_2x8.yaml)
+  --out-dir PATH      Report output directory (default: reports)
+  --format FORMAT     Report format: md, json, or html (default: md)
+  --no-preflight      Skip scripts/multinode_nccl_deep_diagnose.sh preflight
+  --preflight-only    Run only the preflight check, not the workload
+  --dry-run           Print commands without running them
+  -h, --help          Show this help
+EOF
+}
+
+while (($#)); do
+  case "$1" in
+    --python)
+      PYTHON_BIN="$2"
+      shift 2
+      ;;
+    --config)
+      CONFIG_FILE="$2"
+      shift 2
+      ;;
+    --out-dir)
+      OUT_DIR="$2"
+      shift 2
+      ;;
+    --format)
+      FORMAT="$2"
+      shift 2
+      ;;
+    --no-preflight)
+      RUN_PREFLIGHT=0
+      shift
+      ;;
+    --preflight-only)
+      PREFLIGHT_ONLY=1
+      shift
+      ;;
+    --dry-run)
+      DRY_RUN=1
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
+  echo "Unsupported format: $FORMAT" >&2
+  exit 2
+fi
+
+if [[ ! -x "$PYTHON_BIN" ]]; then
+  PYTHON_BIN="$(command -v python3 || true)"
+fi
+
+if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
+  echo "Python executable not found. Set --python or PYTHON_BIN." >&2
+  exit 1
+fi
+
+TS="$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUT_DIR"
+
+REPORT_FILE="$OUT_DIR/multinode_nccl_all_collectives_${TS}.${FORMAT}"
+ARTIFACT_DIR="$OUT_DIR/multinode_nccl_all_collectives_${TS}_artifacts"
+PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
+RUN_CMD=(
+  "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
+  --config "$CONFIG_FILE"
+  --test multinode-nccl
+  --report
+  --format "$FORMAT"
+  --output "$REPORT_FILE"
+)
+
+echo "Project: $PROJECT_DIR"
+echo "Config: $CONFIG_FILE"
+echo "Report: $REPORT_FILE"
+echo "Artifacts: $ARTIFACT_DIR"
+echo "Collectives: allreduce, alltoall, broadcast, reducescatter, allgather, sendrecv"
+echo "Topology: 2 nodes x 8 GPUs per node; 16G"
+
+if ((DRY_RUN)); then
+  if ((RUN_PREFLIGHT)); then
+    printf 'DRY RUN preflight:'
+    printf ' %q' "${PREFLIGHT_CMD[@]}"
+    printf '\n'
+  fi
+  if ((PREFLIGHT_ONLY)); then
+    exit 0
+  fi
+  printf 'DRY RUN workload:'
+  printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR"
+  printf ' %q' "${RUN_CMD[@]}"
+  printf '\n'
+  exit 0
+fi
+
+if ((RUN_PREFLIGHT)); then
+  "${PREFLIGHT_CMD[@]}"
+  preflight_status=$?
+  if ((preflight_status != 0)); then
+    echo "Preflight failed with exit code $preflight_status" >&2
+    exit "$preflight_status"
+  fi
+fi
+
+if ((PREFLIGHT_ONLY)); then
+  exit 0
+fi
+
+mkdir -p "$ARTIFACT_DIR"
+MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${RUN_CMD[@]}"
+status=$?
+
+echo "Report written to: $REPORT_FILE"
+echo "Artifacts written to: $ARTIFACT_DIR"
+exit "$status"
--- a/scripts/run_multinode_nccl_pdf_matrix.sh
+++ b/scripts/run_multinode_nccl_pdf_matrix.sh
@ -0,0 +1,147 @@
+#!/usr/bin/env bash
+set -uo pipefail
+
+# Run the formal cross-node NCCL PDF matrix for the current two-node H100 pair.
+# This wrapper standardizes the command, output naming, and preflight hook; the
+# actual benchmark implementation remains in gpu_tester.py / MultiNodeNCCLTest.
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
+
+PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
+CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_pdf_matrix.yaml}"
+OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
+FORMAT="${FORMAT:-md}"
+DRY_RUN=0
+RUN_PREFLIGHT=1
+PREFLIGHT_ONLY=0
+
+usage() {
+  cat <<'EOF'
+Usage: run_multinode_nccl_pdf_matrix.sh [options]
+
+Options:
+  --python PATH       Python executable (default: /root/gpu-test-venv/bin/python)
+  --config PATH       Matrix config file (default: configs/multinode_nccl_nccl227_pdf_matrix.yaml)
+  --out-dir PATH      Report output directory (default: reports)
+  --format FORMAT     Report format: md, json, or html (default: md)
+  --no-preflight      Skip scripts/multinode_nccl_deep_diagnose.sh preflight
+  --preflight-only    Run only the preflight check, not the matrix workload
+  --dry-run           Print commands without running them
+  -h, --help          Show this help
+EOF
+}
+
+while (($#)); do
+  case "$1" in
+    --python)
+      PYTHON_BIN="$2"
+      shift 2
+      ;;
+    --config)
+      CONFIG_FILE="$2"
+      shift 2
+      ;;
+    --out-dir)
+      OUT_DIR="$2"
+      shift 2
+      ;;
+    --format)
+      FORMAT="$2"
+      shift 2
+      ;;
+    --no-preflight)
+      RUN_PREFLIGHT=0
+      shift
+      ;;
+    --preflight-only)
+      PREFLIGHT_ONLY=1
+      shift
+      ;;
+    --dry-run)
+      DRY_RUN=1
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
+  echo "Unsupported format: $FORMAT" >&2
+  exit 2
+fi
+
+if [[ ! -x "$PYTHON_BIN" ]]; then
+  PYTHON_BIN="$(command -v python3 || true)"
+fi
+
+if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
+  echo "Python executable not found. Set --python or PYTHON_BIN." >&2
+  exit 1
+fi
+
+TS="$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUT_DIR"
+
+REPORT_FILE="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}.${FORMAT}"
+ARTIFACT_DIR="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}_artifacts"
+PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
+MATRIX_CMD=(
+  "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
+  --config "$CONFIG_FILE"
+  --test multinode-nccl
+  --report
+  --format "$FORMAT"
+  --output "$REPORT_FILE"
+)
+
+echo "Project: $PROJECT_DIR"
+echo "Config: $CONFIG_FILE"
+echo "Report: $REPORT_FILE"
+echo "Artifacts: $ARTIFACT_DIR"
+echo "Matrix: 2 nodes x {1,2,4,8} GPUs per node; all_reduce_perf + alltoall_perf; 16G"
+
+if ((DRY_RUN)); then
+  if ((RUN_PREFLIGHT)); then
+    printf 'DRY RUN preflight:'
+    printf ' %q' "${PREFLIGHT_CMD[@]}"
+    printf '\n'
+  fi
+  if ((PREFLIGHT_ONLY)); then
+    exit 0
+  fi
+  printf 'DRY RUN matrix:'
+  printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR"
+  printf ' %q' "${MATRIX_CMD[@]}"
+  printf '\n'
+  exit 0
+fi
+
+if ((RUN_PREFLIGHT)); then
+  "${PREFLIGHT_CMD[@]}"
+  preflight_status=$?
+  if ((preflight_status != 0)); then
+    echo "Preflight failed with exit code $preflight_status" >&2
+    exit "$preflight_status"
+  fi
+fi
+
+if ((PREFLIGHT_ONLY)); then
+  exit 0
+fi
+
+mkdir -p "$ARTIFACT_DIR"
+MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${MATRIX_CMD[@]}"
+status=$?
+
+echo "Report written to: $REPORT_FILE"
+echo "Artifacts written to: $ARTIFACT_DIR"
+exit "$status"