From 86f15544d7092d57b069a013b5ed9a2475272595 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 10:41:09 +0800
Subject: [PATCH 01/41] Add H100 acceptance test coverage and reports

---
 .gitignore                                    |   1 +
 H100_test_all_vs_PDF_覆盖对比.md              |  85 ++
 H100验收_vs_test_all_差距分析.md              | 100 ++
 README.md                                     |  98 +-
 docs/h100_test_all_metrics_guide_cn.md        | 255 +++++
 docs/multinode_nccl_concepts.md               | 362 +++++++
 gpu_tester.py                                 | 169 +++-
 modules/dcgm_test.py                          | 231 +++++
 modules/health_check.py                       |  42 +
 modules/nccl_test.py                          | 171 ++--
 modules/nvlink_test.py                        | 188 ++++
 modules/report.py                             | 357 ++++++-
 modules/stress_test.py                        | 294 +++++-
 modules/training_sim.py                       | 288 +++++-
 reports_all_aikubeworker0016.json             | 921 ++++++++++++++++++
 reports_all_aikubeworker0016.md               | 157 +++
 ...cgm_r3_aikubeworker0012_20260522_200338.md |  65 ++
 ...cgm_r3_aikubeworker0016_20260522_200538.md |  65 ++
 reports_nvbandwidth_aikubeworker0012.json     |  70 ++
 reports_nvbandwidth_aikubeworker0012.md       |  38 +
 reports_nvbandwidth_aikubeworker0016.json     |  70 ++
 reports_nvbandwidth_aikubeworker0016.md       |  38 +
 reports_rdma_aikubeworker0012.json            | 157 +++
 reports_rdma_aikubeworker0016.json            | 157 +++
 ...ounter_aikubeworker0012_20260522_194808.md |  62 ++
 ...ounter_aikubeworker0016_20260522_194828.md |  62 ++
 reports_rdma_cross_node_mlx5_0_20260523.md    |  50 +
 reports_rdma_single_node_summary.md           |  73 ++
 reports_single_gpu_aikubeworker0012.json      | 292 ++++++
 reports_single_gpu_aikubeworker0012.md        |  54 +
 reports_single_gpu_aikubeworker0016.json      | 292 ++++++
 reports_single_gpu_aikubeworker0016.md        |  54 +
 ...stress_smoke_reasons_aikubeworker0012.json | 165 ++++
 ...s_stress_smoke_reasons_aikubeworker0012.md |  29 +
 ...stress_smoke_reasons_aikubeworker0016.json | 165 ++++
 ...s_stress_smoke_reasons_aikubeworker0016.md |  29 +
 ...latest_aikubeworker0012_20260522_203246.md | 322 ++++++
 ...latest_aikubeworker0016_20260522_203447.md | 322 ++++++
 ...rts_test_all_latest_summary_cn_20260523.md | 101 ++
 ...ll_pdf_aikubeworker0012_20260522_182656.md | 259 +++++
 ...ll_pdf_aikubeworker0016_20260522_182856.md | 259 +++++
 ...warmup_aikubeworker0012_20260522_194528.md |  43 +
 ...warmup_aikubeworker0016_20260522_194609.md |  43 +
 ...all_aikubeworker0016_中文结果与验收差距.md |  73 ++
 44 files changed, 6938 insertions(+), 190 deletions(-)
 create mode 100644 H100_test_all_vs_PDF_覆盖对比.md
 create mode 100644 H100验收_vs_test_all_差距分析.md
 create mode 100644 docs/h100_test_all_metrics_guide_cn.md
 create mode 100644 docs/multinode_nccl_concepts.md
 create mode 100644 modules/dcgm_test.py
 create mode 100644 modules/nvlink_test.py
 create mode 100644 reports_all_aikubeworker0016.json
 create mode 100644 reports_all_aikubeworker0016.md
 create mode 100644 reports_dcgm_r3_aikubeworker0012_20260522_200338.md
 create mode 100644 reports_dcgm_r3_aikubeworker0016_20260522_200538.md
 create mode 100644 reports_nvbandwidth_aikubeworker0012.json
 create mode 100644 reports_nvbandwidth_aikubeworker0012.md
 create mode 100644 reports_nvbandwidth_aikubeworker0016.json
 create mode 100644 reports_nvbandwidth_aikubeworker0016.md
 create mode 100644 reports_rdma_aikubeworker0012.json
 create mode 100644 reports_rdma_aikubeworker0016.json
 create mode 100644 reports_rdma_counter_aikubeworker0012_20260522_194808.md
 create mode 100644 reports_rdma_counter_aikubeworker0016_20260522_194828.md
 create mode 100644 reports_rdma_cross_node_mlx5_0_20260523.md
 create mode 100644 reports_rdma_single_node_summary.md
 create mode 100644 reports_single_gpu_aikubeworker0012.json
 create mode 100644 reports_single_gpu_aikubeworker0012.md
 create mode 100644 reports_single_gpu_aikubeworker0016.json
 create mode 100644 reports_single_gpu_aikubeworker0016.md
 create mode 100644 reports_stress_smoke_reasons_aikubeworker0012.json
 create mode 100644 reports_stress_smoke_reasons_aikubeworker0012.md
 create mode 100644 reports_stress_smoke_reasons_aikubeworker0016.json
 create mode 100644 reports_stress_smoke_reasons_aikubeworker0016.md
 create mode 100644 reports_test_all_latest_aikubeworker0012_20260522_203246.md
 create mode 100644 reports_test_all_latest_aikubeworker0016_20260522_203447.md
 create mode 100644 reports_test_all_latest_summary_cn_20260523.md
 create mode 100644 reports_test_all_pdf_aikubeworker0012_20260522_182656.md
 create mode 100644 reports_test_all_pdf_aikubeworker0016_20260522_182856.md
 create mode 100644 reports_training_warmup_aikubeworker0012_20260522_194528.md
 create mode 100644 reports_training_warmup_aikubeworker0016_20260522_194609.md
 create mode 100644 test_all_aikubeworker0016_中文结果与验收差距.md

diff --git a/.gitignore b/.gitignore
index 934bb96..99f18a6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,4 @@ reports/
 venv/
 .qoder/*
 .claude/settings.local.json
+.omx/
diff --git a/H100_test_all_vs_PDF_覆盖对比.md b/H100_test_all_vs_PDF_覆盖对比.md
new file mode 100644
index 0000000..f6d112a
--- /dev/null
+++ b/H100_test_all_vs_PDF_覆盖对比.md
@@ -0,0 +1,85 @@
+# H100 PDF 验收项 vs 当前 `test all` 覆盖对比
+
+对比对象：
+
+- PDF：`/Users/d-robotics/Downloads/H100_production_acceptance.pdf`
+- 当前脚本：`python gpu_tester.py --config configs/default.yaml --test all --report --format md`
+- 范围：单节点 8 卡 H100。跨节点 NCCL/RDMA 暂不纳入本轮。
+
+## 结论
+
+当前 `test all` 已经从“功能巡检”扩成了“接近生产验收”的单节点套件：GPU 健康、NVLink/NVSwitch、HBM/PCIe/NVLink 带宽、计算、NCCL、压力、RDMA 本机端口、DCGM、训练模拟都会进入同一个 all。
+
+最新 stress smoke 已确认 PyTorch BF16 GEMM 压力能把两台机器压到 PDF 要求的功耗区间：
+
+- `aikubeworker0012`：45 秒 smoke，稳态平均功耗约 `697-698W/卡`，TFLOPS jitter `4.07%`，XID `0`，但温差 `12C`、`clocks_throttle_reasons.active=0x4`，按 PDF 严格 FAIL。
+- `aikubeworker0016`：45 秒 smoke，稳态平均功耗约 `697-699W/卡`，TFLOPS jitter `3.77%`，XID `0`，但温差 `8C`、`clocks_throttle_reasons.active=0x4`，按 PDF 严格 FAIL。
+
+也就是说，当前卡点已经不是“脚本压不满 H100”，而是机器在满功耗压力下没有满足 PDF 的 `温差 <=5C` 和 `Throttle Reasons 全程 0x0` 两个严格门槛。
+
+但如果严格按 PDF 做最终验收，现在还差这些：
+
+1. 24 小时类指标未覆盖：PDF 要求 SBE 24h 增长率、长稳态观察；当前 `all` 是单次快照 + 30 分钟压力，不等于 24 小时老化。
+2. 跨节点项目本轮故意不测：PDF 的 IB/RDMA 生产验收通常要双端 `ib_write_bw/read_bw/lat`、`ibping`；当前按你的要求先做单节点，跨节点未纳入。
+3. PFC/ECN/AER 的覆盖依赖机器暴露的系统计数器：脚本会读能找到的 sysfs 计数器和 dmesg，但如果交换机侧 PFC/ECN 不在主机暴露，仍需要网络侧补证据。
+4. NCCL 1MB 档会被严格阈值打失败：实测 1MB AllReduce bus BW 约 23 GB/s，而 256MB AllReduce 已通过 `nccl-tests` 验证，约 421 GB/s；如果 PDF 要求 1MB 也达到 405 GB/s，这项不是“没测”，而是会被判 FAIL。
+5. Stress 已能达到功耗和 jitter 要求，但短测已经暴露温差和 throttle strict FAIL；完整 1800 秒只会给出更正式的证据，不会自动改变这个判据。
+
+## 覆盖表
+
+| PDF 验收项 | 当前 `test all` 状态 | 还少什么 |
+|---|---:|---|
+| GPU 基本信息、Driver/CUDA | 已覆盖 | 无；会记录 driver、CUDA、GPU 型号 |
+| 温度阈值：稳态 ≤75C、峰值 ≤85C | 已覆盖健康快照；压力项覆盖 ≤80C | 24h 稳态曲线不在一次 all 内 |
+| idle power ≤100W/card | 部分覆盖 | 当前 health 会采功耗，但 idle 判据还不是独立验收项 |
+| stress power ≥630W/card | 已覆盖；短测两台约 697-699W/卡 | 完整 1800 秒仍待跑 |
+| throttle reasons active=0x0 | 已覆盖；短测两台出现 0x4 | 按 PDF 严格判 FAIL；不是脚本跳过项 |
+| DBE/SBE/retired pages | 部分覆盖 | retired pages 和内核错误已查；SBE 24h 增长率未覆盖 |
+| PCIe Gen5 x16 | 部分覆盖 | GPU 信息/拓扑可见；Replay/AER 依赖 dmesg/sysfs，可能还需额外主板侧证据 |
+| Fabric Manager active 且无 ERROR | 已覆盖 | 无；health 会查 systemd 和 journal |
+| NVLink：18 links/GPU、25GB/s/link、错误为 0 | 已覆盖 | 无；新增 `nvlink` 项 |
+| D2D/H2D/D2H 带宽 | 已覆盖 | 依赖 `nvbandwidth`，两台已具备 |
+| 8x8 P2P matrix off-diagonal mean/min/deviation | 已覆盖 | 无；由 nvbandwidth JSON 解析 |
+| Compute FP32/TF32/FP16/BF16/FP8/FP64/INT8 | 已覆盖 | INT8 为 PyTorch `_int_mm` 路径，若要供应商标准 INT8 kernel 需再换实现 |
+| NCCL AllReduce/AllGather/ReduceScatter/Broadcast/SendRecv/AllToAll | 已覆盖 | 无；`nccl-tests` 已在两台编好 |
+| NCCL 1MB/256MB/2GB，repeat 3，stddev ≤3% | 已覆盖 | 严格按 PDF 阈值时 1MB 档大概率 FAIL；256MB AllReduce 两台 `nccl-tests` 实测约 421GB/s |
+| Stress ≥30min，BF16/FP16 GEMM 8192，1s telemetry | 已覆盖；默认 BF16 GEMM `24576`，1s telemetry，warmup 后稳态判定 | 完整 1800 秒待执行；短测已暴露温差/throttle FAIL |
+| DCGM `dcgmi diag -r 3` | 已覆盖；DCGM 4.5.3 已安装，服务已启用 | 两台完整 `-r 3` 已 PASS；日志见 `/root/test_gpu_scripts/reports/dcgm_r3_*_20260522_17010*.log` |
+| RDMA 端口 ACTIVE、400Gbps | 部分覆盖 | 单节点可查端口；严格双端吞吐/时延本轮不跑 |
+| RDMA write/read bw ≥47GB/s、latency ≤2/3.5us | 部分覆盖 | 单机 localhost/perftest 不等价跨节点线速验收 |
+| PFC/ECN errors=0、ibping 双向 OK | 部分覆盖 | 主机能读到的计数器会查；交换机侧/跨节点 ibping 未覆盖 |
+| 1.5B synthetic Transformer BF16，8 卡，≥45k tokens/s | 已覆盖 DDP 路径 | 8 进程 DDP smoke 已通过；完整 50 step 长跑待执行 |
+| 任一子项 FAIL 则总体验收 FAIL | 已覆盖 | `all` 现在会按 strict verdict 退出非 0 |
+
+## 如果现在直接跑 `all`
+
+推荐命令：
+
+```bash
+cd /root/test_gpu_scripts
+/root/gpu-test-venv/bin/python gpu_tester.py --config configs/default.yaml --test all --report --format json --output reports/h100_all_$(hostname)_$(date +%Y%m%d_%H%M%S).json
+```
+
+如果要直接生成中文 Markdown 报告，用这个：
+
+```bash
+cd /root/test_gpu_scripts
+/root/gpu-test-venv/bin/python gpu_tester.py --config configs/default.yaml --test all --report --format md --output reports/h100_all_$(hostname)_$(date +%Y%m%d_%H%M%S).md
+```
+
+预计行为：
+
+- 会跑完整单节点项目，压力默认 1800 秒，默认使用 PyTorch BF16 GEMM 压力并采 1 秒 telemetry/XID。
+- stress 默认矩阵为 `24576`，用于把 H100 压到 ≥630W/卡；PDF 只要求 `matrix_size >=8192`，这里是为了满足功耗门槛。
+- NCCL 会跑 6 个 op × 3 个 message size × 3 次 repeat。
+- DCGM 会跑 `dcgmi diag -r 3 -n gpu:8 -j`；DCGM 工具链已安装并启动，`diag -r 1` 与两台独立 `r3` 长跑均已 PASS。
+- NCCL 1MB 档按 405GB/s 阈值也会失败；256MB AllReduce 已验证走 `nccl-tests`，两台约 421GB/s。
+- stress 按 PDF 严格口径预计会 FAIL：当前短测证据显示温差超过 5C，且 throttle active 出现 `0x4`。
+- 跨节点 RDMA/NCCL 不在这次单节点 all 里。
+
+## 当前最小补齐清单
+
+1. 如果要严格 RDMA 生产验收，下一轮用两台机器做 server/client 双端测试。
+2. 执行完整 1.5B DDP 50 step 训练验收并归档 tokens/s、jitter、显存和 loss。
+3. 执行完整 1800 秒 stress 并归档 1 秒 telemetry、XID、throttle、功耗和温度；当前预期会因温差/throttle FAIL。
+4. 如果要 24 小时验收，增加一个 24h monitor 模式，记录 SBE 增长率、XID、温度、功耗、降频曲线。
diff --git a/H100验收_vs_test_all_差距分析.md b/H100验收_vs_test_all_差距分析.md
new file mode 100644
index 0000000..5599d0c
--- /dev/null
+++ b/H100验收_vs_test_all_差距分析.md
@@ -0,0 +1,100 @@
+# H100 生产验收标准 vs 当前 `gpu_tester.py --test all` 覆盖差距
+
+对比文件：`/Users/d-robotics/Downloads/H100_production_acceptance.pdf`
+
+对比对象：当前仓库执行 `python gpu_tester.py --test all --report --format md/json`
+
+## 结论
+
+当前仓库的 `test all` 能覆盖验收文档里的大类框架，但还不是完整的 H100 生产验收。
+
+它会跑 8 个模块：
+
+1. GPU Information
+2. Health Check
+3. Memory Benchmark
+4. Compute Benchmark
+5. NCCL Test
+6. GPU Stress Test
+7. RDMA/IB Test
+8. Training Simulation
+
+但是按照 PDF 的生产验收标准，仍缺少这些关键项：
+
+- NVLink 每卡 18 条链路的 active/速率/错误计数逐项验收
+- DCGM `dcgmi diag -r 3`
+- 30-60 分钟 burn-in 和 1 秒级温度/功耗/throttle/XID 采样
+- NCCL 官方 `nccl-tests` 的性能验收，包括 1MB/256MB/2GB 三个消息大小、重复 3 次取最差值、标准差
+- RDMA 生产口径：4MB 带宽、8B 延迟、PFC/ECN 错误、ibping 双向
+- 8 卡逐卡 compute 一致性，要求同 dtype 极差/均值 <= 3%
+- FP64、INT8 计算项
+- 训练项应为 8 卡 1.5B synthetic Transformer，并按 45k tokens/s、step 抖动、显存、loss 健康度验收
+
+## 覆盖矩阵
+
+| PDF 验收项 | `test all` 是否覆盖 | 当前覆盖程度 | 主要缺口 |
+| --- | --- | --- | --- |
+| 1. 健康检查 | 部分覆盖 | 温度、功耗、ECC、PCIe、时钟、throttle、persistence、IB 设备 | idle 功耗 <=100W 未单独判定；stress 功耗 >=630W 未判定；retired pages 未查；24h SBE 增长率未查；AER/Replay errors 未查；fabricmanager 服务和 ERROR 日志未查 |
+| 2. NVLink 拓扑与链路 | 部分覆盖 | GPU info 会保存 `nvidia-smi topo -m` | 未跑 `nvidia-smi nvlink -s/-c/-e`；未验证每卡 18 条 NVLink；未验证每条 25GB/s；未验证 CRC/Replay/Recovery error = 0 |
+| 3. Memory Bandwidth | 部分覆盖 | 会用 nvbandwidth 测 H2D、D2H、D2D write/read/bidir | 未输出完整 8x8 P2P 矩阵；未验非对角均值 >=360GB/s、最小值 >=320GB/s、相对均值偏差 <=±5%；D2D 口径和 PDF 的单卡/P2P 验收口径还没完全对齐 |
+| 4. Compute Throughput | 大部分覆盖 | 默认配置已是 matrix_size=8192、warmup=50、iterations=500、use_compile=true；H100 绝对 TFLOPS 阈值在 `gpu_specs.py` 里有 | 目前测试结果是整体/单进程口径，未真正逐 GPU 分别测出 8 卡极差/均值；未测 FP64、INT8 |
+| 5. NCCL Multi-GPU | 部分覆盖，依赖工具 | 代码支持 nccl-tests；若缺 binary 会 fallback torchrun 功能连通性 | 当前远端没装好 nccl-tests，实际会退化成功能测试且失败/无性能数据；默认只启 allreduce/alltoall/broadcast，未启 allgather/reducescatter/sendrecv；消息大小不是 1MB/256MB/2GB 三点；未重复 3 次取 worst；未统计标准差 |
+| 6. Stress/Burn-in | 部分覆盖 | 会跑 stress，默认 60 秒；无 gpu-burn 时用 PyTorch fallback | PDF 要 >=30min，推荐 60min；要 FP16/BF16 大 GEMM matrix >=8192；要每分钟 TFLOPS 抖动、温度 <=80、卡间温差 <=5、功耗 >=630W、throttle=0、XID=0；当前 PyTorch fallback 只分配约 64MB/卡，压力不够 |
+| 7. DCGM 诊断 | 未覆盖 | 无 | 没有执行 `dcgmi diag -r 3`，也没有解析 Software/Deployment/Hardware/Integration/Stress/Power 子项 |
+| 8. RDMA/IB | 部分覆盖 | 会发现 IB 设备，跑 ib_write_bw/read_bw/write_lat/read_lat | 当前脚本用 `localhost`，不是跨节点；msg_size 是 64KB，不是 4MB；latency 没指定 8B；阈值是 50GB/s 和 10us，不是 PDF 的 write/read >=47GB/s、write_lat <=2us、read_lat <=3.5us；未查 PFC/ECN、ibping 双向 |
+| 9. Training Simulation | 部分覆盖 | 会跑 GPT-2 或 synthetic transformer，输出 tokens/s、step time、显存、loss | 当前 synthetic 是约 1.47B 参数但实际单进程 `.cuda()`，不是 8 卡分布式训练；未按 45k tokens/s、step 抖动 <=±3%、peak <=70GB/卡、NaN/Inf 做硬判定 |
+| 10. 总体 Verdict | 部分覆盖 | report 有 summary | 当前 `all` 的 pass/fail 逻辑偏“模块是否报错”，不是 PDF 的任一子项 FAIL 即整机禁上生产 |
+
+## 如果现在直接执行 `test all`，能得到什么
+
+会得到一份“单节点综合体检/基准测试报告”，包含：
+
+- 8 张 H100 的基础信息、驱动/CUDA、PCIe、显存、温度、功耗
+- 健康检查结果
+- nvbandwidth 的 H2D/D2H/D2D 汇总带宽
+- FP32/TF32/FP16/BF16/FP8 计算吞吐
+- NCCL 测试结果，如果 nccl-tests 缺失会退化到 torchrun fallback
+- 60 秒 stress 结果
+- 本机 localhost RDMA/IB 结果
+- 训练模拟结果
+
+这份报告能作为“快速冒烟 + 单机初筛”，不能直接作为 PDF 标准下的“生产验收合格报告”。
+
+## 当前两台机器执行前置状态
+
+已经确认：
+
+- `nvbandwidth` 已装好并能被项目脚本调用
+- PyTorch CUDA 环境已装好
+- RDMA perftest 工具已存在
+- `nccl-tests` 和 `gpu-burn` 目前没有按 PDF 生产验收口径准备好
+
+另外，我刚才误触发的 `test all`：
+
+- `aikubeworker0016` 已经在跑单节点 `test all`，当前到 Training Simulation
+- `aikubeworker0012` 没有成功启动
+
+## 要补齐到 PDF 验收口径，需要加的最小清单
+
+1. 安装/修复 `nccl-tests`，确保真正输出 bus BW，而不是 torchrun fallback。
+2. 安装/修复 `gpu-burn`，或把 PyTorch stress 改成真正高占用 FP16/BF16 GEMM，并支持 30/60 分钟。
+3. 增加 NVLink 专项：`nvidia-smi nvlink -s/-c/-e`，按 18 条/卡、25GB/s、error=0 判定。
+4. 增加 DCGM 专项：`dcgmi diag -r 3`，解析子项 PASS/FAIL。
+5. 增加 telemetry 采样：stress 期间每 1 秒采温度、功耗、throttle、XID；计算稳态功耗、温差、抖动。
+6. 修改 RDMA：支持指定 server/client、4MB 带宽、8B 延迟、双向 ibping、PFC/ECN 计数。
+7. 修改 NCCL 配置：全 op 开启，按 1MB/256MB/2GB 三个 size，重复 3 次取最差值和标准差。
+8. 修改 Compute：逐 GPU 分别跑，计算同 dtype 极差/均值；增加 FP64、INT8。
+9. 修改 Training Simulation：明确 8 卡 1.5B synthetic 分布式训练，加入 tokens/s、step 抖动、显存、loss NaN/Inf 的 PASS/FAIL。
+10. 修改最终 verdict：按 PDF 规则，任一子项 FAIL 就整机不通过。
+
+## 建议执行策略
+
+现在直接跑：
+
+```bash
+/root/gpu-test-venv/bin/python gpu_tester.py --test all --report --format md --output reports_all/test_all.md
+```
+
+得到的是“当前仓库 all 覆盖范围报告”。
+
+要拿来做生产验收，需要先补齐上面的缺口，尤其是 `nccl-tests`、`gpu-burn`、NVLink、DCGM、长时间 burn-in、跨节点 RDMA。
diff --git a/README.md b/README.md
index ebe1ae6..1af08c4 100644
--- a/README.md
+++ b/README.md
@@ -159,7 +159,7 @@ python3 gpu_tester.py
  [3]  Memory Benchmark (nvbandwidth)
  [4]  Compute Benchmark
  [5]  NCCL Multi-GPU Test
- [6]  GPU Stress Test (gpu-burn)
+ [6]  GPU Stress Test (PyTorch/gpu-burn)
  [7]  RDMA/IB Test
  [8]  Training Simulation
  [9]  Full Test Suite (All Tests)
@@ -279,33 +279,35 @@ python3 gpu_tester.py --config /path/to/config.yaml --test all
 | FP16 | 312 TFLOPS | 990 TFLOPS | 2,250 TFLOPS | 3,500 TFLOPS |
 | BF16 | 312 TFLOPS | 990 TFLOPS | 2,250 TFLOPS | 3,500 TFLOPS |
 | FP8 | N/A | 1,979 TFLOPS | 4,500 TFLOPS | 7,000 TFLOPS |
+| FP64 | 9.7 TFLOPS | 67 TFLOPS | TBD | TBD |
+| INT8 | 624 TOPS | 1,979 TOPS | TBD | TBD |
 
-默认配置：4096×4096 矩阵，10 次 warmup，100 次迭代。
+默认配置：8192×8192 矩阵，50 次 warmup，500 次迭代；逐 GPU 跑 FP32/TF32/FP16/BF16/FP8/FP64/INT8，并按同 dtype 的极差/均值判断一致性。
 
 ### 5. NCCL Multi-GPU Test（多卡通信）
 
-优先使用官方 nccl-tests（通过 mpirun 调用），不可用时 torchrun fallback。
+优先使用官方 nccl-tests（通过 mpirun 调用）并解析真实 bus BW；如果只能走 torchrun fallback，验收结果会标记 FAIL。
 
 | 操作 | 说明 |
 |---|---|
 | AllReduce | 最常用的集合通信 |
 | AllToAll | 模型并行关键操作 |
 | Broadcast | 参数同步 |
-| ReduceScatter | 可选 |
-| AllGather | 可选 |
-| SendRecv | 可选 |
+| ReduceScatter | 必测 |
+| AllGather | 必测 |
+| SendRecv | 必测 |
 
-默认测试数据量范围 8B ~ 256MB，5 次 warmup，20 次迭代。
+默认按 PDF 口径测试 1MB、256MB、2GB 三个 size，每个 op 重复 3 次，取 worst bus BW 和标准差；标准差超过 3% 判 FAIL。
 
 **NVLink 参考带宽：** A100/A800 ≥ 240 GB/s | H100/H200 ≥ 360 GB/s | B200/B300 ≥ 720 GB/s（40% NVLink 峰值）
 
 ### 6. GPU Stress Test（压力测试）
 
-使用 gpu-burn 进行长时满载测试，验证热稳定性和内存正确性。
+默认使用 PyTorch BF16/FP16 GEMM 进行长时高功耗满载测试；也可在配置中启用 gpu-burn。测试期间采集温度、功耗、throttle、XID，并计算稳态功耗、温差和 TFLOPS 抖动。
 
 | 参数 | 默认值 | 说明 |
 |---|---|---|
-| duration_sec | 60 | 测试时长（秒） |
+| duration_sec | 1800 | 测试时长（秒） |
 | use_tensor_cores | true | 使用 Tensor Core |
 | memory_pct | 90 | 内存占用比例 |
 
@@ -320,18 +322,18 @@ python3 gpu_tester.py --config /path/to/config.yaml --test all
 | 写延迟 | ib_write_lat |
 | 读延迟 | ib_read_lat |
 
-**参考阈值：** 带宽 ≥ 50 GB/s, 延迟 ≤ 10 μs
+**参考阈值：** 端口 ACTIVE 且 ≥400Gbps；4MB 写/读带宽 ≥47GB/s；8B 写延迟 ≤2μs、读延迟 ≤3.5μs；PFC/ECN/CNP/congestion 计数为 0。
 
 ### 8. Training Simulation（训练模拟）
 
-使用真实或合成模型模拟训练负载。
+默认跑 8 卡 DDP synthetic 1.5B Transformer 训练模拟。
 
 | 模式 | 说明 |
 |---|---|
-| 真实模型 | 加载 HuggingFace GPT-2（需安装 transformers） |
-| 合成模型 | 6 层 Transformer（无需额外依赖） |
+| DDP 合成模型 | 约 1.5B 参数，8 卡 torchrun |
+| 单进程 fallback | 仅用于调试；生产验收按 FAIL |
 
-输出：tokens/sec、步时、峰值显存、最终 loss。
+输出：tokens/sec、步时、warmup 后 step 抖动、峰值显存、最终 loss，并检查 loss 是否 NaN/Inf。
 
 ---
 
@@ -351,14 +353,14 @@ benchmark:
     nvbandwidth_buffer_mb: 512          # nvbandwidth 缓冲区大小
     nvbandwidth_samples: 3              # nvbandwidth 采样次数
   compute:
-    dtypes: [fp32, tf32, fp16, bf16, fp8]
-    matrix_size: 4096                   # GEMM 矩阵维度
-    warmup: 10
-    iterations: 100
+    dtypes: [fp32, tf32, fp16, bf16, fp8, fp64, int8]
+    matrix_size: 8192                   # GEMM 矩阵维度
+    warmup: 50
+    iterations: 500
 
 health:
-  temp_warning: 80                      # 温度警告阈值 °C
-  temp_critical: 90                     # 温度严重阈值 °C
+  temp_warning: 75                      # 温度警告阈值 °C
+  temp_critical: 85                     # 温度严重阈值 °C
   power_limit: null                     # null = 自动匹配 GPU TDP
 
 nccl:
@@ -366,26 +368,62 @@ nccl:
   test_allreduce: true
   test_alltoall: true
   test_broadcast: true
+  test_reduce_scatter: true
+  test_allgather: true
+  test_sendrecv: true
+  message_sizes: [1M, 256M, 2G]
+  repeats: 3
+  max_stddev_pct: 3
 
 stress:
-  duration_sec: 60                     # 压力测试时长
+  duration_sec: 1800                   # 压力测试时长
+  use_gpu_burn: false                  # 默认走 PyTorch GEMM stress
+  dtype: bf16
+  matrix_size: 24576
+  telemetry_interval_sec: 1
+  min_power_watts: 630
+  max_tflops_jitter_pct: 5
+  require_tflops_jitter: true
   use_tensor_cores: true
 
 rdma:
-  min_bandwidth_gbps: 50              # RDMA 最低可接受带宽
-  max_latency_us: 10                  # RDMA 最大可接受延迟
-  msg_size: 65536                     # 测试消息大小
+  min_bandwidth_gbps: 47              # RDMA 最低可接受带宽
+  min_port_rate_gbps: 400             # IB 端口最低速率
+  max_write_latency_us: 2.0
+  max_read_latency_us: 3.5
+  msg_size: 4194304                   # 4MB 带宽测试消息
+  latency_msg_size: 8                 # 8B 延迟测试消息
+  server_addr: null                   # client 模式 perftest 对端 IP
+  ibping_target: null                 # ibping 对端 LID/GID，不是 IP
+  role: auto                          # auto / server / client
+  pfc_ecn_counters: true
+
+nvlink:
+  expected_links_per_gpu: 18
+  expected_link_speed_gbps: 25
+  require_zero_errors: true
+
+dcgm:
+  diag_level: 3
+  timeout_sec: 3600
+  expected_num_gpus: 8
+  json_output: true
+  require_subtests: true
 
 training:
-  model: gpt2                          # HuggingFace 模型名
+  model: synthetic_1.5b                # 8 卡 synthetic Transformer
   batch_size: 8
   seq_length: 2048
   num_steps: 50
+  warmup_steps: 5
   dtype: bf16
+  mode: ddp
+  min_tokens_per_sec: 45000
+  max_step_jitter_pct: 3
 
 report:
   output_dir: ./reports
-  format: json                         # json 或 html
+  format: json                         # json / html / md
 ```
 
 ---
@@ -493,9 +531,11 @@ report:
 步骤 2: RDMA 网络测试
 ├── python3 gpu_tester.py --test rdma
 ├── 确认: IB 设备被识别
-├── 确认: 端口状态 Active
-├── 确认: 写带宽 ≥ 50 GB/s
-├── 确认: 延迟 ≤ 10 μs
+├── 确认: 端口状态 ACTIVE 且 ≥400Gbps
+├── 确认: 4MB 写/读带宽 ≥47 GB/s
+├── 确认: 8B 写延迟 ≤2 μs、读延迟 ≤3.5 μs
+├── 确认: ibping 双向连通
+├── 确认: PFC/ECN/CNP/congestion 计数为 0
 └── 异常: 检查 IB 线缆、交换机配置、子网管理器
 
 步骤 3: 多节点 NCCL 测试
diff --git a/docs/h100_test_all_metrics_guide_cn.md b/docs/h100_test_all_metrics_guide_cn.md
new file mode 100644
index 0000000..37abd28
--- /dev/null
+++ b/docs/h100_test_all_metrics_guide_cn.md
@@ -0,0 +1,255 @@
+# H100 `test all` 指标说明
+
+本文解释 `gpu_tester.py --test all` 报告里每一项指标的意义、它在验收中代表什么，以及异常时通常应该优先排查什么。
+
+适用报告：
+
+- `reports_test_all_latest_aikubeworker0012_20260522_203246.md`
+- `reports_test_all_latest_aikubeworker0016_20260522_203447.md`
+- `reports_test_all_latest_summary_cn_20260523.md`
+
+## 总体判定
+
+| 指标 | 意义 | 怎么看 |
+|---|---|---|
+| `Overall Acceptance Verdict` | 整机验收结论 | 按 PDF 生产验收规则，任一必测子项 FAIL，则整机 FAIL |
+| `Suite complete: x/10 tests passed` | 10 个测试模块里通过了几个 | 用来快速看整体健康度，但最终以 `Overall Acceptance Verdict` 为准 |
+| `PASS` | 达到当前配置阈值 | 表示该指标在当前测试口径下通过 |
+| `FAIL` | 未达到当前配置阈值，或证据不足 | 表示该项不能作为生产验收通过证据 |
+| `WARN` | 旧报告或非强制警告口径 | 当前 PDF 生产验收里，关键性能未达标应按 FAIL 处理 |
+
+## GPU Info
+
+GPU Info 是基础盘点项，用来确认机器硬件、驱动和 CUDA 环境是否符合预期。
+
+| 指标 | 意义 | 异常影响 |
+|---|---|---|
+| GPU count | 当前系统识别到的 GPU 数量 | H100 8 卡机器如果不是 8 张，后续所有多卡测试都不可信 |
+| GPU model | GPU 型号，例如 H100 | 型号不对会导致阈值、峰值、验收口径都不对 |
+| Driver version | NVIDIA 驱动版本 | 版本过旧可能影响 CUDA、NCCL、DCGM、NVLink 工具 |
+| CUDA version | CUDA 运行时或驱动支持版本 | CUDA 不匹配会导致 PyTorch、nccl-tests 或编译工具异常 |
+| GPU UUID / PCI bus id | GPU 唯一标识和 PCIe 拓扑位置 | 用于定位具体故障卡、对应槽位和链路 |
+
+这项通常不直接代表性能好坏，它是确认“测的是不是目标机器、目标 GPU、目标软件栈”。
+
+## Health Check
+
+Health Check 是空闲或轻负载状态下的基础健康检查。
+
+| 指标 | 意义 | 怎么看 |
+|---|---|---|
+| Temperature | 当前 GPU 温度 | 空闲温度过高可能说明散热、风道、环境温度异常 |
+| Power | 当前功耗 | 空闲功耗异常高可能说明有残留进程或功耗状态异常 |
+| ECC errors | 显存纠错错误 | 单比特错误过多或双比特错误通常需要重点关注硬件稳定性 |
+| PCIe | PCIe 代际和宽度，例如 Gen5 x16 | 降速或降宽会影响 CPU-GPU、RDMA、部分数据搬运性能 |
+| Throttle | 当前是否触发限速 | 空闲状态下非 idle throttle 不正常，可能影响后续性能 |
+| XID / NVRM events | 驱动或 GPU 错误事件 | 出现新 XID 通常说明硬件、驱动、供电或内核态异常 |
+
+Health PASS 只能说明基础状态正常，不代表满载性能一定达标。
+
+## Memory Bandwidth
+
+Memory Bandwidth 衡量数据搬运能力，包括 CPU 到 GPU、GPU 到 CPU、GPU 到 GPU。
+
+| 指标 | 意义 | 代表什么 |
+|---|---|---|
+| H2D | Host to Device，CPU 内存到 GPU 显存带宽 | 受 PCIe、NUMA、CPU 内存、驱动影响 |
+| D2H | Device to Host，GPU 显存到 CPU 内存带宽 | 受 PCIe、NUMA、CPU 内存、驱动影响 |
+| D2D | Device to Device，GPU 到 GPU 带宽 | 单节点多卡通常主要受 NVLink/NVSwitch 影响 |
+| Efficiency | 实测值相对理论或配置阈值的比例 | 用于快速判断是否达到预期带宽 |
+
+H2D/D2H 主要看 PCIe 和 CPU 侧链路是否正常。D2D 更接近多卡训练、NCCL 和 P2P 通信的基础能力。
+
+## Compute Throughput
+
+Compute Throughput 衡量 GPU 在不同数值格式下的矩阵计算吞吐，单位通常是 TFLOPS。
+
+| 指标 | 意义 | 常见用途 |
+|---|---|---|
+| FP32 | 32 位浮点性能 | 传统科学计算、部分模型训练和验证 |
+| TF32 | TensorFloat-32 Tensor Core 性能 | NVIDIA Ampere/Hopper 上常见的 FP32 加速路径 |
+| FP16 | 16 位浮点 Tensor Core 性能 | 深度学习训练和推理常用 |
+| BF16 | bfloat16 Tensor Core 性能 | 大模型训练常用，数值范围比 FP16 更稳 |
+| FP8 | 8 位浮点 Tensor Core 性能 | 新一代低精度训练/推理加速 |
+| FP64 | 64 位双精度性能 | HPC、科学计算、仿真 |
+| INT8 | 8 位整数性能 | 推理、量化模型 |
+| Achieved | 实测吞吐 | 越接近峰值越好 |
+| Peak | 理论峰值或规格峰值 | 用来计算效率 |
+| Threshold | 当前验收阈值 | 低于阈值则 FAIL |
+| Efficiency | `Achieved / Peak` | 衡量实测利用率 |
+
+### Compute Consistency
+
+Consistency 是看同一种 dtype 下，不同 GPU 之间性能是否均衡。
+
+| 指标 | 意义 | 异常含义 |
+|---|---|---|
+| Min | 8 张 GPU 里最慢卡的实测值 | 用于发现拖后腿的卡 |
+| Mean | 8 张 GPU 平均值 | 用于看整体水平 |
+| Max | 8 张 GPU 里最快卡的实测值 | 和 Min 一起计算离散度 |
+| Spread | `(Max - Min) / Mean` | 反映卡间性能差异 |
+
+Spread 超过阈值通常说明某些卡受温度、功耗、PCIe、后台负载、时钟策略或硬件状态影响。即使平均性能还可以，卡间差异过大也会拖慢分布式训练。
+
+## NVLink / NVSwitch
+
+NVLink/NVSwitch 测试确认 GPU 间高速互联是否完整、速率是否正确、错误计数是否干净。
+
+| 指标 | 意义 | 怎么看 |
+|---|---|---|
+| Active Links | 每张 GPU 当前活跃 NVLink 数 | H100 8 卡 SXM 常见期望是每卡 18 条 |
+| Expected Links | 配置期望链路数 | 少一条都可能影响拓扑和 NCCL 性能 |
+| Link speed | 单条链路速率 | 速率不对说明链路降级或识别异常 |
+| Error counters | NVLink 错误计数，例如 CRC/replay/recovery | 非零可能说明链路质量或硬件问题 |
+
+NVLink PASS 表示链路状态看起来正常，但 NCCL 仍可能因算法、拓扑、消息大小、NCCL 参数或系统噪声而不达标。
+
+## DCGM Diagnostic
+
+DCGM 是 NVIDIA 官方诊断工具。`dcgmi diag -r 3` 是比较完整的生产诊断级别。
+
+| 子项 | 意义 |
+|---|---|
+| Deployment/software | 驱动、库、系统软件依赖检查 |
+| Hardware/memory | GPU 显存健康检查 |
+| Hardware/diagnostic | GPU 硬件基础诊断 |
+| Hardware/nvbandwidth | GPU/NVLink/NVSwitch 带宽诊断 |
+| Integration/pcie | PCIe 集成和链路相关检查 |
+| Stress/targeted_stress | DCGM 自带目标压力测试 |
+| Stress/targeted_power | DCGM 自带目标功耗压力测试 |
+| summary | 该分类汇总结果 |
+
+DCGM PASS 是强证据，说明官方诊断没有发现明显硬件故障。但它不替代项目里的 NCCL、RDMA、长时间 telemetry 和训练模拟验收。
+
+## NCCL Multi-GPU
+
+NCCL 测试衡量单节点多 GPU 集合通信能力。它直接关系到多卡训练效率。
+
+| 指标 | 意义 | 为什么重要 |
+|---|---|---|
+| source | 测试来源 | 必须是 `nccl-tests` 才有真实 bus BW；`torchrun_fallback` 只能说明功能连通，不是性能验收 |
+| bus BW | NCCL 报告的总线等效带宽 | 用来衡量通信是否吃满 NVLink/NVSwitch |
+| message size | 消息大小，例如 1M、256M、2G | 小消息看延迟和调度，中大消息看带宽 |
+| repeats | 重复次数 | 减少偶然波动，当前按 3 次取样 |
+| worst bus BW | 多次结果里的最差值 | 生产验收更关注最差情况 |
+| mean bus BW | 多次平均值 | 反映稳定水平 |
+| stddev | 标准差或波动 | 波动大说明通信稳定性不足 |
+
+### NCCL op 含义
+
+| Op | 意义 | 常见场景 |
+|---|---|---|
+| allreduce | 每张卡都有一份数据，做规约后每张卡都拿到结果 | 数据并行梯度同步最常见 |
+| allgather | 每张卡收集所有卡的数据分片 | 模型并行、张量并行、参数/激活收集 |
+| reducescatter | 先规约再把结果切分给各卡 | ZeRO、优化器状态切分、分布式训练常用 |
+| broadcast | 一张卡把数据广播给其他卡 | 参数同步、初始化权重分发 |
+| sendrecv | 点对点发送和接收 | pipeline、定制通信、拓扑验证 |
+| alltoall | 每张卡向每张卡交换不同数据 | MoE、专家并行、shuffle 类通信 |
+
+NCCL 小消息失败常见于延迟、调度或阈值口径较严；大消息失败更偏向链路带宽、拓扑、NCCL 参数或 NVSwitch/PCIe/NUMA 配置问题。
+
+## Stress Test
+
+Stress Test 是长时间高负载稳定性测试。它不是只看“能不能跑完”，还要看满载期间的温度、功耗、限速和错误事件。
+
+| 指标 | 意义 | 怎么看 |
+|---|---|---|
+| duration | 实际压力测试时长 | 生产验收通常需要 30/60 分钟 |
+| source | 压力来源，例如 `pytorch` 或 `gpu-burn` | 说明用什么负载压 GPU |
+| dtype | 压力计算的数据类型，例如 BF16 | 影响 Tensor Core、功耗和温度 |
+| matrix_size | GEMM 矩阵边长 | 越大越容易形成持续高占用 |
+| memory_pct | 目标显存占用比例 | 避免只测很小负载 |
+| Avg steady power | 稳态平均功耗 | 判断是否真的把卡压起来 |
+| Max steady temp | 稳态最高温度 | 判断散热上限 |
+| Temp delta | 8 卡之间最高温和最低温的差 | 差异过大说明风道、散热或卡位不均衡 |
+| TFLOPS jitter | 稳态吞吐波动 | 波动大说明性能不稳定 |
+| Throttle events | 限速事件数量 | 非 idle throttle 会影响性能稳定性 |
+| XID events | 压测期间新增 XID 错误 | 出现 XID 通常是严重风险 |
+
+### Throttle 常见含义
+
+| 代码 | 常见含义 | 解释 |
+|---|---|---|
+| `0x1` | idle throttle | 空闲状态限速，通常不算真实问题 |
+| `0x4` | `sw_power_cap` | 达到软件功耗上限，性能可能被功耗墙限制 |
+| `0x8` | hardware slowdown | 硬件触发降速 |
+| `0x10` | thermal slowdown | 温度触发降速 |
+| `0x20` | power brake | 外部供电或硬件功率保护 |
+| `0x40` | software thermal slowdown | 软件温度策略触发降速 |
+
+当前报告里的 `sw_power_cap` 表示负载确实压到了功耗墙附近，但验收口径把非 idle throttle 作为失败原因之一，因为它会影响长时间稳定输出。
+
+## RDMA / InfiniBand
+
+RDMA 测试衡量 IB 网卡和网络链路性能。单节点 loopback 和跨节点 server/client 是两种不同证据，不能混用。
+
+| 指标 | 意义 | 怎么看 |
+|---|---|---|
+| Device | IB 设备名，例如 `mlx5_0` | 对应具体 HCA/端口 |
+| Port | 端口号 | 通常是 port 1 |
+| State | 端口状态，例如 ACTIVE/DOWN | ACTIVE 才能作为可用链路 |
+| Rate | 端口速率，例如 400 Gb/sec | 低于期望说明链路降级或接错网络 |
+| GID/LID | IB 寻址信息 | `ibping` 和跨节点定位会用到 |
+| ib_write_bw | RDMA write 带宽 | 客户端向远端写数据的吞吐 |
+| ib_read_bw | RDMA read 带宽 | 客户端从远端读数据的吞吐 |
+| ib_write_lat | RDMA write 延迟 | 小消息写延迟 |
+| ib_read_lat | RDMA read 延迟 | 小消息读延迟 |
+| ibping | IB 层连通性测试 | 看 LID/GID 层是否可达 |
+| PFC/ECN/CNP counters | 拥塞和流控相关计数 | 非零或增长可能说明网络拥塞/丢包/流控问题 |
+
+### 单节点与跨节点的区别
+
+| 口径 | 意义 | 能证明什么 | 不能证明什么 |
+|---|---|---|---|
+| `local_loopback` | 在同一台机器本地启动 perftest server/client | 工具、设备、单机端口基本可用 | 不能证明两台机器之间 RDMA 网络达标 |
+| server/client 跨节点 | 一台做 server，另一台做 client | 能证明实际跨节点 RDMA 带宽/延迟 | 需要明确 server_addr、ib_device、ib_port、ibping_target |
+
+RDMA read 带宽低于 write 带宽很常见，但生产验收会给 read/write 各自设置阈值。read 不过线时，需要排查 HCA 固件、BIOS、PCIe、NUMA、RoCE/IB 配置、交换机、PFC/ECN、线缆和端口速率。
+
+## Training Simulation
+
+Training Simulation 用一个合成 1.5B Transformer 训练负载验证 8 卡分布式训练是否能稳定运行。
+
+| 指标 | 意义 | 怎么看 |
+|---|---|---|
+| Model | 模型类型 | 当前是 synthetic 1.5B，不依赖真实数据集 |
+| Parameters | 参数量 | 用来确认负载规模是否达到预期 |
+| GPU Count | 参与训练的 GPU 数 | 生产口径要求 8 卡 DDP |
+| DType | 训练数值格式，例如 BF16 | 大模型训练常用 BF16 |
+| Batch Size | 每步 batch 大小 | 影响吞吐和显存 |
+| Seq Length | 序列长度 | 影响计算量和显存 |
+| Steps | 计入统计的训练步数 | 步数太少会导致统计不稳 |
+| Warmup Steps | 预热步数 | 避免把 CUDA 初始化、编译、缓存冷启动计入性能 |
+| Avg Step Time | 平均每步耗时 | 越低越好 |
+| Throughput | tokens/sec | 训练吞吐核心指标 |
+| Samples/sec | 每秒样本数 | 辅助衡量数据处理速度 |
+| Peak Memory | 峰值显存 | 看是否接近 OOM 或显存利用不足 |
+| Final Loss | 最后 loss | 用于确认数值是有限值，没有 NaN/Inf |
+| Step Jitter | step 时间抖动 | 抖动大说明训练不稳定 |
+| Distributed Mode | 分布式模式 | 必须是 `ddp` 才满足 8 卡分布式口径 |
+
+Training PASS 说明 8 卡 DDP 训练路径、NCCL 功能连通、PyTorch CUDA 和基本数值稳定性都没问题。但它不能替代 NCCL 性能测试，因为训练负载可能没有覆盖所有通信模式和消息大小。
+
+## 常见误读
+
+1. `DCGM PASS` 不等于整机验收 PASS。DCGM 是官方诊断的一部分，不覆盖全部业务性能门槛。
+2. `Training PASS` 不等于 NCCL 性能 PASS。训练能跑，只说明功能链路通；NCCL bus BW 仍可能不达标。
+3. `NVLink PASS` 不等于 NCCL PASS。链路数量和错误计数正常，不代表所有 NCCL op/size 都达到阈值。
+4. `ibping PASS` 不等于 RDMA 带宽 PASS。`ibping` 只证明连通性，不证明吞吐和延迟达标。
+5. `local_loopback` 不能当作跨节点 RDMA 证据。跨节点验收必须有 server/client 两端证据。
+6. Stress 跑满 30 分钟不等于 PASS。温差、功耗、throttle、XID、jitter 都要一起看。
+7. 小消息 NCCL 低不一定是链路断了，可能是延迟、算法、启动开销或阈值口径导致；但生产验收仍按阈值判定。
+
+## 排查优先级建议
+
+| 失败项 | 优先看什么 |
+|---|---|
+| Compute FAIL | GPU 时钟、功耗策略、MIG/MPS、后台进程、PyTorch/CUDA 版本、benchmark 算法是否用到目标 Tensor Core 路径 |
+| NCCL FAIL | `NCCL_DEBUG=INFO`、拓扑、NVSwitch/NVLink、NCCL 算法、消息大小、PCIe/NUMA、进程绑核 |
+| Stress FAIL | 机箱风道、风扇、环境温度、功耗上限、`nvidia-smi -q -d POWER,CLOCK,TEMPERATURE` |
+| RDMA FAIL | 端口速率、HCA 固件、线缆、交换机、PFC/ECN、NUMA、BIOS、跨节点 server/client 配置 |
+| Training FAIL | torchrun、NCCL 环境变量、CUDA OOM、loss NaN/Inf、DDP 初始化、网络/共享内存 |
+
+## 一句话版
+
+这套报告不是只看 GPU 能不能亮、训练能不能跑，而是同时验证：硬件识别、基础健康、显存和互联带宽、计算吞吐、多卡通信、长时间满载稳定性、IB/RDMA 网络、官方 DCGM 诊断和 8 卡训练业务路径。任何一个关键项 FAIL，按生产验收都应判整机不通过。
diff --git a/docs/multinode_nccl_concepts.md b/docs/multinode_nccl_concepts.md
new file mode 100644
index 0000000..1c6039d
--- /dev/null
+++ b/docs/multinode_nccl_concepts.md
@@ -0,0 +1,362 @@
+# 多机多卡 NCCL 测试概念说明
+
+本文先讲概念，不涉及脚本改造。目标是理解两台 8 卡 H100 服务器做多机多卡通信测试时，应该从哪些层次逐步验证，以及每一层到底在证明什么。
+
+当前示例机器：
+
+| 别名 | 主机名 | 内网 IP | GPU |
+|---|---|---|---|
+| nccl-gpu-1 | aikubeworker0012 | 172.72.8.12 | 8 x H100 |
+| nccl-gpu-2 | aikubeworker0016 | 172.72.8.16 | 8 x H100 |
+
+两台机器合起来就是 16 张 GPU。多机 NCCL 测试的核心问题是：这 16 张 GPU 是否能通过正确的 GPU、NVLink、PCIe、IB/RDMA 网络路径，高效且正确地完成集体通信。
+
+## 1. 总体思路
+
+多机多卡通信测试是一个自底向上的过程。越底层越接近硬件和链路，越上层越接近真实训练业务。
+
+```mermaid
+flowchart TD
+    L0["0. 物理与基础连通<br/>电源 / GPU / 网卡 / 线缆 / 交换机 / SSH"] --> L1["1. 系统识别层<br/>nvidia-smi / lspci / ibstat / ibdev2netdev"]
+    L1 --> L2["2. 单机 GPU 健康层<br/>温度 / 功耗 / ECC / PCIe / Throttling / NVLink Topo"]
+    L2 --> L3["3. 单机 GPU 性能层<br/>HBM 带宽 / H2D-D2H / FP32-TF32-FP16-BF16-FP8 算力"]
+    L3 --> L4["4. 单机多卡通信层<br/>单节点 8 卡 NCCL over NVLink/NVSwitch"]
+    L4 --> L5["5. 跨机网络与 RDMA 层<br/>IP 连通 / IB Active / RDMA 带宽 / RDMA 延迟"]
+    L5 --> L6["6. 跨机 NCCL 层<br/>两机 16 卡 AllReduce / AllGather / ReduceScatter / Broadcast / AllToAll"]
+    L6 --> L7["7. 训练负载层<br/>torchrun / Megatron / DeepSpeed / 业务训练压测"]
+```
+
+最重要的原则：
+
+**上层失败，不一定是上层问题。**
+
+比如两机 `all_reduce_perf` 失败，原因可能在 NCCL，也可能在 SSH、MPI、IB、GID、网卡选择、驱动版本、CUDA 版本、NCCL 版本或 GPU Direct RDMA。
+
+所以排查顺序应该是：
+
+```text
+基础连通 -> 单机健康 -> 单机性能 -> 单机 NCCL -> 跨机 RDMA -> 跨机 NCCL -> 训练业务
+```
+
+## 2. 两机 16 卡通信路径
+
+单机内部主要走 NVLink/NVSwitch；跨机器时，数据必须经过 GPU、PCIe/NVLink、网卡、交换机和对端网卡。
+
+```mermaid
+flowchart LR
+    subgraph A["aikubeworker0012 / 172.72.8.12"]
+        A0["GPU0"] --- ASW["NVSwitch / NVLink"]
+        A1["GPU1"] --- ASW
+        A2["..."] --- ASW
+        A7["GPU7"] --- ASW
+        ASW --> ANIC["IB/RDMA NIC(s)"]
+    end
+
+    subgraph NET["InfiniBand / RoCE Fabric"]
+        SW["IB Switch"]
+    end
+
+    subgraph B["aikubeworker0016 / 172.72.8.16"]
+        BNIC["IB/RDMA NIC(s)"] --> BSW["NVSwitch / NVLink"]
+        B0["GPU0"] --- BSW
+        B1["GPU1"] --- BSW
+        B2["..."] --- BSW
+        B7["GPU7"] --- BSW
+    end
+
+    ANIC <--> SW
+    SW <--> BNIC
+```
+
+这里有两个不同的通信域：
+
+| 通信域 | 典型路径 | 主要测试 |
+|---|---|---|
+| 单机内 8 卡 | GPU -> NVLink/NVSwitch -> GPU | 单机 NCCL、NVLink topo、D2D |
+| 跨机器 16 卡 | GPU -> NIC -> IB/RDMA 网络 -> NIC -> GPU | RDMA、跨机 NCCL |
+
+这两个域的性能阈值不能混用。单机 NVSwitch 很快，跨机 RDMA 一般慢一些，跨机 NCCL 的瓶颈通常在 IB/RDMA 网络。
+
+## 3. 每一层要测什么
+
+### 3.1 基础连通层
+
+这一层只证明机器能访问、身份和地址正确。
+
+要确认：
+
+| 检查项 | 目的 |
+|---|---|
+| SSH 互通 | MPI/NCCL 多机启动依赖远端拉起进程 |
+| hostname 正确 | 避免登录错机器 |
+| IP 正确 | 确认使用的是训练网络或 IB/RDMA 对应网络 |
+| 时间同步 | 长时间训练日志和超时排查更可靠 |
+
+这一层不证明 GPU 或 RDMA 性能，只证明“机器能互相找到”。
+
+### 3.2 系统识别层
+
+这一层证明系统能看见 GPU 和网卡。
+
+常见信息：
+
+| 工具 | 看什么 |
+|---|---|
+| `nvidia-smi` | GPU 数量、型号、驱动、CUDA、温度、功耗 |
+| `nvidia-smi topo -m` | GPU、NIC、CPU NUMA、NVLink/NVSwitch 拓扑 |
+| `ibstat` | IB 设备、端口状态、链路速率 |
+| `ibdev2netdev` | mlx5 设备和网络接口的映射 |
+| `/sys/class/infiniband` | 端口状态、link layer、rate、GID |
+
+这一层很关键，因为 NCCL 经常因为选错网卡而跑到 TCP 或错误的接口上。
+
+### 3.3 单机 GPU 健康层
+
+这一层证明每台机器自己是健康的。
+
+```mermaid
+flowchart LR
+    H["单机健康检查"] --> T["温度"]
+    H --> P["功耗"]
+    H --> E["ECC 错误"]
+    H --> PCIE["PCIe Gen/Width"]
+    H --> C["SM/Mem Clock"]
+    H --> TH["Throttling"]
+    H --> PM["Persistence Mode"]
+```
+
+如果某张卡温度过高、ECC double-bit、PCIe 降级或 throttling，后面的 NCCL 测试即使能跑，结果也不可信。
+
+### 3.4 单机 GPU 性能层
+
+这一层证明每台机器的 GPU 本身性能正常。
+
+| 测试 | 证明什么 |
+|---|---|
+| HBM/D2D 带宽 | GPU 显存和设备间拷贝能力 |
+| H2D/D2H 带宽 | CPU/Host 到 GPU 的 PCIe 路径 |
+| FP32/TF32 | 基础矩阵计算能力 |
+| FP16/BF16/FP8 | 训练常用 Tensor Core 能力 |
+
+这一步是单机验收。它不能证明两台机器之间通信正常，但可以排除“某台机器本身 GPU 算力或带宽异常”。
+
+### 3.5 单机多卡 NCCL 层
+
+这一层验证单台机器 8 卡之间的集体通信。
+
+```mermaid
+flowchart TD
+    S["单机 8 卡 NCCL"] --> AR["AllReduce"]
+    S --> AG["AllGather"]
+    S --> RS["ReduceScatter"]
+    S --> BC["Broadcast"]
+    S --> AT["AllToAll"]
+```
+
+单机 NCCL 主要看 NVLink/NVSwitch 通信路径是否正常。常见指标：
+
+| 指标 | 含义 |
+|---|---|
+| `algbw` | 算法视角的有效带宽 |
+| `busbw` | 总线视角的带宽，更适合比较通信链路利用率 |
+| `#wrong` | 结果错误数量，必须是 0 |
+
+单机测试通过后，只能说明单台服务器内部 8 卡通信正常。
+
+### 3.6 跨机 RDMA 层
+
+这一层验证两台机器之间的网络和 RDMA 能力，不涉及 NCCL。
+
+```mermaid
+sequenceDiagram
+    participant N1 as aikubeworker0012
+    participant FAB as IB/RDMA Fabric
+    participant N2 as aikubeworker0016
+
+    N1->>N2: ping / ssh
+    N1->>FAB: ib_write_bw client
+    FAB->>N2: ib_write_bw server
+    N1->>FAB: ib_read_bw client
+    FAB->>N2: ib_read_bw server
+    N1->>N2: ib_write_lat / ib_read_lat
+```
+
+这一层要回答：
+
+| 问题 | 说明 |
+|---|---|
+| IB 端口是否 Active | 没 Active 就不用跑 NCCL |
+| RDMA 带宽是否达标 | 证明网络数据面能跑起来 |
+| RDMA 延迟是否正常 | 高延迟会影响小消息和训练同步 |
+| 是否是 InfiniBand/RoCE | 两者环境变量和排障点不同 |
+
+如果 RDMA 层失败，跨机 NCCL 大概率也会失败或退化到 TCP。
+
+### 3.7 跨机 NCCL 层
+
+这一层才是真正的多机多卡 NCCL 测试。
+
+两台 8 卡机器通常是：
+
+```text
+2 nodes x 8 GPUs = 16 ranks
+每个 rank 绑定 1 张 GPU
+```
+
+概念上是：
+
+```mermaid
+flowchart LR
+    subgraph N1["Node 1: 172.72.8.12"]
+        R0["rank 0 / GPU0"]
+        R1["rank 1 / GPU1"]
+        R2["..."]
+        R7["rank 7 / GPU7"]
+    end
+
+    subgraph N2["Node 2: 172.72.8.16"]
+        R8["rank 8 / GPU0"]
+        R9["rank 9 / GPU1"]
+        R10["..."]
+        R15["rank 15 / GPU7"]
+    end
+
+    R0 <--> R8
+    R1 <--> R9
+    R7 <--> R15
+    N1 <--> N2
+```
+
+典型测试项：
+
+| NCCL 测试 | 训练里对应什么 |
+|---|---|
+| AllReduce | 数据并行梯度同步 |
+| ReduceScatter | ZeRO/FSDP 梯度切分 |
+| AllGather | ZeRO/FSDP 参数聚合 |
+| Broadcast | 参数广播、初始化 |
+| AllToAll | MoE、专家并行、部分并行策略 |
+| SendRecv | 点对点通信、pipeline parallel |
+
+跨机 NCCL 要看：
+
+| 指标 | 判定 |
+|---|---|
+| 是否成功启动 16 rank | MPI/SSH/路径/环境是否正常 |
+| `#wrong == 0` | 正确性必须过 |
+| `busbw` | 跨节点通信链路利用率 |
+| 是否走 IB/RDMA | 需要从 `NCCL_DEBUG=INFO` 确认 |
+| 是否退化 TCP | 如果退化，性能会明显偏低 |
+
+## 4. NCCL 为什么要分单机和跨机
+
+单机 8 卡通信和跨机 16 卡通信的瓶颈不同。
+
+```mermaid
+flowchart TD
+    A["NCCL 性能结果"] --> B{"测试范围"}
+    B --> C["单机 8 卡"]
+    B --> D["跨机 16 卡"]
+
+    C --> C1["主要瓶颈：NVLink / NVSwitch"]
+    C --> C2["阈值可参考 GPU NVLink 能力"]
+
+    D --> D1["主要瓶颈：IB/RDMA 网络"]
+    D --> D2["阈值应参考网卡数量、速率、拓扑和 rail 数"]
+```
+
+所以不能用单机 NVLink 的阈值直接判断跨机 NCCL。跨机要根据真实网络能力设阈值，例如：
+
+| 网络配置 | 理论上限理解 |
+|---|---|
+| 单张 400G 网卡 | 约 50 GB/s 单向原始带宽 |
+| 8 张 400G 网卡 | 约 400 GB/s 原始聚合带宽 |
+| 实测 NCCL busbw | 会受拓扑、GDR、rail、NUMA、交换机、NCCL 算法影响 |
+
+实际验收时，应该先知道每台机器有几张 IB/RDMA 网卡、每张速率多少、GPU 到 NIC 的拓扑关系，再定跨机 NCCL 阈值。
+
+## 5. 常见失败位置
+
+```mermaid
+flowchart TD
+    F["跨机 NCCL 失败"] --> A["启动失败"]
+    F --> B["能启动但很慢"]
+    F --> C["运行中 timeout"]
+    F --> D["结果 #wrong 非 0"]
+
+    A --> A1["SSH 不通"]
+    A --> A2["远端路径不存在"]
+    A --> A3["MPI 环境不一致"]
+    A --> A4["root 运行未允许"]
+
+    B --> B1["NCCL_SOCKET_IFNAME 选错"]
+    B --> B2["没走 IB/RDMA，退化 TCP"]
+    B --> B3["NCCL_IB_HCA 没选对"]
+    B --> B4["GPU Direct RDMA 没生效"]
+
+    C --> C1["IB 端口不稳定"]
+    C --> C2["交换机/PFC/ECN 问题"]
+    C --> C3["NCCL timeout 配置"]
+    C --> C4["驱动/CUDA/NCCL 版本不兼容"]
+
+    D --> D1["通信正确性失败"]
+    D --> D2["必须 FAIL，不能只看带宽"]
+```
+
+## 6. 推荐验收顺序
+
+下面是面向两台 8 卡机器的推荐顺序：
+
+```mermaid
+flowchart TD
+    A["Step 1: 两台机器基础信息"] --> B["Step 2: 两台机器单机 GPU 健康"]
+    B --> C["Step 3: 两台机器单机 benchmark"]
+    C --> D["Step 4: 两台机器分别跑单机 8 卡 NCCL"]
+    D --> E["Step 5: 两台机器互测 RDMA bandwidth/latency"]
+    E --> F["Step 6: 两机 16 卡 NCCL correctness"]
+    F --> G["Step 7: 两机 16 卡 NCCL performance"]
+    G --> H["Step 8: 两机训练 demo 或业务压测"]
+```
+
+每一步的意义：
+
+| 步骤 | 目的 |
+|---|---|
+| Step 1 | 确认没有登录错机器，基础网络和环境存在 |
+| Step 2 | 排除 GPU 健康问题 |
+| Step 3 | 排除 GPU 单卡/单机性能问题 |
+| Step 4 | 排除单机 NVLink/NVSwitch/NCCL 问题 |
+| Step 5 | 排除跨机 RDMA 问题 |
+| Step 6 | 先证明 NCCL 正确性 |
+| Step 7 | 再证明 NCCL 性能 |
+| Step 8 | 最后用真实训练形态验证稳定性 |
+
+## 7. 对当前脚本的映射
+
+当前脚本已有模块和上面层次的关系：
+
+| 当前模块 | 覆盖层次 | 备注 |
+|---|---|---|
+| `gpu_info` | 系统识别层 | 单机 |
+| `health` | 单机 GPU 健康层 | 单机 |
+| `benchmark` | 单机 GPU 性能层 | 单机 |
+| `nccl` | 单机多卡通信层 | 当前主要是单机 |
+| `rdma` | RDMA 检查 | 当前偏本机检查，不是两机互测 |
+| `stress` | 稳定性 | 单机 |
+| `training` | 训练负载层 | 当前偏单机 |
+| 建议新增 `multi_node_nccl` | 跨机 NCCL 层 | 专门处理 hostfile、mpirun、多节点环境、结果解析 |
+
+如果未来要扩展脚本，比较自然的方向是新增一个多机模块，而不是把所有逻辑塞进现有 `nccl` 模块。
+
+## 8. 最小概念模型
+
+记住这句话即可：
+
+```text
+单机 NCCL 验证 GPU 之间的 NVLink/NVSwitch。
+跨机 RDMA 验证机器之间的网络。
+跨机 NCCL 验证 NCCL 是否能把 GPU 和网络组合起来，为真实训练提供高效通信。
+```
+
+因此，多机多卡测试不是一个命令，而是一条验证链路。
+
diff --git a/gpu_tester.py b/gpu_tester.py
index 4cfa47c..15bc694 100644
--- a/gpu_tester.py
+++ b/gpu_tester.py
@@ -5,6 +5,7 @@ import argparse
 import json
 import os
 import signal
+import socket
 import sys
 import time
 from datetime import datetime
@@ -25,6 +26,8 @@ from modules.nccl_test import NCCLTest
 from modules.training_sim import TrainingSim
 from modules.stress_test import StressTest
 from modules.rdma_test import RDMATest
+from modules.nvlink_test import NVLinkTest
+from modules.dcgm_test import DCGMTest
 from modules.report import ReportGenerator
 from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label, get_supported_gpus, validate_driver_compatibility
 
@@ -32,43 +35,87 @@ DEFAULT_CONFIG = {
     "benchmark": {
         "memory": {"size_mb": 4096, "iterations": 10, "nvbandwidth_buffer_mb": 512, "nvbandwidth_samples": 3},
         "compute": {
-            "dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8"],
-            "matrix_size": 4096,
-            "warmup": 10,
-            "iterations": 100,
+            "dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"],
+            "matrix_size": 8192,
+            "warmup": 50,
+            "iterations": 500,
+            "use_compile": True,
         },
     },
-    "health": {"temp_warning": 80, "temp_critical": 90, "power_limit": None},
+    "health": {"temp_warning": 75, "temp_critical": 85, "power_limit": None},
     "nccl": {
         "min_bandwidth_gbps": None,
         "test_allreduce": True,
         "test_alltoall": True,
         "test_broadcast": True,
-        "test_reduce_scatter": False,
-        "test_allgather": False,
-        "test_sendrecv": False,
+        "test_reduce_scatter": True,
+        "test_allgather": True,
+        "test_sendrecv": True,
+        "message_sizes": ["1M", "256M", "2G"],
+        "repeats": 3,
+        "max_stddev_pct": 3,
     },
     "stress": {
-        "duration_sec": 60,
+        "duration_sec": 1800,
+        "production_duration_sec": 1800,
+        "use_gpu_burn": False,
         "use_doubles": False,
         "use_tensor_cores": True,
         "memory_pct": 90,
         "gpus": "all",
+        "dtype": "bf16",
+        "matrix_size": 24576,
+        "telemetry_interval_sec": 1,
+        "warmup_sec": 60,
+        "min_steady_samples": 10,
+        "max_temp_c": 80,
+        "max_temp_delta_c": 5,
+        "min_power_watts": 630,
+        "max_tflops_jitter_pct": 5,
+        "require_tflops_jitter": True,
     },
     "rdma": {
-        "min_bandwidth_gbps": 50,
-        "max_latency_us": 10,
+        "min_bandwidth_gbps": 47,
+        "min_port_rate_gbps": 400,
+        "max_latency_us": 3.5,
+        "max_write_latency_us": 2.0,
+        "max_read_latency_us": 3.5,
         "ib_iterations": 1000,
-        "msg_size": 65536,
+        "msg_size": 4194304,
+        "latency_msg_size": 8,
         "ib_device": None,
         "ib_port": 1,
+        "server_addr": None,
+        "ibping_target": None,
+        "ibping_count": 5,
+        "role": "auto",
+        "pfc_ecn_counters": True,
+    },
+    "nvlink": {
+        "expected_links_per_gpu": 18,
+        "expected_link_speed_gbps": 25,
+        "require_zero_errors": True,
+    },
+    "dcgm": {
+        "diag_level": 3,
+        "timeout_sec": 1200,
+        "expected_num_gpus": 8,
+        "json_output": True,
+        "require_subtests": True,
     },
     "training": {
-        "model": "gpt2",
+        "model": "synthetic_1.5b",
         "batch_size": 8,
         "seq_length": 2048,
         "num_steps": 50,
+        "warmup_steps": 5,
         "dtype": "bf16",
+        "mode": "ddp",
+        "synthetic_params_b": 1.5,
+        "min_tokens_per_sec": 45000,
+        "max_step_jitter_pct": 3,
+        "max_peak_memory_gb": 70,
+        "require_distributed": True,
     },
     "report": {"output_dir": "./reports", "format": "json"},
     "tools": {"install_dir": "/opt/gpu-test-tools"},
@@ -131,7 +178,7 @@ def interactive_menu(config: dict):
     if not check_prerequisites(console):
         return
 
-    results_store: dict = {"timestamp": datetime.now().isoformat(), "tests": {}}
+    results_store: dict = {"timestamp": datetime.now().isoformat(), "hostname": socket.gethostname(), "tests": {}}
 
     menu_items = [
         ("1", "GPU Information", "gpu_info"),
@@ -139,10 +186,12 @@ def interactive_menu(config: dict):
         ("3", "Memory Benchmark (nvbandwidth)", "memory_bench"),
         ("4", "Compute Benchmark", "compute_bench"),
         ("5", "NCCL Multi-GPU Test", "nccl"),
-        ("6", "GPU Stress Test (gpu-burn)", "stress"),
+        ("6", "GPU Stress Test (PyTorch/gpu-burn)", "stress"),
         ("7", "RDMA/IB Test", "rdma"),
-        ("8", "Training Simulation", "training"),
-        ("9", "Full Test Suite (All Tests)", "all"),
+        ("8", "NVLink/NVSwitch Test", "nvlink"),
+        ("9", "DCGM Diagnostic", "dcgm"),
+        ("10", "Training Simulation", "training"),
+        ("11", "Full Test Suite (All Tests)", "all"),
         ("0", "Generate Report", "report"),
     ]
 
@@ -164,8 +213,10 @@ def interactive_menu(config: dict):
             "memory_bench": "HBM bandwidth via nvbandwidth",
             "compute_bench": "GEMM TFLOPS across FP32/TF32/FP16/BF16/FP8",
             "nccl": "AllReduce, AllToAll, Broadcast via nccl-tests",
-            "stress": "Long-running GPU stress via gpu-burn",
+            "stress": "Long-running high-power GEMM stress with telemetry",
             "rdma": "InfiniBand bandwidth & latency (ib_write_bw)",
+            "nvlink": "NVLink links, speed, and error counters",
+            "dcgm": "DCGM diag -r 3 production diagnostic",
             "training": "Simulate LLM training with PyTorch",
             "all": "Run all tests sequentially",
             "report": "Export results to JSON/HTML",
@@ -257,6 +308,18 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict:
             m.print_results(result)
             return result
 
+        elif test_name == "nvlink":
+            m = NVLinkTest(config)
+            result = m.run()
+            m.print_results(result)
+            return result
+
+        elif test_name == "dcgm":
+            m = DCGMTest(config)
+            result = m.run()
+            m.print_results(result)
+            return result
+
         elif test_name == "training":
             m = TrainingSim(config)
             result = m.run()
@@ -280,15 +343,17 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict:
 def _run_full_suite(config: dict, console: Console) -> dict:
     """Run all tests sequentially."""
     console.print(Panel("[bold cyan]Running Full Test Suite[/bold cyan]", box=box.DOUBLE))
-    all_results: dict = {"timestamp": datetime.now().isoformat()}
+    all_results: dict = {"timestamp": datetime.now().isoformat(), "hostname": socket.gethostname()}
     tests = [
         ("gpu_info", "GPU Information", GPUInfo),
         ("health", "Health Check", HealthCheck),
         ("memory_bench", "Memory Benchmark", lambda c: Benchmark(c)),
         ("compute_bench", "Compute Benchmark", lambda c: Benchmark(c)),
+        ("nvlink", "NVLink/NVSwitch Test", NVLinkTest),
         ("nccl", "NCCL Test", NCCLTest),
         ("stress", "GPU Stress Test", StressTest),
         ("rdma", "RDMA/IB Test", RDMATest),
+        ("dcgm", "DCGM Diagnostic", DCGMTest),
         ("training", "Training Simulation", TrainingSim),
     ]
 
@@ -313,14 +378,49 @@ def _run_full_suite(config: dict, console: Console) -> dict:
     # Summary
     console.print("\n" + "=" * 60)
     # Only count test results, exclude metadata like timestamp
-    test_results = {k: v for k, v in all_results.items() if k != "timestamp"}
-    passed = sum(1 for v in test_results.values() if not isinstance(v, dict) or "error" not in v)
+    test_results = {k: v for k, v in all_results.items() if k not in ("timestamp", "hostname")}
+    passed = sum(1 for v in test_results.values() if _test_result_passed(v))
     total = len(test_results)
     color = "green" if passed == total else ("yellow" if passed > 0 else "red")
     console.print(f"[bold {color}]Suite complete: {passed}/{total} tests passed[/bold {color}]")
     return all_results
 
 
+def _test_result_passed(result) -> bool:
+    """Strict production verdict helper for full-suite exit status."""
+    if not isinstance(result, dict):
+        return True
+    if result.get("error"):
+        return False
+    if result.get("skipped") or result.get("status") == "SKIP":
+        return False
+    if result.get("source") == "torchrun_fallback":
+        return False
+    if "passed" in result:
+        return bool(result.get("passed"))
+    if "memory" in result:
+        mem = result["memory"]
+        if isinstance(mem, dict) and "passed" in mem:
+            return bool(mem.get("passed"))
+        if mem.get("error") or mem.get("source") == "pytorch":
+            return False
+        eff = mem.get("d2d_efficiency_pct") or mem.get("efficiency_pct") or 0
+        return eff >= 80
+    if "compute" in result:
+        comp = result["compute"]
+        if isinstance(comp, dict) and "passed" in comp:
+            return bool(comp.get("passed"))
+        thresholds = comp.get("pass_thresholds_tflops", {}) or {}
+        per_dtype = comp.get("per_dtype_tflops", {})
+        for dt, threshold in thresholds.items():
+            val = per_dtype.get(dt)
+            if not isinstance(val, (int, float)) or val < threshold:
+                return False
+        consistency = comp.get("consistency", {})
+        return not any(not c.get("passed", False) for c in consistency.values())
+    return True
+
+
 def main():
     gpu_list_str = " / ".join(g.upper() for g in get_supported_gpus())
     parser = argparse.ArgumentParser(
@@ -335,15 +435,17 @@ Examples:
    python gpu_tester.py --test benchmark --type memory
    python gpu_tester.py --test benchmark --type compute --dtype fp16
    python gpu_tester.py --test nccl            # NCCL test
+   python gpu_tester.py --test nvlink          # NVLink/NVSwitch test
+   python gpu_tester.py --test dcgm            # DCGM diagnostic
    python gpu_tester.py --test training        # Training sim
    python gpu_tester.py --test all             # Full suite
    python gpu_tester.py --report --format json --output report.json
         """,
     )
-    parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "training", "all"],
+    parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"],
                         help="Run a specific test")
     parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)")
-    parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8"],
+    parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"],
                         help="Compute benchmark dtype (with --test benchmark --type compute)")
     parser.add_argument("--interactive", action="store_true", help="Force interactive mode")
     parser.add_argument("--report", action="store_true", help="Generate report from last results")
@@ -399,6 +501,8 @@ Examples:
         "nccl": "nccl",
         "stress": "stress",
         "rdma": "rdma",
+        "nvlink": "nvlink",
+        "dcgm": "dcgm",
         "training": "training",
         "all": "all",
     }
@@ -415,19 +519,30 @@ Examples:
             result = bench.run()
             Benchmark.print_results(result)
         if args.report:
-            ReportGenerator(config).generate({"benchmark": result, "timestamp": datetime.now().isoformat()},
+            ReportGenerator(config).generate({
+                "benchmark": result,
+                "timestamp": datetime.now().isoformat(),
+                "hostname": socket.gethostname(),
+            },
                                              fmt=args.format, output=args.output)
+        sys.exit(0 if _test_result_passed(result) else 1)
     elif args.test == "all":
         results = _run_full_suite(config, console)
         if args.report:
             ReportGenerator(config).generate(results, fmt=args.format, output=args.output)
-        has_errors = any("error" in v for v in results.values() if isinstance(v, dict))
-        sys.exit(1 if has_errors else 0)
+        failed = any(not _test_result_passed(v) for k, v in results.items() if k not in ("timestamp", "hostname"))
+        sys.exit(1 if failed else 0)
     else:
         result = _run_test(test_map[args.test], config, console)
         if args.report and result:
-            ReportGenerator(config).generate({args.test: result, "timestamp": datetime.now().isoformat()},
+            report_key = test_map[args.test] or args.test
+            ReportGenerator(config).generate({
+                report_key: result,
+                "timestamp": datetime.now().isoformat(),
+                "hostname": socket.gethostname(),
+            },
                                              fmt=args.format, output=args.output)
+        sys.exit(0 if _test_result_passed(result) else 1)
 
 
 if __name__ == "__main__":
diff --git a/modules/dcgm_test.py b/modules/dcgm_test.py
new file mode 100644
index 0000000..e7b4f49
--- /dev/null
+++ b/modules/dcgm_test.py
@@ -0,0 +1,231 @@
+"""DCGM diagnostic acceptance wrapper."""
+
+import json
+import os
+import re
+import shutil
+import signal
+import subprocess
+from datetime import datetime
+from typing import Optional
+
+from rich.console import Console
+from rich.table import Table
+
+
+class DCGMTest:
+    def __init__(self, config: dict):
+        self.config = config
+        self.console = Console()
+        self.cfg = config.get("dcgm", {})
+
+    def run(self) -> dict:
+        dcgmi = shutil.which("dcgmi")
+        if not dcgmi:
+            return {
+                "passed": False,
+                "error": "dcgmi not found",
+                "timestamp": datetime.now().isoformat(),
+            }
+
+        level = str(self.cfg.get("diag_level", 3))
+        timeout = int(self.cfg.get("timeout_sec", 1200))
+        cmd = [dcgmi, "diag", "-r", level]
+        expected_gpus = self.cfg.get("expected_num_gpus")
+        if expected_gpus:
+            cmd.extend(["-n", f"gpu:{int(expected_gpus)}"])
+        if self.cfg.get("json_output", True):
+            cmd.append("-j")
+
+        try:
+            r = self._run_with_process_group_timeout(cmd, timeout)
+        except subprocess.TimeoutExpired as e:
+            output = ((e.output or "") + "\n" + (e.stderr or "")).strip()
+            return {
+                "passed": False,
+                "error": f"dcgmi diag -r {level} timeout after {timeout}s",
+                "command": cmd,
+                "raw_output_tail": output[-8000:],
+                "timestamp": datetime.now().isoformat(),
+            }
+
+        output = r.stdout + "\n" + r.stderr
+        subtests = self._parse_json_output(output) or self._parse_output(output)
+        strict_statuses = {"PASS"}
+        failed = [s for s in subtests if s["status"] not in strict_statuses]
+        require_subtests = bool(self.cfg.get("require_subtests", True))
+        passed = r.returncode == 0 and not failed and (bool(subtests) or not require_subtests)
+        return {
+            "passed": passed,
+            "returncode": r.returncode,
+            "level": int(level),
+            "command": cmd,
+            "expected_num_gpus": int(expected_gpus) if expected_gpus else None,
+            "subtests": subtests,
+            "raw_output_tail": output[-8000:],
+            "timestamp": datetime.now().isoformat(),
+        }
+
+    @staticmethod
+    def _run_with_process_group_timeout(cmd: list[str], timeout: int) -> subprocess.CompletedProcess:
+        proc = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            start_new_session=True,
+        )
+        try:
+            stdout, stderr = proc.communicate(timeout=timeout)
+        except subprocess.TimeoutExpired as e:
+            try:
+                os.killpg(proc.pid, signal.SIGTERM)
+                stdout, stderr = proc.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                os.killpg(proc.pid, signal.SIGKILL)
+                stdout, stderr = proc.communicate(timeout=10)
+            raise subprocess.TimeoutExpired(cmd, timeout, output=stdout, stderr=stderr) from e
+        return subprocess.CompletedProcess(cmd, proc.returncode, stdout, stderr)
+
+    @classmethod
+    def _parse_json_output(cls, output: str) -> list[dict]:
+        text = output.strip()
+        if not text:
+            return []
+        try:
+            payload = json.loads(text)
+        except json.JSONDecodeError:
+            m = re.search(r"(\{.*\})", text, re.S)
+            if not m:
+                return []
+            try:
+                payload = json.loads(m.group(1))
+            except json.JSONDecodeError:
+                return []
+
+        dcgm_payload = payload.get("DCGM Diagnostic") if isinstance(payload, dict) else None
+        if isinstance(dcgm_payload, dict):
+            parsed = cls._parse_dcgm_diagnostic_json(dcgm_payload)
+            if parsed:
+                return parsed
+
+        subtests = []
+
+        def walk(node, path: list[str]):
+            if isinstance(node, dict):
+                node_name = (
+                    node.get("name")
+                    or node.get("testName")
+                    or node.get("test_name")
+                    or node.get("category")
+                    or node.get("category_name")
+                )
+                child_path = [*path, str(node_name)] if node_name else path
+                status = node.get("status") or node.get("result") or node.get("Result")
+                if isinstance(status, str):
+                    name = (
+                        node_name
+                        or " / ".join(path[-3:])
+                    )
+                    normalized = cls._normalize_status(status)
+                    if normalized:
+                        subtests.append({
+                            "name": str(name)[:160],
+                            "status": normalized,
+                            "raw": json.dumps(node, default=str)[:1000],
+                        })
+                for key, value in node.items():
+                    walk(value, [*child_path, str(key)])
+            elif isinstance(node, list):
+                for idx, item in enumerate(node):
+                    walk(item, [*path, str(idx)])
+
+        walk(payload, [])
+        return subtests
+
+    @classmethod
+    def _parse_dcgm_diagnostic_json(cls, payload: dict) -> list[dict]:
+        subtests = []
+        for category in payload.get("test_categories", []) or []:
+            category_name = str(category.get("category") or "DCGM")
+            for test in category.get("tests", []) or []:
+                test_name = str(test.get("name") or "unnamed")
+                for result in test.get("results", []) or []:
+                    status = cls._normalize_status(str(result.get("status", "")))
+                    if not status:
+                        continue
+                    entity_group = result.get("entity_group") or "entity"
+                    entity_id = result.get("entity_id", "unknown")
+                    name = f"{category_name}/{test_name}/{entity_group}{entity_id}"
+                    subtests.append({
+                        "name": name[:160],
+                        "status": status,
+                        "raw": json.dumps(result, default=str)[:1000],
+                    })
+                summary = test.get("test_summary") or {}
+                status = cls._normalize_status(str(summary.get("status", "")))
+                if status:
+                    subtests.append({
+                        "name": f"{category_name}/{test_name}/summary"[:160],
+                        "status": status,
+                        "raw": json.dumps(summary, default=str)[:1000],
+                    })
+        return subtests
+
+    @staticmethod
+    def _normalize_status(status: str) -> str:
+        s = status.strip().upper()
+        aliases = {
+            "PASS": "PASS",
+            "PASSED": "PASS",
+            "OK": "PASS",
+            "FAIL": "FAIL",
+            "FAILED": "FAIL",
+            "ERROR": "ERROR",
+            "WARN": "WARN",
+            "WARNING": "WARN",
+            "SKIP": "SKIP",
+            "SKIPPED": "SKIP",
+            "NOT_RUN": "SKIP",
+            "NOT RUN": "SKIP",
+        }
+        return aliases.get(s, s if s in {"PASS", "FAIL", "ERROR", "WARN", "SKIP"} else "")
+
+    @staticmethod
+    def _parse_output(output: str) -> list[dict]:
+        subtests = []
+        for line in output.splitlines():
+            stripped = line.strip()
+            if not stripped:
+                continue
+            m = re.search(r"(.+?)\s*[:|]\s*(PASS|FAIL|WARN|ERROR|SKIP)\b", stripped, re.I)
+            if not m:
+                m = re.search(r"\b(PASS|FAIL|WARN|ERROR|SKIP)\b\s*[-:|]\s*(.+)", stripped, re.I)
+                if m:
+                    status = DCGMTest._normalize_status(m.group(1))
+                    name = m.group(2).strip()
+                else:
+                    continue
+            else:
+                name = m.group(1).strip(" .|-")
+                status = DCGMTest._normalize_status(m.group(2))
+            if name and len(name) < 160:
+                subtests.append({"name": name, "status": status, "raw": stripped})
+        return subtests
+
+    @staticmethod
+    def print_results(results: dict, console: Optional[Console] = None):
+        c = console or Console()
+        if results.get("error"):
+            c.print(f"[bold red]DCGM error: {results['error']}[/bold red]")
+            return
+        passed = results.get("passed", False)
+        c.print("[bold green]✓ DCGM diag PASSED[/bold green]" if passed else "[bold red]✗ DCGM diag FAILED[/bold red]")
+        subtests = results.get("subtests", [])
+        if subtests:
+            table = Table(box=None, padding=(0, 1))
+            table.add_column("Subtest")
+            table.add_column("Status", style="bold")
+            for s in subtests:
+                table.add_row(s.get("name", ""), s.get("status", ""))
+            c.print(table)
diff --git a/modules/health_check.py b/modules/health_check.py
index dd64071..1e446f6 100644
--- a/modules/health_check.py
+++ b/modules/health_check.py
@@ -171,6 +171,10 @@ class HealthCheck:
             gpu_health.append({"index": i, "status": worst, "checks": checks})
 
         system_health = self._check_system()
+        for key in ("fabricmanager", "retired_pages", "kernel_errors"):
+            item = system_health.get(key, {})
+            if isinstance(item, dict) and item.get("status") == "FAIL":
+                overall_pass = False
 
         return {
             "passed": overall_pass,
@@ -228,6 +232,9 @@ class HealthCheck:
             rdma_devs = os.listdir("/sys/class/infiniband_verbs")
 
         nccl_env = {k: v for k, v in os.environ.items() if k.startswith("NCCL_")}
+        fabric = self._check_fabricmanager()
+        retired = self._check_retired_pages()
+        kernel_errors = self._check_kernel_errors()
 
         return {
             "nvidia_persistenced": {"installed": persistd, "running": persistd_running},
@@ -238,6 +245,41 @@ class HealthCheck:
             "infiniband_devices": ib_devs,
             "rdma_devices": rdma_devs,
             "nccl_env_vars": nccl_env,
+            "fabricmanager": fabric,
+            "retired_pages": retired,
+            "kernel_errors": kernel_errors,
+        }
+
+    def _check_fabricmanager(self) -> dict:
+        r = self._run_cmd(["systemctl", "is-active", "nvidia-fabricmanager"], timeout=5)
+        active = r == "active"
+        logs = self._run_cmd(["journalctl", "-u", "nvidia-fabricmanager", "-n", "200", "--no-pager"], timeout=10) or ""
+        has_error = "ERROR" in logs.upper() or "FAILED" in logs.upper()
+        return {
+            "active": active,
+            "has_error_logs": has_error,
+            "status": "PASS" if active and not has_error else "FAIL",
+        }
+
+    def _check_retired_pages(self) -> dict:
+        raw = self._run_cmd(["nvidia-smi", "-q", "-d", "PAGE_RETIREMENT"], timeout=30) or ""
+        nums = [int(x) for x in __import__("re").findall(r"Retired Pages.*?:\s*(\d+)", raw, flags=__import__("re").I)]
+        pending = "Pending Page Blacklist" in raw and "Yes" in raw
+        total = sum(nums)
+        return {
+            "retired_pages": total,
+            "pending_blacklist": pending,
+            "status": "PASS" if total == 0 and not pending else "FAIL",
+        }
+
+    def _check_kernel_errors(self) -> dict:
+        raw = self._run_cmd(["dmesg", "--ctime", "--level=err,crit,alert,emerg"], timeout=10) or ""
+        upper = raw.upper()
+        hits = [line for line in raw.splitlines() if any(k in line.upper() for k in ("XID", "AER", "PCIE", "NVRM"))]
+        return {
+            "count": len(hits),
+            "tail": hits[-20:],
+            "status": "PASS" if not hits else "FAIL",
         }
 
     @staticmethod
diff --git a/modules/nccl_test.py b/modules/nccl_test.py
index fd9ab6a..9bc47d1 100644
--- a/modules/nccl_test.py
+++ b/modules/nccl_test.py
@@ -5,6 +5,8 @@ import os
 import re
 import shutil
 import subprocess
+import statistics
+import sys
 from datetime import datetime
 from typing import Optional
 
@@ -70,6 +72,38 @@ class NCCLTest:
                 return p
         return None
 
+    def _message_sizes(self) -> list[str]:
+        return list(self.nccl_cfg.get("message_sizes") or ["1M", "256M", "2G"])
+
+    def _repeats(self) -> int:
+        return int(self.nccl_cfg.get("repeats", 3))
+
+    def _max_stddev_pct(self) -> float:
+        return float(self.nccl_cfg.get("max_stddev_pct", 3))
+
+    def _runtime_env(self) -> dict:
+        env = {**os.environ, "NCCL_DEBUG": "WARN"}
+        lib_dirs = []
+
+        nccl_home = env.get("NCCL_HOME") or self.nccl_cfg.get("nccl_home")
+        if nccl_home:
+            lib_dirs.append(os.path.join(str(nccl_home), "lib"))
+
+        for path in sys.path:
+            lib_dirs.append(os.path.join(path, "nvidia", "nccl", "lib"))
+
+        venv_root = os.path.dirname(os.path.dirname(sys.executable))
+        lib_dirs.extend(glob.glob(os.path.join(venv_root, "lib", "python*", "site-packages", "nvidia", "nccl", "lib")))
+
+        existing = env.get("LD_LIBRARY_PATH", "")
+        valid_dirs = []
+        for d in lib_dirs:
+            if d and os.path.isdir(d) and d not in valid_dirs:
+                valid_dirs.append(d)
+        if valid_dirs:
+            env["LD_LIBRARY_PATH"] = ":".join(valid_dirs + ([existing] if existing else []))
+        return env
+
     def run(self) -> dict:
         gpu_count = 0
         if TORCH_AVAILABLE:
@@ -89,7 +123,7 @@ class NCCLTest:
         if self.nccl_cfg.get("test_reduce_scatter", False):
             tests.append(("reduce_scatter_perf", "ReduceScatter"))
         if self.nccl_cfg.get("test_allgather", False):
-            tests.append(("allgather_perf", "AllGather"))
+            tests.append(("all_gather_perf", "AllGather"))
         if self.nccl_cfg.get("test_sendrecv", False):
             tests.append(("sendrecv_perf", "SendRecv"))
 
@@ -170,39 +204,7 @@ class NCCLTest:
         if not binary:
             return {"status": "SKIP", "error": f"{binary_name} not found"}
 
-        cmd = [
-            binary,
-            "-b", "8M",
-            "-e", "8G",
-            "-f", "2",
-            "-g", str(gpu_count),
-            "-w", "5",
-            "-n", "20",
-        ]
-
-        try:
-            env = os.environ.copy()
-            env["NCCL_DEBUG"] = "WARN"
-            r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env)
-
-            combined = r.stdout + r.stderr
-            # Check for NCCL/CUDA compatibility errors
-            if "CUDA driver version is insufficient" in combined or \
-               "Test NCCL failure" in combined:
-                error_msg = "NCCL/CUDA driver version mismatch" \
-                    if "CUDA driver version" in combined \
-                    else "NCCL test failure (library incompatibility)"
-                return {"status": "FAIL", "error": error_msg}
-
-            if r.returncode != 0:
-                return {"status": "FAIL", "error": r.stderr[:300]}
-
-            return self._parse_nccl_output(r.stdout, min_bw)
-
-        except subprocess.TimeoutExpired:
-            return {"status": "FAIL", "error": "timeout"}
-        except Exception as e:
-            return {"status": "FAIL", "error": str(e)}
+        return self._run_nccl_matrix([binary, "-g", str(gpu_count)], min_bw)
 
     def _run_one_nccl_test_mpirun(self, binary_name: str, label: str,
                                    gpu_count: int, mpirun: str, min_bw: float) -> dict:
@@ -218,37 +220,64 @@ class NCCLTest:
             "-x", "NCCL_DEBUG=WARN",
             "-x", "CUDA_VISIBLE_DEVICES=" + ",".join(str(i) for i in range(gpu_count)),
             binary,
-            "-b", "8",
-            "-e", "256M",
-            "-f", "2",
             "-g", "1",
-            "-w", "5",
-            "-n", "20",
         ]
 
+        return self._run_nccl_matrix(cmd, min_bw)
+
+    def _run_nccl_matrix(self, base_cmd: list[str], min_bw: float) -> dict:
+        size_results = []
+        failures = []
+        env = self._runtime_env()
+
         try:
-            env = os.environ.copy()
-            env["NCCL_DEBUG"] = "WARN"
-            r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env)
-
-            combined = r.stdout + r.stderr
-            if "CUDA driver version is insufficient" in combined or \
-               "Test NCCL failure" in combined:
-                error_msg = "NCCL/CUDA driver version mismatch" \
-                    if "CUDA driver version" in combined \
-                    else "NCCL test failure (library incompatibility)"
-                return {"status": "FAIL", "error": error_msg}
-
-            if r.returncode != 0:
-                return {"status": "FAIL", "error": r.stderr[:300]}
-
-            return self._parse_nccl_output(r.stdout, min_bw)
+            for size in self._message_sizes():
+                runs = []
+                for _ in range(self._repeats()):
+                    cmd = [*base_cmd, "-b", size, "-e", size, "-f", "2", "-w", "5", "-n", "20"]
+                    r = subprocess.run(cmd, capture_output=True, text=True, timeout=300, env=env)
+                    combined = r.stdout + r.stderr
+                    if "CUDA driver version is insufficient" in combined or "Test NCCL failure" in combined:
+                        failures.append({"size": size, "error": "NCCL/CUDA/library failure"})
+                        continue
+                    if r.returncode != 0:
+                        failures.append({"size": size, "error": r.stderr[:300]})
+                        continue
+                    parsed = self._parse_nccl_output(r.stdout, min_bw)
+                    runs.append(parsed.get("best_busbw_gbps", 0))
+                if runs:
+                    worst = min(runs)
+                    mean = sum(runs) / len(runs)
+                    std_pct = (statistics.pstdev(runs) / mean * 100) if len(runs) > 1 and mean else 0
+                    size_results.append({
+                        "size": size,
+                        "runs_busbw_gbps": [round(v, 1) for v in runs],
+                        "worst_busbw_gbps": round(worst, 1),
+                        "mean_busbw_gbps": round(mean, 1),
+                        "stddev_pct": round(std_pct, 2),
+                        "status": "PASS" if worst >= min_bw and std_pct <= self._max_stddev_pct() else "FAIL",
+                    })
+                else:
+                    size_results.append({"size": size, "status": "FAIL", "runs_busbw_gbps": []})
 
         except subprocess.TimeoutExpired:
             return {"status": "FAIL", "error": "timeout"}
         except Exception as e:
             return {"status": "FAIL", "error": str(e)}
 
+        best_bus = max((r.get("mean_busbw_gbps", 0) for r in size_results), default=0)
+        worst_bus = min((r.get("worst_busbw_gbps", 0) for r in size_results if r.get("runs_busbw_gbps")), default=0)
+        passed = bool(size_results) and all(r.get("status") == "PASS" for r in size_results) and not failures
+        return {
+            "status": "PASS" if passed else "FAIL",
+            "best_busbw_gbps": round(best_bus, 1),
+            "worst_busbw_gbps": round(worst_bus, 1),
+            "min_required_gbps": min_bw,
+            "max_stddev_pct": self._max_stddev_pct(),
+            "by_size": size_results,
+            "failures": failures,
+        }
+
     @staticmethod
     def _parse_nccl_output(stdout: str, min_bw: float) -> dict:
         """Parse nccl-tests tabular output and extract bandwidth results."""
@@ -363,7 +392,7 @@ dist.destroy_process_group()
             r = subprocess.run(
                 [torchrun_cmd, f"--nproc_per_node={gpu_count}", tmp.name],
                 capture_output=True, text=True, timeout=120,
-                env={**os.environ, "NCCL_DEBUG": "WARN"},
+                env=self._runtime_env(),
             )
             os.unlink(tmp.name)
             
@@ -390,10 +419,15 @@ dist.destroy_process_group()
                 }
             
             return {
-                "passed": all_passed,
+                # torchrun fallback is a functional smoke only. It never proves
+                # production bus bandwidth, so it must not satisfy acceptance.
+                "passed": False,
+                "functional_passed": all_passed,
                 "source": "torchrun_fallback",
                 "tests": tests,
                 "gpu_count": gpu_count,
+                "error": None if all_passed else "torchrun functional NCCL smoke failed",
+                "acceptance_gap": "nccl-tests bus bandwidth was not measured",
             }
         except Exception as e:
             return {"passed": False, "source": "torchrun_fallback", "error": str(e)}
@@ -410,7 +444,8 @@ dist.destroy_process_group()
         
         if source == "torchrun_fallback":
             # Connectivity check mode
-            verdict = "[bold green]✓ NCCL Connectivity OK[/bold green]" if passed else "[bold red]✗ NCCL Connectivity FAILED[/bold red]"
+            functional = results.get("functional_passed", passed)
+            verdict = "[bold yellow]⚠ NCCL bus BW NOT VERIFIED[/bold yellow]" if functional else "[bold red]✗ NCCL Connectivity FAILED[/bold red]"
             c.print(f"{verdict} [dim](basic check via torchrun)[/dim]")
             
             tests = results.get("tests", {})
@@ -427,7 +462,7 @@ dist.destroy_process_group()
                     else:
                         c.print(f"  [{s_color}]{op_name}[/{s_color}]")
             
-            c.print("\n[yellow]Note: functional connectivity test only (no performance data)[/yellow]")
+            c.print("\n[yellow]Note: functional connectivity test only (no bus bandwidth data; acceptance FAIL)[/yellow]")
         else:
             # nccl-tests mode
             verdict = "[bold green]✓ NCCL tests PASSED[/bold green]" if passed else "[bold yellow]⚠ NCCL tests WARNING[/bold yellow]"
@@ -448,12 +483,16 @@ dist.destroy_process_group()
                 if by_size:
                     t = Table(box=None, padding=(0, 1))
                     t.add_column("Size", style="bold", justify="right")
-                    t.add_column("Time (us)", justify="right")
-                    t.add_column("Alg BW (GB/s)", justify="right")
-                    t.add_column("Bus BW (GB/s)", justify="right")
+                    t.add_column("Worst Bus BW", justify="right")
+                    t.add_column("Mean Bus BW", justify="right")
+                    t.add_column("StdDev", justify="right")
+                    t.add_column("Status", justify="right")
                     for r in by_size:
-                        sz = r.get("size", 0)
-                        sz_str = f"{sz/1024:.0f}K" if sz < 1048576 else f"{sz/1048576:.0f}M"
-                        t.add_row(sz_str, f"{r.get('time_us',0):.1f}",
-                                  f"{r.get('algbw_gbps',0):.1f}", f"{r.get('busbw_gbps',0):.1f}")
+                        t.add_row(
+                            str(r.get("size", "")),
+                            f"{r.get('worst_busbw_gbps', 0):.1f}",
+                            f"{r.get('mean_busbw_gbps', 0):.1f}",
+                            f"{r.get('stddev_pct', 0):.2f}%",
+                            r.get("status", "?"),
+                        )
                     c.print(t)
diff --git a/modules/nvlink_test.py b/modules/nvlink_test.py
new file mode 100644
index 0000000..ecf257b
--- /dev/null
+++ b/modules/nvlink_test.py
@@ -0,0 +1,188 @@
+"""NVLink / NVSwitch production acceptance checks."""
+
+import re
+import shutil
+import subprocess
+from datetime import datetime
+from typing import Optional
+
+from rich.console import Console
+from rich.table import Table
+
+
+class NVLinkTest:
+    def __init__(self, config: dict):
+        self.config = config
+        self.console = Console()
+        self.cfg = config.get("nvlink", {})
+
+    def _run(self, args: list[str], timeout: int = 60) -> tuple[int, str, str]:
+        if not shutil.which("nvidia-smi"):
+            return 127, "", "nvidia-smi not found"
+        r = subprocess.run(["nvidia-smi", *args], capture_output=True, text=True, timeout=timeout)
+        return r.returncode, r.stdout, r.stderr
+
+    def run(self) -> dict:
+        expected_links = int(self.cfg.get("expected_links_per_gpu", 18))
+        expected_speed = float(self.cfg.get("expected_link_speed_gbps", 25))
+        require_zero_errors = bool(self.cfg.get("require_zero_errors", True))
+
+        rc_s, out_s, err_s = self._run(["nvlink", "-s"])
+        rc_c, out_c, err_c = self._run(["nvlink", "-c"])
+        rc_e, out_e, err_e = self._run(["nvlink", "-e"])
+
+        if rc_s != 0:
+            return {
+                "passed": False,
+                "error": (err_s or out_s or "nvidia-smi nvlink -s failed")[:1000],
+                "timestamp": datetime.now().isoformat(),
+            }
+
+        links = self._parse_status(out_s)
+        if not links:
+            return {
+                "passed": False,
+                "error": "no NVLink status entries parsed from nvidia-smi nvlink -s",
+                "raw_status": out_s[-4000:],
+                "timestamp": datetime.now().isoformat(),
+            }
+        speeds = self._parse_speeds(out_c) if rc_c == 0 else {}
+        status_speeds = self._parse_speeds(out_s)
+        for gpu, gpu_speeds in status_speeds.items():
+            speeds.setdefault(gpu, {}).update({k: v for k, v in gpu_speeds.items() if k not in speeds.get(gpu, {})})
+        errors = self._parse_errors(out_e) if rc_e == 0 else {}
+
+        gpu_results = []
+        overall = True
+        for gpu, gpu_links in sorted(links.items(), key=lambda x: int(x[0])):
+            active = sum(1 for l in gpu_links.values() if l.get("active"))
+            inactive = [lid for lid, l in gpu_links.items() if not l.get("active")]
+            speed_bad = []
+            for lid in gpu_links:
+                speed = speeds.get(gpu, {}).get(lid)
+                if speed is not None and speed < expected_speed:
+                    speed_bad.append({"link": lid, "speed_gbps": speed})
+            err_bad = []
+            if require_zero_errors:
+                for lid, counters in errors.get(gpu, {}).items():
+                    total = sum(v for v in counters.values() if isinstance(v, int))
+                    if total:
+                        err_bad.append({"link": lid, "counters": counters})
+
+            passed = active == expected_links and not inactive and not speed_bad and not err_bad
+            if not passed:
+                overall = False
+            gpu_results.append({
+                "gpu": int(gpu),
+                "active_links": active,
+                "expected_links": expected_links,
+                "inactive_links": inactive,
+                "speed_issues": speed_bad,
+                "error_issues": err_bad,
+                "passed": passed,
+            })
+
+        return {
+            "passed": overall,
+            "expected_links_per_gpu": expected_links,
+            "expected_link_speed_gbps": expected_speed,
+            "require_zero_errors": require_zero_errors,
+            "gpus": gpu_results,
+            "raw_status": out_s[-4000:],
+            "raw_speed": out_c[-4000:] if out_c else "",
+            "raw_errors": out_e[-4000:] if out_e else "",
+            "timestamp": datetime.now().isoformat(),
+        }
+
+    @staticmethod
+    def _parse_status(text: str) -> dict[str, dict[str, dict]]:
+        result: dict[str, dict[str, dict]] = {}
+        gpu = None
+        for line in text.splitlines():
+            m_gpu = re.search(r"GPU\s+(\d+)", line, re.I)
+            if m_gpu:
+                gpu = m_gpu.group(1)
+                result.setdefault(gpu, {})
+                continue
+            if gpu is None:
+                continue
+            m_link = re.search(r"Link\s+(\d+).*?(Active|Inactive|Disabled|Off|Down)", line, re.I)
+            if m_link:
+                state = m_link.group(2)
+                result[gpu][m_link.group(1)] = {
+                    "state": state,
+                    "active": state.lower() == "active",
+                    "raw": line.strip(),
+                }
+                continue
+            m_speed = re.search(r"Link\s+(\d+).*?([0-9.]+)\s*GB/s", line, re.I)
+            if m_speed:
+                result[gpu][m_speed.group(1)] = {
+                    "state": "Active",
+                    "active": True,
+                    "raw": line.strip(),
+                }
+        return result
+
+    @staticmethod
+    def _parse_speeds(text: str) -> dict[str, dict[str, float]]:
+        result: dict[str, dict[str, float]] = {}
+        gpu = None
+        for line in text.splitlines():
+            m_gpu = re.search(r"GPU\s+(\d+)", line, re.I)
+            if m_gpu:
+                gpu = m_gpu.group(1)
+                result.setdefault(gpu, {})
+                continue
+            if gpu is None:
+                continue
+            m_link = re.search(r"Link\s+(\d+).*?([0-9.]+)\s*GB/s", line, re.I)
+            if m_link:
+                result[gpu][m_link.group(1)] = float(m_link.group(2))
+        return result
+
+    @staticmethod
+    def _parse_errors(text: str) -> dict[str, dict[str, dict[str, int]]]:
+        result: dict[str, dict[str, dict[str, int]]] = {}
+        gpu = None
+        link = None
+        for line in text.splitlines():
+            m_gpu = re.search(r"GPU\s+(\d+)", line, re.I)
+            if m_gpu:
+                gpu = m_gpu.group(1)
+                result.setdefault(gpu, {})
+                continue
+            m_link = re.search(r"Link\s+(\d+)", line, re.I)
+            if m_link and gpu is not None:
+                link = m_link.group(1)
+                result[gpu].setdefault(link, {})
+            if gpu is None or link is None:
+                continue
+            for name in ("CRC", "Replay", "Recovery"):
+                m = re.search(rf"{name}[^0-9]*(\d+)", line, re.I)
+                if m:
+                    result[gpu][link][name.lower()] = int(m.group(1))
+        return result
+
+    @staticmethod
+    def print_results(results: dict, console: Optional[Console] = None):
+        c = console or Console()
+        if results.get("error"):
+            c.print(f"[bold red]NVLink error: {results['error']}[/bold red]")
+            return
+        passed = results.get("passed", False)
+        c.print("[bold green]✓ NVLink PASSED[/bold green]" if passed else "[bold red]✗ NVLink FAILED[/bold red]")
+        table = Table(box=None, padding=(0, 1))
+        table.add_column("GPU", style="bold")
+        table.add_column("Active Links", justify="right")
+        table.add_column("Issues")
+        for g in results.get("gpus", []):
+            issues = []
+            if g.get("inactive_links"):
+                issues.append("inactive=" + ",".join(g["inactive_links"]))
+            if g.get("speed_issues"):
+                issues.append(f"speed={len(g['speed_issues'])}")
+            if g.get("error_issues"):
+                issues.append(f"errors={len(g['error_issues'])}")
+            table.add_row(str(g["gpu"]), f"{g['active_links']}/{g['expected_links']}", "; ".join(issues) or "OK")
+        c.print(table)
diff --git a/modules/report.py b/modules/report.py
index d9e1eba..2f6f1ec 100644
--- a/modules/report.py
+++ b/modules/report.py
@@ -93,8 +93,8 @@ class ReportGenerator:
 
     def _generate_html(self, results: dict, output: str) -> str:
         import socket
-        hostname = socket.gethostname()
-        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        hostname = results.get("hostname") or socket.gethostname()
+        timestamp = results.get("timestamp") or datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
         sections = []
 
@@ -178,8 +178,8 @@ class ReportGenerator:
 
     def _generate_markdown(self, results: dict, output: str) -> str:
         import socket
-        hostname = socket.gethostname()
-        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        hostname = results.get("hostname") or socket.gethostname()
+        timestamp = results.get("timestamp") or datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
         lines: list[str] = []
 
@@ -201,6 +201,21 @@ class ReportGenerator:
         # --- Summary table ---
         summary_items = self._build_summary(results)
         if summary_items:
+            verdict, failures, missing = self._overall_acceptance_verdict(summary_items)
+            lines.append("## Overall Acceptance Verdict\n")
+            lines.append(f"**Result: {verdict}**")
+            lines.append("")
+            if failures:
+                lines.append("Failed or unverified items:")
+                for name, status in failures:
+                    lines.append(f"- {name}: {status}")
+                lines.append("")
+            if missing:
+                lines.append("Missing required evidence:")
+                for name in missing:
+                    lines.append(f"- {name}")
+                lines.append("")
+
             lines.append("## Summary\n")
             lines.append("| Test | Result |")
             lines.append("|------|--------|")
@@ -319,8 +334,6 @@ class ReportGenerator:
                     if use_abs and thr:
                         if val >= thr:
                             status = "PASS"
-                        elif val >= thr * 0.9:
-                            status = "WARN"
                         else:
                             status = "FAIL"
                         lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | >= {thr} | {status} |")
@@ -331,30 +344,123 @@ class ReportGenerator:
                         overall_status = status
             lines.append("")
             if use_abs:
+                if any(not row.get("passed", False) for row in (comp_data.get("consistency", {}) or {}).values()):
+                    overall_status = "FAIL"
                 lines.append(f"**Verdict: {overall_status}** (absolute TFLOPS thresholds; worst efficiency {worst_eff:.1f}%)\n")
             else:
                 overall_status = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
                 lines.append(f"**Verdict: {overall_status}** (worst efficiency {worst_eff:.1f}%)\n")
 
+            consistency = comp_data.get("consistency", {}) or {}
+            if consistency:
+                lines.append("### Compute Consistency\n")
+                lines.append("| DType | Min | Mean | Max | Spread | Limit | Status |")
+                lines.append("|-------|-----|------|-----|--------|-------|--------|")
+                for dt, row in consistency.items():
+                    status = "PASS" if row.get("passed") else "FAIL"
+                    lines.append(
+                        f"| {dt.upper()} | {row.get('min_tflops', 0):.1f} | "
+                        f"{row.get('mean_tflops', 0):.1f} | {row.get('max_tflops', 0):.1f} | "
+                        f"{row.get('spread_pct', 0):.2f}% | <= {row.get('max_allowed_pct', 3)}% | {status} |"
+                    )
+                lines.append("")
+
+            per_gpu = comp_data.get("per_gpu", []) or []
+            dtype_order = [dt for dt in per_dtype.keys() if not isinstance(per_dtype.get(dt), str)]
+            if per_gpu and dtype_order:
+                lines.append("### Compute Per-GPU TFLOPS\n")
+                headers = ["GPU", *[dt.upper() for dt in dtype_order]]
+                lines.append("| " + " | ".join(headers) + " |")
+                lines.append("|" + "|".join(["---"] * len(headers)) + "|")
+                for row in per_gpu:
+                    cells = [str(row.get("index", ""))]
+                    for dt in dtype_order:
+                        val = row.get(dt, "")
+                        cells.append(f"{val:.1f}" if isinstance(val, (int, float)) else str(val))
+                    lines.append("| " + " | ".join(cells) + " |")
+                lines.append("")
+
+        # --- NCCL ---
+        nvlink = results.get("nvlink")
+        if nvlink and not nvlink.get("error"):
+            lines.append("## NVLink/NVSwitch\n")
+            lines.append(f"**Overall: {'PASS' if nvlink.get('passed') else 'FAIL'}**\n")
+            lines.append("| GPU | Active Links | Issues |")
+            lines.append("|-----|--------------|--------|")
+            for g in nvlink.get("gpus", []):
+                issues = []
+                if g.get("inactive_links"):
+                    issues.append("inactive=" + ",".join(g["inactive_links"]))
+                if g.get("speed_issues"):
+                    issues.append(f"speed issues={len(g['speed_issues'])}")
+                if g.get("error_issues"):
+                    issues.append(f"errors={len(g['error_issues'])}")
+                lines.append(f"| {g.get('gpu')} | {g.get('active_links')}/{g.get('expected_links')} | {', '.join(issues) or 'OK'} |")
+            lines.append("")
+        elif nvlink and nvlink.get("error"):
+            lines.append("## NVLink/NVSwitch\n")
+            lines.append(f"**Overall: FAIL** ({nvlink.get('error')})\n")
+
+        dcgm = results.get("dcgm")
+        if dcgm and not dcgm.get("error"):
+            lines.append("## DCGM Diagnostic\n")
+            lines.append(f"**Overall: {'PASS' if dcgm.get('passed') else 'FAIL'}**\n")
+            if dcgm.get("subtests"):
+                lines.append("| Subtest | Status |")
+                lines.append("|---------|--------|")
+                for s in dcgm.get("subtests", []):
+                    lines.append(f"| {s.get('name', '')} | {s.get('status', '')} |")
+                lines.append("")
+        elif dcgm and dcgm.get("error"):
+            lines.append("## DCGM Diagnostic\n")
+            lines.append(f"**Overall: FAIL** ({dcgm.get('error')})\n")
+
         # --- NCCL ---
         nccl = results.get("nccl")
         if nccl and not nccl.get("error"):
             lines.append("## NCCL Multi-GPU\n")
             lines.append(f"Source: {nccl.get('source', 'unknown')} | "
                          f"GPUs: {nccl.get('gpu_count', '?')}\n")
+            if nccl.get("source") == "torchrun_fallback":
+                lines.append("> Functional NCCL smoke only: nccl-tests bus bandwidth was not measured, so this does not satisfy production acceptance.\n")
             tests = nccl.get("tests", {})
             if tests:
-                lines.append("| Operation | Bus BW (GB/s) | Threshold | Status |")
-                lines.append("|-----------|---------------|-----------|--------|")
+                lines.append("> Summary reports the best Bus BW observed for each operation. PASS/FAIL is evaluated across every tested message size and repeat run shown in the detail table below.\n")
+                lines.append("| Operation | Best Bus BW (GB/s) | Failed Sizes | Threshold | Status |")
+                lines.append("|-----------|--------------------|--------------|-----------|--------|")
                 for op, data in tests.items():
                     if isinstance(data, dict) and not data.get("error"):
                         bw = data.get("best_busbw_gbps", 0)
                         req = data.get("min_required_gbps", 0)
                         status = data.get("status", "?")
-                        lines.append(f"| {op} | {bw:.1f} | >= {req:.0f} | {status} |")
+                        failed_sizes = [
+                            str(row.get("size", "?"))
+                            for row in data.get("by_size", [])
+                            if row.get("status") != "PASS"
+                        ]
+                        failed_sizes_text = ", ".join(failed_sizes) if failed_sizes else "-"
+                        lines.append(f"| {op} | {bw:.1f} | {failed_sizes_text} | >= {req:.0f} | {status} |")
                     elif isinstance(data, dict) and data.get("error"):
-                        lines.append(f"| {op} | - | - | ERROR: {data['error']} |")
+                        lines.append(f"| {op} | - | - | - | ERROR: {data['error']} |")
                 lines.append("")
+                for op, data in tests.items():
+                    by_size = data.get("by_size", []) if isinstance(data, dict) else []
+                    if not by_size:
+                        continue
+                    lines.append(f"### NCCL {op} by size\n")
+                    lines.append("| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |")
+                    lines.append("|------|---------------------|-------|------|--------|-----------|--------|")
+                    for row in by_size:
+                        runs = ", ".join(str(v) for v in row.get("runs_busbw_gbps", []))
+                        lines.append(
+                            f"| {row.get('size', '')} | {runs} | "
+                            f"{row.get('worst_busbw_gbps', 0):.1f} | "
+                            f"{row.get('mean_busbw_gbps', 0):.1f} | "
+                            f"{row.get('stddev_pct', 0):.2f}% | "
+                            f">= {data.get('min_required_gbps', 0):.0f} | "
+                            f"{row.get('status', '?')} |"
+                        )
+                    lines.append("")
             passed = nccl.get("passed", False)
             lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")
 
@@ -368,6 +474,21 @@ class ReportGenerator:
             source = stress.get("source", "unknown")
             lines.append(f"- **Source:** {source}")
             lines.append(f"- **Duration:** {elapsed:.0f}s (requested {duration}s)")
+            telemetry = stress.get("telemetry") or {}
+            if telemetry:
+                lines.append(f"- **Telemetry samples:** {telemetry.get('samples', 0)}")
+                lines.append(f"- **Max temp:** {telemetry.get('max_temp_c', {})}")
+                lines.append(f"- **Avg power:** {telemetry.get('avg_power_w', {})}")
+                lines.append(f"- **Temp delta:** {telemetry.get('temp_delta_c', 'N/A')} C")
+                lines.append(f"- **TFLOPS jitter:** {telemetry.get('tflops_jitter_pct', 'N/A')}%")
+                lines.append(f"- **Steady TFLOPS samples:** {telemetry.get('steady_tflops_samples', 0)}")
+                lines.append(f"- **Throttle events:** {telemetry.get('throttle_event_count', len(telemetry.get('throttle_events', [])))}")
+                lines.append(f"- **XID events:** {len(telemetry.get('xid_events', []))}")
+                failures = telemetry.get("failures") or []
+                if failures:
+                    lines.append("- **Failure reasons:**")
+                    for reason in failures:
+                        lines.append(f"  - {reason}")
             lines.append(f"- **Result: {'PASS' if passed else 'FAIL'}**")
             lines.append("")
 
@@ -378,26 +499,70 @@ class ReportGenerator:
             lines.append(f"**Overall: SKIP** [{rdma.get('reason', 'no IB hardware detected')}]\n")
         elif rdma and not rdma.get("error"):
             lines.append("## RDMA/InfiniBand\n")
+            rdma_legacy_note = self._rdma_legacy_note(rdma)
+            if rdma_legacy_note:
+                lines.append(f"> {rdma_legacy_note}\n")
+            port_checks = rdma.get("port_checks", [])
+            if port_checks:
+                lines.append("### RDMA Port Checks\n")
+                lines.append("| Device | Port | State | Rate | Required | Status |")
+                lines.append("|--------|------|-------|------|----------|--------|")
+                for p in port_checks:
+                    lines.append(
+                        f"| {p.get('device', '')} | {p.get('port', '')} | "
+                        f"{p.get('state', '')} | {p.get('rate', '')} | "
+                        f">= {p.get('min_rate_gbps', 400):.0f}Gbps ACTIVE | {p.get('status', '?')} |"
+                    )
+                lines.append("")
             bw_tests = rdma.get("bandwidth_tests", [])
             lat_tests = rdma.get("latency_tests", [])
-            if bw_tests or lat_tests:
+            ibping_tests = rdma.get("ibping_tests", [])
+            if bw_tests or lat_tests or ibping_tests:
                 lines.append("| Test | Value | Threshold | Status |")
                 lines.append("|------|-------|-----------|--------|")
                 for bt in bw_tests:
-                    if not bt.get("error"):
+                    if bt.get("error"):
+                        lines.append(f"| {bt.get('test', 'ib_bw')} | {bt.get('error')} | required runnable test | {bt.get('status', 'FAIL')} |")
+                    else:
+                        threshold, status = self._rdma_bandwidth_verdict(bt)
                         lines.append(f"| {bt['test']} | {bt.get('bandwidth_gbps', 0):.1f} GB/s | "
-                                     f">= {bt.get('min_required_gbps', 0)} GB/s | {bt.get('status', '?')} |")
+                                     f">= {threshold:g} GB/s | {status} |")
                 for lt in lat_tests:
-                    if not lt.get("error"):
+                    if lt.get("error"):
+                        lines.append(f"| {lt.get('test', 'ib_lat')} | {lt.get('error')} | required runnable test | {lt.get('status', 'FAIL')} |")
+                    else:
+                        threshold, status = self._rdma_latency_verdict(lt)
                         lines.append(f"| {lt['test']} | {lt.get('latency_us', 0):.2f} us | "
-                                     f"<= {lt.get('max_allowed_us', 0)} us | {lt.get('status', '?')} |")
+                                     f"<= {threshold:g} us | {status} |")
+                for it in ibping_tests:
+                    direction = it.get("direction") or it.get("role", "N/A")
+                    if it.get("error"):
+                        lines.append(f"| {it.get('test', 'ibping')} | {it.get('error')} | bidirectional peer evidence | {it.get('status', 'FAIL')} |")
+                    else:
+                        lines.append(f"| {it['test']} | {direction} target={it.get('target', 'N/A')} count={it.get('count', 'N/A')} | "
+                                     f"0% packet loss | {it.get('status', '?')} |")
                 lines.append("")
+            fabric = rdma.get("fabric_counters") or {}
+            if fabric:
+                counters = fabric.get("counters", {})
+                lines.append(f"- **PFC/ECN/CNP/congestion counters checked:** {len(counters)}")
+                lines.append(f"- **PFC/ECN/CNP/congestion non-zero:** {'yes' if fabric.get('failed') else 'no'}")
+                if not counters:
+                    lines.append("- **PFC/ECN/CNP/congestion evidence:** missing")
+            failures = rdma.get("failures") or []
+            if not failures:
+                failures = self._rdma_failure_reasons(rdma)
+            if failures:
+                lines.append("- **Failure reasons:**")
+                for reason in failures:
+                    lines.append(f"  - {reason}")
             passed = rdma.get("passed", False)
             lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")
 
         # --- Training ---
         training = results.get("training")
         if training and not training.get("error"):
+            training_status, training_detail, training_missing = self._training_verdict(training)
             lines.append("## Training Simulation\n")
             lines.append("| Metric | Value |")
             lines.append("|--------|-------|")
@@ -405,8 +570,14 @@ class ReportGenerator:
             lines.append(f"| Params | {training.get('total_params_m', 0):.1f}M |")
             lines.append(f"| Throughput | {training.get('throughput_tokens_per_sec', 0):.0f} tokens/sec |")
             lines.append(f"| Avg Step Time | {training.get('avg_step_time_ms', 0):.1f} ms |")
+            lines.append(f"| Warmup Steps | {training.get('warmup_steps', 'N/A')} |")
             lines.append(f"| Peak Memory | {training.get('peak_memory_gb', 0):.1f} GB |")
             lines.append(f"| Final Loss | {training.get('final_loss', 'N/A')} |")
+            lines.append(f"| Step Jitter | {training.get('step_jitter_pct', 'N/A')}% |")
+            lines.append(f"| Distributed Mode | {training.get('distributed_mode', 'N/A')} |")
+            if training_missing:
+                lines.append(f"| Acceptance Gaps | missing {', '.join(training_missing)} |")
+            lines.append(f"| Verdict | {training_status} ({training_detail}) |")
             lines.append("")
 
         # --- Footer ---
@@ -441,6 +612,101 @@ class ReportGenerator:
                 return bench["compute"]
         return {}
 
+    @staticmethod
+    def _training_verdict(training: dict) -> tuple[str, str, list[str]]:
+        """Return report status for both current and legacy training result schemas."""
+        tps = float(training.get("throughput_tokens_per_sec", 0) or 0)
+        if "passed" in training:
+            status = "PASS" if training.get("passed") else "FAIL"
+            return status, f"{tps:.0f} tokens/sec", []
+
+        required = ["passed", "step_jitter_pct", "distributed_mode", "loss_finite"]
+        missing = [k for k in required if k not in training]
+        return "UNVERIFIED", f"{tps:.0f} tokens/sec; legacy result lacks explicit acceptance verdict", missing
+
+    def _rdma_cfg_value(self, key: str, default: float) -> float:
+        try:
+            return float((self.config.get("rdma", {}) or {}).get(key, default))
+        except (TypeError, ValueError):
+            return default
+
+    def _rdma_bandwidth_verdict(self, row: dict) -> tuple[float, str]:
+        threshold = self._rdma_cfg_value("min_bandwidth_gbps", 47.0)
+        value = float(row.get("bandwidth_gbps", 0) or 0)
+        return threshold, "PASS" if value >= threshold else "FAIL"
+
+    def _rdma_latency_verdict(self, row: dict) -> tuple[float, str]:
+        name = row.get("test", "")
+        if name == "ib_write_lat":
+            threshold = self._rdma_cfg_value("max_write_latency_us", 2.0)
+        elif name == "ib_read_lat":
+            threshold = self._rdma_cfg_value("max_read_latency_us", 3.5)
+        else:
+            threshold = self._rdma_cfg_value("max_latency_us", 3.5)
+        value = float(row.get("latency_us", 0) or 0)
+        return threshold, "PASS" if 0 < value <= threshold else "FAIL"
+
+    def _rdma_legacy_note(self, rdma: dict) -> str:
+        """Flag old RDMA result schemas whose embedded thresholds were looser."""
+        for row in rdma.get("bandwidth_tests", []) or []:
+            if row.get("min_required_gbps") != self._rdma_cfg_value("min_bandwidth_gbps", 47.0):
+                return (
+                    "Legacy RDMA result re-evaluated with current PDF acceptance thresholds; "
+                    "old WARN statuses and old 50GB/s/10us limits are not used for verdict."
+                )
+        for row in rdma.get("latency_tests", []) or []:
+            threshold, _ = self._rdma_latency_verdict(row)
+            if row.get("max_allowed_us") != threshold:
+                return (
+                    "Legacy RDMA result re-evaluated with current PDF acceptance thresholds; "
+                    "old WARN statuses and old 50GB/s/10us limits are not used for verdict."
+                )
+        return ""
+
+    def _rdma_failure_reasons(self, rdma: dict) -> list[str]:
+        failures = []
+        for row in rdma.get("bandwidth_tests", []) or []:
+            threshold, status = self._rdma_bandwidth_verdict(row)
+            if status != "PASS":
+                failures.append(
+                    f"{row.get('test')} bandwidth {row.get('bandwidth_gbps', 0)}GB/s < {threshold:g}GB/s"
+                )
+        for row in rdma.get("latency_tests", []) or []:
+            threshold, status = self._rdma_latency_verdict(row)
+            if status != "PASS":
+                failures.append(
+                    f"{row.get('test')} latency {row.get('latency_us', 0)}us > {threshold:g}us"
+                )
+        for row in rdma.get("ibping_tests", []) or []:
+            if row.get("status") != "PASS":
+                failures.append(f"{row.get('test')} failed")
+        return failures
+
+    @staticmethod
+    def _overall_acceptance_verdict(summary_items: list[tuple[str, str]]) -> tuple[str, list[tuple[str, str]], list[str]]:
+        """PDF-style machine verdict: every required item must be present and PASS."""
+        required = [
+            "GPU Info",
+            "Health Check",
+            "Memory Bandwidth",
+            "Compute Throughput",
+            "NVLink/NVSwitch",
+            "NCCL",
+            "Stress Test",
+            "RDMA",
+            "DCGM",
+            "Training",
+        ]
+        status_by_name = dict(summary_items)
+        missing = [name for name in required if name not in status_by_name]
+        failures = [
+            (name, status)
+            for name, status in summary_items
+            if name in required and not str(status).startswith("PASS")
+        ]
+        verdict = "PASS" if not missing and not failures else "FAIL"
+        return verdict, failures, missing
+
     def _build_summary(self, results: dict) -> list[tuple[str, str]]:
         """Build summary verdict list from results."""
         items = []
@@ -473,7 +739,7 @@ class ReportGenerator:
                 d2d = mem.get("d2d_bandwidth_gbps") or 0
                 items.append(("Memory Bandwidth", f"WARN ({d2d:.0f} GB/s via PyTorch fallback)"))
             else:
-                eff = mem.get("efficiency_pct") or 0
+                eff = mem.get("d2d_efficiency_pct") or mem.get("efficiency_pct") or 0
                 verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 60 else "FAIL")
                 items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))
 
@@ -491,25 +757,43 @@ class ReportGenerator:
                     rank = {"PASS": 0, "WARN": 1, "FAIL": 2}
                     worst_status = "PASS"
                     worst_dt = None
+                    lowest_margin = None
                     for dt, thr in pass_thresholds.items():
                         val = per_dtype.get(dt)
                         if not isinstance(val, (int, float)):
                             continue
                         if val >= thr:
                             st = "PASS"
-                        elif val >= thr * 0.9:
-                            st = "WARN"
                         else:
                             st = "FAIL"
+                        margin = val / thr if thr else 0
+                        if lowest_margin is None or margin < lowest_margin:
+                            lowest_margin = margin
+                            worst_dt = dt
                         if rank[st] > rank[worst_status]:
                             worst_status = st
-                            worst_dt = dt
                     if worst_dt:
-                        items.append((
-                            "Compute Throughput",
-                            f"{worst_status} (worst {worst_dt.upper()} "
-                            f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})"
-                        ))
+                        consistency = comp.get("consistency", {}) or {}
+                        failed_consistency = [
+                            (dt, row)
+                            for dt, row in consistency.items()
+                            if not row.get("passed", False)
+                        ]
+                        if failed_consistency:
+                            worst_status = "FAIL"
+                            fail_dt, fail_row = failed_consistency[0]
+                            items.append((
+                                "Compute Throughput",
+                                f"FAIL ({fail_dt.upper()} spread "
+                                f"{fail_row.get('spread_pct', 0):.2f}% > "
+                                f"{fail_row.get('max_allowed_pct', 3)}%)"
+                            ))
+                        else:
+                            items.append((
+                                "Compute Throughput",
+                                f"{worst_status} (worst {worst_dt.upper()} "
+                                f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})"
+                            ))
                     else:
                         items.append(("Compute Throughput", f"{worst_status}"))
                 else:
@@ -521,11 +805,32 @@ class ReportGenerator:
                     else:
                         items.append(("Compute Throughput", "N/A"))
 
+        # NCCL
+        if "nvlink" in results:
+            nvl = results["nvlink"]
+            if nvl.get("error"):
+                items.append(("NVLink/NVSwitch", f"ERROR: {nvl['error']}"))
+            elif nvl.get("passed"):
+                items.append(("NVLink/NVSwitch", "PASS"))
+            else:
+                items.append(("NVLink/NVSwitch", "FAIL"))
+
+        if "dcgm" in results:
+            d = results["dcgm"]
+            if d.get("error"):
+                items.append(("DCGM", f"ERROR: {d['error']}"))
+            elif d.get("passed"):
+                items.append(("DCGM", "PASS"))
+            else:
+                items.append(("DCGM", "FAIL"))
+
         # NCCL
         if "nccl" in results:
             n = results["nccl"]
             if n.get("error"):
                 items.append(("NCCL", f"ERROR: {n['error']}"))
+            elif n.get("source") == "torchrun_fallback":
+                items.append(("NCCL", "FAIL (no nccl-tests bus BW)"))
             elif n.get("passed"):
                 items.append(("NCCL", "PASS"))
             else:
@@ -559,7 +864,7 @@ class ReportGenerator:
             if t.get("error"):
                 items.append(("Training", f"ERROR: {t['error']}"))
             else:
-                tps = t.get("throughput_tokens_per_sec", 0)
-                items.append(("Training", f"PASS ({tps:.0f} tokens/sec)"))
+                status, detail, _missing = self._training_verdict(t)
+                items.append(("Training", f"{status} ({detail})"))
 
         return items
diff --git a/modules/stress_test.py b/modules/stress_test.py
index 8b69d1c..460b3b1 100644
--- a/modules/stress_test.py
+++ b/modules/stress_test.py
@@ -1,9 +1,10 @@
-"""GPU stress test module — wraps gpu-burn for long-running stability tests."""
+"""GPU stress test module — gpu-burn or PyTorch GEMM with telemetry."""
 
 import glob
 import os
 import shutil
 import subprocess
+import threading
 import time
 from datetime import datetime
 
@@ -46,7 +47,7 @@ class StressTest:
         memory_pct = cfg.get("memory_pct", 90)
         target_gpus = cfg.get("gpus", "all")
 
-        gpu_burn = self._find_gpu_burn()
+        gpu_burn = self._find_gpu_burn() if cfg.get("use_gpu_burn", False) else ""
 
         if gpu_burn:
             # Try gpu-burn first
@@ -60,7 +61,7 @@ class StressTest:
             
             return result
 
-        self.console.print("[yellow]gpu_burn not found, using PyTorch stress test[/yellow]")
+        self.console.print("[yellow]Using PyTorch stress test[/yellow]")
         return self._run_pytorch_stress(duration_sec, memory_pct)
 
     def _run_gpu_burn(self, gpu_burn: str, duration: int,
@@ -77,12 +78,26 @@ class StressTest:
         cmd.append(str(duration))
 
         t0 = time.time()
+        xid_before = self._collect_xid_events()
+        interval = int(self.stress_cfg.get("telemetry_interval_sec", 1))
+        telemetry = []
+        stop_sampling = threading.Event()
+        sampler = threading.Thread(
+            target=self._sample_telemetry,
+            args=(telemetry, stop_sampling, interval),
+            daemon=True,
+        )
+        sampler.start()
         try:
             r = subprocess.run(cmd, capture_output=True, text=True, timeout=duration + 120)
             elapsed = round(time.time() - t0, 1)
+            stop_sampling.set()
+            sampler.join(timeout=interval + 1)
 
             output = r.stdout + r.stderr
-            passed = r.returncode == 0
+            xid_events = self._new_xid_events(xid_before, self._collect_xid_events())
+            telemetry_summary = self._evaluate_telemetry(telemetry, [], xid_events)
+            passed = r.returncode == 0 and telemetry_summary.get("passed", False)
 
             gpu_results = []
             for line in output.split("\n"):
@@ -96,25 +111,36 @@ class StressTest:
                 "duration_sec": duration,
                 "elapsed_sec": elapsed,
                 "gpu_results": gpu_results,
+                "telemetry": telemetry_summary,
                 "raw_output_tail": output[-500:] if output else "",
                 "timestamp": datetime.now().isoformat(),
             }
 
         except subprocess.TimeoutExpired:
+            stop_sampling.set()
             return {
                 "source": "gpu-burn",
                 "passed": False,
                 "duration_sec": duration,
                 "error": "timeout",
+                "telemetry": self._evaluate_telemetry(
+                    telemetry, [], self._new_xid_events(xid_before, self._collect_xid_events())
+                ),
                 "timestamp": datetime.now().isoformat(),
             }
         except Exception as e:
+            stop_sampling.set()
             return {
                 "source": "gpu-burn",
                 "passed": False,
                 "error": str(e),
+                "telemetry": self._evaluate_telemetry(
+                    telemetry, [], self._new_xid_events(xid_before, self._collect_xid_events())
+                ),
                 "timestamp": datetime.now().isoformat(),
             }
+        finally:
+            stop_sampling.set()
 
     def _run_pytorch_stress(self, duration: int, memory_pct: int = 90) -> dict:
         try:
@@ -127,58 +153,79 @@ class StressTest:
         gpu_count = torch.cuda.device_count()
         self.console.print(f"[cyan]PyTorch Stress Test ({duration}s, {gpu_count} GPUs, target {memory_pct}% memory)[/cyan]")
 
+        dtype_name = self.stress_cfg.get("dtype", "bf16")
+        matrix_size = int(self.stress_cfg.get("matrix_size", 8192))
+        interval = int(self.stress_cfg.get("telemetry_interval_sec", 1))
+        dtype_map = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
+        dtype = dtype_map.get(dtype_name, torch.bfloat16)
+
         gpu_status = {}
+        telemetry = []
+        stop_sampling = threading.Event()
         t0 = time.time()
+        xid_before = self._collect_xid_events()
 
         try:
+            sampler = threading.Thread(
+                target=self._sample_telemetry,
+                args=(telemetry, stop_sampling, interval),
+                daemon=True,
+            )
+            sampler.start()
             tensors = {}
+            ballast = {}
+            pass_tflops = []
             for i in range(gpu_count):
                 with torch.cuda.device(i):
-                    # Get actual free memory (accounting for other processes)
                     free_mem, total_mem = torch.cuda.mem_get_info(i)
-                    
-                    # Calculate allocation from configured memory_pct
-                    target_mem = int(total_mem * memory_pct / 100)
-                    
-                    # Cap at actual free memory with 5% safety margin
-                    alloc_bytes = min(target_mem, int(free_mem * 0.95))
-                    
-                    # matmul(A, A.T) needs 2x input memory (input + output)
-                    mem_side = int((alloc_bytes / 4 / 2) ** 0.5)
-                    # Cap compute matrix so a single matmul completes in ~2s on H100/H200
-                    # (FP32 ≈ 67 TFLOPS → 2*4096³/67e12 ≈ 2s). Without this cap, a 141GB
-                    # HBM yields side ≈ 131K → single matmul ~68s × 8 GPUs serial → loop
-                    # overshoots a 60s duration request by 10×+.
-                    MAX_COMPUTE_SIDE = 4096
-                    side = min(mem_side, MAX_COMPUTE_SIDE)
-
-                    actual_mem_mb = side * side * 4 / 1024 / 1024
+                    side = matrix_size
+                    elem = torch.tensor([], dtype=dtype).element_size()
+                    compute_bytes = side * side * elem * 3
+                    target_mem = min(int(total_mem * memory_pct / 100), int(free_mem * 0.90))
+                    ballast_bytes = max(0, target_mem - compute_bytes)
+                    if ballast_bytes:
+                        ballast_elems = ballast_bytes // 2
+                        ballast[i] = torch.empty(ballast_elems, device=f"cuda:{i}", dtype=torch.float16)
+                    actual_mem_mb = (compute_bytes + ballast_bytes) / 1024 / 1024
                     total_mem_mb = total_mem / 1024 / 1024
                     free_mem_mb = free_mem / 1024 / 1024
-                    
+
                     self.console.print(
                         f"  [dim]GPU {i}: total {total_mem_mb:.0f}MB, free {free_mem_mb:.0f}MB, "
                         f"alloc {actual_mem_mb:.0f}MB ({actual_mem_mb/total_mem_mb*100:.0f}%) - "
-                        f"matrix {side}x{side}[/dim]"
+                        f"{dtype_name} matrix {side}x{side}[/dim]"
+                    )
+                    tensors[i] = (
+                        torch.randn(side, side, device=f"cuda:{i}", dtype=dtype),
+                        torch.randn(side, side, device=f"cuda:{i}", dtype=dtype),
+                        torch.empty(side, side, device=f"cuda:{i}", dtype=dtype),
                     )
-                    tensors[i] = torch.randn(side, side, device=f"cuda:{i}", dtype=torch.float32)
 
             self.console.print(f"\n[cyan]Starting stress test for {duration} seconds...[/cyan]")
             
             elapsed_check = 0
             while time.time() - t0 < duration:
+                loop_start = time.perf_counter()
                 # Dispatch matmul on all GPUs in parallel — do NOT synchronize between
                 # GPUs, otherwise the 8 GPUs run serially and overshoot the duration.
                 for i in range(gpu_count):
                     with torch.cuda.device(i):
-                        tensors[i] = torch.matmul(tensors[i], tensors[i].T)
+                        a, b, out = tensors[i]
+                        torch.matmul(a, b, out=out)
                 # Single sync per pass — waits for all 8 streams concurrently
                 for i in range(gpu_count):
                     with torch.cuda.device(i):
                         torch.cuda.synchronize()
+                loop_elapsed = time.perf_counter() - loop_start
+                current_elapsed = time.time() - t0
+                if loop_elapsed > 0:
+                    flops = gpu_count * 2 * (matrix_size ** 3)
+                    pass_tflops.append({
+                        "elapsed_sec": current_elapsed,
+                        "tflops": flops / loop_elapsed / 1e12,
+                    })
 
                 # Show progress every 10 seconds
-                current_elapsed = time.time() - t0
                 if int(current_elapsed) != int(elapsed_check) and int(current_elapsed) % 10 == 0:
                     self.console.print(f"  [dim]Running {int(current_elapsed)}s / {duration}s[/dim]")
                     elapsed_check = current_elapsed
@@ -198,21 +245,196 @@ class StressTest:
                 "duration_sec": duration,
                 "error": error_msg,
                 "gpu_status": gpu_status,
+                "telemetry": self._evaluate_telemetry(
+                    telemetry, pass_tflops if "pass_tflops" in locals() else [],
+                    self._new_xid_events(xid_before, self._collect_xid_events()),
+                ),
             }
         finally:
+            stop_sampling.set()
             tensors.clear()
+            ballast.clear()
             torch.cuda.empty_cache()
 
         elapsed = round(time.time() - t0, 1)
+        xid_events = self._new_xid_events(xid_before, self._collect_xid_events())
+        telemetry_summary = self._evaluate_telemetry(telemetry, pass_tflops, xid_events)
+        passed = all(v == "PASS" for v in gpu_status.values()) and telemetry_summary.get("passed", False)
         return {
             "source": "pytorch",
-            "passed": True,
+            "passed": passed,
             "duration_sec": duration,
             "elapsed_sec": elapsed,
             "gpu_status": gpu_status,
+            "telemetry": telemetry_summary,
             "timestamp": datetime.now().isoformat(),
         }
 
+    def _sample_telemetry(self, telemetry: list, stop_event: threading.Event, interval: int):
+        query = "index,temperature.gpu,power.draw,clocks_throttle_reasons.active"
+        while not stop_event.is_set():
+            try:
+                r = subprocess.run(
+                    ["nvidia-smi", f"--query-gpu={query}", "--format=csv,noheader,nounits"],
+                    capture_output=True, text=True, timeout=10,
+                )
+                if r.returncode == 0:
+                    sample = {"time": time.time(), "gpus": []}
+                    for line in r.stdout.splitlines():
+                        parts = [p.strip() for p in line.split(",")]
+                        if len(parts) >= 4:
+                            sample["gpus"].append({
+                                "index": int(parts[0]),
+                                "temp_c": float(parts[1]),
+                                "power_w": float(parts[2]),
+                                "throttle": parts[3],
+                            })
+                    telemetry.append(sample)
+            except Exception:
+                pass
+            stop_event.wait(interval)
+
+    def _collect_xid_events(self) -> list[str]:
+        try:
+            r = subprocess.run(
+                ["dmesg", "--color=never"],
+                capture_output=True, text=True, timeout=10,
+            )
+            if r.returncode != 0:
+                return []
+            return [
+                line.strip()
+                for line in r.stdout.splitlines()
+                if any(token in line.upper() for token in ("XID", "NVRM: XID"))
+            ]
+        except Exception:
+            return []
+
+    @staticmethod
+    def _new_xid_events(before: list[str], after: list[str]) -> list[str]:
+        seen = set(before)
+        return [line for line in after if line not in seen]
+
+    def _evaluate_telemetry(self, telemetry: list, pass_tflops: list, xid_events: list[str] | None = None) -> dict:
+        cfg = self.stress_cfg
+        max_temp = float(cfg.get("max_temp_c", 80))
+        max_delta = float(cfg.get("max_temp_delta_c", 5))
+        min_power = float(cfg.get("min_power_watts", 630))
+        max_jitter = float(cfg.get("max_tflops_jitter_pct", 5))
+        require_jitter = bool(cfg.get("require_tflops_jitter", True))
+        duration = float(cfg.get("duration_sec", 60))
+        requested_warmup = float(cfg.get("warmup_sec", 60))
+        warmup_sec = min(requested_warmup, max(0.0, duration * 0.2))
+        min_steady_samples = int(cfg.get("min_steady_samples", 10))
+        temps = {}
+        powers = {}
+        throttle_bad = []
+        xid_events = xid_events or []
+        steady_telemetry = [
+            sample for sample in telemetry
+            if sample.get("time", 0) - telemetry[0].get("time", 0) >= warmup_sec
+        ] if telemetry else []
+        evaluation_samples = steady_telemetry if len(steady_telemetry) >= min_steady_samples else telemetry
+        for sample in evaluation_samples:
+            for g in sample.get("gpus", []):
+                idx = g["index"]
+                temps.setdefault(idx, []).append(g["temp_c"])
+                powers.setdefault(idx, []).append(g["power_w"])
+                try:
+                    bitmask = int(str(g["throttle"]), 16)
+                except ValueError:
+                    bitmask = 0
+                real_throttle = bitmask & ~0x1
+                if real_throttle:
+                    throttle_bad.append({
+                        "gpu": idx,
+                        "throttle": g["throttle"],
+                        "real_throttle": f"0x{real_throttle:x}",
+                    })
+        max_temps = {idx: max(vals) for idx, vals in temps.items() if vals}
+        avg_powers = {idx: sum(vals) / len(vals) for idx, vals in powers.items() if vals}
+        temp_delta = (max(max_temps.values()) - min(max_temps.values())) if len(max_temps) >= 2 else 0
+        jitter = 0
+        steady_tflops = []
+        for item in pass_tflops:
+            if isinstance(item, dict):
+                if float(item.get("elapsed_sec", 0)) >= warmup_sec:
+                    steady_tflops.append(float(item.get("tflops", 0)))
+            else:
+                steady_tflops.append(float(item))
+        if len(steady_tflops) < 2 and pass_tflops:
+            steady_tflops = [
+                float(item.get("tflops", 0)) if isinstance(item, dict) else float(item)
+                for item in pass_tflops
+            ]
+        if steady_tflops:
+            mean = sum(steady_tflops) / len(steady_tflops)
+            jitter = max(abs(v - mean) / mean * 100 for v in steady_tflops) if mean else 0
+        failures = []
+        temp_failures = {idx: v for idx, v in max_temps.items() if v > max_temp}
+        power_failures = {idx: v for idx, v in avg_powers.items() if v < min_power}
+        if not evaluation_samples:
+            failures.append("no telemetry samples available for evaluation")
+        if temp_failures:
+            failures.append(
+                "max temperature above threshold: "
+                + ", ".join(f"GPU {idx} {val:.1f}C" for idx, val in sorted(temp_failures.items()))
+            )
+        if temp_delta > max_delta:
+            failures.append(f"GPU temperature delta {temp_delta:.1f}C exceeds {max_delta:.1f}C")
+        if power_failures:
+            failures.append(
+                "average steady-state power below threshold: "
+                + ", ".join(f"GPU {idx} {val:.1f}W" for idx, val in sorted(power_failures.items()))
+            )
+        if throttle_bad:
+            failures.append(
+                f"non-idle throttle reasons observed in {len(throttle_bad)} samples "
+                f"(first: GPU {throttle_bad[0]['gpu']} {throttle_bad[0]['real_throttle']})"
+            )
+        if xid_events:
+            failures.append(f"{len(xid_events)} new XID/NVRM XID events observed")
+        if require_jitter and len(steady_tflops) < 2:
+            failures.append(
+                f"insufficient steady TFLOPS samples for jitter evaluation: {len(steady_tflops)} < 2"
+            )
+        if jitter > max_jitter:
+            failures.append(f"TFLOPS jitter {jitter:.2f}% exceeds {max_jitter:.2f}%")
+        passed = (
+            bool(evaluation_samples)
+            and all(v <= max_temp for v in max_temps.values())
+            and temp_delta <= max_delta
+            and all(v >= min_power for v in avg_powers.values())
+            and not throttle_bad
+            and not xid_events
+            and (not require_jitter or len(steady_tflops) >= 2)
+            and jitter <= max_jitter
+        )
+        return {
+            "passed": passed,
+            "samples": len(telemetry),
+            "steady_samples": len(evaluation_samples),
+            "warmup_sec": round(warmup_sec, 1),
+            "max_temp_c": {k: round(v, 1) for k, v in max_temps.items()},
+            "avg_power_w": {k: round(v, 1) for k, v in avg_powers.items()},
+            "temp_delta_c": round(temp_delta, 1),
+            "throttle_events": throttle_bad[:20],
+            "throttle_event_count": len(throttle_bad),
+            "xid_events": xid_events[-20:],
+            "tflops_jitter_pct": round(jitter, 2),
+            "steady_tflops_samples": len(steady_tflops),
+            "failures": failures,
+            "thresholds": {
+                "max_temp_c": max_temp,
+                "max_temp_delta_c": max_delta,
+                "min_power_w": min_power,
+                "max_tflops_jitter_pct": max_jitter,
+                "require_tflops_jitter": require_jitter,
+                "warmup_sec": requested_warmup,
+                "min_steady_samples": min_steady_samples,
+            },
+        }
+
     @staticmethod
     def print_results(results: dict, console: Console = None):
         c = console or Console()
@@ -245,5 +467,21 @@ class StressTest:
                 color = "green" if status == "PASS" else "red"
                 c.print(f"    GPU {gid}: [{color}]{status}[/{color}]")
 
+        telemetry = results.get("telemetry") or {}
+        if telemetry:
+            c.print("\n  Telemetry:")
+            c.print(f"    Samples: {telemetry.get('samples', 0)} total, {telemetry.get('steady_samples', 0)} evaluated after {telemetry.get('warmup_sec', 0)}s warmup")
+            c.print(f"    Avg steady power: {telemetry.get('avg_power_w', {})}")
+            c.print(f"    Max steady temp: {telemetry.get('max_temp_c', {})}")
+            c.print(f"    Temp delta: {telemetry.get('temp_delta_c', 'N/A')} C")
+            c.print(f"    TFLOPS jitter: {telemetry.get('tflops_jitter_pct', 'N/A')}%")
+            c.print(f"    Throttle events: {telemetry.get('throttle_event_count', len(telemetry.get('throttle_events', [])))}")
+            c.print(f"    XID events: {len(telemetry.get('xid_events', []))}")
+            failures = telemetry.get("failures", [])
+            if failures:
+                c.print("  [red]Failure reasons:[/red]")
+                for reason in failures:
+                    c.print(f"    [red]- {reason}[/red]")
+
         if results.get("error"):
             c.print(f"  [red]Error: {results['error']}[/red]")
diff --git a/modules/training_sim.py b/modules/training_sim.py
index dc7f5a3..af93850 100644
--- a/modules/training_sim.py
+++ b/modules/training_sim.py
@@ -1,8 +1,13 @@
 """Training simulation module - LLM training workload with PyTorch."""
 
+import json
+import os
+import sys
+import tempfile
 import time
 import subprocess
 import shutil
+import math
 from datetime import datetime
 from typing import Optional
 
@@ -36,6 +41,7 @@ class TrainingSim:
         batch_size = self.train_cfg.get("batch_size", 8)
         seq_length = self.train_cfg.get("seq_length", 2048)
         num_steps = self.train_cfg.get("num_steps", 50)
+        warmup_steps = int(self.train_cfg.get("warmup_steps", 5))
         dtype_str = self.train_cfg.get("dtype", "bf16")
 
         dtype_map = {
@@ -47,7 +53,13 @@ class TrainingSim:
 
         self.console.print(f"[cyan]Training Simulation[/cyan]")
         self.console.print(f"  Model: {model_name} | Batch: {batch_size} | Seq: {seq_length} | "
-                           f"DType: {dtype_str} | Steps: {num_steps} | GPUs: {gpu_count}")
+                           f"DType: {dtype_str} | Steps: {num_steps} | Warmup: {warmup_steps} | GPUs: {gpu_count}")
+
+        if self.train_cfg.get("mode", "ddp") == "ddp" and gpu_count > 1:
+            ddp_result = self._run_synthetic_ddp(gpu_count, batch_size, seq_length, num_steps, dtype_str)
+            if ddp_result.get("passed") or not self.train_cfg.get("allow_fallback", False):
+                return ddp_result
+            self.console.print("[yellow]DDP synthetic training failed, falling back to single-process synthetic path[/yellow]")
 
         try:
             from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -87,9 +99,10 @@ class TrainingSim:
                 BarColumn(), TextColumn("{task.completed}/{task.total}"),
                 TimeElapsedColumn(), console=self.console,
             ) as progress:
-                task = progress.add_task("Training steps...", total=num_steps)
+                total_steps = num_steps + warmup_steps
+                task = progress.add_task("Training steps...", total=total_steps)
 
-                for step in range(num_steps):
+                for step in range(total_steps):
                     torch.cuda.synchronize()
                     t0 = time.perf_counter()
 
@@ -119,8 +132,15 @@ class TrainingSim:
 
                     progress.advance(task)
 
-            avg_step_time = sum(step_times) / len(step_times)
+            measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times
+            avg_step_time = sum(measured_steps) / len(measured_steps)
             throughput = batch_size * seq_length / avg_step_time
+            jitter = self._jitter_pct(measured_steps)
+            peak_mem = round(max(mem_usage) if mem_usage else 0, 2)
+            final_loss = float(loss.item()) if hasattr(loss, "item") else float("nan")
+            passed = self._acceptance_pass(throughput, jitter, peak_mem, final_loss)
+            if self.train_cfg.get("require_distributed", True):
+                passed = False
 
             return {
                 "model": model_name,
@@ -130,11 +150,18 @@ class TrainingSim:
                 "batch_size": batch_size,
                 "seq_length": seq_length,
                 "num_steps": num_steps,
+                "warmup_steps": warmup_steps,
+                "total_steps": total_steps,
                 "avg_step_time_ms": round(avg_step_time * 1000, 1),
                 "throughput_tokens_per_sec": round(throughput, 0),
                 "throughput_samples_per_sec": round(batch_size / avg_step_time, 2),
-                "peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2),
-                "final_loss": round(loss.item(), 4) if hasattr(loss, 'item') else None,
+                "peak_memory_gb": peak_mem,
+                "final_loss": round(final_loss, 4),
+                "step_jitter_pct": round(jitter, 2),
+                "distributed_mode": "device_map",
+                "loss_finite": math.isfinite(final_loss),
+                "passed": passed,
+                "acceptance_gap": "8-GPU DDP was not used" if self.train_cfg.get("require_distributed", True) else "",
                 "timestamp": datetime.now().isoformat(),
             }
 
@@ -142,6 +169,196 @@ class TrainingSim:
             self.console.print(f"[yellow]Model loading failed: {e}[/yellow]")
             return self._run_synthetic(gpu_count, batch_size, seq_length, num_steps, dtype)
 
+    def _run_synthetic_ddp(self, gpu_count: int, batch_size: int, seq_length: int,
+                           num_steps: int, dtype_str: str) -> dict:
+        """Run the 1.5B synthetic Transformer with one process per GPU."""
+        torchrun = os.path.join(os.path.dirname(sys.executable), "torchrun")
+        if not os.path.isfile(torchrun):
+            torchrun = shutil.which("torchrun") or ""
+        if not torchrun:
+            return {
+                "model": "synthetic_transformer_1.5b",
+                "gpu_count": gpu_count,
+                "distributed_mode": "ddp",
+                "passed": False,
+                "error": "torchrun not found",
+                "timestamp": datetime.now().isoformat(),
+            }
+
+        script = r'''
+import json
+import math
+import os
+import time
+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+def main():
+    local_rank = int(os.environ["LOCAL_RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group("nccl")
+
+    global_batch = int(os.environ["TRAIN_BATCH_SIZE"])
+    local_batch = max(1, global_batch // world_size)
+    seq_length = int(os.environ["TRAIN_SEQ_LENGTH"])
+    num_steps = int(os.environ["TRAIN_NUM_STEPS"])
+    warmup_steps = int(os.environ.get("TRAIN_WARMUP_STEPS", "5"))
+    total_steps = num_steps + warmup_steps
+    dtype_name = os.environ.get("TRAIN_DTYPE", "bf16")
+    dtype = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}.get(dtype_name, torch.bfloat16)
+
+    hidden_size = 4096
+    num_layers = 6
+    num_heads = 32
+    vocab_size = 32000
+
+    class SyntheticTransformer(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.embed = torch.nn.Embedding(vocab_size, hidden_size)
+            self.layers = torch.nn.ModuleList([
+                torch.nn.TransformerEncoderLayer(
+                    d_model=hidden_size,
+                    nhead=num_heads,
+                    dim_feedforward=hidden_size * 4,
+                    batch_first=True,
+                    dtype=dtype,
+                ) for _ in range(num_layers)
+            ])
+            self.head = torch.nn.Linear(hidden_size, vocab_size, dtype=dtype)
+
+        def forward(self, x):
+            h = self.embed(x).to(dtype)
+            for layer in self.layers:
+                h = layer(h)
+            return self.head(h)
+
+    model = SyntheticTransformer().cuda()
+    total_params = sum(p.numel() for p in model.parameters())
+    model = DDP(model, device_ids=[local_rank], output_device=local_rank)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
+    input_ids = torch.randint(0, vocab_size, (local_batch, seq_length), device="cuda")
+    step_times = []
+    last_loss = torch.tensor(float("nan"), device="cuda")
+    torch.cuda.reset_peak_memory_stats(local_rank)
+
+    for _ in range(total_steps):
+        torch.cuda.synchronize()
+        t0 = time.perf_counter()
+        with torch.amp.autocast("cuda", dtype=dtype, enabled=dtype in (torch.float16, torch.bfloat16)):
+            logits = model(input_ids)
+            loss = torch.nn.functional.cross_entropy(logits.reshape(-1, vocab_size), input_ids.reshape(-1))
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad(set_to_none=True)
+        torch.cuda.synchronize()
+        step_times.append(time.perf_counter() - t0)
+        last_loss = loss.detach()
+
+    peak_mem = torch.tensor(torch.cuda.max_memory_allocated(local_rank) / 1024**3, device="cuda")
+    dist.all_reduce(peak_mem, op=dist.ReduceOp.MAX)
+    finite = torch.tensor(1 if math.isfinite(float(last_loss.item())) else 0, device="cuda")
+    dist.all_reduce(finite, op=dist.ReduceOp.MIN)
+
+    if dist.get_rank() == 0:
+        measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times
+        avg_step = sum(measured_steps) / len(measured_steps)
+        mean = avg_step
+        jitter = max(abs(v - mean) / mean * 100 for v in measured_steps) if mean else 0.0
+        throughput = global_batch * seq_length / avg_step if avg_step else 0.0
+        print("TRAINING_DDP_JSON=" + json.dumps({
+            "model": "synthetic_transformer_1.5b",
+            "total_params_m": round(total_params / 1e6, 1),
+            "num_layers": num_layers,
+            "hidden_size": hidden_size,
+            "gpu_count": world_size,
+            "dtype": dtype_name,
+            "batch_size": global_batch,
+            "local_batch_size": local_batch,
+            "seq_length": seq_length,
+            "num_steps": num_steps,
+            "warmup_steps": warmup_steps,
+            "total_steps": total_steps,
+            "avg_step_time_ms": round(avg_step * 1000, 1),
+            "throughput_tokens_per_sec": round(throughput, 0),
+            "throughput_samples_per_sec": round(global_batch / avg_step, 2) if avg_step else 0,
+            "peak_memory_gb": round(float(peak_mem.item()), 2),
+            "final_loss": round(float(last_loss.item()), 4),
+            "step_jitter_pct": round(jitter, 2),
+            "distributed_mode": "ddp",
+            "loss_finite": bool(int(finite.item())),
+        }), flush=True)
+    dist.destroy_process_group()
+
+if __name__ == "__main__":
+    main()
+'''
+        tmp = tempfile.NamedTemporaryFile("w", suffix="_training_ddp.py", delete=False)
+        tmp.write(script)
+        tmp.close()
+
+        env = {
+            **os.environ,
+            "TRAIN_BATCH_SIZE": str(batch_size),
+            "TRAIN_SEQ_LENGTH": str(seq_length),
+            "TRAIN_NUM_STEPS": str(num_steps),
+            "TRAIN_WARMUP_STEPS": str(int(self.train_cfg.get("warmup_steps", 5))),
+            "TRAIN_DTYPE": dtype_str,
+            "NCCL_DEBUG": os.environ.get("NCCL_DEBUG", "WARN"),
+        }
+        cmd = [torchrun, f"--nproc_per_node={gpu_count}", tmp.name]
+        self.console.print(f"  Running synthetic 1.5B DDP via torchrun ({gpu_count} processes)...")
+        try:
+            timeout = int(self.train_cfg.get("timeout_sec", max(600, num_steps * 180)))
+            r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, env=env)
+        except subprocess.TimeoutExpired:
+            os.unlink(tmp.name)
+            return {
+                "model": "synthetic_transformer_1.5b",
+                "gpu_count": gpu_count,
+                "distributed_mode": "ddp",
+                "passed": False,
+                "error": "training_ddp_timeout",
+                "timestamp": datetime.now().isoformat(),
+            }
+        finally:
+            if os.path.exists(tmp.name):
+                try:
+                    os.unlink(tmp.name)
+                except OSError:
+                    pass
+
+        marker = "TRAINING_DDP_JSON="
+        payload = None
+        for line in (r.stdout + "\n" + r.stderr).splitlines():
+            if marker in line:
+                payload = line.split(marker, 1)[1].strip()
+        if r.returncode != 0 or not payload:
+            return {
+                "model": "synthetic_transformer_1.5b",
+                "gpu_count": gpu_count,
+                "distributed_mode": "ddp",
+                "passed": False,
+                "error": (r.stderr or r.stdout or "training_ddp_failed")[-1000:],
+                "timestamp": datetime.now().isoformat(),
+            }
+
+        result = json.loads(payload)
+        loss_value = float(result.get("final_loss", "nan"))
+        passed = self._acceptance_pass(
+            float(result.get("throughput_tokens_per_sec", 0)),
+            float(result.get("step_jitter_pct", 999)),
+            float(result.get("peak_memory_gb", 999)),
+            loss_value,
+        ) and bool(result.get("loss_finite", False)) and result.get("gpu_count") == gpu_count
+        result.update({
+            "passed": passed,
+            "timestamp": datetime.now().isoformat(),
+        })
+        return result
+
     def _run_synthetic(self, gpu_count, batch_size, seq_length, num_steps, dtype) -> dict:
         self.console.print("  Running synthetic training benchmark...")
 
@@ -170,11 +387,17 @@ class TrainingSim:
                     h = layer(h)
                 return self.head(h)
 
-        model = SyntheticTransformer().cuda()
+        model = SyntheticTransformer()
         total_params = sum(p.numel() for p in model.parameters())
 
         self.console.print(f"  Synthetic params: {total_params / 1e6:.1f}M")
 
+        distributed_mode = "single_gpu"
+        if gpu_count > 1:
+            model = torch.nn.DataParallel(model).cuda()
+            distributed_mode = "data_parallel"
+        else:
+            model = model.cuda()
         model.train()
         optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
 
@@ -183,14 +406,17 @@ class TrainingSim:
         step_times = []
         mem_usage = []
 
+        warmup_steps = int(self.train_cfg.get("warmup_steps", 5))
+        total_steps = num_steps + warmup_steps
+
         with Progress(
             SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
             BarColumn(), TextColumn("{task.completed}/{task.total}"),
             TimeElapsedColumn(), console=self.console,
         ) as progress:
-            task = progress.add_task("Synthetic training...", total=num_steps)
+            task = progress.add_task("Synthetic training...", total=total_steps)
 
-            for step in range(num_steps):
+            for step in range(total_steps):
                 torch.cuda.synchronize()
                 t0 = time.perf_counter()
 
@@ -206,14 +432,22 @@ class TrainingSim:
                 elapsed = time.perf_counter() - t0
                 step_times.append(elapsed)
 
-                mem_used = torch.cuda.max_memory_allocated() / 1024**3
+                mem_used = max(torch.cuda.max_memory_allocated(i) for i in range(gpu_count)) / 1024**3
                 mem_usage.append(mem_used)
-                torch.cuda.reset_peak_memory_stats()
+                for i in range(gpu_count):
+                    torch.cuda.reset_peak_memory_stats(i)
 
                 progress.advance(task)
 
-        avg_step_time = sum(step_times) / len(step_times)
+        measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times
+        avg_step_time = sum(measured_steps) / len(measured_steps)
         throughput = batch_size * seq_length / avg_step_time
+        jitter = self._jitter_pct(measured_steps)
+        peak_mem = round(max(mem_usage) if mem_usage else 0, 2)
+        final_loss = float(loss.item())
+        passed = self._acceptance_pass(throughput, jitter, peak_mem, final_loss)
+        if self.train_cfg.get("require_distributed", True):
+            passed = False
 
         return {
             "model": "synthetic_transformer",
@@ -225,14 +459,36 @@ class TrainingSim:
             "batch_size": batch_size,
             "seq_length": seq_length,
             "num_steps": num_steps,
+            "warmup_steps": warmup_steps,
+            "total_steps": total_steps,
             "avg_step_time_ms": round(avg_step_time * 1000, 1),
             "throughput_tokens_per_sec": round(throughput, 0),
             "throughput_samples_per_sec": round(batch_size / avg_step_time, 2),
-            "peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2),
-            "final_loss": round(loss.item(), 4),
+            "peak_memory_gb": peak_mem,
+            "final_loss": round(final_loss, 4),
+            "step_jitter_pct": round(jitter, 2),
+            "distributed_mode": distributed_mode,
+            "loss_finite": math.isfinite(final_loss),
+            "passed": passed,
+            "acceptance_gap": "8-GPU DDP was not used" if self.train_cfg.get("require_distributed", True) else "",
             "timestamp": datetime.now().isoformat(),
         }
 
+    @staticmethod
+    def _jitter_pct(step_times: list[float]) -> float:
+        if not step_times:
+            return 0.0
+        mean = sum(step_times) / len(step_times)
+        return max(abs(v - mean) / mean * 100 for v in step_times) if mean else 0.0
+
+    def _acceptance_pass(self, throughput: float, jitter: float, peak_mem: float, loss_value: float) -> bool:
+        return (
+            throughput >= float(self.train_cfg.get("min_tokens_per_sec", 45000))
+            and jitter <= float(self.train_cfg.get("max_step_jitter_pct", 3))
+            and peak_mem <= float(self.train_cfg.get("max_peak_memory_gb", 70))
+            and math.isfinite(loss_value)
+        )
+
     @staticmethod
     def print_results(results: dict, console: Console = None):
         c = console or Console()
@@ -254,11 +510,15 @@ class TrainingSim:
             ("Batch Size", str(results.get("batch_size", "N/A"))),
             ("Seq Length", str(results.get("seq_length", "N/A"))),
             ("Steps", str(results.get("num_steps", "N/A"))),
+            ("Warmup Steps", str(results.get("warmup_steps", "N/A"))),
             ("Avg Step Time", f"{results.get('avg_step_time_ms', 'N/A')} ms"),
             ("Throughput", f"{results.get('throughput_tokens_per_sec', 'N/A')} tokens/s"),
             ("Samples/sec", f"{results.get('throughput_samples_per_sec', 'N/A')}"),
             ("Peak Memory", f"{results.get('peak_memory_gb', 'N/A')} GB"),
             ("Final Loss", str(results.get("final_loss", "N/A"))),
+            ("Step Jitter", f"{results.get('step_jitter_pct', 'N/A')}%"),
+            ("Distributed Mode", results.get("distributed_mode", "N/A")),
+            ("Verdict", "PASS" if results.get("passed") else "FAIL"),
         ]
         for label, val in metrics:
             table.add_row(label, str(val))
diff --git a/reports_all_aikubeworker0016.json b/reports_all_aikubeworker0016.json
new file mode 100644
index 0000000..d3db53f
--- /dev/null
+++ b/reports_all_aikubeworker0016.json
@@ -0,0 +1,921 @@
+{
+  "timestamp": "2026-05-22T15:49:02.368516",
+  "gpu_info": {
+    "driver_version": "580.159.03",
+    "cuda_version": "13.0",
+    "gpu_count": 8,
+    "gpus": [
+      {
+        "index": 0,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-dfbc9513-255d-4fe7-2b77-7b1ec3972e75",
+        "pci_bus_id": "00000000:18:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 4,
+        "vram_free_mb": 81076,
+        "power_draw": 69.98,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 21,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924016120",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 1,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-bb845ef7-d7b5-f011-9395-ea74274e2282",
+        "pci_bus_id": "00000000:2A:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 4,
+        "vram_free_mb": 81076,
+        "power_draw": 67.54,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 21,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924015483",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 2,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-3720cf13-2a34-be38-27be-0a7adc4addc4",
+        "pci_bus_id": "00000000:3A:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 4,
+        "vram_free_mb": 81076,
+        "power_draw": 66.82,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 22,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924025595",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 3,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-87080b2d-ac43-be0d-d574-c193078850ae",
+        "pci_bus_id": "00000000:5D:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 4,
+        "vram_free_mb": 81076,
+        "power_draw": 67.02,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 21,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924016862",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 4,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-599bd883-cc5c-a5dd-6c33-c15f7049da48",
+        "pci_bus_id": "00000000:9A:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 4,
+        "vram_free_mb": 81076,
+        "power_draw": 67.24,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 21,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924025670",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 5,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-a1c6bba4-61b0-e623-06c9-9c88635e26fe",
+        "pci_bus_id": "00000000:AB:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 4,
+        "vram_free_mb": 81076,
+        "power_draw": 69.31,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 23,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924027166",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 6,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-98745a0c-39bd-3e56-d6ca-54ba3647ab6d",
+        "pci_bus_id": "00000000:BA:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 4,
+        "vram_free_mb": 81076,
+        "power_draw": 67.84,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 21,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924026234",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 7,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-8c73bd8b-666b-357e-ac5d-c75ac7a759db",
+        "pci_bus_id": "00000000:DB:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 4,
+        "vram_free_mb": 81076,
+        "power_draw": 66.21,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 21,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924027255",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      }
+    ],
+    "topology": "\t\u001b[4mGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tNIC3\tNIC4\tNIC5\tNIC6\tNIC7\tNIC8\tNIC9\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\u001b[0m\nGPU0\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tPIX\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU1\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tPIX\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU2\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tPIX\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU3\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU4\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU5\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU6\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tPIX\t56-111,168-223\t1\t\tN/A\nGPU7\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nNIC0\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC3\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC4\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t X \tPIX\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC5\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\tPIX\t X \tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC6\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\t\t\t\t\nNIC7\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\t\t\t\t\nNIC8\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\t\t\t\t\nNIC9\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \t\t\t\t\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n  NIC0: mlx5_0\n  NIC1: mlx5_1\n  NIC2: mlx5_2\n  NIC3: mlx5_3\n  NIC4: mlx5_4\n  NIC5: mlx5_5\n  NIC6: mlx5_6\n  NIC7: mlx5_7\n  NIC8: mlx5_8\n  NIC9: mlx5_9\n\n",
+    "timestamp": "2026-05-22T15:49:09.197459",
+    "detected_gpu_type": "h100",
+    "gpu_label": "H100 SXM5"
+  },
+  "health": {
+    "passed": true,
+    "gpu_health": [
+      {
+        "index": 0,
+        "status": "WARN",
+        "checks": {
+          "temperature": {
+            "value": 21,
+            "status": "PASS",
+            "threshold": 75
+          },
+          "power": {
+            "value": 69.86,
+            "limit": 700.0,
+            "status": "PASS"
+          },
+          "ecc_errors": {
+            "single": 0,
+            "double": 0,
+            "status": "PASS"
+          },
+          "memory_errors": {
+            "status": "PASS"
+          },
+          "pcie_link": {
+            "gen": 5,
+            "width": 16,
+            "status": "PASS"
+          },
+          "clock_speed": {
+            "sm": 345,
+            "mem": 2619,
+            "status": "PASS"
+          },
+          "throttling": {
+            "status": "PASS",
+            "reasons": []
+          },
+          "persistence_mode": {
+            "enabled": false,
+            "status": "WARN"
+          }
+        }
+      },
+      {
+        "index": 1,
+        "status": "WARN",
+        "checks": {
+          "temperature": {
+            "value": 21,
+            "status": "PASS",
+            "threshold": 75
+          },
+          "power": {
+            "value": 67.48,
+            "limit": 700.0,
+            "status": "PASS"
+          },
+          "ecc_errors": {
+            "single": 0,
+            "double": 0,
+            "status": "PASS"
+          },
+          "memory_errors": {
+            "status": "PASS"
+          },
+          "pcie_link": {
+            "gen": 5,
+            "width": 16,
+            "status": "PASS"
+          },
+          "clock_speed": {
+            "sm": 345,
+            "mem": 2619,
+            "status": "PASS"
+          },
+          "throttling": {
+            "status": "PASS",
+            "reasons": []
+          },
+          "persistence_mode": {
+            "enabled": false,
+            "status": "WARN"
+          }
+        }
+      },
+      {
+        "index": 2,
+        "status": "WARN",
+        "checks": {
+          "temperature": {
+            "value": 22,
+            "status": "PASS",
+            "threshold": 75
+          },
+          "power": {
+            "value": 66.76,
+            "limit": 700.0,
+            "status": "PASS"
+          },
+          "ecc_errors": {
+            "single": 0,
+            "double": 0,
+            "status": "PASS"
+          },
+          "memory_errors": {
+            "status": "PASS"
+          },
+          "pcie_link": {
+            "gen": 5,
+            "width": 16,
+            "status": "PASS"
+          },
+          "clock_speed": {
+            "sm": 345,
+            "mem": 2619,
+            "status": "PASS"
+          },
+          "throttling": {
+            "status": "PASS",
+            "reasons": []
+          },
+          "persistence_mode": {
+            "enabled": false,
+            "status": "WARN"
+          }
+        }
+      },
+      {
+        "index": 3,
+        "status": "WARN",
+        "checks": {
+          "temperature": {
+            "value": 21,
+            "status": "PASS",
+            "threshold": 75
+          },
+          "power": {
+            "value": 67.06,
+            "limit": 700.0,
+            "status": "PASS"
+          },
+          "ecc_errors": {
+            "single": 0,
+            "double": 0,
+            "status": "PASS"
+          },
+          "memory_errors": {
+            "status": "PASS"
+          },
+          "pcie_link": {
+            "gen": 5,
+            "width": 16,
+            "status": "PASS"
+          },
+          "clock_speed": {
+            "sm": 345,
+            "mem": 2619,
+            "status": "PASS"
+          },
+          "throttling": {
+            "status": "PASS",
+            "reasons": []
+          },
+          "persistence_mode": {
+            "enabled": false,
+            "status": "WARN"
+          }
+        }
+      },
+      {
+        "index": 4,
+        "status": "WARN",
+        "checks": {
+          "temperature": {
+            "value": 21,
+            "status": "PASS",
+            "threshold": 75
+          },
+          "power": {
+            "value": 67.23,
+            "limit": 700.0,
+            "status": "PASS"
+          },
+          "ecc_errors": {
+            "single": 0,
+            "double": 0,
+            "status": "PASS"
+          },
+          "memory_errors": {
+            "status": "PASS"
+          },
+          "pcie_link": {
+            "gen": 5,
+            "width": 16,
+            "status": "PASS"
+          },
+          "clock_speed": {
+            "sm": 345,
+            "mem": 2619,
+            "status": "PASS"
+          },
+          "throttling": {
+            "status": "PASS",
+            "reasons": []
+          },
+          "persistence_mode": {
+            "enabled": false,
+            "status": "WARN"
+          }
+        }
+      },
+      {
+        "index": 5,
+        "status": "WARN",
+        "checks": {
+          "temperature": {
+            "value": 23,
+            "status": "PASS",
+            "threshold": 75
+          },
+          "power": {
+            "value": 69.27,
+            "limit": 700.0,
+            "status": "PASS"
+          },
+          "ecc_errors": {
+            "single": 0,
+            "double": 0,
+            "status": "PASS"
+          },
+          "memory_errors": {
+            "status": "PASS"
+          },
+          "pcie_link": {
+            "gen": 5,
+            "width": 16,
+            "status": "PASS"
+          },
+          "clock_speed": {
+            "sm": 345,
+            "mem": 2619,
+            "status": "PASS"
+          },
+          "throttling": {
+            "status": "PASS",
+            "reasons": []
+          },
+          "persistence_mode": {
+            "enabled": false,
+            "status": "WARN"
+          }
+        }
+      },
+      {
+        "index": 6,
+        "status": "WARN",
+        "checks": {
+          "temperature": {
+            "value": 21,
+            "status": "PASS",
+            "threshold": 75
+          },
+          "power": {
+            "value": 67.81,
+            "limit": 700.0,
+            "status": "PASS"
+          },
+          "ecc_errors": {
+            "single": 0,
+            "double": 0,
+            "status": "PASS"
+          },
+          "memory_errors": {
+            "status": "PASS"
+          },
+          "pcie_link": {
+            "gen": 5,
+            "width": 16,
+            "status": "PASS"
+          },
+          "clock_speed": {
+            "sm": 345,
+            "mem": 2619,
+            "status": "PASS"
+          },
+          "throttling": {
+            "status": "PASS",
+            "reasons": []
+          },
+          "persistence_mode": {
+            "enabled": false,
+            "status": "WARN"
+          }
+        }
+      },
+      {
+        "index": 7,
+        "status": "WARN",
+        "checks": {
+          "temperature": {
+            "value": 21,
+            "status": "PASS",
+            "threshold": 75
+          },
+          "power": {
+            "value": 66.3,
+            "limit": 700.0,
+            "status": "PASS"
+          },
+          "ecc_errors": {
+            "single": 0,
+            "double": 0,
+            "status": "PASS"
+          },
+          "memory_errors": {
+            "status": "PASS"
+          },
+          "pcie_link": {
+            "gen": 5,
+            "width": 16,
+            "status": "PASS"
+          },
+          "clock_speed": {
+            "sm": 345,
+            "mem": 2619,
+            "status": "PASS"
+          },
+          "throttling": {
+            "status": "PASS",
+            "reasons": []
+          },
+          "persistence_mode": {
+            "enabled": false,
+            "status": "WARN"
+          }
+        }
+      }
+    ],
+    "system_health": {
+      "nvidia_persistenced": {
+        "installed": true,
+        "running": false
+      },
+      "hugepages": {
+        "configured": false,
+        "count": 0
+      },
+      "swap": {
+        "enabled": true
+      },
+      "transparent_hugepage": "madvise",
+      "file_descriptors": {
+        "soft": 1024,
+        "max": 1048576
+      },
+      "infiniband_devices": [
+        "mlx5_4",
+        "mlx5_2",
+        "mlx5_0",
+        "mlx5_9",
+        "mlx5_7",
+        "mlx5_5",
+        "mlx5_3",
+        "mlx5_1",
+        "mlx5_8",
+        "mlx5_6"
+      ],
+      "rdma_devices": [
+        "abi_version",
+        "uverbs4",
+        "uverbs2",
+        "uverbs0",
+        "uverbs9",
+        "uverbs7",
+        "uverbs5",
+        "uverbs3",
+        "uverbs1",
+        "uverbs8",
+        "uverbs6"
+      ],
+      "nccl_env_vars": {}
+    },
+    "timestamp": "2026-05-22T15:49:11.294816",
+    "detected_gpu_type": "h100"
+  },
+  "memory_bench": {
+    "memory": {
+      "source": "nvbandwidth",
+      "h2d_bandwidth_gbps": 55.5,
+      "d2h_bandwidth_gbps": 55.3,
+      "d2d_bandwidth_gbps": 486.5,
+      "h2d_peak_gbps": 64,
+      "d2h_peak_gbps": 64,
+      "d2d_peak_gbps": 450.0,
+      "h2d_efficiency_pct": 86.7,
+      "d2h_efficiency_pct": 86.4,
+      "d2d_efficiency_pct": 108.1,
+      "peak_bandwidth_gbps": 3400,
+      "efficiency_pct": 108.1,
+      "results_by_test": {
+        "h2d": 55.5,
+        "d2h": 55.3,
+        "d2d_write": 397.4,
+        "d2d_read": 395.1,
+        "d2d_bidir": 486.5
+      },
+      "per_gpu": []
+    }
+  },
+  "compute_bench": {
+    "compute": {
+      "per_dtype_tflops": {
+        "fp32": 51.9,
+        "tf32": 357.0,
+        "fp16": 664.0,
+        "bf16": 700.1,
+        "fp8": 1116.2
+      },
+      "peak_tflops": {
+        "fp32": 67,
+        "tf32": 495,
+        "fp16": 990,
+        "bf16": 990,
+        "fp8": 1979
+      },
+      "efficiency_pct": {
+        "fp32": 77.5,
+        "tf32": 72.1,
+        "fp16": 67.1,
+        "bf16": 70.7,
+        "fp8": 56.4
+      },
+      "pass_thresholds_tflops": {
+        "fp32": 54,
+        "tf32": 444,
+        "fp16": 734,
+        "bf16": 745,
+        "fp8": 1400
+      },
+      "per_gpu": [
+        {
+          "index": 0,
+          "fp32": 51.9,
+          "tf32": 357.0,
+          "fp16": 664.0,
+          "bf16": 700.1,
+          "fp8": 1116.2
+        },
+        {
+          "index": 1,
+          "fp32": 51.9,
+          "tf32": 357.0,
+          "fp16": 664.0,
+          "bf16": 700.1,
+          "fp8": 1116.2
+        },
+        {
+          "index": 2,
+          "fp32": 51.9,
+          "tf32": 357.0,
+          "fp16": 664.0,
+          "bf16": 700.1,
+          "fp8": 1116.2
+        },
+        {
+          "index": 3,
+          "fp32": 51.9,
+          "tf32": 357.0,
+          "fp16": 664.0,
+          "bf16": 700.1,
+          "fp8": 1116.2
+        },
+        {
+          "index": 4,
+          "fp32": 51.9,
+          "tf32": 357.0,
+          "fp16": 664.0,
+          "bf16": 700.1,
+          "fp8": 1116.2
+        },
+        {
+          "index": 5,
+          "fp32": 51.9,
+          "tf32": 357.0,
+          "fp16": 664.0,
+          "bf16": 700.1,
+          "fp8": 1116.2
+        },
+        {
+          "index": 6,
+          "fp32": 51.9,
+          "tf32": 357.0,
+          "fp16": 664.0,
+          "bf16": 700.1,
+          "fp8": 1116.2
+        },
+        {
+          "index": 7,
+          "fp32": 51.9,
+          "tf32": 357.0,
+          "fp16": 664.0,
+          "bf16": 700.1,
+          "fp8": 1116.2
+        }
+      ],
+      "matrix_size": 8192,
+      "warmup": 50,
+      "iterations": 500
+    }
+  },
+  "nccl": {
+    "passed": false,
+    "source": "torchrun_fallback",
+    "tests": {
+      "NCCL version 2.21.5+cuda12.4": {
+        "status": "FAIL",
+        "error": null
+      },
+      "allreduce": {
+        "status": "PASS",
+        "error": null
+      },
+      "broadcast": {
+        "status": "PASS",
+        "error": null
+      },
+      "allgather": {
+        "status": "PASS",
+        "error": null
+      },
+      "reducescatter": {
+        "status": "PASS",
+        "error": null
+      },
+      "alltoall": {
+        "status": "PASS",
+        "error": null
+      }
+    },
+    "gpu_count": 8
+  },
+  "stress": {
+    "source": "pytorch",
+    "passed": true,
+    "duration_sec": 60,
+    "elapsed_sec": 60.0,
+    "gpu_status": {
+      "0": "PASS",
+      "1": "PASS",
+      "2": "PASS",
+      "3": "PASS",
+      "4": "PASS",
+      "5": "PASS",
+      "6": "PASS",
+      "7": "PASS"
+    },
+    "timestamp": "2026-05-22T15:51:56.803540"
+  },
+  "rdma": {
+    "passed": false,
+    "devices": [
+      {
+        "name": "mlx5_0",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "400 Gb/sec (4X NDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:58a2:e103:0088:81e0"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_1",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "400 Gb/sec (4X NDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:9c63:c003:0054:e00a"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_2",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "25 Gb/sec (1X EDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_3",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "25 Gb/sec (1X EDR)",
+            "state": "1: DOWN",
+            "phys_state": "3: Disabled",
+            "gid": "fe80:0000:0000:0000:c670:bdff:fefd:5bd9"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_4",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "100 Gb/sec (2X HDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:9c63:c003:005f:58ec"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_5",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "100 Gb/sec (2X HDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:9c63:c003:005f:58ed"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_6",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "400 Gb/sec (4X NDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:9c63:c003:0055:0e56"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_7",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "400 Gb/sec (4X NDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:a088:c203:00f0:286c"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_8",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "25 Gb/sec (1X EDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_9",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "25 Gb/sec (1X EDR)",
+            "state": "1: DOWN",
+            "phys_state": "3: Disabled",
+            "gid": "fe80:0000:0000:0000:c670:bdff:fefd:569d"
+          }
+        ]
+      }
+    ],
+    "bandwidth_tests": [
+      {
+        "test": "ib_write_bw",
+        "status": "WARN",
+        "bandwidth_gbps": 0.13,
+        "min_required_gbps": 50
+      },
+      {
+        "test": "ib_read_bw",
+        "status": "WARN",
+        "bandwidth_gbps": 0.13,
+        "min_required_gbps": 50
+      }
+    ],
+    "latency_tests": [
+      {
+        "test": "ib_write_lat",
+        "status": "PASS",
+        "latency_us": 4.1,
+        "max_allowed_us": 10
+      },
+      {
+        "test": "ib_read_lat",
+        "status": "WARN",
+        "latency_us": 16.0,
+        "max_allowed_us": 10
+      }
+    ],
+    "timestamp": "2026-05-22T15:52:03.507540"
+  },
+  "training": {
+    "model": "synthetic_transformer",
+    "total_params_m": 1470.5,
+    "num_layers": 6,
+    "hidden_size": 4096,
+    "gpu_count": 8,
+    "dtype": "bfloat16",
+    "batch_size": 8,
+    "seq_length": 2048,
+    "num_steps": 50,
+    "avg_step_time_ms": 312.3,
+    "throughput_tokens_per_sec": 52471.0,
+    "throughput_samples_per_sec": 25.62,
+    "peak_memory_gb": 27.31,
+    "final_loss": 0.0041,
+    "timestamp": "2026-05-22T15:52:32.650522"
+  }
+}
\ No newline at end of file
diff --git a/reports_all_aikubeworker0016.md b/reports_all_aikubeworker0016.md
new file mode 100644
index 0000000..80dda75
--- /dev/null
+++ b/reports_all_aikubeworker0016.md
@@ -0,0 +1,157 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22T15:49:02.368516
+- **Host:** aikubeworker0016
+- **GPU:** NVIDIA H100 80GB HBM3 x8
+- **Driver:** 580.159.03 | **CUDA:** 13.0
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Failed or unverified items:
+- Compute Throughput: FAIL (worst FP32 52 vs >= 54)
+- NCCL: FAIL (no nccl-tests bus BW)
+- RDMA: FAIL
+- Training: UNVERIFIED (52471 tokens/sec; legacy result lacks explicit acceptance verdict)
+
+Missing required evidence:
+- NVLink/NVSwitch
+- DCGM
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| GPU Info | PASS (8 GPUs detected) |
+| Health Check | PASS |
+| Memory Bandwidth | PASS (108.1%) |
+| Compute Throughput | FAIL (worst FP32 52 vs >= 54) |
+| NCCL | FAIL (no nccl-tests bus BW) |
+| Stress Test | PASS |
+| RDMA | FAIL |
+| Training | UNVERIFIED (52471 tokens/sec; legacy result lacks explicit acceptance verdict) |
+
+## GPU Information
+
+| GPU | Model | VRAM | Temp | Power | SM Clock |
+|-----|-------|------|------|-------|----------|
+| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 70/700W | 345 MHz |
+| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 68/700W | 345 MHz |
+| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 22C | 67/700W | 345 MHz |
+| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 67/700W | 345 MHz |
+| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 67/700W | 345 MHz |
+| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 23C | 69/700W | 345 MHz |
+| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 68/700W | 345 MHz |
+| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 66/700W | 345 MHz |
+
+## Health Check
+
+**Overall: PASS**
+
+| GPU | Temp | Power | ECC | PCIe | Throttle | Status |
+|-----|------|-------|-----|------|----------|--------|
+| 0 | 21C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
+| 1 | 21C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
+| 2 | 22C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
+| 3 | 21C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
+| 4 | 21C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
+| 5 | 23C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
+| 6 | 21C PASS | 68W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
+| 7 | 21C PASS | 66W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
+
+## Memory Bandwidth
+
+Source: nvbandwidth
+
+| Metric | Value | Peak | Efficiency |
+|--------|-------|------|------------|
+| H2D (PCIe) | 55.5 GB/s | 64 GB/s | 86.7% |
+| D2H (PCIe) | 55.3 GB/s | 64 GB/s | 86.4% |
+| D2D (NVLink) | 486.5 GB/s | 450 GB/s | 108.1% |
+
+**Verdict: PASS** (D2D efficiency 108.1%)
+
+## Compute Throughput
+
+| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
+|-------|-------------------|------|------------|--------|
+| FP32 | 51.9 | 67 | >= 54 | FAIL |
+| TF32 | 357.0 | 495 | >= 444 | FAIL |
+| FP16 | 664.0 | 990 | >= 734 | FAIL |
+| BF16 | 700.1 | 990 | >= 745 | FAIL |
+| FP8 | 1116.2 | 1979 | >= 1400 | FAIL |
+
+**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 56.4%)
+
+### Compute Per-GPU TFLOPS
+
+| GPU | FP32 | TF32 | FP16 | BF16 | FP8 |
+|---|---|---|---|---|---|
+| 0 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
+| 1 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
+| 2 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
+| 3 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
+| 4 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
+| 5 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
+| 6 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
+| 7 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
+
+## NCCL Multi-GPU
+
+Source: torchrun_fallback | GPUs: 8
+
+> Functional NCCL smoke only: nccl-tests bus bandwidth was not measured, so this does not satisfy production acceptance.
+
+| Operation | Bus BW (GB/s) | Threshold | Status |
+|-----------|---------------|-----------|--------|
+| NCCL version 2.21.5+cuda12.4 | 0.0 | >= 0 | FAIL |
+| allreduce | 0.0 | >= 0 | PASS |
+| broadcast | 0.0 | >= 0 | PASS |
+| allgather | 0.0 | >= 0 | PASS |
+| reducescatter | 0.0 | >= 0 | PASS |
+| alltoall | 0.0 | >= 0 | PASS |
+
+**Overall: FAIL**
+
+## Stress Test
+
+- **Source:** pytorch
+- **Duration:** 60s (requested 60s)
+- **Result: PASS**
+
+## RDMA/InfiniBand
+
+> Legacy RDMA result re-evaluated with current PDF acceptance thresholds; old WARN statuses and old 50GB/s/10us limits are not used for verdict.
+
+| Test | Value | Threshold | Status |
+|------|-------|-----------|--------|
+| ib_write_bw | 0.1 GB/s | >= 47 GB/s | FAIL |
+| ib_read_bw | 0.1 GB/s | >= 47 GB/s | FAIL |
+| ib_write_lat | 4.10 us | <= 2 us | FAIL |
+| ib_read_lat | 16.00 us | <= 3.5 us | FAIL |
+
+- **Failure reasons:**
+  - ib_write_bw bandwidth 0.13GB/s < 47GB/s
+  - ib_read_bw bandwidth 0.13GB/s < 47GB/s
+  - ib_write_lat latency 4.1us > 2us
+  - ib_read_lat latency 16.0us > 3.5us
+**Overall: FAIL**
+
+## Training Simulation
+
+| Metric | Value |
+|--------|-------|
+| Model | synthetic_transformer |
+| Params | 1470.5M |
+| Throughput | 52471 tokens/sec |
+| Avg Step Time | 312.3 ms |
+| Peak Memory | 27.3 GB |
+| Final Loss | 0.0041 |
+| Step Jitter | N/A% |
+| Distributed Mode | N/A |
+| Acceptance Gaps | missing passed, step_jitter_pct, distributed_mode, loss_finite |
+| Verdict | UNVERIFIED (52471 tokens/sec; legacy result lacks explicit acceptance verdict) |
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_dcgm_r3_aikubeworker0012_20260522_200338.md b/reports_dcgm_r3_aikubeworker0012_20260522_200338.md
new file mode 100644
index 0000000..1663b83
--- /dev/null
+++ b/reports_dcgm_r3_aikubeworker0012_20260522_200338.md
@@ -0,0 +1,65 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22T20:26:56.947796
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- RDMA
+- Training
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| DCGM | PASS |
+
+## DCGM Diagnostic
+
+**Overall: PASS**
+
+| Subtest | Status |
+|---------|--------|
+| Hardware/nvbandwidth/GPU6 | PASS |
+| Hardware/nvbandwidth/GPU7 | PASS |
+| Hardware/nvbandwidth/summary | PASS |
+| Integration/pcie/GPU0 | PASS |
+| Integration/pcie/GPU1 | PASS |
+| Integration/pcie/GPU2 | PASS |
+| Integration/pcie/GPU3 | PASS |
+| Integration/pcie/GPU4 | PASS |
+| Integration/pcie/GPU5 | PASS |
+| Integration/pcie/GPU6 | PASS |
+| Integration/pcie/GPU7 | PASS |
+| Integration/pcie/summary | PASS |
+| Stress/targeted_stress/GPU0 | PASS |
+| Stress/targeted_stress/GPU1 | PASS |
+| Stress/targeted_stress/GPU2 | PASS |
+| Stress/targeted_stress/GPU3 | PASS |
+| Stress/targeted_stress/GPU4 | PASS |
+| Stress/targeted_stress/GPU5 | PASS |
+| Stress/targeted_stress/GPU6 | PASS |
+| Stress/targeted_stress/GPU7 | PASS |
+| Stress/targeted_stress/summary | PASS |
+| Stress/targeted_power/GPU0 | PASS |
+| Stress/targeted_power/GPU1 | PASS |
+| Stress/targeted_power/GPU2 | PASS |
+| Stress/targeted_power/GPU3 | PASS |
+| Stress/targeted_power/GPU4 | PASS |
+| Stress/targeted_power/GPU5 | PASS |
+| Stress/targeted_power/GPU6 | PASS |
+| Stress/targeted_power/GPU7 | PASS |
+| Stress/targeted_power/summary | PASS |
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_dcgm_r3_aikubeworker0016_20260522_200538.md b/reports_dcgm_r3_aikubeworker0016_20260522_200538.md
new file mode 100644
index 0000000..f51b5bf
--- /dev/null
+++ b/reports_dcgm_r3_aikubeworker0016_20260522_200538.md
@@ -0,0 +1,65 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22T20:28:58.716266
+- **Host:** aikubeworker0016
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- RDMA
+- Training
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| DCGM | PASS |
+
+## DCGM Diagnostic
+
+**Overall: PASS**
+
+| Subtest | Status |
+|---------|--------|
+| Hardware/nvbandwidth/GPU6 | PASS |
+| Hardware/nvbandwidth/GPU7 | PASS |
+| Hardware/nvbandwidth/summary | PASS |
+| Integration/pcie/GPU0 | PASS |
+| Integration/pcie/GPU1 | PASS |
+| Integration/pcie/GPU2 | PASS |
+| Integration/pcie/GPU3 | PASS |
+| Integration/pcie/GPU4 | PASS |
+| Integration/pcie/GPU5 | PASS |
+| Integration/pcie/GPU6 | PASS |
+| Integration/pcie/GPU7 | PASS |
+| Integration/pcie/summary | PASS |
+| Stress/targeted_stress/GPU0 | PASS |
+| Stress/targeted_stress/GPU1 | PASS |
+| Stress/targeted_stress/GPU2 | PASS |
+| Stress/targeted_stress/GPU3 | PASS |
+| Stress/targeted_stress/GPU4 | PASS |
+| Stress/targeted_stress/GPU5 | PASS |
+| Stress/targeted_stress/GPU6 | PASS |
+| Stress/targeted_stress/GPU7 | PASS |
+| Stress/targeted_stress/summary | PASS |
+| Stress/targeted_power/GPU0 | PASS |
+| Stress/targeted_power/GPU1 | PASS |
+| Stress/targeted_power/GPU2 | PASS |
+| Stress/targeted_power/GPU3 | PASS |
+| Stress/targeted_power/GPU4 | PASS |
+| Stress/targeted_power/GPU5 | PASS |
+| Stress/targeted_power/GPU6 | PASS |
+| Stress/targeted_power/GPU7 | PASS |
+| Stress/targeted_power/summary | PASS |
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_nvbandwidth_aikubeworker0012.json b/reports_nvbandwidth_aikubeworker0012.json
new file mode 100644
index 0000000..05a0587
--- /dev/null
+++ b/reports_nvbandwidth_aikubeworker0012.json
@@ -0,0 +1,70 @@
+{
+  "benchmark": {
+    "memory": {
+      "source": "nvbandwidth",
+      "h2d_bandwidth_gbps": 55.5,
+      "d2h_bandwidth_gbps": 54.8,
+      "d2d_bandwidth_gbps": 0.0,
+      "h2d_peak_gbps": 64,
+      "d2h_peak_gbps": 64,
+      "d2d_peak_gbps": 450.0,
+      "h2d_efficiency_pct": 86.7,
+      "d2h_efficiency_pct": 85.6,
+      "d2d_efficiency_pct": null,
+      "peak_bandwidth_gbps": 3400,
+      "efficiency_pct": null,
+      "results_by_test": {
+        "h2d": 55.5,
+        "d2h": 54.8,
+        "d2d_write": 0.0,
+        "d2d_read": 0.0,
+        "d2d_bidir": 0.0
+      },
+      "per_gpu": []
+    },
+    "compute": {
+      "per_dtype_tflops": {
+        "fp32": 52.2,
+        "tf32": 360.7,
+        "fp16": 680.0,
+        "bf16": 707.6,
+        "fp8": 1142.4
+      },
+      "peak_tflops": {
+        "fp32": 67,
+        "tf32": 495,
+        "fp16": 990,
+        "bf16": 990,
+        "fp8": 1979
+      },
+      "efficiency_pct": {
+        "fp32": 77.9,
+        "tf32": 72.9,
+        "fp16": 68.7,
+        "bf16": 71.5,
+        "fp8": 57.7
+      },
+      "pass_thresholds_tflops": {
+        "fp32": 54,
+        "tf32": 444,
+        "fp16": 734,
+        "bf16": 745,
+        "fp8": 1400
+      },
+      "per_gpu": [
+        {
+          "index": 0,
+          "fp32": 52.2,
+          "tf32": 360.7,
+          "fp16": 680.0,
+          "bf16": 707.6,
+          "fp8": 1142.4
+        }
+      ],
+      "matrix_size": 8192,
+      "warmup": 50,
+      "iterations": 500
+    }
+  },
+  "timestamp": "2026-05-22T15:35:16.675924"
+}
\ No newline at end of file
diff --git a/reports_nvbandwidth_aikubeworker0012.md b/reports_nvbandwidth_aikubeworker0012.md
new file mode 100644
index 0000000..bf571ab
--- /dev/null
+++ b/reports_nvbandwidth_aikubeworker0012.md
@@ -0,0 +1,38 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22 15:37:12
+- **Host:** aikubeworker0012
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Memory Bandwidth | FAIL (0.0%) |
+| Compute Throughput | FAIL (worst TF32 361 vs >= 444) |
+
+## Memory Bandwidth
+
+Source: nvbandwidth
+
+| Metric | Value | Peak | Efficiency |
+|--------|-------|------|------------|
+| H2D (PCIe) | 55.5 GB/s | 64 GB/s | 86.7% |
+| D2H (PCIe) | 54.8 GB/s | 64 GB/s | 85.6% |
+| D2D (NVLink) | 0.0 GB/s | 450 GB/s | 0.0% |
+
+**Verdict: FAIL** (D2D efficiency 0.0%)
+
+## Compute Throughput
+
+| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
+|-------|-------------------|------|------------|--------|
+| FP32 | 52.2 | 67 | >= 54 | WARN |
+| TF32 | 360.7 | 495 | >= 444 | FAIL |
+| FP16 | 680.0 | 990 | >= 734 | WARN |
+| BF16 | 707.6 | 990 | >= 745 | WARN |
+| FP8 | 1142.4 | 1979 | >= 1400 | FAIL |
+
+**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 57.7%)
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_nvbandwidth_aikubeworker0016.json b/reports_nvbandwidth_aikubeworker0016.json
new file mode 100644
index 0000000..34ac61c
--- /dev/null
+++ b/reports_nvbandwidth_aikubeworker0016.json
@@ -0,0 +1,70 @@
+{
+  "benchmark": {
+    "memory": {
+      "source": "nvbandwidth",
+      "h2d_bandwidth_gbps": 55.5,
+      "d2h_bandwidth_gbps": 55.0,
+      "d2d_bandwidth_gbps": 0.0,
+      "h2d_peak_gbps": 64,
+      "d2h_peak_gbps": 64,
+      "d2d_peak_gbps": 450.0,
+      "h2d_efficiency_pct": 86.7,
+      "d2h_efficiency_pct": 85.9,
+      "d2d_efficiency_pct": null,
+      "peak_bandwidth_gbps": 3400,
+      "efficiency_pct": null,
+      "results_by_test": {
+        "h2d": 55.5,
+        "d2h": 55.0,
+        "d2d_write": 0.0,
+        "d2d_read": 0.0,
+        "d2d_bidir": 0.0
+      },
+      "per_gpu": []
+    },
+    "compute": {
+      "per_dtype_tflops": {
+        "fp32": 52.2,
+        "tf32": 357.5,
+        "fp16": 665.3,
+        "bf16": 697.1,
+        "fp8": 1138.8
+      },
+      "peak_tflops": {
+        "fp32": 67,
+        "tf32": 495,
+        "fp16": 990,
+        "bf16": 990,
+        "fp8": 1979
+      },
+      "efficiency_pct": {
+        "fp32": 77.9,
+        "tf32": 72.2,
+        "fp16": 67.2,
+        "bf16": 70.4,
+        "fp8": 57.5
+      },
+      "pass_thresholds_tflops": {
+        "fp32": 54,
+        "tf32": 444,
+        "fp16": 734,
+        "bf16": 745,
+        "fp8": 1400
+      },
+      "per_gpu": [
+        {
+          "index": 0,
+          "fp32": 52.2,
+          "tf32": 357.5,
+          "fp16": 665.3,
+          "bf16": 697.1,
+          "fp8": 1138.8
+        }
+      ],
+      "matrix_size": 8192,
+      "warmup": 50,
+      "iterations": 500
+    }
+  },
+  "timestamp": "2026-05-22T15:35:19.219299"
+}
\ No newline at end of file
diff --git a/reports_nvbandwidth_aikubeworker0016.md b/reports_nvbandwidth_aikubeworker0016.md
new file mode 100644
index 0000000..01320cf
--- /dev/null
+++ b/reports_nvbandwidth_aikubeworker0016.md
@@ -0,0 +1,38 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22 15:37:18
+- **Host:** aikubeworker0016
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Memory Bandwidth | FAIL (0.0%) |
+| Compute Throughput | FAIL (worst TF32 358 vs >= 444) |
+
+## Memory Bandwidth
+
+Source: nvbandwidth
+
+| Metric | Value | Peak | Efficiency |
+|--------|-------|------|------------|
+| H2D (PCIe) | 55.5 GB/s | 64 GB/s | 86.7% |
+| D2H (PCIe) | 55.0 GB/s | 64 GB/s | 85.9% |
+| D2D (NVLink) | 0.0 GB/s | 450 GB/s | 0.0% |
+
+**Verdict: FAIL** (D2D efficiency 0.0%)
+
+## Compute Throughput
+
+| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
+|-------|-------------------|------|------------|--------|
+| FP32 | 52.2 | 67 | >= 54 | WARN |
+| TF32 | 357.5 | 495 | >= 444 | FAIL |
+| FP16 | 665.3 | 990 | >= 734 | WARN |
+| BF16 | 697.1 | 990 | >= 745 | WARN |
+| FP8 | 1138.8 | 1979 | >= 1400 | FAIL |
+
+**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 57.5%)
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_rdma_aikubeworker0012.json b/reports_rdma_aikubeworker0012.json
new file mode 100644
index 0000000..93d7644
--- /dev/null
+++ b/reports_rdma_aikubeworker0012.json
@@ -0,0 +1,157 @@
+{
+  "rdma": {
+    "passed": false,
+    "devices": [
+      {
+        "name": "mlx5_0",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "400 Gb/sec (4X NDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:58a2:e103:0093:3898"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_1",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "400 Gb/sec (4X NDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:58a2:e103:0093:3db0"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_2",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "25 Gb/sec (1X EDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:5c3f:b8ff:fe5e:7832"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_3",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "25 Gb/sec (1X EDR)",
+            "state": "1: DOWN",
+            "phys_state": "3: Disabled",
+            "gid": "fe80:0000:0000:0000:5e25:73ff:fe4e:eac1"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_4",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "100 Gb/sec (2X HDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:9c63:c003:005f:63cc"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_5",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "100 Gb/sec (2X HDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:9c63:c003:005f:63cd"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_6",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "400 Gb/sec (4X NDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:58a2:e103:0093:3bf4"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_7",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "400 Gb/sec (4X NDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:58a2:e103:0093:3e28"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_8",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "25 Gb/sec (1X EDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:5c3f:b8ff:fe5e:7832"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_9",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "25 Gb/sec (1X EDR)",
+            "state": "1: DOWN",
+            "phys_state": "3: Disabled",
+            "gid": "fe80:0000:0000:0000:5e25:73ff:fe63:1717"
+          }
+        ]
+      }
+    ],
+    "bandwidth_tests": [
+      {
+        "test": "ib_write_bw",
+        "status": "WARN",
+        "bandwidth_gbps": 0.13,
+        "min_required_gbps": 50
+      },
+      {
+        "test": "ib_read_bw",
+        "status": "WARN",
+        "bandwidth_gbps": 0.13,
+        "min_required_gbps": 50
+      }
+    ],
+    "latency_tests": [
+      {
+        "test": "ib_write_lat",
+        "status": "PASS",
+        "latency_us": 4.53,
+        "max_allowed_us": 10
+      },
+      {
+        "test": "ib_read_lat",
+        "status": "WARN",
+        "latency_us": 16.0,
+        "max_allowed_us": 10
+      }
+    ],
+    "timestamp": "2026-05-22T15:41:20.534115"
+  },
+  "timestamp": "2026-05-22T15:41:20.544589"
+}
\ No newline at end of file
diff --git a/reports_rdma_aikubeworker0016.json b/reports_rdma_aikubeworker0016.json
new file mode 100644
index 0000000..5e98f8a
--- /dev/null
+++ b/reports_rdma_aikubeworker0016.json
@@ -0,0 +1,157 @@
+{
+  "rdma": {
+    "passed": false,
+    "devices": [
+      {
+        "name": "mlx5_0",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "400 Gb/sec (4X NDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:58a2:e103:0088:81e0"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_1",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "400 Gb/sec (4X NDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:9c63:c003:0054:e00a"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_2",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "25 Gb/sec (1X EDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_3",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "25 Gb/sec (1X EDR)",
+            "state": "1: DOWN",
+            "phys_state": "3: Disabled",
+            "gid": "fe80:0000:0000:0000:c670:bdff:fefd:5bd9"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_4",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "100 Gb/sec (2X HDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:9c63:c003:005f:58ec"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_5",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "100 Gb/sec (2X HDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:9c63:c003:005f:58ed"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_6",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "400 Gb/sec (4X NDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:9c63:c003:0055:0e56"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_7",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "400 Gb/sec (4X NDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:a088:c203:00f0:286c"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_8",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "25 Gb/sec (1X EDR)",
+            "state": "4: ACTIVE",
+            "phys_state": "5: LinkUp",
+            "gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf"
+          }
+        ]
+      },
+      {
+        "name": "mlx5_9",
+        "ports": [
+          {
+            "port": "1",
+            "rate": "25 Gb/sec (1X EDR)",
+            "state": "1: DOWN",
+            "phys_state": "3: Disabled",
+            "gid": "fe80:0000:0000:0000:c670:bdff:fefd:569d"
+          }
+        ]
+      }
+    ],
+    "bandwidth_tests": [
+      {
+        "test": "ib_write_bw",
+        "status": "WARN",
+        "bandwidth_gbps": 0.13,
+        "min_required_gbps": 50
+      },
+      {
+        "test": "ib_read_bw",
+        "status": "WARN",
+        "bandwidth_gbps": 0.13,
+        "min_required_gbps": 50
+      }
+    ],
+    "latency_tests": [
+      {
+        "test": "ib_write_lat",
+        "status": "PASS",
+        "latency_us": 4.22,
+        "max_allowed_us": 10
+      },
+      {
+        "test": "ib_read_lat",
+        "status": "WARN",
+        "latency_us": 16.0,
+        "max_allowed_us": 10
+      }
+    ],
+    "timestamp": "2026-05-22T15:41:07.851101"
+  },
+  "timestamp": "2026-05-22T15:41:07.861558"
+}
\ No newline at end of file
diff --git a/reports_rdma_counter_aikubeworker0012_20260522_194808.md b/reports_rdma_counter_aikubeworker0012_20260522_194808.md
new file mode 100644
index 0000000..f254bef
--- /dev/null
+++ b/reports_rdma_counter_aikubeworker0012_20260522_194808.md
@@ -0,0 +1,62 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22T19:48:26.622179
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Failed or unverified items:
+- RDMA: FAIL
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- DCGM
+- Training
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| RDMA | FAIL |
+
+## RDMA/InfiniBand
+
+### RDMA Port Checks
+
+| Device | Port | State | Rate | Required | Status |
+|--------|------|-------|------|----------|--------|
+| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
+| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
+| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+
+| Test | Value | Threshold | Status |
+|------|-------|-----------|--------|
+| ib_write_bw | 49.3 GB/s | >= 47 GB/s | PASS |
+| ib_read_bw | 39.2 GB/s | >= 47 GB/s | FAIL |
+| ib_write_lat | 4.49 us | <= 2 us | FAIL |
+| ib_read_lat | 16.00 us | <= 3.5 us | FAIL |
+| ibping | target=0x58 count=5 | 0% packet loss | PASS |
+
+- **PFC/ECN/CNP/congestion counters checked:** 146
+- **PFC/ECN/CNP/congestion non-zero:** no
+- **Failure reasons:**
+  - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
+  - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
+  - ib_read_bw bandwidth 39.21GB/s < 47GB/s
+  - ib_write_lat latency 4.49us > 2.0us
+  - ib_read_lat latency 16.0us > 3.5us
+**Overall: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_rdma_counter_aikubeworker0016_20260522_194828.md b/reports_rdma_counter_aikubeworker0016_20260522_194828.md
new file mode 100644
index 0000000..a72f917
--- /dev/null
+++ b/reports_rdma_counter_aikubeworker0016_20260522_194828.md
@@ -0,0 +1,62 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22T19:48:45.899570
+- **Host:** aikubeworker0016
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Failed or unverified items:
+- RDMA: FAIL
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- DCGM
+- Training
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| RDMA | FAIL |
+
+## RDMA/InfiniBand
+
+### RDMA Port Checks
+
+| Device | Port | State | Rate | Required | Status |
+|--------|------|-------|------|----------|--------|
+| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
+| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
+| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+
+| Test | Value | Threshold | Status |
+|------|-------|-----------|--------|
+| ib_write_bw | 48.1 GB/s | >= 47 GB/s | PASS |
+| ib_read_bw | 40.3 GB/s | >= 47 GB/s | FAIL |
+| ib_write_lat | 4.28 us | <= 2 us | FAIL |
+| ib_read_lat | 16.00 us | <= 3.5 us | FAIL |
+| ibping | target=0x4b count=5 | 0% packet loss | PASS |
+
+- **PFC/ECN/CNP/congestion counters checked:** 146
+- **PFC/ECN/CNP/congestion non-zero:** no
+- **Failure reasons:**
+  - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
+  - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
+  - ib_read_bw bandwidth 40.3GB/s < 47GB/s
+  - ib_write_lat latency 4.28us > 2.0us
+  - ib_read_lat latency 16.0us > 3.5us
+**Overall: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_rdma_cross_node_mlx5_0_20260523.md b/reports_rdma_cross_node_mlx5_0_20260523.md
new file mode 100644
index 0000000..dfdfb8a
--- /dev/null
+++ b/reports_rdma_cross_node_mlx5_0_20260523.md
@@ -0,0 +1,50 @@
+# RDMA Cross-node Evidence Report
+
+- **Date:** 2026-05-23 Asia/Shanghai
+- **Scope:** `aikubeworker0012` <-> `aikubeworker0016`, single rail `mlx5_0`, port 1
+- **Client/server bootstrap IPs:** `172.72.8.12` and `172.72.8.16`
+- **Bandwidth message size:** 4MB
+- **Latency message size:** 8B
+- **Iterations:** 1000
+
+## Port Evidence
+
+| Host | Device | State | Rate | Link | LID |
+|---|---|---|---|---|---|
+| aikubeworker0012 | mlx5_0/1 | ACTIVE | 400 Gb/sec (4X NDR) | InfiniBand | 0x58 |
+| aikubeworker0016 | mlx5_0/1 | ACTIVE | 400 Gb/sec (4X NDR) | InfiniBand | 0x4b |
+
+## Cross-node Perftest Results
+
+| Direction | Test | Value | PDF Threshold | Status |
+|---|---|---:|---:|---|
+| 0016 -> 0012 | ib_write_bw | 49.35 GB/s | >= 47 GB/s | PASS |
+| 0016 -> 0012 | ib_read_bw | 44.36 GB/s | >= 47 GB/s | FAIL |
+| 0016 -> 0012 | ib_write_lat avg | 2.17 us | <= 2.0 us | FAIL |
+| 0016 -> 0012 | ib_read_lat avg | 4.05 us | <= 3.5 us | FAIL |
+| 0012 -> 0016 | ib_write_bw | 48.38 GB/s | >= 47 GB/s | PASS |
+| 0012 -> 0016 | ib_read_bw | 44.37 GB/s | >= 47 GB/s | FAIL |
+| 0012 -> 0016 | ib_write_lat avg | 2.13 us | <= 2.0 us | FAIL |
+| 0012 -> 0016 | ib_read_lat avg | 4.08 us | <= 3.5 us | FAIL |
+
+## Bidirectional ibping
+
+| Direction | Target LID | Result |
+|---|---|---|
+| 0016 -> 0012 | 0x58 | 5 transmitted, 5 received, 0% packet loss; avg 0.005 ms |
+| 0012 -> 0016 | 0x4b | 5 transmitted, 5 received, 0% packet loss; avg 0.005 ms |
+
+## Fabric Counters
+
+| Host | PFC/ECN/CNP/congestion Counters Checked | Non-zero Counters | Status |
+|---|---:|---:|---|
+| aikubeworker0012 | 146 | 0 | PASS |
+| aikubeworker0016 | 146 | 0 | PASS |
+
+## Verdict
+
+**RDMA cross-node verdict: FAIL**
+
+Reason: bidirectional connectivity is good, PFC/ECN/CNP/congestion counters are clean, and write bandwidth passes. However read bandwidth is below 47 GB/s in both directions, write latency is slightly above 2.0 us in both directions, and read latency is above 3.5 us in both directions.
+
+Note: `modules/rdma_test.py` was corrected on 2026-05-23 to parse `ib_write_lat` / `ib_read_lat` `t_avg[usec]` rather than the 99.9 percentile column. Older reports that show `read_lat` around 16 us are therefore not the current parser output.
diff --git a/reports_rdma_single_node_summary.md b/reports_rdma_single_node_summary.md
new file mode 100644
index 0000000..c1c95de
--- /dev/null
+++ b/reports_rdma_single_node_summary.md
@@ -0,0 +1,73 @@
+# Single-node RDMA/IB Report
+
+Generated: 2026-05-22 23:41 Asia/Shanghai
+
+Scope: project CLI `gpu_tester.py --test rdma --report --format json`, run separately on each host.
+
+Important note: the current repository RDMA test is single-node only. In `modules/rdma_test.py`, the perftest client connects to `localhost`, so this report validates local IB device discovery and local perftest behavior. It does not validate cross-node RDMA bandwidth between `aikubeworker0012` and `aikubeworker0016`.
+
+## Summary
+
+| Host | Devices Found | Active 400G Ports | Active 100G Ports | Down Ports | Overall |
+| --- | ---: | --- | --- | --- | --- |
+| aikubeworker0012 / 172.72.8.12 | 10 | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | mlx5_4, mlx5_5 | mlx5_3, mlx5_9 | WARN |
+| aikubeworker0016 / 172.72.8.16 | 10 | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | mlx5_4, mlx5_5 | mlx5_3, mlx5_9 | WARN |
+
+## Bandwidth
+
+The bandwidth numbers below are from the repo's local `localhost` RDMA perftest path.
+
+| Host | ib_write_bw | Threshold | Status | ib_read_bw | Threshold | Status |
+| --- | ---: | ---: | --- | ---: | ---: | --- |
+| aikubeworker0012 | 0.13 GB/s | 50 GB/s | WARN | 0.13 GB/s | 50 GB/s | WARN |
+| aikubeworker0016 | 0.13 GB/s | 50 GB/s | WARN | 0.13 GB/s | 50 GB/s | WARN |
+
+## Latency
+
+| Host | ib_write_lat | Limit | Status | ib_read_lat | Limit | Status |
+| --- | ---: | ---: | --- | ---: | ---: | --- |
+| aikubeworker0012 | 4.53 us | 10 us | PASS | 16.00 us | 10 us | WARN |
+| aikubeworker0016 | 4.22 us | 10 us | PASS | 16.00 us | 10 us | WARN |
+
+## Device Inventory
+
+### aikubeworker0012
+
+| Device | Port | State | Physical State | Rate |
+| --- | --- | --- | --- | --- |
+| mlx5_0 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
+| mlx5_1 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
+| mlx5_2 | 1 | ACTIVE | LinkUp | 25 Gb/sec (1X EDR) |
+| mlx5_3 | 1 | DOWN | Disabled | 25 Gb/sec (1X EDR) |
+| mlx5_4 | 1 | ACTIVE | LinkUp | 100 Gb/sec (2X HDR) |
+| mlx5_5 | 1 | ACTIVE | LinkUp | 100 Gb/sec (2X HDR) |
+| mlx5_6 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
+| mlx5_7 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
+| mlx5_8 | 1 | ACTIVE | LinkUp | 25 Gb/sec (1X EDR) |
+| mlx5_9 | 1 | DOWN | Disabled | 25 Gb/sec (1X EDR) |
+
+### aikubeworker0016
+
+| Device | Port | State | Physical State | Rate |
+| --- | --- | --- | --- | --- |
+| mlx5_0 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
+| mlx5_1 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
+| mlx5_2 | 1 | ACTIVE | LinkUp | 25 Gb/sec (1X EDR) |
+| mlx5_3 | 1 | DOWN | Disabled | 25 Gb/sec (1X EDR) |
+| mlx5_4 | 1 | ACTIVE | LinkUp | 100 Gb/sec (2X HDR) |
+| mlx5_5 | 1 | ACTIVE | LinkUp | 100 Gb/sec (2X HDR) |
+| mlx5_6 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
+| mlx5_7 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
+| mlx5_8 | 1 | ACTIVE | LinkUp | 25 Gb/sec (1X EDR) |
+| mlx5_9 | 1 | DOWN | Disabled | 25 Gb/sec (1X EDR) |
+
+## Files
+
+Raw JSON:
+
+- `reports_rdma_aikubeworker0012.json`
+- `reports_rdma_aikubeworker0016.json`
+
+Markdown summary:
+
+- `reports_rdma_single_node_summary.md`
diff --git a/reports_single_gpu_aikubeworker0012.json b/reports_single_gpu_aikubeworker0012.json
new file mode 100644
index 0000000..6cc5a37
--- /dev/null
+++ b/reports_single_gpu_aikubeworker0012.json
@@ -0,0 +1,292 @@
+{
+  "timestamp": "2026-05-22T15:26:26.973586",
+  "gpu_info": {
+    "driver_version": "580.159.03",
+    "cuda_version": "13.0",
+    "gpu_count": 8,
+    "gpus": [
+      {
+        "index": 0,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-7658c03c-7659-9886-041e-545c21d53e12",
+        "pci_bus_id": "00000000:18:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 4,
+        "vram_free_mb": 81076,
+        "power_draw": 69.72,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 25,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1654923030411",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 1,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-6392d40b-893b-9fc2-4284-a3f1d8c4d7f1",
+        "pci_bus_id": "00000000:2A:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 0,
+        "vram_free_mb": 81079,
+        "power_draw": 73.17,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 25,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1654724063165",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 2,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-2ae38735-10de-fb0b-fb20-9d1b5b434558",
+        "pci_bus_id": "00000000:3A:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 0,
+        "vram_free_mb": 81079,
+        "power_draw": 68.71,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 26,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1654823036530",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 3,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-ec62123f-0c48-6dbd-49e4-8b231b3fed0e",
+        "pci_bus_id": "00000000:5D:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 0,
+        "vram_free_mb": 81079,
+        "power_draw": 69.73,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 25,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1654923021638",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 4,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-b64fc270-109e-1543-fb0c-be7feecf14f1",
+        "pci_bus_id": "00000000:9A:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 0,
+        "vram_free_mb": 81079,
+        "power_draw": 68.84,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 24,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1655023033179",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 5,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-15ab7baf-9010-7cf3-5462-eeb09f8dbe65",
+        "pci_bus_id": "00000000:AB:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 0,
+        "vram_free_mb": 81079,
+        "power_draw": 69.94,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 27,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1655023034225",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 6,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-225f6f3c-6fef-d1e2-5428-d90f665fb3d3",
+        "pci_bus_id": "00000000:BA:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 0,
+        "vram_free_mb": 81079,
+        "power_draw": 70.46,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 25,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1654923078278",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 7,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-79aeb6a8-c00c-6edb-956f-779ef56950a3",
+        "pci_bus_id": "00000000:DB:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 0,
+        "vram_free_mb": 81079,
+        "power_draw": 71.76,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 24,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1654024031464",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      }
+    ],
+    "topology": "\t\u001b[4mGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tNIC3\tNIC4\tNIC5\tNIC6\tNIC7\tNIC8\tNIC9\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\u001b[0m\nGPU0\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tPIX\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU1\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tPIX\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU2\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tPIX\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU3\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU4\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU5\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU6\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tPIX\t56-111,168-223\t1\t\tN/A\nGPU7\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nNIC0\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC3\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC4\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t X \tPIX\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC5\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\tPIX\t X \tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC6\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\t\t\t\t\nNIC7\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\t\t\t\t\nNIC8\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\t\t\t\t\nNIC9\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \t\t\t\t\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n  NIC0: mlx5_0\n  NIC1: mlx5_1\n  NIC2: mlx5_2\n  NIC3: mlx5_3\n  NIC4: mlx5_4\n  NIC5: mlx5_5\n  NIC6: mlx5_6\n  NIC7: mlx5_7\n  NIC8: mlx5_8\n  NIC9: mlx5_9\n\n",
+    "timestamp": "2026-05-22T15:26:34.187409",
+    "detected_gpu_type": "h100",
+    "gpu_label": "H100 SXM5"
+  },
+  "memory_bench": {
+    "memory": {
+      "source": "pytorch",
+      "h2d_bandwidth_gbps": 11.8,
+      "d2h_bandwidth_gbps": 9.9,
+      "d2d_bandwidth_gbps": 829.1,
+      "peak_bandwidth_gbps": 3400,
+      "efficiency_pct": 24.4,
+      "test_sizes_mb": [
+        1,
+        4,
+        16,
+        64,
+        256,
+        1024,
+        4096
+      ],
+      "bandwidth_by_size": {
+        "1": {
+          "h2d_gbps": 3.8,
+          "d2h_gbps": 1.4,
+          "d2d_gbps": 40.6
+        },
+        "4": {
+          "h2d_gbps": 7.6,
+          "d2h_gbps": 9.9,
+          "d2d_gbps": 141.5
+        },
+        "16": {
+          "h2d_gbps": 11.0,
+          "d2h_gbps": 1.9,
+          "d2d_gbps": 450.3
+        },
+        "64": {
+          "h2d_gbps": 11.8,
+          "d2h_gbps": 1.4,
+          "d2d_gbps": 726.5
+        },
+        "256": {
+          "h2d_gbps": 9.0,
+          "d2h_gbps": 1.4,
+          "d2d_gbps": 793.8
+        },
+        "1024": {
+          "h2d_gbps": 5.5,
+          "d2h_gbps": 1.4,
+          "d2d_gbps": 821.2
+        },
+        "4096": {
+          "h2d_gbps": 5.9,
+          "d2h_gbps": 1.4,
+          "d2d_gbps": 829.1
+        }
+      },
+      "per_gpu": []
+    }
+  },
+  "compute_bench": {
+    "compute": {
+      "per_dtype_tflops": {
+        "fp32": 52.0,
+        "tf32": 362.3,
+        "fp16": 691.0,
+        "bf16": 713.0,
+        "fp8": 1148.8
+      },
+      "peak_tflops": {
+        "fp32": 67,
+        "tf32": 495,
+        "fp16": 990,
+        "bf16": 990,
+        "fp8": 1979
+      },
+      "efficiency_pct": {
+        "fp32": 77.6,
+        "tf32": 73.2,
+        "fp16": 69.8,
+        "bf16": 72.0,
+        "fp8": 58.0
+      },
+      "pass_thresholds_tflops": {
+        "fp32": 54,
+        "tf32": 444,
+        "fp16": 734,
+        "bf16": 745,
+        "fp8": 1400
+      },
+      "per_gpu": [
+        {
+          "index": 0,
+          "fp32": 52.0,
+          "tf32": 362.3,
+          "fp16": 691.0,
+          "bf16": 713.0,
+          "fp8": 1148.8
+        }
+      ],
+      "matrix_size": 8192,
+      "warmup": 50,
+      "iterations": 500
+    }
+  }
+}
\ No newline at end of file
diff --git a/reports_single_gpu_aikubeworker0012.md b/reports_single_gpu_aikubeworker0012.md
new file mode 100644
index 0000000..3a6c3c9
--- /dev/null
+++ b/reports_single_gpu_aikubeworker0012.md
@@ -0,0 +1,54 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22 15:27:51
+- **Host:** aikubeworker0012
+- **GPU:** NVIDIA H100 80GB HBM3 x8
+- **Driver:** 580.159.03 | **CUDA:** 13.0
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| GPU Info | PASS (8 GPUs detected) |
+| Memory Bandwidth | WARN (829 GB/s via PyTorch fallback) |
+| Compute Throughput | FAIL (worst TF32 362 vs >= 444) |
+
+## GPU Information
+
+| GPU | Model | VRAM | Temp | Power | SM Clock |
+|-----|-------|------|------|-------|----------|
+| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 70/700W | 345 MHz |
+| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 73/700W | 345 MHz |
+| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 26C | 69/700W | 345 MHz |
+| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 70/700W | 345 MHz |
+| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 69/700W | 345 MHz |
+| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 27C | 70/700W | 345 MHz |
+| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 70/700W | 345 MHz |
+| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 72/700W | 345 MHz |
+
+## Memory Bandwidth
+
+Source: pytorch
+
+| Metric | Value | Peak | Efficiency |
+|--------|-------|------|------------|
+| H2D (PCIe) | 11.8 GB/s | 0 GB/s | 0.0% |
+| D2H (PCIe) | 9.9 GB/s | 0 GB/s | 0.0% |
+| D2D (NVLink) | 829.1 GB/s | 3400 GB/s | 24.4% |
+
+**Verdict: WARN** (D2D 829.1 GB/s via PyTorch fallback; nvbandwidth unavailable — figure is indicative only, not a true HBM peak)
+
+## Compute Throughput
+
+| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
+|-------|-------------------|------|------------|--------|
+| FP32 | 52.0 | 67 | >= 54 | WARN |
+| TF32 | 362.3 | 495 | >= 444 | FAIL |
+| FP16 | 691.0 | 990 | >= 734 | WARN |
+| BF16 | 713.0 | 990 | >= 745 | WARN |
+| FP8 | 1148.8 | 1979 | >= 1400 | FAIL |
+
+**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 58.0%)
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_single_gpu_aikubeworker0016.json b/reports_single_gpu_aikubeworker0016.json
new file mode 100644
index 0000000..4b3c442
--- /dev/null
+++ b/reports_single_gpu_aikubeworker0016.json
@@ -0,0 +1,292 @@
+{
+  "timestamp": "2026-05-22T15:26:29.511252",
+  "gpu_info": {
+    "driver_version": "580.159.03",
+    "cuda_version": "13.0",
+    "gpu_count": 8,
+    "gpus": [
+      {
+        "index": 0,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-dfbc9513-255d-4fe7-2b77-7b1ec3972e75",
+        "pci_bus_id": "00000000:18:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 4,
+        "vram_free_mb": 81076,
+        "power_draw": 69.81,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 20,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924016120",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 1,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-bb845ef7-d7b5-f011-9395-ea74274e2282",
+        "pci_bus_id": "00000000:2A:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 0,
+        "vram_free_mb": 81079,
+        "power_draw": 67.45,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 20,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924015483",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 2,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-3720cf13-2a34-be38-27be-0a7adc4addc4",
+        "pci_bus_id": "00000000:3A:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 0,
+        "vram_free_mb": 81079,
+        "power_draw": 66.69,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 21,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924025595",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 3,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-87080b2d-ac43-be0d-d574-c193078850ae",
+        "pci_bus_id": "00000000:5D:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 0,
+        "vram_free_mb": 81079,
+        "power_draw": 66.86,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 20,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924016862",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 4,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-599bd883-cc5c-a5dd-6c33-c15f7049da48",
+        "pci_bus_id": "00000000:9A:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 0,
+        "vram_free_mb": 81079,
+        "power_draw": 67.07,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 20,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924025670",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 5,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-a1c6bba4-61b0-e623-06c9-9c88635e26fe",
+        "pci_bus_id": "00000000:AB:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 0,
+        "vram_free_mb": 81079,
+        "power_draw": 69.12,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 22,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924027166",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 6,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-98745a0c-39bd-3e56-d6ca-54ba3647ab6d",
+        "pci_bus_id": "00000000:BA:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 0,
+        "vram_free_mb": 81079,
+        "power_draw": 67.61,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 20,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924026234",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      },
+      {
+        "index": 7,
+        "name": "NVIDIA H100 80GB HBM3",
+        "uuid": "GPU-8c73bd8b-666b-357e-ac5d-c75ac7a759db",
+        "pci_bus_id": "00000000:DB:00.0",
+        "pcie_link_gen": 5,
+        "pcie_link_width": 16,
+        "vram_total_mb": 81559,
+        "vram_used_mb": 0,
+        "vram_free_mb": 81079,
+        "power_draw": 66.19,
+        "power_limit": 700.0,
+        "clock_sm": 345,
+        "clock_mem": 2619,
+        "temperature": 20,
+        "fan_speed": 0,
+        "persistence_mode": false,
+        "compute_mode": "Default",
+        "serial_number": "1651924027255",
+        "ecc_errors_single": 0,
+        "ecc_errors_double": 0
+      }
+    ],
+    "topology": "\t\u001b[4mGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tNIC3\tNIC4\tNIC5\tNIC6\tNIC7\tNIC8\tNIC9\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\u001b[0m\nGPU0\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tPIX\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU1\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tPIX\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU2\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tPIX\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU3\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU4\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU5\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU6\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tPIX\t56-111,168-223\t1\t\tN/A\nGPU7\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nNIC0\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC3\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC4\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t X \tPIX\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC5\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\tPIX\t X \tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC6\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\t\t\t\t\nNIC7\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\t\t\t\t\nNIC8\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\t\t\t\t\nNIC9\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \t\t\t\t\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n  NIC0: mlx5_0\n  NIC1: mlx5_1\n  NIC2: mlx5_2\n  NIC3: mlx5_3\n  NIC4: mlx5_4\n  NIC5: mlx5_5\n  NIC6: mlx5_6\n  NIC7: mlx5_7\n  NIC8: mlx5_8\n  NIC9: mlx5_9\n\n",
+    "timestamp": "2026-05-22T15:26:36.627805",
+    "detected_gpu_type": "h100",
+    "gpu_label": "H100 SXM5"
+  },
+  "memory_bench": {
+    "memory": {
+      "source": "pytorch",
+      "h2d_bandwidth_gbps": 11.8,
+      "d2h_bandwidth_gbps": 10.1,
+      "d2d_bandwidth_gbps": 829.0,
+      "peak_bandwidth_gbps": 3400,
+      "efficiency_pct": 24.4,
+      "test_sizes_mb": [
+        1,
+        4,
+        16,
+        64,
+        256,
+        1024,
+        4096
+      ],
+      "bandwidth_by_size": {
+        "1": {
+          "h2d_gbps": 3.6,
+          "d2h_gbps": 1.4,
+          "d2d_gbps": 40.3
+        },
+        "4": {
+          "h2d_gbps": 7.7,
+          "d2h_gbps": 10.1,
+          "d2d_gbps": 159.5
+        },
+        "16": {
+          "h2d_gbps": 10.9,
+          "d2h_gbps": 1.9,
+          "d2d_gbps": 439.5
+        },
+        "64": {
+          "h2d_gbps": 11.8,
+          "d2h_gbps": 1.4,
+          "d2d_gbps": 740.5
+        },
+        "256": {
+          "h2d_gbps": 9.0,
+          "d2h_gbps": 1.4,
+          "d2d_gbps": 792.1
+        },
+        "1024": {
+          "h2d_gbps": 8.4,
+          "d2h_gbps": 1.4,
+          "d2d_gbps": 818.9
+        },
+        "4096": {
+          "h2d_gbps": 6.1,
+          "d2h_gbps": 1.4,
+          "d2d_gbps": 829.0
+        }
+      },
+      "per_gpu": []
+    }
+  },
+  "compute_bench": {
+    "compute": {
+      "per_dtype_tflops": {
+        "fp32": 51.9,
+        "tf32": 357.8,
+        "fp16": 667.2,
+        "bf16": 699.1,
+        "fp8": 1146.2
+      },
+      "peak_tflops": {
+        "fp32": 67,
+        "tf32": 495,
+        "fp16": 990,
+        "bf16": 990,
+        "fp8": 1979
+      },
+      "efficiency_pct": {
+        "fp32": 77.5,
+        "tf32": 72.3,
+        "fp16": 67.4,
+        "bf16": 70.6,
+        "fp8": 57.9
+      },
+      "pass_thresholds_tflops": {
+        "fp32": 54,
+        "tf32": 444,
+        "fp16": 734,
+        "bf16": 745,
+        "fp8": 1400
+      },
+      "per_gpu": [
+        {
+          "index": 0,
+          "fp32": 51.9,
+          "tf32": 357.8,
+          "fp16": 667.2,
+          "bf16": 699.1,
+          "fp8": 1146.2
+        }
+      ],
+      "matrix_size": 8192,
+      "warmup": 50,
+      "iterations": 500
+    }
+  }
+}
\ No newline at end of file
diff --git a/reports_single_gpu_aikubeworker0016.md b/reports_single_gpu_aikubeworker0016.md
new file mode 100644
index 0000000..49f9f45
--- /dev/null
+++ b/reports_single_gpu_aikubeworker0016.md
@@ -0,0 +1,54 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22 15:27:53
+- **Host:** aikubeworker0016
+- **GPU:** NVIDIA H100 80GB HBM3 x8
+- **Driver:** 580.159.03 | **CUDA:** 13.0
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| GPU Info | PASS (8 GPUs detected) |
+| Memory Bandwidth | WARN (829 GB/s via PyTorch fallback) |
+| Compute Throughput | FAIL (worst TF32 358 vs >= 444) |
+
+## GPU Information
+
+| GPU | Model | VRAM | Temp | Power | SM Clock |
+|-----|-------|------|------|-------|----------|
+| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 70/700W | 345 MHz |
+| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz |
+| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 67/700W | 345 MHz |
+| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz |
+| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz |
+| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 22C | 69/700W | 345 MHz |
+| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 68/700W | 345 MHz |
+| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 66/700W | 345 MHz |
+
+## Memory Bandwidth
+
+Source: pytorch
+
+| Metric | Value | Peak | Efficiency |
+|--------|-------|------|------------|
+| H2D (PCIe) | 11.8 GB/s | 0 GB/s | 0.0% |
+| D2H (PCIe) | 10.1 GB/s | 0 GB/s | 0.0% |
+| D2D (NVLink) | 829.0 GB/s | 3400 GB/s | 24.4% |
+
+**Verdict: WARN** (D2D 829.0 GB/s via PyTorch fallback; nvbandwidth unavailable — figure is indicative only, not a true HBM peak)
+
+## Compute Throughput
+
+| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
+|-------|-------------------|------|------------|--------|
+| FP32 | 51.9 | 67 | >= 54 | WARN |
+| TF32 | 357.8 | 495 | >= 444 | FAIL |
+| FP16 | 667.2 | 990 | >= 734 | WARN |
+| BF16 | 699.1 | 990 | >= 745 | WARN |
+| FP8 | 1146.2 | 1979 | >= 1400 | FAIL |
+
+**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 57.9%)
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_stress_smoke_reasons_aikubeworker0012.json b/reports_stress_smoke_reasons_aikubeworker0012.json
new file mode 100644
index 0000000..2722c96
--- /dev/null
+++ b/reports_stress_smoke_reasons_aikubeworker0012.json
@@ -0,0 +1,165 @@
+{
+  "stress": {
+    "source": "pytorch",
+    "passed": false,
+    "duration_sec": 45,
+    "elapsed_sec": 45.4,
+    "gpu_status": {
+      "0": "PASS",
+      "1": "PASS",
+      "2": "PASS",
+      "3": "PASS",
+      "4": "PASS",
+      "5": "PASS",
+      "6": "PASS",
+      "7": "PASS"
+    },
+    "telemetry": {
+      "passed": false,
+      "samples": 39,
+      "steady_samples": 31,
+      "warmup_sec": 9.0,
+      "max_temp_c": {
+        "0": 59.0,
+        "1": 58.0,
+        "2": 65.0,
+        "3": 54.0,
+        "4": 59.0,
+        "5": 66.0,
+        "6": 62.0,
+        "7": 55.0
+      },
+      "avg_power_w": {
+        "0": 697.0,
+        "1": 697.4,
+        "2": 697.9,
+        "3": 698.0,
+        "4": 697.8,
+        "5": 697.6,
+        "6": 697.9,
+        "7": 698.2
+      },
+      "temp_delta_c": 12.0,
+      "throttle_events": [
+        {
+          "gpu": 0,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 1,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 2,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 3,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 4,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 5,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 6,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 7,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 0,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 1,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 2,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 3,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 4,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 5,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 6,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 7,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 0,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 1,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 2,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 3,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        }
+      ],
+      "throttle_event_count": 248,
+      "xid_events": [],
+      "tflops_jitter_pct": 4.07,
+      "steady_tflops_samples": 781,
+      "failures": [
+        "GPU temperature delta 12.0C exceeds 5.0C",
+        "non-idle throttle reasons observed in 248 samples (first: GPU 0 0x4)"
+      ],
+      "thresholds": {
+        "max_temp_c": 80.0,
+        "max_temp_delta_c": 5.0,
+        "min_power_w": 630.0,
+        "max_tflops_jitter_pct": 5.0,
+        "warmup_sec": 10.0,
+        "min_steady_samples": 10
+      }
+    },
+    "timestamp": "2026-05-22T17:52:09.074859"
+  },
+  "timestamp": "2026-05-22T17:52:09.082873"
+}
\ No newline at end of file
diff --git a/reports_stress_smoke_reasons_aikubeworker0012.md b/reports_stress_smoke_reasons_aikubeworker0012.md
new file mode 100644
index 0000000..cea30e2
--- /dev/null
+++ b/reports_stress_smoke_reasons_aikubeworker0012.md
@@ -0,0 +1,29 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22T17:52:09.082873
+- **Host:** aikubeworker0012
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Stress Test | FAIL |
+
+## Stress Test
+
+- **Source:** pytorch
+- **Duration:** 45s (requested 45s)
+- **Telemetry samples:** 39
+- **Max temp:** {'0': 59.0, '1': 58.0, '2': 65.0, '3': 54.0, '4': 59.0, '5': 66.0, '6': 62.0, '7': 55.0}
+- **Avg power:** {'0': 697.0, '1': 697.4, '2': 697.9, '3': 698.0, '4': 697.8, '5': 697.6, '6': 697.9, '7': 698.2}
+- **Temp delta:** 12.0 C
+- **TFLOPS jitter:** 4.07%
+- **Throttle events:** 248
+- **XID events:** 0
+- **Failure reasons:**
+  - GPU temperature delta 12.0C exceeds 5.0C
+  - non-idle throttle reasons observed in 248 samples (first: GPU 0 0x4)
+- **Result: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_stress_smoke_reasons_aikubeworker0016.json b/reports_stress_smoke_reasons_aikubeworker0016.json
new file mode 100644
index 0000000..8d39f58
--- /dev/null
+++ b/reports_stress_smoke_reasons_aikubeworker0016.json
@@ -0,0 +1,165 @@
+{
+  "stress": {
+    "source": "pytorch",
+    "passed": false,
+    "duration_sec": 45,
+    "elapsed_sec": 45.4,
+    "gpu_status": {
+      "0": "PASS",
+      "1": "PASS",
+      "2": "PASS",
+      "3": "PASS",
+      "4": "PASS",
+      "5": "PASS",
+      "6": "PASS",
+      "7": "PASS"
+    },
+    "telemetry": {
+      "passed": false,
+      "samples": 39,
+      "steady_samples": 31,
+      "warmup_sec": 9.0,
+      "max_temp_c": {
+        "0": 50.0,
+        "1": 56.0,
+        "2": 57.0,
+        "3": 52.0,
+        "4": 51.0,
+        "5": 58.0,
+        "6": 53.0,
+        "7": 51.0
+      },
+      "avg_power_w": {
+        "0": 698.3,
+        "1": 698.5,
+        "2": 697.6,
+        "3": 697.9,
+        "4": 697.8,
+        "5": 698.0,
+        "6": 697.5,
+        "7": 698.0
+      },
+      "temp_delta_c": 8.0,
+      "throttle_events": [
+        {
+          "gpu": 0,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 1,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 2,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 3,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 4,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 5,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 6,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 7,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 0,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 1,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 2,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 3,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 4,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 5,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 6,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 7,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 0,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 1,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 2,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        },
+        {
+          "gpu": 3,
+          "throttle": "0x0000000000000004",
+          "real_throttle": "0x4"
+        }
+      ],
+      "throttle_event_count": 248,
+      "xid_events": [],
+      "tflops_jitter_pct": 3.77,
+      "steady_tflops_samples": 787,
+      "failures": [
+        "GPU temperature delta 8.0C exceeds 5.0C",
+        "non-idle throttle reasons observed in 248 samples (first: GPU 0 0x4)"
+      ],
+      "thresholds": {
+        "max_temp_c": 80.0,
+        "max_temp_delta_c": 5.0,
+        "min_power_w": 630.0,
+        "max_tflops_jitter_pct": 5.0,
+        "warmup_sec": 10.0,
+        "min_steady_samples": 10
+      }
+    },
+    "timestamp": "2026-05-22T17:53:02.058687"
+  },
+  "timestamp": "2026-05-22T17:53:02.066792"
+}
\ No newline at end of file
diff --git a/reports_stress_smoke_reasons_aikubeworker0016.md b/reports_stress_smoke_reasons_aikubeworker0016.md
new file mode 100644
index 0000000..9f9c3ab
--- /dev/null
+++ b/reports_stress_smoke_reasons_aikubeworker0016.md
@@ -0,0 +1,29 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22T17:53:02.066792
+- **Host:** aikubeworker0016
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Stress Test | FAIL |
+
+## Stress Test
+
+- **Source:** pytorch
+- **Duration:** 45s (requested 45s)
+- **Telemetry samples:** 39
+- **Max temp:** {'0': 50.0, '1': 56.0, '2': 57.0, '3': 52.0, '4': 51.0, '5': 58.0, '6': 53.0, '7': 51.0}
+- **Avg power:** {'0': 698.3, '1': 698.5, '2': 697.6, '3': 697.9, '4': 697.8, '5': 698.0, '6': 697.5, '7': 698.0}
+- **Temp delta:** 8.0 C
+- **TFLOPS jitter:** 3.77%
+- **Throttle events:** 248
+- **XID events:** 0
+- **Failure reasons:**
+  - GPU temperature delta 8.0C exceeds 5.0C
+  - non-idle throttle reasons observed in 248 samples (first: GPU 0 0x4)
+- **Result: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_test_all_latest_aikubeworker0012_20260522_203246.md b/reports_test_all_latest_aikubeworker0012_20260522_203246.md
new file mode 100644
index 0000000..8853d18
--- /dev/null
+++ b/reports_test_all_latest_aikubeworker0012_20260522_203246.md
@@ -0,0 +1,322 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22T20:32:51.687830
+- **Host:** aikubeworker0012
+- **GPU:** NVIDIA H100 80GB HBM3 x8
+- **Driver:** 580.159.03 | **CUDA:** 13.0
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Failed or unverified items:
+- Compute Throughput: FAIL (FP16 spread 3.04% > 3%)
+- NCCL: FAIL
+- Stress Test: FAIL
+- RDMA: FAIL
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| GPU Info | PASS (8 GPUs detected) |
+| Health Check | PASS |
+| Memory Bandwidth | PASS (108.1%) |
+| Compute Throughput | FAIL (FP16 spread 3.04% > 3%) |
+| NVLink/NVSwitch | PASS |
+| DCGM | PASS |
+| NCCL | FAIL |
+| Stress Test | FAIL |
+| RDMA | FAIL |
+| Training | PASS (216498 tokens/sec) |
+
+## GPU Information
+
+| GPU | Model | VRAM | Temp | Power | SM Clock |
+|-----|-------|------|------|-------|----------|
+| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 69/700W | 345 MHz |
+| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 73/700W | 345 MHz |
+| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 26C | 69/700W | 345 MHz |
+| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 69/700W | 345 MHz |
+| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 69/700W | 345 MHz |
+| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 27C | 70/700W | 345 MHz |
+| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 70/700W | 345 MHz |
+| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 71/700W | 345 MHz |
+
+## Health Check
+
+**Overall: PASS**
+
+| GPU | Temp | Power | ECC | PCIe | Throttle | Status |
+|-----|------|-------|-----|------|----------|--------|
+| 0 | 25C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 1 | 25C PASS | 73W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 2 | 26C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 3 | 24C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 4 | 24C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 5 | 27C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 6 | 25C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 7 | 24C PASS | 71W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+
+## Memory Bandwidth
+
+Source: nvbandwidth
+
+| Metric | Value | Peak | Efficiency |
+|--------|-------|------|------------|
+| H2D (PCIe) | 55.4 GB/s | 64 GB/s | 86.6% |
+| D2H (PCIe) | 54.0 GB/s | 64 GB/s | 84.4% |
+| D2D (NVLink) | 486.5 GB/s | 450 GB/s | 108.1% |
+
+**Verdict: PASS** (D2D efficiency 108.1%)
+
+## Compute Throughput
+
+| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
+|-------|-------------------|------|------------|--------|
+| FP32 | 51.9 | 67 | >= 54 | FAIL |
+| TF32 | 364.9 | 495 | >= 444 | FAIL |
+| FP16 | 680.0 | 990 | >= 734 | FAIL |
+| BF16 | 713.2 | 990 | >= 745 | FAIL |
+| FP8 | 1170.4 | 1979 | >= 1400 | FAIL |
+| FP64 | 46.9 | 67 | >= 63 | FAIL |
+| INT8 | 100.4 | 1979 | >= 1536 | FAIL |
+
+**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 5.1%)
+
+### Compute Consistency
+
+| DType | Min | Mean | Max | Spread | Limit | Status |
+|-------|-----|------|-----|--------|-------|--------|
+| FP32 | 51.9 | 52.0 | 52.1 | 0.38% | <= 3% | PASS |
+| TF32 | 361.0 | 364.9 | 369.0 | 2.19% | <= 3% | PASS |
+| FP16 | 667.3 | 680.0 | 688.0 | 3.04% | <= 3% | FAIL |
+| BF16 | 703.0 | 713.3 | 735.7 | 4.58% | <= 3% | FAIL |
+| FP8 | 1156.9 | 1170.5 | 1186.1 | 2.49% | <= 3% | PASS |
+| FP64 | 45.9 | 46.9 | 47.5 | 3.41% | <= 3% | FAIL |
+| INT8 | 100.4 | 100.4 | 100.4 | 0.00% | <= 3% | PASS |
+
+### Compute Per-GPU TFLOPS
+
+| GPU | FP32 | TF32 | FP16 | BF16 | FP8 | FP64 | INT8 |
+|---|---|---|---|---|---|---|---|
+| 0 | 52.0 | 369.0 | 688.0 | 735.7 | 1186.1 | 47.5 | 100.4 |
+| 1 | 51.9 | 365.6 | 675.3 | 711.6 | 1171.0 | 47.0 | 100.4 |
+| 2 | 51.9 | 364.9 | 685.7 | 715.3 | 1175.3 | 47.1 | 100.4 |
+| 3 | 51.9 | 364.0 | 679.9 | 704.0 | 1167.6 | 47.4 | 100.4 |
+| 4 | 51.9 | 367.7 | 681.2 | 719.0 | 1178.0 | 46.6 | 100.4 |
+| 5 | 52.0 | 364.3 | 680.8 | 712.3 | 1165.5 | 46.8 | 100.4 |
+| 6 | 52.1 | 362.9 | 681.8 | 703.0 | 1156.9 | 46.9 | 100.4 |
+| 7 | 51.9 | 361.0 | 667.3 | 705.3 | 1163.2 | 45.9 | 100.4 |
+
+## NVLink/NVSwitch
+
+**Overall: PASS**
+
+| GPU | Active Links | Issues |
+|-----|--------------|--------|
+| 0 | 18/18 | OK |
+| 1 | 18/18 | OK |
+| 2 | 18/18 | OK |
+| 3 | 18/18 | OK |
+| 4 | 18/18 | OK |
+| 5 | 18/18 | OK |
+| 6 | 18/18 | OK |
+| 7 | 18/18 | OK |
+
+## DCGM Diagnostic
+
+**Overall: PASS**
+
+| Subtest | Status |
+|---------|--------|
+| Deployment/software/GPU0 | PASS |
+| Deployment/software/GPU1 | PASS |
+| Deployment/software/GPU2 | PASS |
+| Deployment/software/GPU3 | PASS |
+| Deployment/software/GPU4 | PASS |
+| Deployment/software/GPU5 | PASS |
+| Deployment/software/GPU6 | PASS |
+| Deployment/software/GPU7 | PASS |
+| Deployment/software/summary | PASS |
+| Hardware/memory/GPU0 | PASS |
+| Hardware/memory/GPU1 | PASS |
+| Hardware/memory/GPU2 | PASS |
+| Hardware/memory/GPU3 | PASS |
+| Hardware/memory/GPU4 | PASS |
+| Hardware/memory/GPU5 | PASS |
+| Hardware/memory/GPU6 | PASS |
+| Hardware/memory/GPU7 | PASS |
+| Hardware/memory/summary | PASS |
+| Hardware/diagnostic/GPU0 | PASS |
+| Hardware/diagnostic/GPU1 | PASS |
+| Hardware/diagnostic/GPU2 | PASS |
+| Hardware/diagnostic/GPU3 | PASS |
+| Hardware/diagnostic/GPU4 | PASS |
+| Hardware/diagnostic/GPU5 | PASS |
+| Hardware/diagnostic/GPU6 | PASS |
+| Hardware/diagnostic/GPU7 | PASS |
+| Hardware/diagnostic/summary | PASS |
+| Hardware/nvbandwidth/GPU0 | PASS |
+| Hardware/nvbandwidth/GPU1 | PASS |
+| Hardware/nvbandwidth/GPU2 | PASS |
+| Hardware/nvbandwidth/GPU3 | PASS |
+| Hardware/nvbandwidth/GPU4 | PASS |
+| Hardware/nvbandwidth/GPU5 | PASS |
+| Hardware/nvbandwidth/GPU6 | PASS |
+| Hardware/nvbandwidth/GPU7 | PASS |
+| Hardware/nvbandwidth/summary | PASS |
+| Integration/pcie/GPU0 | PASS |
+| Integration/pcie/GPU1 | PASS |
+| Integration/pcie/GPU2 | PASS |
+| Integration/pcie/GPU3 | PASS |
+| Integration/pcie/GPU4 | PASS |
+| Integration/pcie/GPU5 | PASS |
+| Integration/pcie/GPU6 | PASS |
+| Integration/pcie/GPU7 | PASS |
+| Integration/pcie/summary | PASS |
+| Stress/targeted_stress/GPU0 | PASS |
+| Stress/targeted_stress/GPU1 | PASS |
+| Stress/targeted_stress/GPU2 | PASS |
+| Stress/targeted_stress/GPU3 | PASS |
+| Stress/targeted_stress/GPU4 | PASS |
+| Stress/targeted_stress/GPU5 | PASS |
+| Stress/targeted_stress/GPU6 | PASS |
+| Stress/targeted_stress/GPU7 | PASS |
+| Stress/targeted_stress/summary | PASS |
+| Stress/targeted_power/GPU0 | PASS |
+| Stress/targeted_power/GPU1 | PASS |
+| Stress/targeted_power/GPU2 | PASS |
+| Stress/targeted_power/GPU3 | PASS |
+| Stress/targeted_power/GPU4 | PASS |
+| Stress/targeted_power/GPU5 | PASS |
+| Stress/targeted_power/GPU6 | PASS |
+| Stress/targeted_power/GPU7 | PASS |
+| Stress/targeted_power/summary | PASS |
+
+## NCCL Multi-GPU
+
+Source: nccl-tests | GPUs: 8
+
+| Operation | Bus BW (GB/s) | Threshold | Status |
+|-----------|---------------|-----------|--------|
+| allreduce | 472.3 | >= 405 | FAIL |
+| alltoall | 343.3 | >= 315 | FAIL |
+| broadcast | 364.1 | >= 360 | FAIL |
+| reducescatter | 352.8 | >= 405 | FAIL |
+| allgather | 366.4 | >= 405 | FAIL |
+| sendrecv | 369.0 | >= 360 | FAIL |
+
+### NCCL allreduce by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 24.9, 25.0, 24.7 | 24.7 | 24.9 | 0.50% | >= 405 | FAIL |
+| 256M | 421.6, 421.8, 421.6 | 421.6 | 421.7 | 0.02% | >= 405 | PASS |
+| 2G | 472.8, 472.7, 471.5 | 471.5 | 472.3 | 0.13% | >= 405 | PASS |
+
+### NCCL alltoall by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 8.1, 8.0, 8.0 | 8.0 | 8.0 | 0.59% | >= 315 | FAIL |
+| 256M | 305.3, 314.9, 313.1 | 305.3 | 311.1 | 1.34% | >= 315 | FAIL |
+| 2G | 342.1, 342.5, 345.4 | 342.1 | 343.3 | 0.43% | >= 315 | PASS |
+
+### NCCL broadcast by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 14.5, 14.6, 14.2 | 14.2 | 14.4 | 1.18% | >= 360 | FAIL |
+| 256M | 344.2, 345.9, 344.6 | 344.2 | 344.9 | 0.21% | >= 360 | FAIL |
+| 2G | 364.2, 364.0, 364.1 | 364.0 | 364.1 | 0.02% | >= 360 | PASS |
+
+### NCCL reducescatter by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 14.1, 13.8, 14.2 | 13.8 | 14.0 | 1.21% | >= 405 | FAIL |
+| 256M | 328.6, 328.3, 328.2 | 328.2 | 328.4 | 0.05% | >= 405 | FAIL |
+| 2G | 352.6, 352.4, 353.3 | 352.4 | 352.8 | 0.11% | >= 405 | FAIL |
+
+### NCCL allgather by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 14.6, 14.3, 14.4 | 14.3 | 14.4 | 0.86% | >= 405 | FAIL |
+| 256M | 350.5, 350.4, 349.9 | 349.9 | 350.3 | 0.07% | >= 405 | FAIL |
+| 2G | 366.3, 366.6, 366.2 | 366.2 | 366.4 | 0.05% | >= 405 | FAIL |
+
+### NCCL sendrecv by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 18.4, 18.4, 18.4 | 18.4 | 18.4 | 0.00% | >= 360 | FAIL |
+| 256M | 350.9, 351.6, 351.4 | 350.9 | 351.3 | 0.08% | >= 360 | FAIL |
+| 2G | 368.9, 369.1, 368.9 | 368.9 | 369.0 | 0.03% | >= 360 | PASS |
+
+**Overall: FAIL**
+
+## Stress Test
+
+- **Source:** pytorch
+- **Duration:** 1800s (requested 1800s)
+- **Telemetry samples:** 1266
+- **Max temp:** {0: 60.0, 1: 60.0, 2: 68.0, 3: 56.0, 4: 60.0, 5: 68.0, 6: 64.0, 7: 56.0}
+- **Avg power:** {0: 697.7, 1: 697.5, 2: 697.1, 3: 697.8, 4: 697.8, 5: 697.9, 6: 697.7, 7: 698.3}
+- **Temp delta:** 12.0 C
+- **TFLOPS jitter:** 4.37%
+- **Steady TFLOPS samples:** 37672
+- **Throttle events:** 9712
+- **XID events:** 0
+- **Failure reasons:**
+  - GPU temperature delta 12.0C exceeds 5.0C
+  - non-idle throttle reasons observed in 9712 samples (first: GPU 0 0x4)
+- **Result: FAIL**
+
+## RDMA/InfiniBand
+
+### RDMA Port Checks
+
+| Device | Port | State | Rate | Required | Status |
+|--------|------|-------|------|----------|--------|
+| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
+| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
+| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+
+| Test | Value | Threshold | Status |
+|------|-------|-----------|--------|
+| ib_write_bw | 49.5 GB/s | >= 47 GB/s | PASS |
+| ib_read_bw | 39.1 GB/s | >= 47 GB/s | FAIL |
+| ib_write_lat | 1.25 us | <= 2 us | PASS |
+| ib_read_lat | 2.60 us | <= 3.5 us | PASS |
+| ibping | local_loopback target=0x58 count=5 | 0% packet loss | PASS |
+
+- **PFC/ECN/CNP/congestion counters checked:** 146
+- **PFC/ECN/CNP/congestion non-zero:** no
+- **Failure reasons:**
+  - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
+  - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
+  - ib_read_bw bandwidth 39.12GB/s < 47GB/s
+**Overall: FAIL**
+
+## Training Simulation
+
+| Metric | Value |
+|--------|-------|
+| Model | synthetic_transformer_1.5b |
+| Params | 1470.5M |
+| Throughput | 216498 tokens/sec |
+| Avg Step Time | 75.7 ms |
+| Warmup Steps | 5 |
+| Peak Memory | 18.1 GB |
+| Final Loss | 0.0039 |
+| Step Jitter | 1.89% |
+| Distributed Mode | ddp |
+| Verdict | PASS (216498 tokens/sec) |
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_test_all_latest_aikubeworker0016_20260522_203447.md b/reports_test_all_latest_aikubeworker0016_20260522_203447.md
new file mode 100644
index 0000000..3a4077f
--- /dev/null
+++ b/reports_test_all_latest_aikubeworker0016_20260522_203447.md
@@ -0,0 +1,322 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22T20:34:52.129246
+- **Host:** aikubeworker0016
+- **GPU:** NVIDIA H100 80GB HBM3 x8
+- **Driver:** 580.159.03 | **CUDA:** 13.0
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Failed or unverified items:
+- Compute Throughput: FAIL (BF16 spread 3.44% > 3%)
+- NCCL: FAIL
+- Stress Test: FAIL
+- RDMA: FAIL
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| GPU Info | PASS (8 GPUs detected) |
+| Health Check | PASS |
+| Memory Bandwidth | PASS (108.1%) |
+| Compute Throughput | FAIL (BF16 spread 3.44% > 3%) |
+| NVLink/NVSwitch | PASS |
+| DCGM | PASS |
+| NCCL | FAIL |
+| Stress Test | FAIL |
+| RDMA | FAIL |
+| Training | PASS (216683 tokens/sec) |
+
+## GPU Information
+
+| GPU | Model | VRAM | Temp | Power | SM Clock |
+|-----|-------|------|------|-------|----------|
+| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 70/700W | 345 MHz |
+| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 68/700W | 345 MHz |
+| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 67/700W | 345 MHz |
+| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz |
+| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 68/700W | 345 MHz |
+| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 22C | 69/700W | 345 MHz |
+| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 68/700W | 345 MHz |
+| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 66/700W | 345 MHz |
+
+## Health Check
+
+**Overall: PASS**
+
+| GPU | Temp | Power | ECC | PCIe | Throttle | Status |
+|-----|------|-------|-----|------|----------|--------|
+| 0 | 20C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 1 | 21C PASS | 68W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 2 | 21C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 3 | 20C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 4 | 20C PASS | 68W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 5 | 22C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 6 | 20C PASS | 68W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 7 | 20C PASS | 66W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+
+## Memory Bandwidth
+
+Source: nvbandwidth
+
+| Metric | Value | Peak | Efficiency |
+|--------|-------|------|------------|
+| H2D (PCIe) | 55.4 GB/s | 64 GB/s | 86.6% |
+| D2H (PCIe) | 54.4 GB/s | 64 GB/s | 85.0% |
+| D2D (NVLink) | 486.6 GB/s | 450 GB/s | 108.1% |
+
+**Verdict: PASS** (D2D efficiency 108.1%)
+
+## Compute Throughput
+
+| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
+|-------|-------------------|------|------------|--------|
+| FP32 | 52.1 | 67 | >= 54 | FAIL |
+| TF32 | 366.7 | 495 | >= 444 | FAIL |
+| FP16 | 682.7 | 990 | >= 734 | FAIL |
+| BF16 | 717.3 | 990 | >= 745 | FAIL |
+| FP8 | 1173.5 | 1979 | >= 1400 | FAIL |
+| FP64 | 47.4 | 67 | >= 63 | FAIL |
+| INT8 | 100.4 | 1979 | >= 1536 | FAIL |
+
+**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 5.1%)
+
+### Compute Consistency
+
+| DType | Min | Mean | Max | Spread | Limit | Status |
+|-------|-----|------|-----|--------|-------|--------|
+| FP32 | 51.9 | 52.1 | 52.2 | 0.58% | <= 3% | PASS |
+| TF32 | 362.3 | 366.7 | 369.2 | 1.88% | <= 3% | PASS |
+| FP16 | 674.4 | 682.7 | 693.1 | 2.74% | <= 3% | PASS |
+| BF16 | 705.3 | 717.2 | 730.0 | 3.44% | <= 3% | FAIL |
+| FP8 | 1155.2 | 1173.5 | 1186.2 | 2.64% | <= 3% | PASS |
+| FP64 | 46.3 | 47.4 | 48.5 | 4.64% | <= 3% | FAIL |
+| INT8 | 100.4 | 100.4 | 100.4 | 0.00% | <= 3% | PASS |
+
+### Compute Per-GPU TFLOPS
+
+| GPU | FP32 | TF32 | FP16 | BF16 | FP8 | FP64 | INT8 |
+|---|---|---|---|---|---|---|---|
+| 0 | 52.2 | 362.3 | 674.4 | 714.3 | 1159.0 | 46.3 | 100.4 |
+| 1 | 51.9 | 366.5 | 674.7 | 721.4 | 1185.4 | 47.7 | 100.4 |
+| 2 | 52.2 | 367.4 | 693.1 | 730.0 | 1185.7 | 48.5 | 100.4 |
+| 3 | 52.2 | 367.8 | 682.2 | 708.2 | 1163.4 | 47.4 | 100.4 |
+| 4 | 52.0 | 366.4 | 686.9 | 714.1 | 1186.2 | 47.3 | 100.4 |
+| 5 | 52.0 | 369.2 | 679.9 | 721.1 | 1155.2 | 47.3 | 100.4 |
+| 6 | 51.9 | 365.1 | 677.7 | 705.3 | 1169.0 | 47.0 | 100.4 |
+| 7 | 52.2 | 369.0 | 692.8 | 723.5 | 1184.3 | 47.6 | 100.4 |
+
+## NVLink/NVSwitch
+
+**Overall: PASS**
+
+| GPU | Active Links | Issues |
+|-----|--------------|--------|
+| 0 | 18/18 | OK |
+| 1 | 18/18 | OK |
+| 2 | 18/18 | OK |
+| 3 | 18/18 | OK |
+| 4 | 18/18 | OK |
+| 5 | 18/18 | OK |
+| 6 | 18/18 | OK |
+| 7 | 18/18 | OK |
+
+## DCGM Diagnostic
+
+**Overall: PASS**
+
+| Subtest | Status |
+|---------|--------|
+| Deployment/software/GPU0 | PASS |
+| Deployment/software/GPU1 | PASS |
+| Deployment/software/GPU2 | PASS |
+| Deployment/software/GPU3 | PASS |
+| Deployment/software/GPU4 | PASS |
+| Deployment/software/GPU5 | PASS |
+| Deployment/software/GPU6 | PASS |
+| Deployment/software/GPU7 | PASS |
+| Deployment/software/summary | PASS |
+| Hardware/memory/GPU0 | PASS |
+| Hardware/memory/GPU1 | PASS |
+| Hardware/memory/GPU2 | PASS |
+| Hardware/memory/GPU3 | PASS |
+| Hardware/memory/GPU4 | PASS |
+| Hardware/memory/GPU5 | PASS |
+| Hardware/memory/GPU6 | PASS |
+| Hardware/memory/GPU7 | PASS |
+| Hardware/memory/summary | PASS |
+| Hardware/diagnostic/GPU0 | PASS |
+| Hardware/diagnostic/GPU1 | PASS |
+| Hardware/diagnostic/GPU2 | PASS |
+| Hardware/diagnostic/GPU3 | PASS |
+| Hardware/diagnostic/GPU4 | PASS |
+| Hardware/diagnostic/GPU5 | PASS |
+| Hardware/diagnostic/GPU6 | PASS |
+| Hardware/diagnostic/GPU7 | PASS |
+| Hardware/diagnostic/summary | PASS |
+| Hardware/nvbandwidth/GPU0 | PASS |
+| Hardware/nvbandwidth/GPU1 | PASS |
+| Hardware/nvbandwidth/GPU2 | PASS |
+| Hardware/nvbandwidth/GPU3 | PASS |
+| Hardware/nvbandwidth/GPU4 | PASS |
+| Hardware/nvbandwidth/GPU5 | PASS |
+| Hardware/nvbandwidth/GPU6 | PASS |
+| Hardware/nvbandwidth/GPU7 | PASS |
+| Hardware/nvbandwidth/summary | PASS |
+| Integration/pcie/GPU0 | PASS |
+| Integration/pcie/GPU1 | PASS |
+| Integration/pcie/GPU2 | PASS |
+| Integration/pcie/GPU3 | PASS |
+| Integration/pcie/GPU4 | PASS |
+| Integration/pcie/GPU5 | PASS |
+| Integration/pcie/GPU6 | PASS |
+| Integration/pcie/GPU7 | PASS |
+| Integration/pcie/summary | PASS |
+| Stress/targeted_stress/GPU0 | PASS |
+| Stress/targeted_stress/GPU1 | PASS |
+| Stress/targeted_stress/GPU2 | PASS |
+| Stress/targeted_stress/GPU3 | PASS |
+| Stress/targeted_stress/GPU4 | PASS |
+| Stress/targeted_stress/GPU5 | PASS |
+| Stress/targeted_stress/GPU6 | PASS |
+| Stress/targeted_stress/GPU7 | PASS |
+| Stress/targeted_stress/summary | PASS |
+| Stress/targeted_power/GPU0 | PASS |
+| Stress/targeted_power/GPU1 | PASS |
+| Stress/targeted_power/GPU2 | PASS |
+| Stress/targeted_power/GPU3 | PASS |
+| Stress/targeted_power/GPU4 | PASS |
+| Stress/targeted_power/GPU5 | PASS |
+| Stress/targeted_power/GPU6 | PASS |
+| Stress/targeted_power/GPU7 | PASS |
+| Stress/targeted_power/summary | PASS |
+
+## NCCL Multi-GPU
+
+Source: nccl-tests | GPUs: 8
+
+| Operation | Bus BW (GB/s) | Threshold | Status |
+|-----------|---------------|-----------|--------|
+| allreduce | 472.4 | >= 405 | FAIL |
+| alltoall | 344.3 | >= 315 | FAIL |
+| broadcast | 363.6 | >= 360 | FAIL |
+| reducescatter | 353.1 | >= 405 | FAIL |
+| allgather | 366.4 | >= 405 | FAIL |
+| sendrecv | 368.9 | >= 360 | FAIL |
+
+### NCCL allreduce by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 24.9, 24.4, 24.9 | 24.4 | 24.7 | 0.95% | >= 405 | FAIL |
+| 256M | 421.9, 421.1, 421.9 | 421.1 | 421.6 | 0.09% | >= 405 | PASS |
+| 2G | 472.6, 472.0, 472.5 | 472.0 | 472.4 | 0.06% | >= 405 | PASS |
+
+### NCCL alltoall by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 7.9, 7.8, 8.1 | 7.8 | 7.9 | 1.57% | >= 315 | FAIL |
+| 256M | 298.7, 312.7, 303.2 | 298.7 | 304.9 | 1.91% | >= 315 | FAIL |
+| 2G | 342.2, 345.4, 345.2 | 342.2 | 344.3 | 0.43% | >= 315 | PASS |
+
+### NCCL broadcast by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 14.5, 14.3, 14.4 | 14.3 | 14.4 | 0.57% | >= 360 | FAIL |
+| 256M | 344.1, 344.3, 344.8 | 344.1 | 344.4 | 0.09% | >= 360 | FAIL |
+| 2G | 364.0, 363.6, 363.3 | 363.3 | 363.6 | 0.08% | >= 360 | PASS |
+
+### NCCL reducescatter by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 14.0, 14.2, 14.3 | 14.0 | 14.2 | 0.88% | >= 405 | FAIL |
+| 256M | 328.8, 328.7, 328.4 | 328.4 | 328.6 | 0.05% | >= 405 | FAIL |
+| 2G | 351.9, 353.8, 353.6 | 351.9 | 353.1 | 0.24% | >= 405 | FAIL |
+
+### NCCL allgather by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 14.4, 13.9, 14.0 | 13.9 | 14.1 | 1.53% | >= 405 | FAIL |
+| 256M | 350.2, 350.4, 350.7 | 350.2 | 350.4 | 0.06% | >= 405 | FAIL |
+| 2G | 366.9, 366.4, 366.0 | 366.0 | 366.4 | 0.10% | >= 405 | FAIL |
+
+### NCCL sendrecv by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 18.4, 18.3, 18.5 | 18.3 | 18.4 | 0.44% | >= 360 | FAIL |
+| 256M | 351.1, 351.4, 351.3 | 351.1 | 351.3 | 0.04% | >= 360 | FAIL |
+| 2G | 368.9, 368.8, 368.9 | 368.8 | 368.9 | 0.01% | >= 360 | PASS |
+
+**Overall: FAIL**
+
+## Stress Test
+
+- **Source:** pytorch
+- **Duration:** 1800s (requested 1800s)
+- **Telemetry samples:** 1295
+- **Max temp:** {0: 51.0, 1: 59.0, 2: 61.0, 3: 53.0, 4: 53.0, 5: 62.0, 6: 56.0, 7: 52.0}
+- **Avg power:** {0: 698.8, 1: 697.8, 2: 698.1, 3: 697.9, 4: 697.9, 5: 698.2, 6: 698.0, 7: 697.8}
+- **Temp delta:** 11.0 C
+- **TFLOPS jitter:** 3.4%
+- **Steady TFLOPS samples:** 37874
+- **Throttle events:** 9944
+- **XID events:** 0
+- **Failure reasons:**
+  - GPU temperature delta 11.0C exceeds 5.0C
+  - non-idle throttle reasons observed in 9944 samples (first: GPU 0 0x4)
+- **Result: FAIL**
+
+## RDMA/InfiniBand
+
+### RDMA Port Checks
+
+| Device | Port | State | Rate | Required | Status |
+|--------|------|-------|------|----------|--------|
+| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
+| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
+| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+
+| Test | Value | Threshold | Status |
+|------|-------|-----------|--------|
+| ib_write_bw | 48.6 GB/s | >= 47 GB/s | PASS |
+| ib_read_bw | 40.3 GB/s | >= 47 GB/s | FAIL |
+| ib_write_lat | 1.29 us | <= 2 us | PASS |
+| ib_read_lat | 2.59 us | <= 3.5 us | PASS |
+| ibping | local_loopback target=0x4b count=5 | 0% packet loss | PASS |
+
+- **PFC/ECN/CNP/congestion counters checked:** 146
+- **PFC/ECN/CNP/congestion non-zero:** no
+- **Failure reasons:**
+  - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
+  - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
+  - ib_read_bw bandwidth 40.29GB/s < 47GB/s
+**Overall: FAIL**
+
+## Training Simulation
+
+| Metric | Value |
+|--------|-------|
+| Model | synthetic_transformer_1.5b |
+| Params | 1470.5M |
+| Throughput | 216683 tokens/sec |
+| Avg Step Time | 75.6 ms |
+| Warmup Steps | 5 |
+| Peak Memory | 18.1 GB |
+| Final Loss | 0.0039 |
+| Step Jitter | 1.2% |
+| Distributed Mode | ddp |
+| Verdict | PASS (216683 tokens/sec) |
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_test_all_latest_summary_cn_20260523.md b/reports_test_all_latest_summary_cn_20260523.md
new file mode 100644
index 0000000..9ef9449
--- /dev/null
+++ b/reports_test_all_latest_summary_cn_20260523.md
@@ -0,0 +1,101 @@
+# H100 单节点 test all 中文汇总
+
+生成时间：2026-05-23  
+测试范围：`aikubeworker0012`、`aikubeworker0016` 单节点 `python gpu_tester.py --test all --report --format md`
+
+原始报告：
+
+- `reports_test_all_latest_aikubeworker0012_20260522_203246.md`
+- `reports_test_all_latest_aikubeworker0016_20260522_203447.md`
+
+## 总结论
+
+| 机器 | Suite | PDF 验收结论 | 主要失败项 |
+|---|---:|---|---|
+| aikubeworker0012 | 6/10 PASS | FAIL | Compute、NCCL、Stress、RDMA |
+| aikubeworker0016 | 6/10 PASS | FAIL | Compute、NCCL、Stress、RDMA |
+
+按 PDF 口径，任一必测子项 FAIL，则整机 FAIL。因此两台机器当前都不通过生产验收。
+
+## 通过项
+
+| 项目 | aikubeworker0012 | aikubeworker0016 | 说明 |
+|---|---|---|---|
+| GPU Info | PASS | PASS | 8 张 H100 |
+| Health | PASS | PASS | 温度、空闲功耗、ECC、PCIe、空闲 throttle 正常 |
+| Memory Bandwidth | PASS | PASS | D2D 效率均约 108.1% |
+| NVLink/NVSwitch | PASS | PASS | 8 卡均 18/18 links |
+| DCGM diag -r 3 | PASS | PASS | software、memory、diagnostic、nvbandwidth、pcie、targeted stress/power 全 PASS |
+| Training Simulation | PASS | PASS | 8 卡 DDP synthetic 1.5B，loss finite |
+
+Training 结果：
+
+| 机器 | Throughput | Step jitter | Peak memory | Verdict |
+|---|---:|---:|---:|---|
+| aikubeworker0012 | 216498 tokens/s | 1.89% | 18.08 GB | PASS |
+| aikubeworker0016 | 216683 tokens/s | 1.20% | 18.08 GB | PASS |
+
+## 失败项
+
+### Compute
+
+两台机器都未达到当前 H100 绝对 TFLOPS 阈值，且部分 dtype 的跨 GPU spread 超过 3%。
+
+| 机器 | 代表性失败 |
+|---|---|
+| aikubeworker0012 | FP16 spread 3.04%，BF16 spread 4.58%，FP64 spread 3.41%；FP32/TF32/FP16/BF16/FP8/FP64/INT8 绝对阈值均 FAIL |
+| aikubeworker0016 | BF16 spread 3.44%，FP64 spread 4.64%；FP32/TF32/FP16/BF16/FP8/FP64/INT8 绝对阈值均 FAIL |
+
+### NCCL
+
+NCCL 已经使用真实 `nccl-tests` bus BW，不是 torchrun fallback。失败主要来自小 size 以及部分 256M/2G op 未达阈值。
+
+| 机器 | allreduce best | alltoall best | broadcast best | reducescatter best | allgather best | sendrecv best | Verdict |
+|---|---:|---:|---:|---:|---:|---:|---|
+| aikubeworker0012 | 472.3 | 343.3 | 364.1 | 352.8 | 366.4 | 369.0 | FAIL |
+| aikubeworker0016 | 472.4 | 344.3 | 363.6 | 353.1 | 366.4 | 368.9 | FAIL |
+
+关键原因：
+
+- `1M` size 在所有 op 上都明显低于阈值。
+- `reducescatter`、`allgather` 的 2G 也低于 405 GB/s 阈值。
+- `broadcast/sendrecv` 的 256M 低于 360 GB/s 阈值。
+
+### Stress
+
+两台机器的 1800 秒 PyTorch BF16 GEMM 压力测试均跑满，但 telemetry 判定 FAIL。
+
+| 机器 | 平均稳态功耗 | 最高温度范围 | 温差 | TFLOPS jitter | throttle events | XID | Verdict |
+|---|---|---|---:|---:|---:|---:|---|
+| aikubeworker0012 | 约 697-698W/GPU | 56-68C | 12C | 4.37% | 9712 | 0 | FAIL |
+| aikubeworker0016 | 约 698W/GPU | 51-62C | 11C | 3.40% | 9944 | 0 | FAIL |
+
+失败原因：
+
+- GPU 间温差超过 5C 阈值。
+- 观测到大量非 idle throttle，首个原因是 `0x4`，即 `sw_power_cap`。
+
+### RDMA/InfiniBand
+
+本轮 `test all` 是单节点 RDMA 路径，`ibping` 显示为 `local_loopback`。这份结果不能替代跨节点 RDMA 验收，但仍反映单节点 perftest read bandwidth 未达标。
+
+| 机器 | ib_write_bw | ib_read_bw | ib_write_lat | ib_read_lat | Verdict |
+|---|---:|---:|---:|---:|---|
+| aikubeworker0012 | 49.5 GB/s PASS | 39.1 GB/s FAIL | 1.25 us PASS | 2.60 us PASS | FAIL |
+| aikubeworker0016 | 48.6 GB/s PASS | 40.3 GB/s FAIL | 1.29 us PASS | 2.59 us PASS | FAIL |
+
+另外，两台机器都有 `mlx5_4`、`mlx5_5` 处于 ACTIVE 但速率为 100 Gb/sec，低于当前 400G 端口阈值，因此 RDMA port check 也有 FAIL。
+
+## 当前阻塞
+
+1. Compute 阈值口径较严，当前实测绝对 TFLOPS 全 dtype 未达配置阈值，尤其 INT8 路径仅约 100 TFLOPS。
+2. NCCL 真实 bus BW 已可测，但多 op/size 未达 PDF 阈值。
+3. Stress 负载可跑满 30 分钟，但温差和 `sw_power_cap` throttle 导致 FAIL。
+4. 单节点 RDMA read bandwidth 未达 47 GB/s，且部分 IB 端口速率低于 400G。
+5. 跨节点 RDMA 需要继续使用单独 server/client 报告；不能把本轮 `local_loopback` 当作跨节点验收。
+
+## 状态判断
+
+脚本能力已经基本补齐到 PDF 验收口径：真实 nccl-tests、30 分钟 stress telemetry、NVLink、DCGM r3、RDMA perftest/ibping/counter、逐 GPU compute、8 卡 DDP training、最终任一 FAIL 即整机 FAIL 都已经跑通。
+
+当前剩余问题主要不是脚本缺项，而是两台机器的实际验收数据有多项未达标。
diff --git a/reports_test_all_pdf_aikubeworker0012_20260522_182656.md b/reports_test_all_pdf_aikubeworker0012_20260522_182656.md
new file mode 100644
index 0000000..283d875
--- /dev/null
+++ b/reports_test_all_pdf_aikubeworker0012_20260522_182656.md
@@ -0,0 +1,259 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22T18:27:01.103760
+- **Host:** aikubeworker0012
+- **GPU:** NVIDIA H100 80GB HBM3 x8
+- **Driver:** 580.159.03 | **CUDA:** 13.0
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Failed or unverified items:
+- Compute Throughput: FAIL (worst FP32 52 vs >= 54)
+- DCGM: ERROR: dcgmi diag -r 3 timeout after 1200s
+- NCCL: FAIL
+- Stress Test: FAIL
+- RDMA: FAIL
+- Training: FAIL (188741 tokens/sec)
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| GPU Info | PASS (8 GPUs detected) |
+| Health Check | PASS |
+| Memory Bandwidth | PASS (108.1%) |
+| Compute Throughput | FAIL (worst FP32 52 vs >= 54) |
+| NVLink/NVSwitch | PASS |
+| DCGM | ERROR: dcgmi diag -r 3 timeout after 1200s |
+| NCCL | FAIL |
+| Stress Test | FAIL |
+| RDMA | FAIL |
+| Training | FAIL (188741 tokens/sec) |
+
+## GPU Information
+
+| GPU | Model | VRAM | Temp | Power | SM Clock |
+|-----|-------|------|------|-------|----------|
+| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 70/700W | 345 MHz |
+| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 73/700W | 345 MHz |
+| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 26C | 69/700W | 345 MHz |
+| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 70/700W | 345 MHz |
+| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 69/700W | 345 MHz |
+| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 27C | 70/700W | 345 MHz |
+| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 71/700W | 345 MHz |
+| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 72/700W | 345 MHz |
+
+## Health Check
+
+**Overall: PASS**
+
+| GPU | Temp | Power | ECC | PCIe | Throttle | Status |
+|-----|------|-------|-----|------|----------|--------|
+| 0 | 25C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 1 | 25C PASS | 73W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 2 | 26C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 3 | 24C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 4 | 24C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 5 | 27C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 6 | 25C PASS | 71W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 7 | 24C PASS | 72W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+
+## Memory Bandwidth
+
+Source: nvbandwidth
+
+| Metric | Value | Peak | Efficiency |
+|--------|-------|------|------------|
+| H2D (PCIe) | 55.5 GB/s | 64 GB/s | 86.7% |
+| D2H (PCIe) | 54.3 GB/s | 64 GB/s | 84.8% |
+| D2D (NVLink) | 486.6 GB/s | 450 GB/s | 108.1% |
+
+**Verdict: PASS** (D2D efficiency 108.1%)
+
+## Compute Throughput
+
+| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
+|-------|-------------------|------|------------|--------|
+| FP32 | 52.0 | 67 | >= 54 | FAIL |
+| TF32 | 364.8 | 495 | >= 444 | FAIL |
+| FP16 | 685.0 | 990 | >= 734 | FAIL |
+| BF16 | 715.9 | 990 | >= 745 | FAIL |
+| FP8 | 1166.6 | 1979 | >= 1400 | FAIL |
+| FP64 | 46.9 | 0 | >= 63 | FAIL |
+| INT8 | 100.4 | 0 | >= 1536 | FAIL |
+
+**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 58.9%)
+
+### Compute Consistency
+
+| DType | Min | Mean | Max | Spread | Limit | Status |
+|-------|-----|------|-----|--------|-------|--------|
+| FP32 | 51.9 | 52.0 | 52.2 | 0.58% | <= 3% | PASS |
+| TF32 | 360.9 | 364.9 | 368.2 | 2.00% | <= 3% | PASS |
+| FP16 | 676.0 | 685.0 | 689.9 | 2.03% | <= 3% | PASS |
+| BF16 | 697.3 | 715.9 | 730.2 | 4.60% | <= 3% | FAIL |
+| FP8 | 1141.8 | 1166.6 | 1180.3 | 3.30% | <= 3% | FAIL |
+| FP64 | 45.8 | 46.9 | 47.7 | 4.05% | <= 3% | FAIL |
+| INT8 | 100.4 | 100.4 | 100.4 | 0.00% | <= 3% | PASS |
+
+### Compute Per-GPU TFLOPS
+
+| GPU | FP32 | TF32 | FP16 | BF16 | FP8 | FP64 | INT8 |
+|---|---|---|---|---|---|---|---|
+| 0 | 51.9 | 368.2 | 689.5 | 730.2 | 1180.3 | 47.1 | 100.4 |
+| 1 | 51.9 | 366.8 | 688.7 | 721.6 | 1170.1 | 47.7 | 100.4 |
+| 2 | 51.9 | 366.3 | 689.9 | 711.3 | 1167.8 | 47.2 | 100.4 |
+| 3 | 51.9 | 363.0 | 677.6 | 699.2 | 1176.3 | 46.6 | 100.4 |
+| 4 | 52.2 | 365.3 | 685.0 | 725.4 | 1163.0 | 46.8 | 100.4 |
+| 5 | 52.1 | 363.9 | 684.2 | 725.0 | 1172.1 | 46.9 | 100.4 |
+| 6 | 51.9 | 364.4 | 688.8 | 717.3 | 1161.2 | 46.9 | 100.4 |
+| 7 | 51.9 | 360.9 | 676.0 | 697.3 | 1141.8 | 45.8 | 100.4 |
+
+## NVLink/NVSwitch
+
+**Overall: PASS**
+
+| GPU | Active Links | Issues |
+|-----|--------------|--------|
+| 0 | 18/18 | OK |
+| 1 | 18/18 | OK |
+| 2 | 18/18 | OK |
+| 3 | 18/18 | OK |
+| 4 | 18/18 | OK |
+| 5 | 18/18 | OK |
+| 6 | 18/18 | OK |
+| 7 | 18/18 | OK |
+
+## DCGM Diagnostic
+
+**Overall: FAIL** (dcgmi diag -r 3 timeout after 1200s)
+
+## NCCL Multi-GPU
+
+Source: nccl-tests | GPUs: 8
+
+| Operation | Bus BW (GB/s) | Threshold | Status |
+|-----------|---------------|-----------|--------|
+| allreduce | 472.4 | >= 405 | FAIL |
+| alltoall | 344.4 | >= 315 | FAIL |
+| broadcast | 363.8 | >= 360 | FAIL |
+| reducescatter | 353.0 | >= 405 | FAIL |
+| allgather | 366.4 | >= 405 | FAIL |
+| sendrecv | 368.9 | >= 360 | FAIL |
+
+### NCCL allreduce by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 24.0, 24.9, 24.7 | 24.0 | 24.5 | 1.57% | >= 405 | FAIL |
+| 256M | 421.4, 421.7, 421.4 | 421.4 | 421.5 | 0.03% | >= 405 | PASS |
+| 2G | 471.8, 473.0, 472.3 | 471.8 | 472.4 | 0.10% | >= 405 | PASS |
+
+### NCCL alltoall by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 8.1, 8.0, 8.0 | 8.0 | 8.0 | 0.59% | >= 315 | FAIL |
+| 256M | 312.3, 310.9, 319.2 | 310.9 | 314.1 | 1.15% | >= 315 | FAIL |
+| 2G | 343.1, 346.2, 344.0 | 343.1 | 344.4 | 0.38% | >= 315 | PASS |
+
+### NCCL broadcast by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 14.6, 13.6, 14.5 | 13.6 | 14.2 | 3.16% | >= 360 | FAIL |
+| 256M | 343.8, 344.2, 344.5 | 343.8 | 344.2 | 0.08% | >= 360 | FAIL |
+| 2G | 363.5, 363.3, 364.7 | 363.3 | 363.8 | 0.17% | >= 360 | PASS |
+
+### NCCL reducescatter by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 14.1, 14.3, 14.3 | 14.1 | 14.2 | 0.66% | >= 405 | FAIL |
+| 256M | 328.1, 328.3, 328.3 | 328.1 | 328.2 | 0.03% | >= 405 | FAIL |
+| 2G | 354.0, 352.6, 352.3 | 352.3 | 353.0 | 0.21% | >= 405 | FAIL |
+
+### NCCL allgather by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 14.5, 14.5, 14.3 | 14.3 | 14.4 | 0.65% | >= 405 | FAIL |
+| 256M | 350.7, 350.7, 350.5 | 350.5 | 350.6 | 0.03% | >= 405 | FAIL |
+| 2G | 366.6, 366.3, 366.3 | 366.3 | 366.4 | 0.04% | >= 405 | FAIL |
+
+### NCCL sendrecv by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 18.5, 18.4, 18.1 | 18.1 | 18.3 | 0.93% | >= 360 | FAIL |
+| 256M | 352.3, 350.6, 350.5 | 350.5 | 351.1 | 0.24% | >= 360 | FAIL |
+| 2G | 368.8, 369.0, 368.8 | 368.8 | 368.9 | 0.03% | >= 360 | PASS |
+
+**Overall: FAIL**
+
+## Stress Test
+
+- **Source:** pytorch
+- **Duration:** 1800s (requested 1800s)
+- **Telemetry samples:** 1541
+- **Max temp:** {0: 60.0, 1: 60.0, 2: 68.0, 3: 56.0, 4: 60.0, 5: 68.0, 6: 65.0, 7: 56.0}
+- **Avg power:** {0: 697.7, 1: 697.4, 2: 697.2, 3: 697.7, 4: 697.5, 5: 698.0, 6: 697.8, 7: 698.4}
+- **Temp delta:** 12.0 C
+- **TFLOPS jitter:** 3.16%
+- **Steady TFLOPS samples:** 37676
+- **Throttle events:** 11912
+- **XID events:** 0
+- **Failure reasons:**
+  - GPU temperature delta 12.0C exceeds 5.0C
+  - non-idle throttle reasons observed in 11912 samples (first: GPU 0 0x4)
+- **Result: FAIL**
+
+## RDMA/InfiniBand
+
+### RDMA Port Checks
+
+| Device | Port | State | Rate | Required | Status |
+|--------|------|-------|------|----------|--------|
+| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
+| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
+| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+
+| Test | Value | Threshold | Status |
+|------|-------|-----------|--------|
+| ib_write_bw | 49.2 GB/s | >= 47 GB/s | PASS |
+| ib_read_bw | 39.1 GB/s | >= 47 GB/s | FAIL |
+| ib_write_lat | 5.68 us | <= 2 us | FAIL |
+| ib_read_lat | 16.00 us | <= 3.5 us | FAIL |
+| ibping | target=0x58 count=5 | 0% packet loss | PASS |
+
+- **PFC/ECN/CNP/congestion counters checked:** 0
+- **PFC/ECN/CNP/congestion non-zero:** no
+- **Failure reasons:**
+  - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
+  - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
+  - ib_read_bw bandwidth 39.11GB/s < 47GB/s
+  - ib_write_lat latency 5.68us > 2.0us
+  - ib_read_lat latency 16.0us > 3.5us
+**Overall: FAIL**
+
+## Training Simulation
+
+| Metric | Value |
+|--------|-------|
+| Model | synthetic_transformer_1.5b |
+| Params | 1470.5M |
+| Throughput | 188741 tokens/sec |
+| Avg Step Time | 86.8 ms |
+| Peak Memory | 18.1 GB |
+| Final Loss | 0.0041 |
+| Step Jitter | 626.74% |
+| Distributed Mode | ddp |
+| Verdict | FAIL (188741 tokens/sec) |
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_test_all_pdf_aikubeworker0016_20260522_182856.md b/reports_test_all_pdf_aikubeworker0016_20260522_182856.md
new file mode 100644
index 0000000..dbee788
--- /dev/null
+++ b/reports_test_all_pdf_aikubeworker0016_20260522_182856.md
@@ -0,0 +1,259 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22T18:29:01.245683
+- **Host:** aikubeworker0016
+- **GPU:** NVIDIA H100 80GB HBM3 x8
+- **Driver:** 580.159.03 | **CUDA:** 13.0
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Failed or unverified items:
+- Compute Throughput: FAIL (worst FP32 52 vs >= 54)
+- DCGM: ERROR: dcgmi diag -r 3 timeout after 1200s
+- NCCL: FAIL
+- Stress Test: FAIL
+- RDMA: FAIL
+- Training: FAIL (193836 tokens/sec)
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| GPU Info | PASS (8 GPUs detected) |
+| Health Check | PASS |
+| Memory Bandwidth | PASS (108.1%) |
+| Compute Throughput | FAIL (worst FP32 52 vs >= 54) |
+| NVLink/NVSwitch | PASS |
+| DCGM | ERROR: dcgmi diag -r 3 timeout after 1200s |
+| NCCL | FAIL |
+| Stress Test | FAIL |
+| RDMA | FAIL |
+| Training | FAIL (193836 tokens/sec) |
+
+## GPU Information
+
+| GPU | Model | VRAM | Temp | Power | SM Clock |
+|-----|-------|------|------|-------|----------|
+| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 19C | 70/700W | 345 MHz |
+| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz |
+| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz |
+| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 19C | 67/700W | 345 MHz |
+| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 19C | 67/700W | 345 MHz |
+| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 69/700W | 345 MHz |
+| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 19C | 68/700W | 345 MHz |
+| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 19C | 66/700W | 345 MHz |
+
+## Health Check
+
+**Overall: PASS**
+
+| GPU | Temp | Power | ECC | PCIe | Throttle | Status |
+|-----|------|-------|-----|------|----------|--------|
+| 0 | 19C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 1 | 20C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 2 | 20C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 3 | 19C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 4 | 19C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 5 | 21C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 6 | 19C PASS | 68W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+| 7 | 19C PASS | 66W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
+
+## Memory Bandwidth
+
+Source: nvbandwidth
+
+| Metric | Value | Peak | Efficiency |
+|--------|-------|------|------------|
+| H2D (PCIe) | 55.5 GB/s | 64 GB/s | 86.7% |
+| D2H (PCIe) | 54.7 GB/s | 64 GB/s | 85.5% |
+| D2D (NVLink) | 486.6 GB/s | 450 GB/s | 108.1% |
+
+**Verdict: PASS** (D2D efficiency 108.1%)
+
+## Compute Throughput
+
+| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
+|-------|-------------------|------|------------|--------|
+| FP32 | 52.0 | 67 | >= 54 | FAIL |
+| TF32 | 366.2 | 495 | >= 444 | FAIL |
+| FP16 | 684.8 | 990 | >= 734 | FAIL |
+| BF16 | 720.7 | 990 | >= 745 | FAIL |
+| FP8 | 1180.3 | 1979 | >= 1400 | FAIL |
+| FP64 | 47.3 | 0 | >= 63 | FAIL |
+| INT8 | 100.5 | 0 | >= 1536 | FAIL |
+
+**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 59.6%)
+
+### Compute Consistency
+
+| DType | Min | Mean | Max | Spread | Limit | Status |
+|-------|-----|------|-----|--------|-------|--------|
+| FP32 | 51.9 | 52.0 | 52.2 | 0.58% | <= 3% | PASS |
+| TF32 | 361.1 | 366.2 | 368.9 | 2.13% | <= 3% | PASS |
+| FP16 | 672.6 | 684.8 | 695.0 | 3.27% | <= 3% | FAIL |
+| BF16 | 703.6 | 720.7 | 734.2 | 4.25% | <= 3% | FAIL |
+| FP8 | 1158.6 | 1180.3 | 1241.8 | 7.05% | <= 3% | FAIL |
+| FP64 | 46.7 | 47.3 | 48.0 | 2.75% | <= 3% | PASS |
+| INT8 | 100.4 | 100.5 | 101.1 | 0.70% | <= 3% | PASS |
+
+### Compute Per-GPU TFLOPS
+
+| GPU | FP32 | TF32 | FP16 | BF16 | FP8 | FP64 | INT8 |
+|---|---|---|---|---|---|---|---|
+| 0 | 51.9 | 361.1 | 673.3 | 703.6 | 1158.6 | 46.7 | 100.4 |
+| 1 | 52.0 | 367.0 | 684.0 | 725.7 | 1184.3 | 47.3 | 100.4 |
+| 2 | 52.2 | 368.7 | 695.0 | 734.2 | 1197.7 | 48.0 | 100.4 |
+| 3 | 51.9 | 367.8 | 688.0 | 708.1 | 1174.8 | 47.3 | 100.4 |
+| 4 | 52.0 | 365.2 | 688.4 | 718.2 | 1160.5 | 47.0 | 101.1 |
+| 5 | 52.1 | 368.9 | 684.2 | 733.7 | 1160.5 | 47.3 | 100.4 |
+| 6 | 51.9 | 364.0 | 672.6 | 715.6 | 1164.4 | 47.1 | 100.4 |
+| 7 | 51.9 | 367.0 | 692.5 | 726.5 | 1241.8 | 47.6 | 100.4 |
+
+## NVLink/NVSwitch
+
+**Overall: PASS**
+
+| GPU | Active Links | Issues |
+|-----|--------------|--------|
+| 0 | 18/18 | OK |
+| 1 | 18/18 | OK |
+| 2 | 18/18 | OK |
+| 3 | 18/18 | OK |
+| 4 | 18/18 | OK |
+| 5 | 18/18 | OK |
+| 6 | 18/18 | OK |
+| 7 | 18/18 | OK |
+
+## DCGM Diagnostic
+
+**Overall: FAIL** (dcgmi diag -r 3 timeout after 1200s)
+
+## NCCL Multi-GPU
+
+Source: nccl-tests | GPUs: 8
+
+| Operation | Bus BW (GB/s) | Threshold | Status |
+|-----------|---------------|-----------|--------|
+| allreduce | 472.5 | >= 405 | FAIL |
+| alltoall | 344.2 | >= 315 | FAIL |
+| broadcast | 363.8 | >= 360 | FAIL |
+| reducescatter | 352.5 | >= 405 | FAIL |
+| allgather | 366.8 | >= 405 | FAIL |
+| sendrecv | 369.0 | >= 360 | FAIL |
+
+### NCCL allreduce by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 24.7, 24.1, 24.5 | 24.1 | 24.4 | 1.02% | >= 405 | FAIL |
+| 256M | 421.8, 422.1, 421.4 | 421.4 | 421.8 | 0.07% | >= 405 | PASS |
+| 2G | 472.8, 472.2, 472.6 | 472.2 | 472.5 | 0.05% | >= 405 | PASS |
+
+### NCCL alltoall by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 8.0, 8.0, 7.9 | 7.9 | 8.0 | 0.59% | >= 315 | FAIL |
+| 256M | 326.8, 315.4, 315.8 | 315.4 | 319.3 | 1.65% | >= 315 | PASS |
+| 2G | 344.2, 343.8, 344.6 | 343.8 | 344.2 | 0.09% | >= 315 | PASS |
+
+### NCCL broadcast by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 14.4, 14.2, 14.1 | 14.1 | 14.2 | 0.88% | >= 360 | FAIL |
+| 256M | 345.3, 344.9, 344.4 | 344.4 | 344.9 | 0.11% | >= 360 | FAIL |
+| 2G | 363.6, 363.9, 363.8 | 363.6 | 363.8 | 0.03% | >= 360 | PASS |
+
+### NCCL reducescatter by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 14.3, 14.1, 14.1 | 14.1 | 14.2 | 0.67% | >= 405 | FAIL |
+| 256M | 328.2, 328.3, 328.4 | 328.2 | 328.3 | 0.02% | >= 405 | FAIL |
+| 2G | 352.2, 352.7, 352.6 | 352.2 | 352.5 | 0.06% | >= 405 | FAIL |
+
+### NCCL allgather by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 14.2, 14.5, 14.3 | 14.2 | 14.3 | 0.87% | >= 405 | FAIL |
+| 256M | 350.6, 350.6, 350.5 | 350.5 | 350.6 | 0.01% | >= 405 | FAIL |
+| 2G | 367.0, 366.8, 366.5 | 366.5 | 366.8 | 0.06% | >= 405 | FAIL |
+
+### NCCL sendrecv by size
+
+| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
+|------|---------------------|-------|------|--------|-----------|--------|
+| 1M | 18.4, 18.2, 18.6 | 18.2 | 18.4 | 0.89% | >= 360 | FAIL |
+| 256M | 350.7, 350.8, 351.1 | 350.7 | 350.9 | 0.05% | >= 360 | FAIL |
+| 2G | 369.0, 369.0, 368.9 | 368.9 | 369.0 | 0.01% | >= 360 | PASS |
+
+**Overall: FAIL**
+
+## Stress Test
+
+- **Source:** pytorch
+- **Duration:** 1800s (requested 1800s)
+- **Telemetry samples:** 1541
+- **Max temp:** {0: 51.0, 1: 59.0, 2: 62.0, 3: 53.0, 4: 53.0, 5: 62.0, 6: 57.0, 7: 53.0}
+- **Avg power:** {0: 698.7, 1: 698.0, 2: 698.1, 3: 697.9, 4: 697.7, 5: 698.2, 6: 698.0, 7: 697.7}
+- **Temp delta:** 11.0 C
+- **TFLOPS jitter:** 3.05%
+- **Steady TFLOPS samples:** 37841
+- **Throttle events:** 11912
+- **XID events:** 0
+- **Failure reasons:**
+  - GPU temperature delta 11.0C exceeds 5.0C
+  - non-idle throttle reasons observed in 11912 samples (first: GPU 0 0x4)
+- **Result: FAIL**
+
+## RDMA/InfiniBand
+
+### RDMA Port Checks
+
+| Device | Port | State | Rate | Required | Status |
+|--------|------|-------|------|----------|--------|
+| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
+| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
+| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
+
+| Test | Value | Threshold | Status |
+|------|-------|-----------|--------|
+| ib_write_bw | 48.4 GB/s | >= 47 GB/s | PASS |
+| ib_read_bw | 40.3 GB/s | >= 47 GB/s | FAIL |
+| ib_write_lat | 2.44 us | <= 2 us | FAIL |
+| ib_read_lat | 16.00 us | <= 3.5 us | FAIL |
+| ibping | target=0x4b count=5 | 0% packet loss | PASS |
+
+- **PFC/ECN/CNP/congestion counters checked:** 0
+- **PFC/ECN/CNP/congestion non-zero:** no
+- **Failure reasons:**
+  - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
+  - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
+  - ib_read_bw bandwidth 40.29GB/s < 47GB/s
+  - ib_write_lat latency 2.44us > 2.0us
+  - ib_read_lat latency 16.0us > 3.5us
+**Overall: FAIL**
+
+## Training Simulation
+
+| Metric | Value |
+|--------|-------|
+| Model | synthetic_transformer_1.5b |
+| Params | 1470.5M |
+| Throughput | 193836 tokens/sec |
+| Avg Step Time | 84.5 ms |
+| Peak Memory | 18.1 GB |
+| Final Loss | 0.004 |
+| Step Jitter | 521.24% |
+| Distributed Mode | ddp |
+| Verdict | FAIL (193836 tokens/sec) |
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_training_warmup_aikubeworker0012_20260522_194528.md b/reports_training_warmup_aikubeworker0012_20260522_194528.md
new file mode 100644
index 0000000..948e866
--- /dev/null
+++ b/reports_training_warmup_aikubeworker0012_20260522_194528.md
@@ -0,0 +1,43 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22T19:46:07.450315
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- RDMA
+- DCGM
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Training | PASS (216654 tokens/sec) |
+
+## Training Simulation
+
+| Metric | Value |
+|--------|-------|
+| Model | synthetic_transformer_1.5b |
+| Params | 1470.5M |
+| Throughput | 216654 tokens/sec |
+| Avg Step Time | 75.6 ms |
+| Warmup Steps | 5 |
+| Peak Memory | 18.1 GB |
+| Final Loss | 0.0039 |
+| Step Jitter | 0.87% |
+| Distributed Mode | ddp |
+| Verdict | PASS (216654 tokens/sec) |
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_training_warmup_aikubeworker0016_20260522_194609.md b/reports_training_warmup_aikubeworker0016_20260522_194609.md
new file mode 100644
index 0000000..61570ca
--- /dev/null
+++ b/reports_training_warmup_aikubeworker0016_20260522_194609.md
@@ -0,0 +1,43 @@
+# GPU Test Report
+
+- **Date:** 2026-05-22T19:46:48.023650
+- **Host:** aikubeworker0016
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- RDMA
+- DCGM
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Training | PASS (217236 tokens/sec) |
+
+## Training Simulation
+
+| Metric | Value |
+|--------|-------|
+| Model | synthetic_transformer_1.5b |
+| Params | 1470.5M |
+| Throughput | 217236 tokens/sec |
+| Avg Step Time | 75.4 ms |
+| Warmup Steps | 5 |
+| Peak Memory | 18.1 GB |
+| Final Loss | 0.0039 |
+| Step Jitter | 1.23% |
+| Distributed Mode | ddp |
+| Verdict | PASS (217236 tokens/sec) |
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/test_all_aikubeworker0016_中文结果与验收差距.md b/test_all_aikubeworker0016_中文结果与验收差距.md
new file mode 100644
index 0000000..d05e25a
--- /dev/null
+++ b/test_all_aikubeworker0016_中文结果与验收差距.md
@@ -0,0 +1,73 @@
+# aikubeworker0016 `test all` 中文结果与 H100 验收差距
+
+测试命令：
+
+```bash
+/root/gpu-test-venv/bin/python gpu_tester.py --test all --report --format json --output reports_all/test_all.json
+```
+
+测试机器：`aikubeworker0016 / 172.72.8.16`
+
+原始结果：`reports_all_aikubeworker0016.json`
+
+## 先说结论
+
+项目输出里最后显示 `Suite complete: 8/8 tests passed`，但这个结论不能直接当成生产验收 PASS。
+
+原因是当前 `all` 的汇总逻辑主要看模块有没有抛 `error`，没有把 `nccl.passed=false` 和 `rdma.passed=false` 当成整套失败。因此按 PDF 的生产验收口径，这台机器目前不能算完整验收通过。
+
+## 本次 `test all` 实际结果
+
+| 模块 | 当前结果 | 关键数据 | 按 PDF 验收看 |
+| --- | --- | --- | --- |
+| GPU 信息 | 已覆盖 | 8 张 H100，Driver 580.159.03，CUDA 13.0 | 基础信息 OK，但 NVLink 链路专项不足 |
+| 健康检查 | PASS | health.passed=true | 基础健康 OK，但缺 retired pages、AER/Replay、fabricmanager 日志、stress 期间采样 |
+| Memory | 有结果 | H2D 55.5 GB/s，D2H 55.3 GB/s，D2D 486.5 GB/s | 单项看起来不错，但缺 8x8 P2P 矩阵验收 |
+| Compute | 有结果 | FP32 51.9，TF32 357.0，FP16 664.0，BF16 700.1，FP8 1116.2 TFLOPS | 对 PDF 绝对门槛不全通过 |
+| NCCL | 实际不合格 | source=torchrun_fallback，`nccl.passed=false`，无 bus BW 性能数据 | 不满足 PDF NCCL 性能验收 |
+| Stress | PASS | PyTorch fallback，60 秒，8 GPU 状态 PASS | 不满足 PDF 的 30/60 分钟 burn-in；负载只有约 64MB/卡，压力明显不够 |
+| RDMA/IB | 实际不合格 | ib_write_bw/read_bw 0.13 GB/s WARN；write_lat 4.10us PASS；read_lat 16us WARN | 当前是 localhost 单节点口径，不满足 PDF RDMA 生产验收 |
+| Training | 有结果 | synthetic 1.47B，52471 tokens/s，peak 27.31GB，loss 0.0041 | tokens/s 过线，但代码实际不是 8 卡分布式训练验收 |
+
+## Compute 对 PDF 门槛的判断
+
+PDF H100 PASS 门槛：
+
+| DType | 本次结果 | PDF PASS 门槛 | 判断 |
+| --- | ---: | ---: | --- |
+| FP32 | 51.9 TFLOPS | >= 54 | WARN |
+| TF32 | 357.0 TFLOPS | >= 444 | FAIL |
+| FP16 | 664.0 TFLOPS | >= 734 | WARN |
+| BF16 | 700.1 TFLOPS | >= 745 | WARN |
+| FP8 | 1116.2 TFLOPS | >= 1400 | FAIL |
+| FP64 | 未测 | >= 63 | 缺失 |
+| INT8 | 未测 | >= 1536 | 缺失 |
+
+说明：PDF 里 WARN 区间是 PASS 门槛的 90%-100%。TF32 和 FP8 低于 90% 门槛，所以按 PDF 是 FAIL。
+
+## 如果只执行当前仓库 `test all`，少了什么
+
+1. 少 NVLink 专项验收：没有逐卡检查 18 条链路、25GB/s 速率、CRC/Replay/Recovery error = 0。
+2. 少 DCGM 诊断：没有 `dcgmi diag -r 3`。
+3. 少长时间 burn-in：当前是 60 秒，不是 30/60 分钟。
+4. 少 stress 期间 1 秒级采样：温度、功耗、throttle、XID、TFLOPS 抖动都没按 PDF 统计。
+5. 少真正 NCCL 性能：当前退化到 torchrun fallback，没有 `nccl-tests` bus BW。
+6. 少 NCCL 全操作和三档消息：PDF 要 AllReduce/AllGather/ReduceScatter/Broadcast/SendRecv/AllToAll，且 1MB/256MB/2GB 都过线。
+7. 少 NCCL 重复 3 次取最差值和标准差 <=3%。
+8. 少完整 P2P 8x8 矩阵：没有非对角均值、最小值、偏差判断。
+9. 少逐 GPU compute 一致性：没有真正分别测 8 卡同 dtype 极差/均值 <=3%。
+10. 少 FP64 和 INT8。
+11. 少 RDMA 生产口径：当前 `localhost`，64KB message，阈值 10us；PDF 要 4MB BW、8B latency、write/read >=47GB/s、write_lat <=2us、read_lat <=3.5us。
+12. 少 PFC/ECN 错误计数和 ibping 双向。
+13. 少真正 8 卡分布式 Training Simulation 验收。
+14. 少严格最终 verdict：当前代码会把 `passed=false` 的模块也计入“通过”，这是验收逻辑漏洞。
+
+## 建议
+
+`test all` 可以继续作为快速初筛跑，但如果目标是对齐 `H100_production_acceptance.pdf`，需要把它升级成“生产验收模式”。优先级如下：
+
+1. 先修汇总 verdict：任何子模块 `passed=false` 必须导致整机 FAIL。
+2. 先装好 `nccl-tests` 和 `gpu-burn`，否则 NCCL/Stress 都不是生产口径。
+3. 增加 NVLink、DCGM、长时间 telemetry、P2P 矩阵。
+4. 改 RDMA 为生产参数，且支持跨节点。
+5. 改 compute/training 为逐 GPU/8 卡分布式验收。
-- 
2.47.2


From 4b17bafd531a6013d93d49887e6e98447b4d26ca Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 13:03:26 +0800
Subject: [PATCH 02/41] Add multi-node NCCL sweep test

---
 README.md                                     |  39 +-
 configs/default.yaml                          |  46 ++
 gpu_tester.py                                 |  55 ++-
 modules/report.py                             |  50 ++
 ...node_nccl_smoke_256m_aikubeworker0012.json | 439 ++++++++++++++++++
 ...tinode_nccl_smoke_256m_aikubeworker0012.md |  50 ++
 6 files changed, 667 insertions(+), 12 deletions(-)
 create mode 100644 reports_multinode_nccl_smoke_256m_aikubeworker0012.json
 create mode 100644 reports_multinode_nccl_smoke_256m_aikubeworker0012.md

diff --git a/README.md b/README.md
index 1af08c4..eed4791 100644
--- a/README.md
+++ b/README.md
@@ -375,6 +375,27 @@ nccl:
   repeats: 3
   max_stddev_pct: 3
 
+multinode_nccl:
+  enabled: false                        # true 时纳入 --test all
+  hosts:
+    - {name: nccl-gpu-1, addr: 172.72.8.12, slots: 8}
+    - {name: nccl-gpu-2, addr: 172.72.8.16, slots: 8}
+  tests: [all_reduce_perf, alltoall_perf]
+  topologies:
+    - {nodes: 2, gpus_per_node: 8}
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  extra_ld_library_path:                # 传给远端 rank 的 MPI/NCCL/CUDA 库路径
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  begin_size: 1k
+  end_size: 16g
+  step_factor: 2
+  warmup_iters: 10
+  socket_ifname: bond0
+  ib_gid_index: 3
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+
 stress:
   duration_sec: 1800                   # 压力测试时长
   use_gpu_burn: false                  # 默认走 PyTorch GEMM stress
@@ -539,16 +560,14 @@ report:
 └── 异常: 检查 IB 线缆、交换机配置、子网管理器
 
 步骤 3: 多节点 NCCL 测试
-├── 在每个节点上配置:
-│   export MASTER_ADDR=<主节点IP>
-│   export MASTER_PORT=29500
-│   export NCCL_SOCKET_IFNAME=ib0    # IB 网卡名
-│   export NCCL_DEBUG=INFO
-├── 运行 nccl-tests 手动测试:
-│   mpirun -np <总GPU数> -hostfile hosts \
-│     /opt/gpu-test-tools/nccl-tests/build/all_reduce_perf \
-│     -b 8 -e 256M -f 2 -g 1 -w 5 -n 20
-└── 确认: 多节点 AllReduce 带宽正常
+├── 在发起节点确认 mpirun、nccl-tests、跨节点 root SSH 可用
+├── 配置 configs/default.yaml 的 multinode_nccl.hosts / IB 参数
+├── 执行 PDF 风格 sweep:
+│   python3 gpu_tester.py --test multinode-nccl --report --format md
+├── 默认命令口径:
+│   mpirun -H <node1>:8,<node2>:8 --map-by ppr:8:node -np 16 \
+│     all_reduce_perf/alltoall_perf -b 1k -e 16g -f 2 -g 1 -w 10
+└── 确认: Peak Bus BW、Peak Size、wrong_count 正常
 
 步骤 4: 训练验证
 ├── python3 gpu_tester.py --test training
diff --git a/configs/default.yaml b/configs/default.yaml
index a432c11..09a3921 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -48,6 +48,52 @@ nccl:
   test_allgather: false
   test_sendrecv: false
 
+multinode_nccl:
+  enabled: false
+  mode: sweep
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: null  # null = tools.install_dir/nccl-tests/build
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+  begin_size: 1k
+  end_size: 16g
+  step_factor: 2
+  warmup_iters: 10
+  gpus_per_rank: 1
+  timeout_sec: 1800
+  socket_ifname: bond0
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: 4
+  min_nchannels: 4
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: 1
+  min_peak_busbw_gbps:
+    allreduce: 480
+    alltoall: 75
+
 stress:
   duration_sec: 600           # 10 min — reaches thermal steady state, validates throttle/jitter beyond warmup
   use_doubles: false
diff --git a/gpu_tester.py b/gpu_tester.py
index 15bc694..35d89de 100644
--- a/gpu_tester.py
+++ b/gpu_tester.py
@@ -28,6 +28,7 @@ from modules.stress_test import StressTest
 from modules.rdma_test import RDMATest
 from modules.nvlink_test import NVLinkTest
 from modules.dcgm_test import DCGMTest
+from modules.multinode_nccl_test import MultiNodeNCCLTest
 from modules.report import ReportGenerator
 from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label, get_supported_gpus, validate_driver_compatibility
 
@@ -55,6 +56,44 @@ DEFAULT_CONFIG = {
         "repeats": 3,
         "max_stddev_pct": 3,
     },
+    "multinode_nccl": {
+        "enabled": False,
+        "mode": "sweep",
+        "hosts": [
+            {"name": "nccl-gpu-1", "addr": "172.72.8.12", "slots": 8},
+            {"name": "nccl-gpu-2", "addr": "172.72.8.16", "slots": 8},
+        ],
+        "ssh_user": "root",
+        "ssh_preflight": True,
+        "mpirun_path": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun",
+        "mpi_ld_preload": None,
+        "extra_ld_library_path": [
+            "/usr/mpi/gcc/openmpi-4.1.9a1/lib",
+            "/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib",
+            "/usr/local/cuda-12.4/targets/x86_64-linux/lib",
+        ],
+        "nccl_tests_dir": None,
+        "tests": ["all_reduce_perf", "alltoall_perf"],
+        "topologies": [{"nodes": 2, "gpus_per_node": 8}],
+        "begin_size": "1k",
+        "end_size": "16g",
+        "step_factor": 2,
+        "warmup_iters": 10,
+        "gpus_per_rank": 1,
+        "timeout_sec": 1800,
+        "socket_ifname": "bond0",
+        "ib_gid_index": 3,
+        "ib_sl": 5,
+        "ib_tc": 136,
+        "ib_hca": "mlx5_0,mlx5_1,mlx5_6,mlx5_7",
+        "ib_timeout": 22,
+        "qps_per_connection": 4,
+        "min_nchannels": 4,
+        "net_plugin": "none",
+        "nvls_enable": 1,
+        "split_data_on_qps": 1,
+        "min_peak_busbw_gbps": {"allreduce": 480, "alltoall": 75},
+    },
     "stress": {
         "duration_sec": 1800,
         "production_duration_sec": 1800,
@@ -191,7 +230,8 @@ def interactive_menu(config: dict):
         ("8", "NVLink/NVSwitch Test", "nvlink"),
         ("9", "DCGM Diagnostic", "dcgm"),
         ("10", "Training Simulation", "training"),
-        ("11", "Full Test Suite (All Tests)", "all"),
+        ("11", "Multi-node NCCL Test", "multinode_nccl"),
+        ("12", "Full Test Suite (All Tests)", "all"),
         ("0", "Generate Report", "report"),
     ]
 
@@ -218,6 +258,7 @@ def interactive_menu(config: dict):
             "nvlink": "NVLink links, speed, and error counters",
             "dcgm": "DCGM diag -r 3 production diagnostic",
             "training": "Simulate LLM training with PyTorch",
+            "multinode_nccl": "Cross-node NCCL via mpirun/nccl-tests",
             "all": "Run all tests sequentially",
             "report": "Export results to JSON/HTML",
         }
@@ -326,6 +367,12 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict:
             m.print_results(result)
             return result
 
+        elif test_name == "multinode_nccl":
+            m = MultiNodeNCCLTest(config)
+            result = m.run()
+            m.print_results(result)
+            return result
+
         elif test_name == "all":
             return _run_full_suite(config, console)
 
@@ -356,6 +403,8 @@ def _run_full_suite(config: dict, console: Console) -> dict:
         ("dcgm", "DCGM Diagnostic", DCGMTest),
         ("training", "Training Simulation", TrainingSim),
     ]
+    if (config.get("multinode_nccl", {}) or {}).get("enabled"):
+        tests.append(("multinode_nccl", "Multi-node NCCL Test", MultiNodeNCCLTest))
 
     for i, (key, name, mod_cls) in enumerate(tests, 1):
         console.print(f"\n[bold cyan][{i}/{len(tests)}] {name}[/bold cyan]")
@@ -435,6 +484,7 @@ Examples:
    python gpu_tester.py --test benchmark --type memory
    python gpu_tester.py --test benchmark --type compute --dtype fp16
    python gpu_tester.py --test nccl            # NCCL test
+   python gpu_tester.py --test multinode-nccl  # Cross-node NCCL test
    python gpu_tester.py --test nvlink          # NVLink/NVSwitch test
    python gpu_tester.py --test dcgm            # DCGM diagnostic
    python gpu_tester.py --test training        # Training sim
@@ -442,7 +492,7 @@ Examples:
    python gpu_tester.py --report --format json --output report.json
         """,
     )
-    parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"],
+    parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "multinode-nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"],
                         help="Run a specific test")
     parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)")
     parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"],
@@ -499,6 +549,7 @@ Examples:
         "health": "health",
         "benchmark": None,
         "nccl": "nccl",
+        "multinode-nccl": "multinode_nccl",
         "stress": "stress",
         "rdma": "rdma",
         "nvlink": "nvlink",
diff --git a/modules/report.py b/modules/report.py
index 2f6f1ec..b82170b 100644
--- a/modules/report.py
+++ b/modules/report.py
@@ -464,6 +464,47 @@ class ReportGenerator:
             passed = nccl.get("passed", False)
             lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")
 
+        multinode = results.get("multinode_nccl")
+        if multinode and not multinode.get("error"):
+            lines.append("## Multi-node NCCL / Cross Leaf\n")
+            lines.append(f"Source: {multinode.get('source', 'unknown')} | Mode: {multinode.get('mode', 'unknown')}\n")
+            hosts = multinode.get("hosts", [])
+            if hosts:
+                host_text = ", ".join(f"{h.get('name') or h.get('addr')}({h.get('addr')})" for h in hosts)
+                lines.append(f"- **Hosts:** {host_text}")
+            preflight = multinode.get("preflight", {})
+            if preflight.get("checks"):
+                failed_checks = [c for c in preflight["checks"] if c.get("status") == "FAIL"]
+                warn_checks = [c for c in preflight["checks"] if c.get("status") == "WARN"]
+                lines.append(f"- **Preflight:** {'PASS' if not failed_checks else 'FAIL'}"
+                             f"{f' ({len(warn_checks)} warnings)' if warn_checks else ''}")
+            lines.append("")
+            for op, data in (multinode.get("tests") or {}).items():
+                lines.append(f"### Multi-node NCCL {op}\n")
+                lines.append("| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |")
+                lines.append("|----------|-------------|-----------|------------|-----------|--------|")
+                for topo in data.get("topologies", []):
+                    threshold = topo.get("min_required_gbps", 0) or 0
+                    threshold_text = f">= {threshold:.0f} GB/s" if threshold else "-"
+                    lines.append(
+                        f"| {topo.get('label', '')} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | "
+                        f"{topo.get('peak_size', '')} | {topo.get('avg_busbw_gbps', 0):.2f} GB/s | "
+                        f"{threshold_text} | {topo.get('status', '?')} |"
+                    )
+                lines.append("")
+            lines.append(f"**Overall: {'PASS' if multinode.get('passed') else 'FAIL'}**\n")
+        elif multinode and multinode.get("error"):
+            lines.append("## Multi-node NCCL / Cross Leaf\n")
+            lines.append(f"**Overall: FAIL** ({multinode.get('error')})\n")
+            preflight = multinode.get("preflight", {})
+            if preflight.get("checks"):
+                lines.append("| Check | Status | Detail |")
+                lines.append("|-------|--------|--------|")
+                for check in preflight["checks"]:
+                    detail = str(check.get("detail", "")).replace("\n", " ")
+                    lines.append(f"| {check.get('name', '')} | {check.get('status', '')} | {detail} |")
+                lines.append("")
+
         # --- Stress Test ---
         stress = results.get("stress")
         if stress and not stress.get("error"):
@@ -836,6 +877,15 @@ class ReportGenerator:
             else:
                 items.append(("NCCL", "FAIL"))
 
+        if "multinode_nccl" in results:
+            mn = results["multinode_nccl"]
+            if mn.get("error"):
+                items.append(("Multi-node NCCL", f"ERROR: {mn['error']}"))
+            elif mn.get("passed"):
+                items.append(("Multi-node NCCL", "PASS"))
+            else:
+                items.append(("Multi-node NCCL", "FAIL"))
+
         # Stress
         if "stress" in results:
             s = results["stress"]
diff --git a/reports_multinode_nccl_smoke_256m_aikubeworker0012.json b/reports_multinode_nccl_smoke_256m_aikubeworker0012.json
new file mode 100644
index 0000000..72c30ce
--- /dev/null
+++ b/reports_multinode_nccl_smoke_256m_aikubeworker0012.json
@@ -0,0 +1,439 @@
+{
+  "multinode_nccl": {
+    "passed": false,
+    "source": "nccl-tests-mpirun",
+    "mode": "sweep",
+    "hosts": [
+      {
+        "name": "nccl-gpu-1",
+        "addr": "172.72.8.12",
+        "slots": 8
+      },
+      {
+        "name": "nccl-gpu-2",
+        "addr": "172.72.8.16",
+        "slots": 8
+      }
+    ],
+    "preflight": {
+      "checks": [
+        {
+          "name": "mpirun",
+          "status": "PASS",
+          "detail": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun"
+        },
+        {
+          "name": "hosts",
+          "status": "PASS",
+          "detail": "2 configured"
+        },
+        {
+          "name": "all_reduce_perf",
+          "status": "PASS",
+          "detail": "/opt/gpu-test-tools/nccl-tests/build/all_reduce_perf"
+        },
+        {
+          "name": "alltoall_perf",
+          "status": "PASS",
+          "detail": "/opt/gpu-test-tools/nccl-tests/build/alltoall_perf"
+        },
+        {
+          "name": "ssh 172.72.8.12",
+          "status": "WARN",
+          "detail": "Host key verification failed."
+        },
+        {
+          "name": "ssh 172.72.8.16",
+          "status": "PASS",
+          "detail": "aikubeworker0016"
+        }
+      ],
+      "passed": true
+    },
+    "tests": {
+      "allreduce": {
+        "binary": "/opt/gpu-test-tools/nccl-tests/build/all_reduce_perf",
+        "topologies": [
+          {
+            "label": "2 nodes x 8 GPUs",
+            "nodes": 2,
+            "gpus_per_node": 8,
+            "ranks": 16,
+            "hosts": [
+              {
+                "name": "nccl-gpu-1",
+                "addr": "172.72.8.12",
+                "slots": 8
+              },
+              {
+                "name": "nccl-gpu-2",
+                "addr": "172.72.8.16",
+                "slots": 8
+              }
+            ],
+            "command": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun --allow-run-as-root --mca btl_openib_warn_no_device_params_found 0 --mca btl_tcp_if_include bond0 -H 172.72.8.12:8,172.72.8.16:8 --map-by ppr:8:node -np 16 -x NCCL_DEBUG=WARN -x NCCL_SOCKET_IFNAME=bond0 -x NCCL_IB_GID_INDEX=3 -x NCCL_IB_SL=5 -x NCCL_IB_TC=136 -x NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 -x NCCL_IB_TIMEOUT=22 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_MIN_NCHANNELS=4 -x NCCL_NET_PLUGIN=none -x NCCL_NVLS_ENABLE=1 -x NCCL_IB_SPLIT_DATA_ON_QPS=1 -x LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib:/usr/local/cuda-12.4/targets/x86_64-linux/lib /opt/gpu-test-tools/nccl-tests/build/all_reduce_perf -b 1k -e 256M -g 1 -f 2 -w 2",
+            "returncode": 0,
+            "status": "FAIL",
+            "peak_busbw_gbps": 39.32,
+            "peak_algbw_gbps": 20.97,
+            "peak_size": "4M",
+            "avg_busbw_gbps": 9.1,
+            "min_required_gbps": 100.0,
+            "wrong_count": 0,
+            "by_size": [
+              {
+                "size_bytes": 1024,
+                "size": "1K",
+                "time_us": 80.32,
+                "algbw_gbps": 0.01,
+                "busbw_gbps": 0.02,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 2048,
+                "size": "2K",
+                "time_us": 35.79,
+                "algbw_gbps": 0.06,
+                "busbw_gbps": 0.11,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 4096,
+                "size": "4K",
+                "time_us": 37.49,
+                "algbw_gbps": 0.11,
+                "busbw_gbps": 0.2,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 8192,
+                "size": "8K",
+                "time_us": 40.32,
+                "algbw_gbps": 0.2,
+                "busbw_gbps": 0.38,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 16384,
+                "size": "16K",
+                "time_us": 43.04,
+                "algbw_gbps": 0.38,
+                "busbw_gbps": 0.71,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 32768,
+                "size": "32K",
+                "time_us": 43.32,
+                "algbw_gbps": 0.76,
+                "busbw_gbps": 1.42,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 65536,
+                "size": "64K",
+                "time_us": 47.45,
+                "algbw_gbps": 1.38,
+                "busbw_gbps": 2.59,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 131072,
+                "size": "128K",
+                "time_us": 89.3,
+                "algbw_gbps": 1.47,
+                "busbw_gbps": 2.75,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 262144,
+                "size": "256K",
+                "time_us": 165.38,
+                "algbw_gbps": 1.59,
+                "busbw_gbps": 2.97,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 524288,
+                "size": "512K",
+                "time_us": 4292.69,
+                "algbw_gbps": 0.12,
+                "busbw_gbps": 0.23,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 1048576,
+                "size": "1M",
+                "time_us": 139.29,
+                "algbw_gbps": 7.53,
+                "busbw_gbps": 14.12,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 2097152,
+                "size": "2M",
+                "time_us": 4195.12,
+                "algbw_gbps": 0.5,
+                "busbw_gbps": 0.94,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 4194304,
+                "size": "4M",
+                "time_us": 199.99,
+                "algbw_gbps": 20.97,
+                "busbw_gbps": 39.32,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 8388608,
+                "size": "8M",
+                "time_us": 6159.0,
+                "algbw_gbps": 1.36,
+                "busbw_gbps": 2.55,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 16777216,
+                "size": "16M",
+                "time_us": 6336.73,
+                "algbw_gbps": 2.65,
+                "busbw_gbps": 4.96,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 33554432,
+                "size": "32M",
+                "time_us": 12623.3,
+                "algbw_gbps": 2.66,
+                "busbw_gbps": 4.98,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 67108864,
+                "size": "64M",
+                "time_us": 17005.6,
+                "algbw_gbps": 3.95,
+                "busbw_gbps": 7.4,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 134217728,
+                "size": "128M",
+                "time_us": 23826.7,
+                "algbw_gbps": 5.63,
+                "busbw_gbps": 10.56,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 268435456,
+                "size": "256M",
+                "time_us": 47356.5,
+                "algbw_gbps": 5.67,
+                "busbw_gbps": 10.63,
+                "wrong": 0
+              }
+            ],
+            "stderr_tail": "",
+            "stdout_tail": "   6.25       0\n     1048576        262144     float     sum      -1   139.29    7.53   14.12       0  3552.34    0.30    0.55       0\n     2097152        524288     float     sum      -1  4195.12    0.50    0.94       0   158.81   13.21   24.76       0\n     4194304       1048576     float     sum      -1   199.99   20.97   39.32       0  3623.39    1.16    2.17       0\n     8388608       2097152     float     sum      -1  6159.00    1.36    2.55       0   324.45   25.85   48.48       0\n    16777216       4194304     float     sum      -1  6336.73    2.65    4.96       0   600.96   27.92   52.35       0\n    33554432       8388608     float     sum      -1  12623.3    2.66    4.98       0   949.39   35.34   66.27       0\n    67108864      16777216     float     sum      -1  17005.6    3.95    7.40       0  17175.5    3.91    7.33       0\n   134217728      33554432     float     sum      -1  23826.7    5.63   10.56       0  25793.0    5.20    9.76       0\n   268435456      67108864     float     sum      -1  47356.5    5.67   10.63       0  43195.8    6.21   11.65       0\n# Out of bounds values : 0 OK\n# Avg bus bandwidth    : 9.0956 \n#\n# Collective test concluded: all_reduce_perf\n#\n\n",
+            "started_at": "2026-05-23T04:59:28.584786",
+            "finished_at": "2026-05-23T04:59:54.886123"
+          }
+        ]
+      },
+      "alltoall": {
+        "binary": "/opt/gpu-test-tools/nccl-tests/build/alltoall_perf",
+        "topologies": [
+          {
+            "label": "2 nodes x 8 GPUs",
+            "nodes": 2,
+            "gpus_per_node": 8,
+            "ranks": 16,
+            "hosts": [
+              {
+                "name": "nccl-gpu-1",
+                "addr": "172.72.8.12",
+                "slots": 8
+              },
+              {
+                "name": "nccl-gpu-2",
+                "addr": "172.72.8.16",
+                "slots": 8
+              }
+            ],
+            "command": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun --allow-run-as-root --mca btl_openib_warn_no_device_params_found 0 --mca btl_tcp_if_include bond0 -H 172.72.8.12:8,172.72.8.16:8 --map-by ppr:8:node -np 16 -x NCCL_DEBUG=WARN -x NCCL_SOCKET_IFNAME=bond0 -x NCCL_IB_GID_INDEX=3 -x NCCL_IB_SL=5 -x NCCL_IB_TC=136 -x NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 -x NCCL_IB_TIMEOUT=22 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_MIN_NCHANNELS=4 -x NCCL_NET_PLUGIN=none -x NCCL_NVLS_ENABLE=1 -x NCCL_IB_SPLIT_DATA_ON_QPS=1 -x LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib:/usr/local/cuda-12.4/targets/x86_64-linux/lib /opt/gpu-test-tools/nccl-tests/build/alltoall_perf -b 1k -e 256M -g 1 -f 2 -w 2",
+            "returncode": 0,
+            "status": "FAIL",
+            "peak_busbw_gbps": 8.64,
+            "peak_algbw_gbps": 9.21,
+            "peak_size": "2M",
+            "avg_busbw_gbps": 2.19,
+            "min_required_gbps": 20.0,
+            "wrong_count": 0,
+            "by_size": [
+              {
+                "size_bytes": 1024,
+                "size": "1K",
+                "time_us": 58.44,
+                "algbw_gbps": 0.02,
+                "busbw_gbps": 0.02,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 2048,
+                "size": "2K",
+                "time_us": 47.2,
+                "algbw_gbps": 0.04,
+                "busbw_gbps": 0.04,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 4096,
+                "size": "4K",
+                "time_us": 47.68,
+                "algbw_gbps": 0.09,
+                "busbw_gbps": 0.08,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 8192,
+                "size": "8K",
+                "time_us": 48.78,
+                "algbw_gbps": 0.17,
+                "busbw_gbps": 0.16,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 16384,
+                "size": "16K",
+                "time_us": 79.34,
+                "algbw_gbps": 0.21,
+                "busbw_gbps": 0.19,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 32768,
+                "size": "32K",
+                "time_us": 68.8,
+                "algbw_gbps": 0.48,
+                "busbw_gbps": 0.45,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 65536,
+                "size": "64K",
+                "time_us": 49.86,
+                "algbw_gbps": 1.31,
+                "busbw_gbps": 1.23,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 131072,
+                "size": "128K",
+                "time_us": 52.89,
+                "algbw_gbps": 2.48,
+                "busbw_gbps": 2.32,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 262144,
+                "size": "256K",
+                "time_us": 3861.98,
+                "algbw_gbps": 0.07,
+                "busbw_gbps": 0.06,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 524288,
+                "size": "512K",
+                "time_us": 83.38,
+                "algbw_gbps": 6.29,
+                "busbw_gbps": 5.89,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 1048576,
+                "size": "1M",
+                "time_us": 182.32,
+                "algbw_gbps": 5.75,
+                "busbw_gbps": 5.39,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 2097152,
+                "size": "2M",
+                "time_us": 227.67,
+                "algbw_gbps": 9.21,
+                "busbw_gbps": 8.64,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 4194304,
+                "size": "4M",
+                "time_us": 6482.39,
+                "algbw_gbps": 0.65,
+                "busbw_gbps": 0.61,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 8388608,
+                "size": "8M",
+                "time_us": 10348.9,
+                "algbw_gbps": 0.81,
+                "busbw_gbps": 0.76,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 16777216,
+                "size": "16M",
+                "time_us": 18616.5,
+                "algbw_gbps": 0.9,
+                "busbw_gbps": 0.84,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 33554432,
+                "size": "32M",
+                "time_us": 17170.7,
+                "algbw_gbps": 1.95,
+                "busbw_gbps": 1.83,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 67108864,
+                "size": "64M",
+                "time_us": 35735.6,
+                "algbw_gbps": 1.88,
+                "busbw_gbps": 1.76,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 134217728,
+                "size": "128M",
+                "time_us": 69388.5,
+                "algbw_gbps": 1.93,
+                "busbw_gbps": 1.81,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 268435456,
+                "size": "256M",
+                "time_us": 96873.9,
+                "algbw_gbps": 2.77,
+                "busbw_gbps": 2.6,
+                "wrong": 0
+              }
+            ],
+            "stderr_tail": "",
+            "stdout_tail": "56    6.85    6.42    N/A\n     1048576         16384     float    none      -1   182.32    5.75    5.39       0   169.19    6.20    5.81    N/A\n     2097152         32768     float    none      -1   227.67    9.21    8.64       0  3664.15    0.57    0.54    N/A\n     4194304         65536     float    none      -1  6482.39    0.65    0.61       0   553.24    7.58    7.11    N/A\n     8388608        131072     float    none      -1  10348.9    0.81    0.76       0   803.01   10.45    9.79    N/A\n    16777216        262144     float    none      -1  18616.5    0.90    0.84       0  4237.22    3.96    3.71    N/A\n    33554432        524288     float    none      -1  17170.7    1.95    1.83       0  20849.4    1.61    1.51    N/A\n    67108864       1048576     float    none      -1  35735.6    1.88    1.76       0  34524.7    1.94    1.82    N/A\n   134217728       2097152     float    none      -1  69388.5    1.93    1.81       0  63535.3    2.11    1.98    N/A\n   268435456       4194304     float    none      -1  96873.9    2.77    2.60       0   100742    2.66    2.50    N/A\n# Out of bounds values : 0 OK\n# Avg bus bandwidth    : 2.19061 \n#\n# Collective test concluded: alltoall_perf\n#\n\n",
+            "started_at": "2026-05-23T04:59:54.886310",
+            "finished_at": "2026-05-23T05:00:28.796555"
+          }
+        ]
+      }
+    },
+    "timestamp": "2026-05-23T05:00:28.796580"
+  },
+  "timestamp": "2026-05-23T05:00:28.807561",
+  "hostname": "aikubeworker0012"
+}
\ No newline at end of file
diff --git a/reports_multinode_nccl_smoke_256m_aikubeworker0012.md b/reports_multinode_nccl_smoke_256m_aikubeworker0012.md
new file mode 100644
index 0000000..57fea2a
--- /dev/null
+++ b/reports_multinode_nccl_smoke_256m_aikubeworker0012.md
@@ -0,0 +1,50 @@
+# GPU Test Report
+
+- **Date:** 2026-05-23T05:00:28.807561
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- RDMA
+- DCGM
+- Training
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Multi-node NCCL | FAIL |
+
+## Multi-node NCCL / Cross Leaf
+
+Source: nccl-tests-mpirun | Mode: sweep
+
+- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
+- **Preflight:** PASS (1 warnings)
+
+### Multi-node NCCL allreduce
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs | 39.32 GB/s | 4M | 9.10 GB/s | >= 100 GB/s | FAIL |
+
+### Multi-node NCCL alltoall
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs | 8.64 GB/s | 2M | 2.19 GB/s | >= 20 GB/s | FAIL |
+
+**Overall: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
-- 
2.47.2


From 4b93fc785f2e5208fe5df97adddbc3a88937c3b4 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 15:39:15 +0800
Subject: [PATCH 03/41] Add multinode NCCL diagnostic report

---
 configs/default.yaml                          |   1 +
 configs/multinode_nccl_diagnostic.yaml        |  60 ++++++++
 modules/report.py                             |  23 +++
 reports_multinode_nccl_diagnosis_20260523.md  | 134 ++++++++++++++++++
 ..._multinode_nccl_diagnostic_2x8_debug_v2.md |  66 +++++++++
 5 files changed, 284 insertions(+)
 create mode 100644 configs/multinode_nccl_diagnostic.yaml
 create mode 100644 reports_multinode_nccl_diagnosis_20260523.md
 create mode 100644 reports_multinode_nccl_diagnostic_2x8_debug_v2.md

diff --git a/configs/default.yaml b/configs/default.yaml
index 09a3921..7951089 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -90,6 +90,7 @@ multinode_nccl:
   net_plugin: none
   nvls_enable: 1
   split_data_on_qps: 1
+  extra_env: {}
   min_peak_busbw_gbps:
     allreduce: 480
     alltoall: 75
diff --git a/configs/multinode_nccl_diagnostic.yaml b/configs/multinode_nccl_diagnostic.yaml
new file mode 100644
index 0000000..6afdc19
--- /dev/null
+++ b/configs/multinode_nccl_diagnostic.yaml
@@ -0,0 +1,60 @@
+tools:
+  install_dir: /opt/gpu-test-tools
+
+report:
+  output_dir: ./reports
+  format: md
+
+multinode_nccl:
+  enabled: true
+  mode: diagnostic
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: null
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+      label: 2 nodes x 8 GPUs diagnostic
+  begin_size: 256M
+  end_size: 256M
+  step_factor: 2
+  warmup_iters: 1
+  iters: 3
+  gpus_per_rank: 1
+  timeout_sec: 600
+  debug: INFO
+  socket_ifname: bond0
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: 4
+  min_nchannels: 4
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: 1
+  extra_env:
+    NCCL_DEBUG_SUBSYS: INIT,NET
+    NCCL_NET_GDR_LEVEL: 5
+    NCCL_NET_GDR_READ: 1
+    NCCL_DMABUF_ENABLE: 0
+  min_peak_busbw_gbps:
+    allreduce: 480
+    alltoall: 75
diff --git a/modules/report.py b/modules/report.py
index b82170b..c9e1b8d 100644
--- a/modules/report.py
+++ b/modules/report.py
@@ -492,6 +492,29 @@ class ReportGenerator:
                         f"{threshold_text} | {topo.get('status', '?')} |"
                     )
                 lines.append("")
+                diag_rows = []
+                for topo in data.get("topologies", []):
+                    net = topo.get("network") or {}
+                    if net:
+                        diag_rows.append((topo, net))
+                if diag_rows:
+                    lines.append("| Topology | NCCL Network | GPU Direct RDMA | GDR Disabled HCAs |")
+                    lines.append("|----------|--------------|-----------------|-------------------|")
+                    for topo, net in diag_rows:
+                        networks = ", ".join(net.get("networks") or []) or "unknown"
+                        gdr = net.get("gpu_direct_rdma", "UNKNOWN")
+                        disabled = ", ".join(net.get("gdr_disabled_hcas") or []) or "-"
+                        lines.append(f"| {topo.get('label', '')} | {networks} | {gdr} | {disabled} |")
+                    lines.append("")
+                failed_topos = [topo for topo in data.get("topologies", []) if topo.get("status") == "FAIL"]
+                if failed_topos:
+                    lines.append("| Topology | Return Code | Error / Output Tail |")
+                    lines.append("|----------|-------------|---------------------|")
+                    for topo in failed_topos:
+                        tail = topo.get("error") or topo.get("stderr_tail") or topo.get("stdout_tail") or ""
+                        tail = str(tail).replace("\n", " ").replace("|", "\\|")[-240:]
+                        lines.append(f"| {topo.get('label', '')} | {topo.get('returncode', '')} | {tail} |")
+                    lines.append("")
             lines.append(f"**Overall: {'PASS' if multinode.get('passed') else 'FAIL'}**\n")
         elif multinode and multinode.get("error"):
             lines.append("## Multi-node NCCL / Cross Leaf\n")
diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md
new file mode 100644
index 0000000..37cb75e
--- /dev/null
+++ b/reports_multinode_nccl_diagnosis_20260523.md
@@ -0,0 +1,134 @@
+# 多机多卡 NCCL 诊断报告
+
+- 日期：2026-05-23
+- 测试入口：`nccl-gpu-1` / `aikubeworker0012` / `172.72.8.12`
+- 对端节点：`nccl-gpu-2` / `aikubeworker0016` / `172.72.8.16`
+- 诊断配置：`configs/multinode_nccl_diagnostic.yaml`
+- 原始脚本报告：`reports_multinode_nccl_diagnostic_2x8_debug_v2.md`
+
+## 当前结论
+
+这不是单纯 “IB 不通” 的问题。底层 CUDA RDMA perftest 可以跑到接近单端口 400Gb/s 的水平，但 NCCL 在实际 2 节点通信时把 GPU Direct RDMA 禁用了，导致 NCCL 带宽显著低于验收阈值。
+
+同时，`nccl-gpu-2` 的 SSH 入口不稳定，会造成 `mpirun` 拉起远端 rank 失败。这个问题会直接影响 alltoall 等多机测试的稳定性，需要和 NCCL GDR 问题一起处理。
+
+## 已完成的修正
+
+1. 修正 `mpirun` 使用路径，避开系统 `/usr/bin/mpirun` 与 DOCA OpenMPI 动态库混用导致的崩溃。
+2. 补充 `LD_LIBRARY_PATH`，确保 `mpirun`、CUDA、pip 安装的 NCCL 动态库可同时解析。
+3. 将 NCCL HCA 限定到 400Gb/s 活跃端口：`mlx5_0,mlx5_1,mlx5_6,mlx5_7`。
+4. 在脚本中加入 multi-node NCCL 网络诊断解析，报告会展示 `NCCL Network`、`GPU Direct RDMA`、`GDR Disabled HCAs`。
+5. 增加 `multinode_nccl.extra_env`，可以在配置里快速试 NCCL 环境变量，不需要改代码。
+6. 增加诊断配置 `configs/multinode_nccl_diagnostic.yaml`，固定跑 2 节点 x 8 GPU、256M、`NCCL_DEBUG=INFO` 和 `NCCL_DEBUG_SUBSYS=INIT,NET`。
+
+## 关键证据
+
+### 1. CUDA RDMA perftest 通过
+
+命令类型：
+
+```bash
+CUDA_VISIBLE_DEVICES=0 ib_write_bw -d mlx5_0 -i 1 --use_cuda=0 -s 4194304 -F --report_gbits 172.72.8.16
+```
+
+结果：
+
+| 测试 | 设备 | GPU | 平均带宽 | 结论 |
+|------|------|-----|----------|------|
+| `ib_write_bw --use_cuda` | `mlx5_0` | GPU0 | `387.16 Gb/s` | PASS |
+
+解释：GPU 内存参与 RDMA 写带宽测试可以接近 400Gb/s，说明 `nvidia_peermem`/经典 GPUDirect RDMA 路径并非完全不可用。
+
+### 2. CUDA DMA-BUF 路径不可用
+
+命令类型：
+
+```bash
+CUDA_VISIBLE_DEVICES=0 ib_write_bw -d mlx5_0 -i 1 --use_cuda=0 --use_cuda_dmabuf -s 4194304 -F --report_gbits 172.72.8.16
+```
+
+结果：
+
+| 测试 | 输出 | 结论 |
+|------|------|------|
+| `ib_write_bw --use_cuda_dmabuf` | `DMA-BUF is not supported on this GPU` | FAIL |
+
+解释：当前环境不能走 CUDA DMA-BUF RDMA。后续 NCCL 应优先确认是否能稳定走经典 `nvidia_peermem` 路径。
+
+### 3. NCCL 单卡跨节点仍禁用 GDR
+
+已经尝试：
+
+- `NCCL_NET_GDR_LEVEL=SYS`
+- `NCCL_NET_GDR_LEVEL=5`
+- `NCCL_NET_GDR_READ=1`
+- `NCCL_DMABUF_ENABLE=0`
+- `NCCL_IB_CUDA_SUPPORT=1`
+- `NCCL_IB_HCA=mlx5_0`
+
+结果仍显示：
+
+```text
+NCCL INFO Using network IB
+NCCL INFO NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
+```
+
+256M allreduce 约 `13.4 GB/s`，明显低于 400Gb/s IB 端口能力。
+
+### 4. 脚本 2 节点 x 8 GPU 诊断结果
+
+原始报告：`reports_multinode_nccl_diagnostic_2x8_debug_v2.md`
+
+| Operation | Topology | Peak Bus BW | Threshold | Status | NCCL Network | GPU Direct RDMA |
+|-----------|----------|-------------|-----------|--------|--------------|-----------------|
+| allreduce | 2 nodes x 8 GPUs | `68.69 GB/s` | `>= 480 GB/s` | FAIL | IB | DISABLED |
+| alltoall | 2 nodes x 8 GPUs | `0.00 GB/s` | `>= 75 GB/s` | FAIL | unknown | UNKNOWN |
+
+allreduce 失败原因是带宽不达标，且报告捕获到 GDR 被 NCCL 禁用：
+
+| GDR Disabled HCAs |
+|-------------------|
+| `mlx5_0, mlx5_1, mlx5_6, mlx5_7` |
+
+alltoall 失败原因这轮不是性能本身，而是 `mpirun` 阶段受 SSH/网络发现影响失败，报告尾部显示：
+
+```text
+lack of common network interfaces and/or no route found between them
+```
+
+## 当前阻塞
+
+### 阻塞 1：NCCL 禁用 GPU Direct RDMA
+
+现象：
+
+- IB 能被 NCCL 识别：`Using network IB`
+- 400Gb/s HCA 被 NCCL 选中：`mlx5_0, mlx5_1, mlx5_6, mlx5_7`
+- 但 NCCL 明确禁用 GDR：`GPU Direct RDMA Disabled`
+- perftest 的经典 CUDA RDMA 又能跑到 `387.16 Gb/s`
+
+判断：底层 RDMA 能力存在，但 NCCL 的 GDR 判定/注册路径没有打通。优先排查 NCCL 与 NVIDIA driver、OFED、`nvidia_peermem`、NCCL net plugin/内部 IB 后端之间的兼容性。
+
+### 阻塞 2：`nccl-gpu-2` SSH 不稳定
+
+现象：
+
+- 多次出现：`kex_exchange_identification: Connection closed by remote host`
+- MCP 直连 `nccl-gpu-2` 也会失败或长时间超时
+- `mpirun` 依赖 SSH 拉起远端 rank，因此 SSH 抖动会让 alltoall 这类测试直接没有有效输出
+
+判断：需要先处理 `aikubeworker0016` 的 SSHD/连接限制/MaxStartups/安全策略，否则多机测试无法稳定复现。
+
+## 建议下一步
+
+1. 先修 `nccl-gpu-2` SSH 稳定性：检查 `sshd_config` 的 `MaxStartups`、连接限制、安全审计组件，以及是否有过多半开 SSH 会话。
+2. 对两台机器分别确认 `nvidia_peermem` 参数、OFED 版本、NVIDIA driver 版本一致性。
+3. 在两台机器上测试是否需要切换 `nvidia_peermem peerdirect_support` 模式，并在变更前确认没有正在运行的业务任务。
+4. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin；当前日志显示 `No plugin found (libnccl-net.so)`，NCCL 使用的是 internal network plugin。
+5. SSH 稳定后重跑完整多机配置：2 节点 x 8 GPU，至少覆盖 `all_reduce_perf` 和 `alltoall_perf`，消息大小从 `1K` 到 `16G`。
+
+## 当前可交付物
+
+- `configs/multinode_nccl_diagnostic.yaml`：多机多卡诊断配置
+- `reports_multinode_nccl_diagnostic_2x8_debug_v2.md`：脚本生成的原始 2x8 诊断报告
+- `reports_multinode_nccl_diagnosis_20260523.md`：本中文诊断总结
diff --git a/reports_multinode_nccl_diagnostic_2x8_debug_v2.md b/reports_multinode_nccl_diagnostic_2x8_debug_v2.md
new file mode 100644
index 0000000..2076245
--- /dev/null
+++ b/reports_multinode_nccl_diagnostic_2x8_debug_v2.md
@@ -0,0 +1,66 @@
+# GPU Test Report
+
+- **Date:** 2026-05-23T07:37:41.426792
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- RDMA
+- DCGM
+- Training
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Multi-node NCCL | FAIL |
+
+## Multi-node NCCL / Cross Leaf
+
+Source: nccl-tests-mpirun | Mode: diagnostic
+
+- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
+- **Preflight:** PASS (1 warnings)
+
+### Multi-node NCCL allreduce
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs diagnostic | 68.69 GB/s | 256M | 68.21 GB/s | >= 480 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Disabled HCAs |
+|----------|--------------|-----------------|-------------------|
+| 2 nodes x 8 GPUs diagnostic | IB | DISABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs diagnostic | 0 |  aikubeworker0012:2139504:2139504 [0] NCCL INFO comm 0x55646d15f590 rank 0 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 68.2135  # # Collective test concluded: all_reduce_perf #   |
+
+### Multi-node NCCL alltoall
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs diagnostic | 0.00 GB/s |  | 0.00 GB/s | >= 75 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Disabled HCAs |
+|----------|--------------|-----------------|-------------------|
+| 2 nodes x 8 GPUs diagnostic | unknown | UNKNOWN | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs diagnostic | 255 |  lack of common network interfaces and/or no route found between   them. Please check network connectivity (including firewalls   and network routing requirements). --------------------------------------------------------------------------  |
+
+**Overall: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
-- 
2.47.2


From c660e04c99fa4b603ed74ac7497a56dc838629e4 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 15:49:14 +0800
Subject: [PATCH 04/41] Stabilize multinode NCCL launch diagnostics

---
 configs/default.yaml                          |  2 +
 configs/multinode_nccl_diagnostic.yaml        |  2 +
 reports_multinode_nccl_diagnosis_20260523.md  | 78 ++++++++++++++-----
 ...ts_multinode_nccl_diagnostic_2x8_sshfix.md | 66 ++++++++++++++++
 4 files changed, 130 insertions(+), 18 deletions(-)
 create mode 100644 reports_multinode_nccl_diagnostic_2x8_sshfix.md

diff --git a/configs/default.yaml b/configs/default.yaml
index 7951089..b3956a4 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -80,6 +80,8 @@ multinode_nccl:
   gpus_per_rank: 1
   timeout_sec: 1800
   socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
   ib_gid_index: 3
   ib_sl: 5
   ib_tc: 136
diff --git a/configs/multinode_nccl_diagnostic.yaml b/configs/multinode_nccl_diagnostic.yaml
index 6afdc19..3741b37 100644
--- a/configs/multinode_nccl_diagnostic.yaml
+++ b/configs/multinode_nccl_diagnostic.yaml
@@ -40,6 +40,8 @@ multinode_nccl:
   timeout_sec: 600
   debug: INFO
   socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
   ib_gid_index: 3
   ib_sl: 5
   ib_tc: 136
diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md
index 37cb75e..6468215 100644
--- a/reports_multinode_nccl_diagnosis_20260523.md
+++ b/reports_multinode_nccl_diagnosis_20260523.md
@@ -4,13 +4,13 @@
 - 测试入口：`nccl-gpu-1` / `aikubeworker0012` / `172.72.8.12`
 - 对端节点：`nccl-gpu-2` / `aikubeworker0016` / `172.72.8.16`
 - 诊断配置：`configs/multinode_nccl_diagnostic.yaml`
-- 原始脚本报告：`reports_multinode_nccl_diagnostic_2x8_debug_v2.md`
+- 原始脚本报告：`reports_multinode_nccl_diagnostic_2x8_sshfix.md`
 
 ## 当前结论
 
 这不是单纯 “IB 不通” 的问题。底层 CUDA RDMA perftest 可以跑到接近单端口 400Gb/s 的水平，但 NCCL 在实际 2 节点通信时把 GPU Direct RDMA 禁用了，导致 NCCL 带宽显著低于验收阈值。
 
-同时，`nccl-gpu-2` 的 SSH 入口不稳定，会造成 `mpirun` 拉起远端 rank 失败。这个问题会直接影响 alltoall 等多机测试的稳定性，需要和 NCCL GDR 问题一起处理。
+同时，`nccl-gpu-2` 的 SSH 入口曾因未认证连接过多触发 `MaxStartups` 随机拒绝，导致 `mpirun` 拉起远端 rank 失败。已经做了临时 SSHD 缓解并拿到有效的 2 节点 x 8 GPU allreduce/alltoall 报告；当前剩余核心问题是 NCCL GDR 仍被禁用。
 
 ## 已完成的修正
 
@@ -20,6 +20,8 @@
 4. 在脚本中加入 multi-node NCCL 网络诊断解析，报告会展示 `NCCL Network`、`GPU Direct RDMA`、`GDR Disabled HCAs`。
 5. 增加 `multinode_nccl.extra_env`，可以在配置里快速试 NCCL 环境变量，不需要改代码。
 6. 增加诊断配置 `configs/multinode_nccl_diagnostic.yaml`，固定跑 2 节点 x 8 GPU、256M、`NCCL_DEBUG=INFO` 和 `NCCL_DEBUG_SUBSYS=INIT,NET`。
+7. 在 `nccl-gpu-2` 上临时提高 SSHD `MaxStartups` 并缩短 `LoginGraceTime`，缓解未认证连接过多导致的 SSH 随机拒绝。
+8. 将 OpenMPI OOB TCP 控制通道固定到 `bond0`，并加入 `plm_rsh_args`，减少 `mpirun` 远端启动受 SSH/host key/接口选择影响的概率。
 
 ## 关键证据
 
@@ -77,12 +79,12 @@ NCCL INFO NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 
 ### 4. 脚本 2 节点 x 8 GPU 诊断结果
 
-原始报告：`reports_multinode_nccl_diagnostic_2x8_debug_v2.md`
+原始报告：`reports_multinode_nccl_diagnostic_2x8_sshfix.md`
 
 | Operation | Topology | Peak Bus BW | Threshold | Status | NCCL Network | GPU Direct RDMA |
 |-----------|----------|-------------|-----------|--------|--------------|-----------------|
-| allreduce | 2 nodes x 8 GPUs | `68.69 GB/s` | `>= 480 GB/s` | FAIL | IB | DISABLED |
-| alltoall | 2 nodes x 8 GPUs | `0.00 GB/s` | `>= 75 GB/s` | FAIL | unknown | UNKNOWN |
+| allreduce | 2 nodes x 8 GPUs | `67.42 GB/s` | `>= 480 GB/s` | FAIL | IB | DISABLED |
+| alltoall | 2 nodes x 8 GPUs | `9.56 GB/s` | `>= 75 GB/s` | FAIL | IB | DISABLED |
 
 allreduce 失败原因是带宽不达标，且报告捕获到 GDR 被 NCCL 禁用：
 
@@ -90,12 +92,51 @@ allreduce 失败原因是带宽不达标，且报告捕获到 GDR 被 NCCL 禁
 |-------------------|
 | `mlx5_0, mlx5_1, mlx5_6, mlx5_7` |
 
-alltoall 失败原因这轮不是性能本身，而是 `mpirun` 阶段受 SSH/网络发现影响失败，报告尾部显示：
+allreduce 和 alltoall 本轮均正常完成，`returncode=0`、`wrong=0`，失败原因是带宽低于阈值，不是正确性失败。
+
+### 5. SSHD MaxStartups 阻塞已临时缓解
+
+`nccl-gpu-2` 曾显示：
 
 ```text
-lack of common network interfaces and/or no route found between them
+sshd: /usr/sbin/sshd -D [listener] 52 of 10-100 startups
+maxstartups 10:30:100
 ```
 
+同时存在大量 `sshd: unknown [priv]` / `sshd: unknown [net]` 未认证连接，来源主要是 `172.239.10.85`。这会触发 OpenSSH `MaxStartups` 随机拒绝，直接表现为：
+
+```text
+kex_exchange_identification: Connection closed by remote host
+```
+
+已临时改为：
+
+```text
+MaxStartups 120:30:240
+LoginGraceTime 20
+```
+
+改完后从 0012 连续 SSH 0016 5 次成功，2 节点 `mpirun hostname` 成功，2 节点 x 8 GPU allreduce/alltoall 也都能跑出有效结果。
+
+### 6. `nvidia_peermem` legacy 模式实验无效
+
+两台机器默认参数一致：
+
+| 参数 | 值 |
+|------|----|
+| `nvidia_peermem` version | `580.159.03` |
+| `peerdirect_support` | `0` |
+| `persistent_api_support` | `1` |
+| OFED | `OFED-internal-26.01-1.0.0` |
+
+临时切换两台机器到 `peerdirect_support=1` 后，2 节点 x 1 GPU NCCL 仍显示：
+
+```text
+NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
+```
+
+带宽仍约 `13.4 GB/s`。测试后已经恢复默认 `peerdirect_support=0,persistent_api_support=1`。
+
 ## 当前阻塞
 
 ### 阻塞 1：NCCL 禁用 GPU Direct RDMA
@@ -109,26 +150,27 @@ lack of common network interfaces and/or no route found between them
 
 判断：底层 RDMA 能力存在，但 NCCL 的 GDR 判定/注册路径没有打通。优先排查 NCCL 与 NVIDIA driver、OFED、`nvidia_peermem`、NCCL net plugin/内部 IB 后端之间的兼容性。
 
-### 阻塞 2：`nccl-gpu-2` SSH 不稳定
+### 阻塞 2：`nccl-gpu-2` SSH 存在外部连接压力
 
 现象：
 
-- 多次出现：`kex_exchange_identification: Connection closed by remote host`
-- MCP 直连 `nccl-gpu-2` 也会失败或长时间超时
-- `mpirun` 依赖 SSH 拉起远端 rank，因此 SSH 抖动会让 alltoall 这类测试直接没有有效输出
+- 多次出现过：`kex_exchange_identification: Connection closed by remote host`
+- 根因是未认证连接过多触发 `MaxStartups`
+- 当前已经通过临时 SSHD 配置缓解，并拿到了有效 2x8 报告
+- 但如果外部连接压力持续，仍建议从网络侧或安全策略侧处理来源连接
 
-判断：需要先处理 `aikubeworker0016` 的 SSHD/连接限制/MaxStartups/安全策略，否则多机测试无法稳定复现。
+判断：这不再阻塞当前报告产出，但属于环境稳定性风险。
 
 ## 建议下一步
 
-1. 先修 `nccl-gpu-2` SSH 稳定性：检查 `sshd_config` 的 `MaxStartups`、连接限制、安全审计组件，以及是否有过多半开 SSH 会话。
-2. 对两台机器分别确认 `nvidia_peermem` 参数、OFED 版本、NVIDIA driver 版本一致性。
-3. 在两台机器上测试是否需要切换 `nvidia_peermem peerdirect_support` 模式，并在变更前确认没有正在运行的业务任务。
-4. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin；当前日志显示 `No plugin found (libnccl-net.so)`，NCCL 使用的是 internal network plugin。
-5. SSH 稳定后重跑完整多机配置：2 节点 x 8 GPU，至少覆盖 `all_reduce_perf` 和 `alltoall_perf`，消息大小从 `1K` 到 `16G`。
+1. 从网络/安全侧处理 `172.239.10.85` 等来源的 SSH 未认证连接压力，或者保留更高的 `MaxStartups` 配置作为测试窗口临时策略。
+2. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin；当前日志显示 `No plugin found (libnccl-net.so)`，NCCL 使用的是 internal network plugin。
+3. 用同版本软件栈补测 `nccl-tests` + NCCL net plugin 后的 GDR 状态，核心判据是报告里 `GPU Direct RDMA` 从 `DISABLED` 变成未禁用，且 2x8 带宽显著抬升。
+4. 如果仍禁用 GDR，再继续查 NVIDIA driver 580.159.03、OFED 26.01、NCCL 2.21.5 与 H100/IB NDR 组合的兼容矩阵。
+5. GDR 修复后重跑完整多机配置：2 节点 x 8 GPU，至少覆盖 `all_reduce_perf` 和 `alltoall_perf`，消息大小从 `1K` 到 `16G`。
 
 ## 当前可交付物
 
 - `configs/multinode_nccl_diagnostic.yaml`：多机多卡诊断配置
-- `reports_multinode_nccl_diagnostic_2x8_debug_v2.md`：脚本生成的原始 2x8 诊断报告
+- `reports_multinode_nccl_diagnostic_2x8_sshfix.md`：脚本生成的原始 2x8 诊断报告
 - `reports_multinode_nccl_diagnosis_20260523.md`：本中文诊断总结
diff --git a/reports_multinode_nccl_diagnostic_2x8_sshfix.md b/reports_multinode_nccl_diagnostic_2x8_sshfix.md
new file mode 100644
index 0000000..1872c50
--- /dev/null
+++ b/reports_multinode_nccl_diagnostic_2x8_sshfix.md
@@ -0,0 +1,66 @@
+# GPU Test Report
+
+- **Date:** 2026-05-23T07:46:11.464439
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- RDMA
+- DCGM
+- Training
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Multi-node NCCL | FAIL |
+
+## Multi-node NCCL / Cross Leaf
+
+Source: nccl-tests-mpirun | Mode: diagnostic
+
+- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
+- **Preflight:** PASS
+
+### Multi-node NCCL allreduce
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs diagnostic | 67.42 GB/s | 256M | 67.50 GB/s | >= 480 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Disabled HCAs |
+|----------|--------------|-----------------|-------------------|
+| 2 nodes x 8 GPUs diagnostic | IB | DISABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs diagnostic | 0 | orker0016:986293:986293 [1] NCCL INFO comm 0x563abe94c350 rank 9 nranks 16 cudaDev 1 busId 2a000 - Destroy COMPLETE aikubeworker0016:986292:986292 [0] NCCL INFO comm 0x560ffac51160 rank 8 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE   |
+
+### Multi-node NCCL alltoall
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs diagnostic | 9.56 GB/s | 256M | 9.55 GB/s | >= 75 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Disabled HCAs |
+|----------|--------------|-----------------|-------------------|
+| 2 nodes x 8 GPUs diagnostic | IB | DISABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs diagnostic | 0 | TE aikubeworker0012:2141982:2141982 [4] NCCL INFO comm 0x55d0bf9c6a00 rank 4 nranks 16 cudaDev 4 busId 9a000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 9.55234  # # Collective test concluded: alltoall_perf #   |
+
+**Overall: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
-- 
2.47.2


From 1f907e969177070089012155b833c12e801c9361 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 15:58:21 +0800
Subject: [PATCH 05/41] Validate NCCL 2.27 multinode GDR performance

---
 configs/multinode_nccl_nccl227_16g.yaml       |  62 +++++++++++
 .../multinode_nccl_nccl227_diagnostic.yaml    |  62 +++++++++++
 configs/multinode_nccl_nccl227_sweep.yaml     |  62 +++++++++++
 modules/report.py                             |   7 +-
 reports_multinode_nccl_16g_2x8_nccl227.md     |  66 ++++++++++++
 reports_multinode_nccl_diagnosis_20260523.md  | 101 +++++++++++++++---
 ...ultinode_nccl_diagnostic_2x8_nccl227_v2.md |  66 ++++++++++++
 reports_multinode_nccl_sweep_2x8_nccl227.md   |  66 ++++++++++++
 8 files changed, 474 insertions(+), 18 deletions(-)
 create mode 100644 configs/multinode_nccl_nccl227_16g.yaml
 create mode 100644 configs/multinode_nccl_nccl227_diagnostic.yaml
 create mode 100644 configs/multinode_nccl_nccl227_sweep.yaml
 create mode 100644 reports_multinode_nccl_16g_2x8_nccl227.md
 create mode 100644 reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md
 create mode 100644 reports_multinode_nccl_sweep_2x8_nccl227.md

diff --git a/configs/multinode_nccl_nccl227_16g.yaml b/configs/multinode_nccl_nccl227_16g.yaml
new file mode 100644
index 0000000..e7b718f
--- /dev/null
+++ b/configs/multinode_nccl_nccl227_16g.yaml
@@ -0,0 +1,62 @@
+tools:
+  install_dir: /opt/gpu-test-tools
+
+report:
+  output_dir: ./reports
+  format: md
+
+multinode_nccl:
+  enabled: true
+  mode: large-message-nccl-2.27.7
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: null
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+      label: 2 nodes x 8 GPUs NCCL 2.27.7 16G
+  begin_size: 16G
+  end_size: 16G
+  step_factor: 2
+  warmup_iters: 1
+  iters: 3
+  gpus_per_rank: 1
+  timeout_sec: 1200
+  debug: INFO
+  socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: 4
+  min_nchannels: 4
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: 1
+  extra_env:
+    NCCL_DEBUG_SUBSYS: INIT,NET
+    NCCL_NET_GDR_LEVEL: 5
+    NCCL_NET_GDR_READ: 1
+    NCCL_DMABUF_ENABLE: 0
+  min_peak_busbw_gbps:
+    allreduce: 480
+    alltoall: 75
diff --git a/configs/multinode_nccl_nccl227_diagnostic.yaml b/configs/multinode_nccl_nccl227_diagnostic.yaml
new file mode 100644
index 0000000..8a769ad
--- /dev/null
+++ b/configs/multinode_nccl_nccl227_diagnostic.yaml
@@ -0,0 +1,62 @@
+tools:
+  install_dir: /opt/gpu-test-tools
+
+report:
+  output_dir: ./reports
+  format: md
+
+multinode_nccl:
+  enabled: true
+  mode: diagnostic-nccl-2.27.7
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: null
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+      label: 2 nodes x 8 GPUs NCCL 2.27.7
+  begin_size: 256M
+  end_size: 256M
+  step_factor: 2
+  warmup_iters: 1
+  iters: 3
+  gpus_per_rank: 1
+  timeout_sec: 600
+  debug: INFO
+  socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: 4
+  min_nchannels: 4
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: 1
+  extra_env:
+    NCCL_DEBUG_SUBSYS: INIT,NET
+    NCCL_NET_GDR_LEVEL: 5
+    NCCL_NET_GDR_READ: 1
+    NCCL_DMABUF_ENABLE: 0
+  min_peak_busbw_gbps:
+    allreduce: 480
+    alltoall: 75
diff --git a/configs/multinode_nccl_nccl227_sweep.yaml b/configs/multinode_nccl_nccl227_sweep.yaml
new file mode 100644
index 0000000..3dcbf36
--- /dev/null
+++ b/configs/multinode_nccl_nccl227_sweep.yaml
@@ -0,0 +1,62 @@
+tools:
+  install_dir: /opt/gpu-test-tools
+
+report:
+  output_dir: ./reports
+  format: md
+
+multinode_nccl:
+  enabled: true
+  mode: sweep-nccl-2.27.7
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: null
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+      label: 2 nodes x 8 GPUs NCCL 2.27.7 sweep
+  begin_size: 1M
+  end_size: 4G
+  step_factor: 4
+  warmup_iters: 2
+  iters: 5
+  gpus_per_rank: 1
+  timeout_sec: 1200
+  debug: INFO
+  socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: 4
+  min_nchannels: 4
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: 1
+  extra_env:
+    NCCL_DEBUG_SUBSYS: INIT,NET
+    NCCL_NET_GDR_LEVEL: 5
+    NCCL_NET_GDR_READ: 1
+    NCCL_DMABUF_ENABLE: 0
+  min_peak_busbw_gbps:
+    allreduce: 480
+    alltoall: 75
diff --git a/modules/report.py b/modules/report.py
index c9e1b8d..acca41e 100644
--- a/modules/report.py
+++ b/modules/report.py
@@ -498,13 +498,14 @@ class ReportGenerator:
                     if net:
                         diag_rows.append((topo, net))
                 if diag_rows:
-                    lines.append("| Topology | NCCL Network | GPU Direct RDMA | GDR Disabled HCAs |")
-                    lines.append("|----------|--------------|-----------------|-------------------|")
+                    lines.append("| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |")
+                    lines.append("|----------|--------------|-----------------|------------------|-------------------|")
                     for topo, net in diag_rows:
                         networks = ", ".join(net.get("networks") or []) or "unknown"
                         gdr = net.get("gpu_direct_rdma", "UNKNOWN")
+                        enabled = ", ".join(net.get("gdr_enabled_hcas") or []) or "-"
                         disabled = ", ".join(net.get("gdr_disabled_hcas") or []) or "-"
-                        lines.append(f"| {topo.get('label', '')} | {networks} | {gdr} | {disabled} |")
+                        lines.append(f"| {topo.get('label', '')} | {networks} | {gdr} | {enabled} | {disabled} |")
                     lines.append("")
                 failed_topos = [topo for topo in data.get("topologies", []) if topo.get("status") == "FAIL"]
                 if failed_topos:
diff --git a/reports_multinode_nccl_16g_2x8_nccl227.md b/reports_multinode_nccl_16g_2x8_nccl227.md
new file mode 100644
index 0000000..394f191
--- /dev/null
+++ b/reports_multinode_nccl_16g_2x8_nccl227.md
@@ -0,0 +1,66 @@
+# GPU Test Report
+
+- **Date:** 2026-05-23T07:56:26.791384
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- RDMA
+- DCGM
+- Training
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Multi-node NCCL | FAIL |
+
+## Multi-node NCCL / Cross Leaf
+
+Source: nccl-tests-mpirun | Mode: large-message-nccl-2.27.7
+
+- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
+- **Preflight:** PASS
+
+### Multi-node NCCL allreduce
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 16G | 237.86 GB/s | 16G | 238.56 GB/s | >= 480 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 16G | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 16G | 0 | aikubeworker0016:1019342:1020412 [4] NCCL INFO comm 0x559f14871c30 rank 12 nranks 16 cudaDev 4 busId 9a000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 238.555  # # Collective test concluded: all_reduce_perf #   |
+
+### Multi-node NCCL alltoall
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 16G | 28.62 GB/s | 16G | 28.62 GB/s | >= 75 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 16G | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 16G | 0 | E aikubeworker0016:1020609:1021756 [5] NCCL INFO comm 0x55f920e55d90 rank 13 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 28.6222  # # Collective test concluded: alltoall_perf #   |
+
+**Overall: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md
index 6468215..bc20b72 100644
--- a/reports_multinode_nccl_diagnosis_20260523.md
+++ b/reports_multinode_nccl_diagnosis_20260523.md
@@ -8,9 +8,11 @@
 
 ## 当前结论
 
-这不是单纯 “IB 不通” 的问题。底层 CUDA RDMA perftest 可以跑到接近单端口 400Gb/s 的水平，但 NCCL 在实际 2 节点通信时把 GPU Direct RDMA 禁用了，导致 NCCL 带宽显著低于验收阈值。
+这不是单纯 “IB 不通” 的问题。底层 CUDA RDMA perftest 可以跑到接近单端口 400Gb/s 的水平；最初使用 pip 包里的 NCCL 2.21.5 时，NCCL 在实际 2 节点通信中把 GPU Direct RDMA 禁用了，导致带宽显著偏低。
 
-同时，`nccl-gpu-2` 的 SSH 入口曾因未认证连接过多触发 `MaxStartups` 随机拒绝，导致 `mpirun` 拉起远端 rank 失败。已经做了临时 SSHD 缓解并拿到有效的 2 节点 x 8 GPU allreduce/alltoall 报告；当前剩余核心问题是 NCCL GDR 仍被禁用。
+后续临时切换到 apt 包解压出的 NCCL 2.27.7+cuda12.4 后，NCCL GDR 已经恢复启用，2 节点 x 8 GPU allreduce 从 `67.42 GB/s` 提升到 `237.86 GB/s`，alltoall 从 `9.56 GB/s` 提升到 `28.62 GB/s`。当前剩余问题不再是 GDR disabled，而是 GDR enabled 后仍低于当前配置里的验收阈值。
+
+同时，`nccl-gpu-2` 的 SSH 入口曾因未认证连接过多触发 `MaxStartups` 随机拒绝，导致 `mpirun` 拉起远端 rank 失败。已经做了临时 SSHD 缓解并拿到有效的 2 节点 x 8 GPU allreduce/alltoall 报告。
 
 ## 已完成的修正
 
@@ -22,6 +24,8 @@
 6. 增加诊断配置 `configs/multinode_nccl_diagnostic.yaml`，固定跑 2 节点 x 8 GPU、256M、`NCCL_DEBUG=INFO` 和 `NCCL_DEBUG_SUBSYS=INIT,NET`。
 7. 在 `nccl-gpu-2` 上临时提高 SSHD `MaxStartups` 并缩短 `LoginGraceTime`，缓解未认证连接过多导致的 SSH 随机拒绝。
 8. 将 OpenMPI OOB TCP 控制通道固定到 `bond0`，并加入 `plm_rsh_args`，减少 `mpirun` 远端启动受 SSH/host key/接口选择影响的概率。
+9. 从 NVIDIA apt 源下载但不安装 `libnccl2=2.27.7-1+cuda12.4`，解压到两台机器 `/tmp/nccl-2.27.7-cuda12.4`，用 `LD_LIBRARY_PATH` 临时覆盖 NCCL 运行库验证。
+10. 增强报告解析，能够区分 `GPU Direct RDMA ENABLED` 和 `DISABLED`，并列出 enabled/disabled HCA。
 
 ## 关键证据
 
@@ -59,6 +63,8 @@ CUDA_VISIBLE_DEVICES=0 ib_write_bw -d mlx5_0 -i 1 --use_cuda=0 --use_cuda_dmabuf
 
 ### 3. NCCL 单卡跨节点仍禁用 GDR
 
+使用 pip NCCL 2.21.5 时，
+
 已经尝试：
 
 - `NCCL_NET_GDR_LEVEL=SYS`
@@ -77,9 +83,27 @@ NCCL INFO NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 
 256M allreduce 约 `13.4 GB/s`，明显低于 400Gb/s IB 端口能力。
 
+### 3.1 NCCL 2.27.7 恢复 GDR
+
+临时使用：
+
+```bash
+LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu:/usr/local/cuda-12.4/targets/x86_64-linux/lib
+```
+
+2 节点 x 1 GPU 日志显示：
+
+```text
+NCCL version 2.27.7+cuda12.4
+NET/IB : GPU Direct RDMA Enabled for HCA 0 'mlx5_0'
+Channel ... via NET/IB/0/GDRDMA
+```
+
+256M allreduce 从 NCCL 2.21.5 的约 `13.4 GB/s` 提升到 `45.2 GB/s`。判断：NCCL 2.21.5 与当前 driver/OFED/H100 组合存在 GDR 判定或注册路径兼容问题；升级 NCCL 是有效修复方向。
+
 ### 4. 脚本 2 节点 x 8 GPU 诊断结果
 
-原始报告：`reports_multinode_nccl_diagnostic_2x8_sshfix.md`
+原始报告：`reports_multinode_nccl_diagnostic_2x8_sshfix.md`，使用 pip NCCL 2.21.5。
 
 | Operation | Topology | Peak Bus BW | Threshold | Status | NCCL Network | GPU Direct RDMA |
 |-----------|----------|-------------|-----------|--------|--------------|-----------------|
@@ -94,6 +118,31 @@ allreduce 失败原因是带宽不达标，且报告捕获到 GDR 被 NCCL 禁
 
 allreduce 和 alltoall 本轮均正常完成，`returncode=0`、`wrong=0`，失败原因是带宽低于阈值，不是正确性失败。
 
+### 4.1 NCCL 2.27.7 诊断结果
+
+256M 诊断报告：`reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md`
+
+| Operation | Topology | Peak Bus BW | Threshold | Status | NCCL Network | GPU Direct RDMA |
+|-----------|----------|-------------|-----------|--------|--------------|-----------------|
+| allreduce | 2 nodes x 8 GPUs | `212.19 GB/s` | `>= 480 GB/s` | FAIL | IB | ENABLED |
+| alltoall | 2 nodes x 8 GPUs | `28.37 GB/s` | `>= 75 GB/s` | FAIL | IB | ENABLED |
+
+1M 到 4G sweep 报告：`reports_multinode_nccl_sweep_2x8_nccl227.md`
+
+| Operation | Peak Bus BW | Peak Size | Threshold | Status | GPU Direct RDMA |
+|-----------|-------------|-----------|-----------|--------|-----------------|
+| allreduce | `237.26 GB/s` | `4G` | `>= 480 GB/s` | FAIL | ENABLED |
+| alltoall | `28.78 GB/s` | `1G` | `>= 75 GB/s` | FAIL | ENABLED |
+
+16G 大包报告：`reports_multinode_nccl_16g_2x8_nccl227.md`
+
+| Operation | Peak Bus BW | Peak Size | Threshold | Status | GPU Direct RDMA |
+|-----------|-------------|-----------|-----------|--------|-----------------|
+| allreduce | `237.86 GB/s` | `16G` | `>= 480 GB/s` | FAIL | ENABLED |
+| alltoall | `28.62 GB/s` | `16G` | `>= 75 GB/s` | FAIL | ENABLED |
+
+解释：NCCL 2.27.7 已经修复 GDR 禁用问题，且性能提升明显；但在当前跨节点/跨 Leaf 环境和当前阈值下仍不达标。allreduce 约稳定在 `238 GB/s`，alltoall 约稳定在 `28-29 GB/s`。
+
 ### 5. SSHD MaxStartups 阻塞已临时缓解
 
 `nccl-gpu-2` 曾显示：
@@ -109,13 +158,20 @@ maxstartups 10:30:100
 kex_exchange_identification: Connection closed by remote host
 ```
 
-已临时改为：
+先临时改为：
 
 ```text
 MaxStartups 120:30:240
 LoginGraceTime 20
 ```
 
+后续外部未认证连接继续上涨到 `110 of 120-240 startups`，测试窗口进一步临时改为：
+
+```text
+MaxStartups 500:30:1000
+LoginGraceTime 5
+```
+
 改完后从 0012 连续 SSH 0016 5 次成功，2 节点 `mpirun hostname` 成功，2 节点 x 8 GPU allreduce/alltoall 也都能跑出有效结果。
 
 ### 6. `nvidia_peermem` legacy 模式实验无效
@@ -139,18 +195,27 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 
 ## 当前阻塞
 
-### 阻塞 1：NCCL 禁用 GPU Direct RDMA
+### 阻塞 1：当前生产 NCCL 版本过旧，GDR 被禁用
 
 现象：
 
-- IB 能被 NCCL 识别：`Using network IB`
-- 400Gb/s HCA 被 NCCL 选中：`mlx5_0, mlx5_1, mlx5_6, mlx5_7`
-- 但 NCCL 明确禁用 GDR：`GPU Direct RDMA Disabled`
-- perftest 的经典 CUDA RDMA 又能跑到 `387.16 Gb/s`
+- pip NCCL 2.21.5：`GPU Direct RDMA Disabled`，2x8 allreduce `67.42 GB/s`
+- 临时 NCCL 2.27.7：`GPU Direct RDMA Enabled`，2x8 allreduce `237.86 GB/s`
+- 因此，生产测试环境应避免继续使用 pip NCCL 2.21.5 作为多机 NCCL 验收运行库
 
-判断：底层 RDMA 能力存在，但 NCCL 的 GDR 判定/注册路径没有打通。优先排查 NCCL 与 NVIDIA driver、OFED、`nvidia_peermem`、NCCL net plugin/内部 IB 后端之间的兼容性。
+判断：底层 RDMA 能力存在，GDR 禁用主要由旧 NCCL 版本触发。建议正式安装并固定 NCCL 2.27.7+cuda12.4 或更新的已验证版本。
 
-### 阻塞 2：`nccl-gpu-2` SSH 存在外部连接压力
+### 阻塞 2：GDR enabled 后带宽仍低于当前阈值
+
+现象：
+
+- 2x8 16G allreduce：`237.86 GB/s`，阈值 `>= 480 GB/s`
+- 2x8 16G alltoall：`28.62 GB/s`，阈值 `>= 75 GB/s`
+- 已使用 4 个 400Gb/s HCA：`mlx5_0, mlx5_1, mlx5_6, mlx5_7`
+
+判断：需要确认当前 PDF/config 阈值是否适用于跨 Leaf 两节点场景；如果阈值确实要求跨 Leaf 也达到这些数值，则还需要继续查链路聚合、多 rail 使用、交换网络、NCCL net plugin/SHARP 或 rail mapping。
+
+### 阻塞 3：`nccl-gpu-2` SSH 存在外部连接压力
 
 现象：
 
@@ -164,13 +229,19 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 ## 建议下一步
 
 1. 从网络/安全侧处理 `172.239.10.85` 等来源的 SSH 未认证连接压力，或者保留更高的 `MaxStartups` 配置作为测试窗口临时策略。
-2. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin；当前日志显示 `No plugin found (libnccl-net.so)`，NCCL 使用的是 internal network plugin。
-3. 用同版本软件栈补测 `nccl-tests` + NCCL net plugin 后的 GDR 状态，核心判据是报告里 `GPU Direct RDMA` 从 `DISABLED` 变成未禁用，且 2x8 带宽显著抬升。
-4. 如果仍禁用 GDR，再继续查 NVIDIA driver 580.159.03、OFED 26.01、NCCL 2.21.5 与 H100/IB NDR 组合的兼容矩阵。
-5. GDR 修复后重跑完整多机配置：2 节点 x 8 GPU，至少覆盖 `all_reduce_perf` 和 `alltoall_perf`，消息大小从 `1K` 到 `16G`。
+2. 正式安装并固定已验证的 NCCL 2.27.7+cuda12.4 或更新版本，不要依赖 pip NCCL 2.21.5；当前 `/tmp/nccl-2.27.7-cuda12.4` 只是临时解压验证。
+3. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin/SHARP；当前日志显示 `Could not find: libnccl-net.so`，NCCL 使用的是 internal IB plugin。
+4. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由和拥塞计数，确认 4 个 400Gb/s HCA 是否都在跨节点通信中充分利用。
+5. 确认当前 `allreduce >= 480 GB/s`、`alltoall >= 75 GB/s` 阈值是否应直接用于跨 Leaf 两节点场景；如果是，继续按链路和 NCCL rail 聚合方向排查。
 
 ## 当前可交付物
 
 - `configs/multinode_nccl_diagnostic.yaml`：多机多卡诊断配置
+- `configs/multinode_nccl_nccl227_diagnostic.yaml`：NCCL 2.27.7 256M 诊断配置
+- `configs/multinode_nccl_nccl227_sweep.yaml`：NCCL 2.27.7 1M 到 4G sweep 配置
+- `configs/multinode_nccl_nccl227_16g.yaml`：NCCL 2.27.7 16G 大包配置
 - `reports_multinode_nccl_diagnostic_2x8_sshfix.md`：脚本生成的原始 2x8 诊断报告
+- `reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md`：NCCL 2.27.7 256M 诊断报告
+- `reports_multinode_nccl_sweep_2x8_nccl227.md`：NCCL 2.27.7 1M 到 4G sweep 报告
+- `reports_multinode_nccl_16g_2x8_nccl227.md`：NCCL 2.27.7 16G 大包报告
 - `reports_multinode_nccl_diagnosis_20260523.md`：本中文诊断总结
diff --git a/reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md b/reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md
new file mode 100644
index 0000000..1b188d5
--- /dev/null
+++ b/reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md
@@ -0,0 +1,66 @@
+# GPU Test Report
+
+- **Date:** 2026-05-23T07:53:24.460277
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- RDMA
+- DCGM
+- Training
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Multi-node NCCL | FAIL |
+
+## Multi-node NCCL / Cross Leaf
+
+Source: nccl-tests-mpirun | Mode: diagnostic-nccl-2.27.7
+
+- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
+- **Preflight:** PASS
+
+### Multi-node NCCL allreduce
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 | 212.19 GB/s | 256M | 211.75 GB/s | >= 480 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 | 0 | 0016:1009332:1009965 [2] NCCL INFO comm 0x56388eec2e40 rank 10 nranks 16 cudaDev 2 busId 3a000 - Destroy COMPLETE aikubeworker0012:2144366:2144531 [5] NCCL INFO comm 0x556e4fcf5280 rank 5 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE   |
+
+### Multi-node NCCL alltoall
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 | 28.37 GB/s | 256M | 28.32 GB/s | >= 75 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 | 0 | 0012:2144547:2144713 [4] NCCL INFO comm 0x55896a1dae20 rank 4 nranks 16 cudaDev 4 busId 9a000 - Destroy COMPLETE aikubeworker0016:1010164:1010881 [2] NCCL INFO comm 0x565344db7790 rank 10 nranks 16 cudaDev 2 busId 3a000 - Destroy COMPLETE   |
+
+**Overall: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_multinode_nccl_sweep_2x8_nccl227.md b/reports_multinode_nccl_sweep_2x8_nccl227.md
new file mode 100644
index 0000000..701492b
--- /dev/null
+++ b/reports_multinode_nccl_sweep_2x8_nccl227.md
@@ -0,0 +1,66 @@
+# GPU Test Report
+
+- **Date:** 2026-05-23T07:54:48.990378
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- RDMA
+- DCGM
+- Training
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Multi-node NCCL | FAIL |
+
+## Multi-node NCCL / Cross Leaf
+
+Source: nccl-tests-mpirun | Mode: sweep-nccl-2.27.7
+
+- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
+- **Preflight:** PASS
+
+### Multi-node NCCL allreduce
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 sweep | 237.26 GB/s | 4G | 150.62 GB/s | >= 480 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 sweep | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 sweep | 0 |  aikubeworker0012:2145024:2145189 [0] NCCL INFO comm 0x561f7dc1f780 rank 0 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 150.624  # # Collective test concluded: all_reduce_perf #   |
+
+### Multi-node NCCL alltoall
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 sweep | 28.78 GB/s | 1G | 23.57 GB/s | >= 75 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 sweep | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 sweep | 0 | r0012:2145213:2145384 [7] NCCL INFO comm 0x558d54228110 rank 7 nranks 16 cudaDev 7 busId db000 - Destroy COMPLETE aikubeworker0016:1014703:1015544 [0] NCCL INFO comm 0x55ed6d99d8e0 rank 8 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE   |
+
+**Overall: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
-- 
2.47.2


From 6c9f049b71c39f95bb1ddd5268e323c8fa133494 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 16:12:32 +0800
Subject: [PATCH 06/41] Tune multinode NCCL auto parameters

---
 configs/default.yaml                          |  6 +-
 configs/multinode_nccl_diagnostic.yaml        |  6 +-
 configs/multinode_nccl_nccl227_16g.yaml       |  6 +-
 configs/multinode_nccl_nccl227_auto_16g.yaml  | 62 +++++++++++++++++
 .../multinode_nccl_nccl227_diagnostic.yaml    |  6 +-
 configs/multinode_nccl_nccl227_sweep.yaml     |  6 +-
 ...rts_multinode_nccl_16g_2x8_nccl227_auto.md | 66 +++++++++++++++++++
 reports_multinode_nccl_diagnosis_20260523.md  | 54 ++++++++++++---
 8 files changed, 187 insertions(+), 25 deletions(-)
 create mode 100644 configs/multinode_nccl_nccl227_auto_16g.yaml
 create mode 100644 reports_multinode_nccl_16g_2x8_nccl227_auto.md

diff --git a/configs/default.yaml b/configs/default.yaml
index b3956a4..cd214e4 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -87,11 +87,11 @@ multinode_nccl:
   ib_tc: 136
   ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
   ib_timeout: 22
-  qps_per_connection: 4
-  min_nchannels: 4
+  qps_per_connection: null
+  min_nchannels: null
   net_plugin: none
   nvls_enable: 1
-  split_data_on_qps: 1
+  split_data_on_qps: null
   extra_env: {}
   min_peak_busbw_gbps:
     allreduce: 480
diff --git a/configs/multinode_nccl_diagnostic.yaml b/configs/multinode_nccl_diagnostic.yaml
index 3741b37..0e6479d 100644
--- a/configs/multinode_nccl_diagnostic.yaml
+++ b/configs/multinode_nccl_diagnostic.yaml
@@ -47,11 +47,11 @@ multinode_nccl:
   ib_tc: 136
   ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
   ib_timeout: 22
-  qps_per_connection: 4
-  min_nchannels: 4
+  qps_per_connection: null
+  min_nchannels: null
   net_plugin: none
   nvls_enable: 1
-  split_data_on_qps: 1
+  split_data_on_qps: null
   extra_env:
     NCCL_DEBUG_SUBSYS: INIT,NET
     NCCL_NET_GDR_LEVEL: 5
diff --git a/configs/multinode_nccl_nccl227_16g.yaml b/configs/multinode_nccl_nccl227_16g.yaml
index e7b718f..c5552fe 100644
--- a/configs/multinode_nccl_nccl227_16g.yaml
+++ b/configs/multinode_nccl_nccl227_16g.yaml
@@ -47,11 +47,11 @@ multinode_nccl:
   ib_tc: 136
   ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
   ib_timeout: 22
-  qps_per_connection: 4
-  min_nchannels: 4
+  qps_per_connection: null
+  min_nchannels: null
   net_plugin: none
   nvls_enable: 1
-  split_data_on_qps: 1
+  split_data_on_qps: null
   extra_env:
     NCCL_DEBUG_SUBSYS: INIT,NET
     NCCL_NET_GDR_LEVEL: 5
diff --git a/configs/multinode_nccl_nccl227_auto_16g.yaml b/configs/multinode_nccl_nccl227_auto_16g.yaml
new file mode 100644
index 0000000..2492989
--- /dev/null
+++ b/configs/multinode_nccl_nccl227_auto_16g.yaml
@@ -0,0 +1,62 @@
+tools:
+  install_dir: /opt/gpu-test-tools
+
+report:
+  output_dir: ./reports
+  format: md
+
+multinode_nccl:
+  enabled: true
+  mode: large-message-nccl-2.27.7-auto
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: null
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+      label: 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G
+  begin_size: 16G
+  end_size: 16G
+  step_factor: 2
+  warmup_iters: 1
+  iters: 3
+  gpus_per_rank: 1
+  timeout_sec: 1200
+  debug: INFO
+  socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: null
+  min_nchannels: null
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: null
+  extra_env:
+    NCCL_DEBUG_SUBSYS: INIT,NET
+    NCCL_NET_GDR_LEVEL: 5
+    NCCL_NET_GDR_READ: 1
+    NCCL_DMABUF_ENABLE: 0
+  min_peak_busbw_gbps:
+    allreduce: 480
+    alltoall: 75
diff --git a/configs/multinode_nccl_nccl227_diagnostic.yaml b/configs/multinode_nccl_nccl227_diagnostic.yaml
index 8a769ad..5465772 100644
--- a/configs/multinode_nccl_nccl227_diagnostic.yaml
+++ b/configs/multinode_nccl_nccl227_diagnostic.yaml
@@ -47,11 +47,11 @@ multinode_nccl:
   ib_tc: 136
   ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
   ib_timeout: 22
-  qps_per_connection: 4
-  min_nchannels: 4
+  qps_per_connection: null
+  min_nchannels: null
   net_plugin: none
   nvls_enable: 1
-  split_data_on_qps: 1
+  split_data_on_qps: null
   extra_env:
     NCCL_DEBUG_SUBSYS: INIT,NET
     NCCL_NET_GDR_LEVEL: 5
diff --git a/configs/multinode_nccl_nccl227_sweep.yaml b/configs/multinode_nccl_nccl227_sweep.yaml
index 3dcbf36..da96ef1 100644
--- a/configs/multinode_nccl_nccl227_sweep.yaml
+++ b/configs/multinode_nccl_nccl227_sweep.yaml
@@ -47,11 +47,11 @@ multinode_nccl:
   ib_tc: 136
   ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
   ib_timeout: 22
-  qps_per_connection: 4
-  min_nchannels: 4
+  qps_per_connection: null
+  min_nchannels: null
   net_plugin: none
   nvls_enable: 1
-  split_data_on_qps: 1
+  split_data_on_qps: null
   extra_env:
     NCCL_DEBUG_SUBSYS: INIT,NET
     NCCL_NET_GDR_LEVEL: 5
diff --git a/reports_multinode_nccl_16g_2x8_nccl227_auto.md b/reports_multinode_nccl_16g_2x8_nccl227_auto.md
new file mode 100644
index 0000000..0481813
--- /dev/null
+++ b/reports_multinode_nccl_16g_2x8_nccl227_auto.md
@@ -0,0 +1,66 @@
+# GPU Test Report
+
+- **Date:** 2026-05-23T08:09:56.340954
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- RDMA
+- DCGM
+- Training
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Multi-node NCCL | FAIL |
+
+## Multi-node NCCL / Cross Leaf
+
+Source: nccl-tests-mpirun | Mode: large-message-nccl-2.27.7-auto
+
+- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
+- **Preflight:** PASS
+
+### Multi-node NCCL allreduce
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | 354.60 GB/s | 16G | 354.57 GB/s | >= 480 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | 0 | 0012:2149404:2149572 [7] NCCL INFO comm 0x560bd3541a30 rank 7 nranks 16 cudaDev 7 busId db000 - Destroy COMPLETE aikubeworker0016:1066162:1066981 [5] NCCL INFO comm 0x55e73208e200 rank 13 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE   |
+
+### Multi-node NCCL alltoall
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | 30.01 GB/s | 16G | 30.02 GB/s | >= 75 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | 0 | r0012:2149589:2149764 [7] NCCL INFO comm 0x55fef234b7c0 rank 7 nranks 16 cudaDev 7 busId db000 - Destroy COMPLETE aikubeworker0012:2149588:2149765 [6] NCCL INFO comm 0x5637718f1dd0 rank 6 nranks 16 cudaDev 6 busId ba000 - Destroy COMPLETE   |
+
+**Overall: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md
index bc20b72..79325a3 100644
--- a/reports_multinode_nccl_diagnosis_20260523.md
+++ b/reports_multinode_nccl_diagnosis_20260523.md
@@ -3,14 +3,16 @@
 - 日期：2026-05-23
 - 测试入口：`nccl-gpu-1` / `aikubeworker0012` / `172.72.8.12`
 - 对端节点：`nccl-gpu-2` / `aikubeworker0016` / `172.72.8.16`
-- 诊断配置：`configs/multinode_nccl_diagnostic.yaml`
-- 原始脚本报告：`reports_multinode_nccl_diagnostic_2x8_sshfix.md`
+- 诊断配置：`configs/multinode_nccl_nccl227_auto_16g.yaml`
+- 当前最佳原始脚本报告：`reports_multinode_nccl_16g_2x8_nccl227_auto.md`
 
 ## 当前结论
 
 这不是单纯 “IB 不通” 的问题。底层 CUDA RDMA perftest 可以跑到接近单端口 400Gb/s 的水平；最初使用 pip 包里的 NCCL 2.21.5 时，NCCL 在实际 2 节点通信中把 GPU Direct RDMA 禁用了，导致带宽显著偏低。
 
-后续临时切换到 apt 包解压出的 NCCL 2.27.7+cuda12.4 后，NCCL GDR 已经恢复启用，2 节点 x 8 GPU allreduce 从 `67.42 GB/s` 提升到 `237.86 GB/s`，alltoall 从 `9.56 GB/s` 提升到 `28.62 GB/s`。当前剩余问题不再是 GDR disabled，而是 GDR enabled 后仍低于当前配置里的验收阈值。
+后续临时切换到 apt 包解压出的 NCCL 2.27.7+cuda12.4 后，NCCL GDR 已经恢复启用，2 节点 x 8 GPU allreduce 从 `67.42 GB/s` 提升到 `237.86 GB/s`，alltoall 从 `9.56 GB/s` 提升到 `28.62 GB/s`。
+
+继续 tuning 后发现，配置里固定的 `NCCL_MIN_NCHANNELS=4`、`NCCL_IB_QPS_PER_CONNECTION=4`、`NCCL_IB_SPLIT_DATA_ON_QPS=1` 会明显压低 16G allreduce。去掉这些固定参数、让 NCCL 2.27.7 自动选择后，正式脚本报告中 2 节点 x 8 GPU allreduce 提升到 `354.60 GB/s`，alltoall 小幅提升到 `30.01 GB/s`。当前剩余问题不再是 GDR disabled，而是 GDR enabled 且 NCCL 自动调参后，仍低于当前配置里的验收阈值。
 
 同时，`nccl-gpu-2` 的 SSH 入口曾因未认证连接过多触发 `MaxStartups` 随机拒绝，导致 `mpirun` 拉起远端 rank 失败。已经做了临时 SSHD 缓解并拿到有效的 2 节点 x 8 GPU allreduce/alltoall 报告。
 
@@ -26,6 +28,7 @@
 8. 将 OpenMPI OOB TCP 控制通道固定到 `bond0`，并加入 `plm_rsh_args`，减少 `mpirun` 远端启动受 SSH/host key/接口选择影响的概率。
 9. 从 NVIDIA apt 源下载但不安装 `libnccl2=2.27.7-1+cuda12.4`，解压到两台机器 `/tmp/nccl-2.27.7-cuda12.4`，用 `LD_LIBRARY_PATH` 临时覆盖 NCCL 运行库验证。
 10. 增强报告解析，能够区分 `GPU Direct RDMA ENABLED` 和 `DISABLED`，并列出 enabled/disabled HCA。
+11. 将 multi-node NCCL 配置中的 `qps_per_connection`、`min_nchannels`、`split_data_on_qps` 改为 `null`，避免默认导出会压低大包 allreduce 的固定 NCCL 参数。
 
 ## 关键证据
 
@@ -141,7 +144,35 @@ allreduce 和 alltoall 本轮均正常完成，`returncode=0`、`wrong=0`，失
 | allreduce | `237.86 GB/s` | `16G` | `>= 480 GB/s` | FAIL | ENABLED |
 | alltoall | `28.62 GB/s` | `16G` | `>= 75 GB/s` | FAIL | ENABLED |
 
-解释：NCCL 2.27.7 已经修复 GDR 禁用问题，且性能提升明显；但在当前跨节点/跨 Leaf 环境和当前阈值下仍不达标。allreduce 约稳定在 `238 GB/s`，alltoall 约稳定在 `28-29 GB/s`。
+解释：NCCL 2.27.7 已经修复 GDR 禁用问题，且性能提升明显；但在固定 `min_nchannels=4/qps=4/split=1` 的配置下仍不达标。allreduce 约稳定在 `238 GB/s`，alltoall 约稳定在 `28-29 GB/s`。
+
+### 4.2 NCCL 2.27.7 自动通道/QP 参数结果
+
+进一步对 16G 大包做 tuning，发现默认配置里锁定的参数会压低 allreduce：
+
+| 配置 | allreduce Avg Bus BW | alltoall Avg Bus BW | 结论 |
+|------|----------------------|---------------------|------|
+| NCCL 2.27.7 + 固定 `min_nchannels=4/qps=4/split=1` | `238.56 GB/s` | `28.62 GB/s` | GDR 已启用，但 allreduce 被压低 |
+| NCCL 2.27.7 + NCCL 自动选择 channel/QP | `354.57 GB/s` | `30.02 GB/s` | 当前最佳脚本结果 |
+
+正式脚本报告：`reports_multinode_nccl_16g_2x8_nccl227_auto.md`
+
+| Operation | Peak Bus BW | Avg Bus BW | Peak Size | Threshold | Status | GPU Direct RDMA |
+|-----------|-------------|------------|-----------|-----------|--------|-----------------|
+| allreduce | `354.60 GB/s` | `354.57 GB/s` | `16G` | `>= 480 GB/s` | FAIL | ENABLED |
+| alltoall | `30.01 GB/s` | `30.02 GB/s` | `16G` | `>= 75 GB/s` | FAIL | ENABLED |
+
+对比临时 tuning 命令：
+
+| 变量组合 | allreduce Avg Bus BW | alltoall Avg Bus BW |
+|----------|----------------------|---------------------|
+| baseline auto | `353.63 GB/s` | `30.05 GB/s` |
+| `NCCL_IB_MERGE_NICS=1` | `352.73 GB/s` | `30.07 GB/s` |
+| `NCCL_CROSS_NIC=1` | `354.68 GB/s` | `30.05 GB/s` |
+| `NCCL_IB_QPS_PER_CONNECTION=8` + `NCCL_IB_SPLIT_DATA_ON_QPS=0` | `350.91 GB/s` | `29.41 GB/s` |
+| `NCCL_MIN_NCHANNELS=16` + `NCCL_MAX_NCHANNELS=16` | `354.32 GB/s` | `30.06 GB/s` |
+
+解释：allreduce 的主要提升来自取消不合适的固定参数，而不是 `MERGE_NICS` 或 `CROSS_NIC`。alltoall 对这些参数不敏感，当前基本稳定在 `30 GB/s` 左右。
 
 ### 5. SSHD MaxStartups 阻塞已临时缓解
 
@@ -205,12 +236,12 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 
 判断：底层 RDMA 能力存在，GDR 禁用主要由旧 NCCL 版本触发。建议正式安装并固定 NCCL 2.27.7+cuda12.4 或更新的已验证版本。
 
-### 阻塞 2：GDR enabled 后带宽仍低于当前阈值
+### 阻塞 2：GDR enabled 且 NCCL 自动调参后带宽仍低于当前阈值
 
 现象：
 
-- 2x8 16G allreduce：`237.86 GB/s`，阈值 `>= 480 GB/s`
-- 2x8 16G alltoall：`28.62 GB/s`，阈值 `>= 75 GB/s`
+- 2x8 16G allreduce：`354.60 GB/s`，阈值 `>= 480 GB/s`
+- 2x8 16G alltoall：`30.01 GB/s`，阈值 `>= 75 GB/s`
 - 已使用 4 个 400Gb/s HCA：`mlx5_0, mlx5_1, mlx5_6, mlx5_7`
 
 判断：需要确认当前 PDF/config 阈值是否适用于跨 Leaf 两节点场景；如果阈值确实要求跨 Leaf 也达到这些数值，则还需要继续查链路聚合、多 rail 使用、交换网络、NCCL net plugin/SHARP 或 rail mapping。
@@ -230,9 +261,10 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 
 1. 从网络/安全侧处理 `172.239.10.85` 等来源的 SSH 未认证连接压力，或者保留更高的 `MaxStartups` 配置作为测试窗口临时策略。
 2. 正式安装并固定已验证的 NCCL 2.27.7+cuda12.4 或更新版本，不要依赖 pip NCCL 2.21.5；当前 `/tmp/nccl-2.27.7-cuda12.4` 只是临时解压验证。
-3. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin/SHARP；当前日志显示 `Could not find: libnccl-net.so`，NCCL 使用的是 internal IB plugin。
-4. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由和拥塞计数，确认 4 个 400Gb/s HCA 是否都在跨节点通信中充分利用。
-5. 确认当前 `allreduce >= 480 GB/s`、`alltoall >= 75 GB/s` 阈值是否应直接用于跨 Leaf 两节点场景；如果是，继续按链路和 NCCL rail 聚合方向排查。
+3. multi-node NCCL 默认不要固定 `NCCL_MIN_NCHANNELS=4`、`NCCL_IB_QPS_PER_CONNECTION=4`、`NCCL_IB_SPLIT_DATA_ON_QPS=1`；当前脚本配置已改成 `null`，让 NCCL 自动选择。
+4. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin/SHARP；当前日志显示 `Could not find: libnccl-net.so`，NCCL 使用的是 internal IB plugin。
+5. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由和拥塞计数，确认 4 个 400Gb/s HCA 是否都在跨节点通信中充分利用。
+6. 确认当前 `allreduce >= 480 GB/s`、`alltoall >= 75 GB/s` 阈值是否应直接用于跨 Leaf 两节点场景；如果是，继续按链路和 NCCL rail 聚合方向排查。
 
 ## 当前可交付物
 
@@ -240,8 +272,10 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 - `configs/multinode_nccl_nccl227_diagnostic.yaml`：NCCL 2.27.7 256M 诊断配置
 - `configs/multinode_nccl_nccl227_sweep.yaml`：NCCL 2.27.7 1M 到 4G sweep 配置
 - `configs/multinode_nccl_nccl227_16g.yaml`：NCCL 2.27.7 16G 大包配置
+- `configs/multinode_nccl_nccl227_auto_16g.yaml`：NCCL 2.27.7 16G 自动 channel/QP 配置
 - `reports_multinode_nccl_diagnostic_2x8_sshfix.md`：脚本生成的原始 2x8 诊断报告
 - `reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md`：NCCL 2.27.7 256M 诊断报告
 - `reports_multinode_nccl_sweep_2x8_nccl227.md`：NCCL 2.27.7 1M 到 4G sweep 报告
 - `reports_multinode_nccl_16g_2x8_nccl227.md`：NCCL 2.27.7 16G 大包报告
+- `reports_multinode_nccl_16g_2x8_nccl227_auto.md`：NCCL 2.27.7 16G 自动 channel/QP 原始报告
 - `reports_multinode_nccl_diagnosis_20260523.md`：本中文诊断总结
-- 
2.47.2


From aa05ccab2e7535e1d465d84a08df69a3d2684bd9 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 16:35:24 +0800
Subject: [PATCH 07/41] Add NCCL PDF matrix topology report

---
 .../multinode_nccl_nccl227_pdf_matrix.yaml    | 88 +++++++++++++++++++
 modules/report.py                             |  7 +-
 reports_multinode_nccl_diagnosis_20260523.md  | 71 +++++++++++++--
 reports_multinode_nccl_pdf_matrix_nccl227.md  | 83 +++++++++++++++++
 4 files changed, 238 insertions(+), 11 deletions(-)
 create mode 100644 configs/multinode_nccl_nccl227_pdf_matrix.yaml
 create mode 100644 reports_multinode_nccl_pdf_matrix_nccl227.md

diff --git a/configs/multinode_nccl_nccl227_pdf_matrix.yaml b/configs/multinode_nccl_nccl227_pdf_matrix.yaml
new file mode 100644
index 0000000..34ce13e
--- /dev/null
+++ b/configs/multinode_nccl_nccl227_pdf_matrix.yaml
@@ -0,0 +1,88 @@
+tools:
+  install_dir: /opt/gpu-test-tools
+
+report:
+  output_dir: ./reports
+  format: md
+
+multinode_nccl:
+  enabled: true
+  mode: cross-leaf-pdf-matrix-nccl-2.27.7
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: null
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 1
+      label: 2 nodes x 1 GPU (PDF 2 machines 2 GPUs)
+      min_peak_busbw_gbps:
+        allreduce: 48.90
+        alltoall: 27.25
+    - nodes: 2
+      gpus_per_node: 2
+      label: 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs)
+      min_peak_busbw_gbps:
+        allreduce: 136.93
+        alltoall: 54.41
+    - nodes: 2
+      gpus_per_node: 4
+      label: 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs)
+      cuda_visible_devices: 0,1,4,5
+      op_env:
+        alltoall:
+          NCCL_IB_QPS_PER_CONNECTION: 4
+          NCCL_MIN_NCHANNELS: 4
+          NCCL_IB_SPLIT_DATA_ON_QPS: 1
+      min_peak_busbw_gbps:
+        allreduce: 335.48
+        alltoall: 73.73
+    - nodes: 2
+      gpus_per_node: 8
+      label: 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs)
+      min_peak_busbw_gbps:
+        allreduce: 491.84
+        alltoall: 76.54
+  begin_size: 16G
+  end_size: 16G
+  step_factor: 2
+  warmup_iters: 10
+  gpus_per_rank: 1
+  timeout_sec: 1800
+  debug: INFO
+  socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: null
+  min_nchannels: null
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: null
+  extra_env:
+    NCCL_DEBUG_SUBSYS: INIT,NET
+    NCCL_NET_GDR_LEVEL: 5
+    NCCL_NET_GDR_READ: 1
+    NCCL_DMABUF_ENABLE: 0
+  min_peak_busbw_gbps:
+    allreduce: 0
+    alltoall: 0
diff --git a/modules/report.py b/modules/report.py
index acca41e..b10d1a0 100644
--- a/modules/report.py
+++ b/modules/report.py
@@ -481,13 +481,14 @@ class ReportGenerator:
             lines.append("")
             for op, data in (multinode.get("tests") or {}).items():
                 lines.append(f"### Multi-node NCCL {op}\n")
-                lines.append("| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |")
-                lines.append("|----------|-------------|-----------|------------|-----------|--------|")
+                lines.append("| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |")
+                lines.append("|----------|----------------------|-------------|-----------|------------|-----------|--------|")
                 for topo in data.get("topologies", []):
                     threshold = topo.get("min_required_gbps", 0) or 0
                     threshold_text = f">= {threshold:.0f} GB/s" if threshold else "-"
+                    cuda_visible = topo.get("cuda_visible_devices") or "-"
                     lines.append(
-                        f"| {topo.get('label', '')} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | "
+                        f"| {topo.get('label', '')} | {cuda_visible} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | "
                         f"{topo.get('peak_size', '')} | {topo.get('avg_busbw_gbps', 0):.2f} GB/s | "
                         f"{threshold_text} | {topo.get('status', '?')} |"
                     )
diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md
index 79325a3..fccf1b7 100644
--- a/reports_multinode_nccl_diagnosis_20260523.md
+++ b/reports_multinode_nccl_diagnosis_20260523.md
@@ -14,6 +14,8 @@
 
 继续 tuning 后发现，配置里固定的 `NCCL_MIN_NCHANNELS=4`、`NCCL_IB_QPS_PER_CONNECTION=4`、`NCCL_IB_SPLIT_DATA_ON_QPS=1` 会明显压低 16G allreduce。去掉这些固定参数、让 NCCL 2.27.7 自动选择后，正式脚本报告中 2 节点 x 8 GPU allreduce 提升到 `354.60 GB/s`，alltoall 小幅提升到 `30.01 GB/s`。当前剩余问题不再是 GDR disabled，而是 GDR enabled 且 NCCL 自动调参后，仍低于当前配置里的验收阈值。
 
+按 `sx算力节点跨Leaf NCCL测试报告.pdf` 的矩阵继续对齐后，发现 2 机 4 卡档位的核心问题是默认 GPU 选择不符合 GPU-NIC 亲和性。显式选择 `CUDA_VISIBLE_DEVICES=0,1,4,5` 后，2 机 4 卡 allreduce 可以恢复到 `333-335 GB/s` 区间，接近 PDF 的 `335.48 GB/s`；alltoall 配合 PDF 固定 NCCL 参数可到 `72.93 GB/s`，接近 PDF 的 `73.73 GB/s`。但 2 机 8 卡档位仍只有 allreduce `354.02 GB/s`、alltoall `30.04 GB/s`，与 PDF 的 `491.84/76.54 GB/s` 差距明显。
+
 同时，`nccl-gpu-2` 的 SSH 入口曾因未认证连接过多触发 `MaxStartups` 随机拒绝，导致 `mpirun` 拉起远端 rank 失败。已经做了临时 SSHD 缓解并拿到有效的 2 节点 x 8 GPU allreduce/alltoall 报告。
 
 ## 已完成的修正
@@ -29,6 +31,8 @@
 9. 从 NVIDIA apt 源下载但不安装 `libnccl2=2.27.7-1+cuda12.4`，解压到两台机器 `/tmp/nccl-2.27.7-cuda12.4`，用 `LD_LIBRARY_PATH` 临时覆盖 NCCL 运行库验证。
 10. 增强报告解析，能够区分 `GPU Direct RDMA ENABLED` 和 `DISABLED`，并列出 enabled/disabled HCA。
 11. 将 multi-node NCCL 配置中的 `qps_per_connection`、`min_nchannels`、`split_data_on_qps` 改为 `null`，避免默认导出会压低大包 allreduce 的固定 NCCL 参数。
+12. 增加 topology 级 `cuda_visible_devices`、`env`、`op_env` 配置能力，支持按 GPU/NIC 亲和性和不同 NCCL op 分别设置环境变量。
+13. 生成 PDF 矩阵式原始报告 `reports_multinode_nccl_pdf_matrix_nccl227.md`，覆盖 2 机 1/2/4/8 GPU per node。
 
 ## 关键证据
 
@@ -224,6 +228,50 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 
 带宽仍约 `13.4 GB/s`。测试后已经恢复默认 `peerdirect_support=0,persistent_api_support=1`。
 
+### 7. PDF 矩阵对齐与 GPU-NIC 亲和性
+
+参考 PDF 的跨 Leaf 命令覆盖 2 机 2/4/8/16 卡矩阵，并使用：
+
+- `NCCL_IB_GID_INDEX=3`
+- `NCCL_IB_SL=5`
+- `NCCL_IB_TC=136`
+- `NCCL_SOCKET_IFNAME=bond0`
+- `NCCL_IB_TIMEOUT=22`
+- `NCCL_NET_PLUGIN=none`
+- `NCCL_NVLS_ENABLE=1`
+
+本环境与 PDF 参考机器有一个关键硬件差异：当前两台机器只有 `mlx5_0,mlx5_1,mlx5_6,mlx5_7` 是 400Gb/s NDR；`mlx5_4,mlx5_5` 是 100Gb/s HDR；`mlx5_2,mlx5_8` 是 25Gb/s；`mlx5_3,mlx5_9` 为 DOWN。参考 PDF 的命令列出了更多 HCA，但当前节点不能等价使用为 8 条 400G rail。
+
+`nvidia-smi topo -m` 显示：
+
+| GPU | 最近的 400G HCA |
+|-----|-----------------|
+| GPU0 | `mlx5_0` |
+| GPU1 | `mlx5_1` |
+| GPU4 | `mlx5_6` |
+| GPU5 | `mlx5_7` |
+
+默认 2 机 4 卡会选择 GPU0/1/2/3，其中 GPU2 最近的是 25G/down 端口，GPU3 没有直接对应 400G rail。因此 2 机 4 卡默认 allreduce 只有约 `168 GB/s`。显式设置 `CUDA_VISIBLE_DEVICES=0,1,4,5` 后：
+
+| 场景 | allreduce | alltoall | 说明 |
+|------|-----------|----------|------|
+| 默认 GPU0/1/2/3 | `167.89 GB/s` | `39.68 GB/s` | GPU/NIC 亲和性错误 |
+| `CUDA_VISIBLE_DEVICES=0,1,4,5` + auto NCCL | `335.34 GB/s` | `63.90 GB/s` | allreduce 接近 PDF |
+| `CUDA_VISIBLE_DEVICES=0,1,4,5` + PDF 固定参数 | `225.29 GB/s` | `73.10 GB/s` | alltoall 接近 PDF，但 allreduce 被压低 |
+
+因此当前脚本支持按 op 配环境变量：4 卡 allreduce 用 auto，4 卡 alltoall 用 PDF 固定参数。
+
+矩阵式正式报告：`reports_multinode_nccl_pdf_matrix_nccl227.md`
+
+| Topology | allreduce | PDF Reference | Status | alltoall | PDF Reference | Status |
+|----------|-----------|---------------|--------|----------|---------------|--------|
+| 2 nodes x 1 GPU | `47.23 GB/s` | `48.90 GB/s` | FAIL | `24.84 GB/s` | `27.25 GB/s` | FAIL |
+| 2 nodes x 2 GPUs | `136.97 GB/s` | `136.93 GB/s` | PASS | `47.67 GB/s` | `54.41 GB/s` | FAIL |
+| 2 nodes x 4 GPUs | `333.22 GB/s` | `335.48 GB/s` | FAIL | `72.93 GB/s` | `73.73 GB/s` | FAIL |
+| 2 nodes x 8 GPUs | `354.02 GB/s` | `491.84 GB/s` | FAIL | `30.04 GB/s` | `76.54 GB/s` | FAIL |
+
+解释：2 机 4 卡档位已经基本定位并修复到接近 PDF；2 机 8 卡档位不是简单 GPU 顺序问题。尝试调整 8 卡 `CUDA_VISIBLE_DEVICES` 顺序、加入 100G/25G active HCA、以及套 PDF 固定参数都没有改善；固定参数反而会把 8 卡 allreduce 从约 `354 GB/s` 压到约 `239 GB/s`。
+
 ## 当前阻塞
 
 ### 阻塞 1：当前生产 NCCL 版本过旧，GDR 被禁用
@@ -236,15 +284,18 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 
 判断：底层 RDMA 能力存在，GDR 禁用主要由旧 NCCL 版本触发。建议正式安装并固定 NCCL 2.27.7+cuda12.4 或更新的已验证版本。
 
-### 阻塞 2：GDR enabled 且 NCCL 自动调参后带宽仍低于当前阈值
+### 阻塞 2：2 机 8 GPU 档位仍低于 PDF 参考值
 
 现象：
 
-- 2x8 16G allreduce：`354.60 GB/s`，阈值 `>= 480 GB/s`
-- 2x8 16G alltoall：`30.01 GB/s`，阈值 `>= 75 GB/s`
+- 2x8 16G allreduce：`354.02 GB/s`，PDF 参考 `491.84 GB/s`
+- 2x8 16G alltoall：`30.04 GB/s`，PDF 参考 `76.54 GB/s`
 - 已使用 4 个 400Gb/s HCA：`mlx5_0, mlx5_1, mlx5_6, mlx5_7`
+- 加入 `mlx5_4,mlx5_5` 100G HCA 或 `mlx5_2,mlx5_8` 25G HCA 基本无收益
+- 调整 8 卡 `CUDA_VISIBLE_DEVICES` 顺序基本无收益
+- 套 PDF 固定参数会让 8 卡 allreduce 明显变差
 
-判断：需要确认当前 PDF/config 阈值是否适用于跨 Leaf 两节点场景；如果阈值确实要求跨 Leaf 也达到这些数值，则还需要继续查链路聚合、多 rail 使用、交换网络、NCCL net plugin/SHARP 或 rail mapping。
+判断：2 机 8 GPU 档位的剩余差距更像硬件 rail 数量/交换网络/路由/拥塞/NCCL net plugin 能力问题，不再是旧 NCCL GDR disabled 或 4 卡 GPU 选择问题。
 
 ### 阻塞 3：`nccl-gpu-2` SSH 存在外部连接压力
 
@@ -261,10 +312,12 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 
 1. 从网络/安全侧处理 `172.239.10.85` 等来源的 SSH 未认证连接压力，或者保留更高的 `MaxStartups` 配置作为测试窗口临时策略。
 2. 正式安装并固定已验证的 NCCL 2.27.7+cuda12.4 或更新版本，不要依赖 pip NCCL 2.21.5；当前 `/tmp/nccl-2.27.7-cuda12.4` 只是临时解压验证。
-3. multi-node NCCL 默认不要固定 `NCCL_MIN_NCHANNELS=4`、`NCCL_IB_QPS_PER_CONNECTION=4`、`NCCL_IB_SPLIT_DATA_ON_QPS=1`；当前脚本配置已改成 `null`，让 NCCL 自动选择。
-4. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin/SHARP；当前日志显示 `Could not find: libnccl-net.so`，NCCL 使用的是 internal IB plugin。
-5. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由和拥塞计数，确认 4 个 400Gb/s HCA 是否都在跨节点通信中充分利用。
-6. 确认当前 `allreduce >= 480 GB/s`、`alltoall >= 75 GB/s` 阈值是否应直接用于跨 Leaf 两节点场景；如果是，继续按链路和 NCCL rail 聚合方向排查。
+3. 4 卡 per node 测试应显式使用 `CUDA_VISIBLE_DEVICES=0,1,4,5`，避免默认 GPU0/1/2/3 落到错误 GPU/NIC 亲和性。
+4. 4 卡 allreduce 建议继续让 NCCL 自动选择 channel/QP；4 卡 alltoall 如果要贴近 PDF，可单独套 `NCCL_IB_QPS_PER_CONNECTION=4`、`NCCL_MIN_NCHANNELS=4`、`NCCL_IB_SPLIT_DATA_ON_QPS=1`。
+5. 8 卡 per node 不建议套上述固定参数，会降低 allreduce；继续用 auto。
+6. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin/SHARP；当前日志显示 `Could not find: libnccl-net.so`，NCCL 使用的是 internal IB plugin。
+7. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由和拥塞计数，确认 4 个 400Gb/s HCA 是否都在跨节点通信中充分利用。
+8. 确认当前 PDF 的 `491.84/76.54 GB/s` 是否要求当前这两台节点在只有 4 条 400G rail 的形态下也达到；如果要求一致，需要网络/硬件侧继续介入。
 
 ## 当前可交付物
 
@@ -273,9 +326,11 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 - `configs/multinode_nccl_nccl227_sweep.yaml`：NCCL 2.27.7 1M 到 4G sweep 配置
 - `configs/multinode_nccl_nccl227_16g.yaml`：NCCL 2.27.7 16G 大包配置
 - `configs/multinode_nccl_nccl227_auto_16g.yaml`：NCCL 2.27.7 16G 自动 channel/QP 配置
+- `configs/multinode_nccl_nccl227_pdf_matrix.yaml`：按 PDF 矩阵和 GPU 亲和性优化后的跨 Leaf 配置
 - `reports_multinode_nccl_diagnostic_2x8_sshfix.md`：脚本生成的原始 2x8 诊断报告
 - `reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md`：NCCL 2.27.7 256M 诊断报告
 - `reports_multinode_nccl_sweep_2x8_nccl227.md`：NCCL 2.27.7 1M 到 4G sweep 报告
 - `reports_multinode_nccl_16g_2x8_nccl227.md`：NCCL 2.27.7 16G 大包报告
 - `reports_multinode_nccl_16g_2x8_nccl227_auto.md`：NCCL 2.27.7 16G 自动 channel/QP 原始报告
+- `reports_multinode_nccl_pdf_matrix_nccl227.md`：NCCL 2.27.7 PDF 矩阵式原始报告
 - `reports_multinode_nccl_diagnosis_20260523.md`：本中文诊断总结
diff --git a/reports_multinode_nccl_pdf_matrix_nccl227.md b/reports_multinode_nccl_pdf_matrix_nccl227.md
new file mode 100644
index 0000000..a18fb0d
--- /dev/null
+++ b/reports_multinode_nccl_pdf_matrix_nccl227.md
@@ -0,0 +1,83 @@
+# GPU Test Report
+
+- **Date:** 2026-05-23T08:32:58.113416
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- RDMA
+- DCGM
+- Training
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Multi-node NCCL | FAIL |
+
+## Multi-node NCCL / Cross Leaf
+
+Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
+
+- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
+- **Preflight:** PASS
+
+### Multi-node NCCL allreduce
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.23 GB/s | 16G | 47.24 GB/s | >= 49 GB/s | FAIL |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 136.97 GB/s | 16G | 137.17 GB/s | >= 137 GB/s | PASS |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 333.22 GB/s | 16G | 333.24 GB/s | >= 335 GB/s | FAIL |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 354.02 GB/s | 16G | 353.92 GB/s | >= 492 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | E aikubeworker0012:2157248:2157325 [0] NCCL INFO comm 0x5595f28bf420 rank 0 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 47.2399  # # Collective test concluded: all_reduce_perf #   |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ker0012:2157429:2157526 [3] NCCL INFO comm 0x55a8a0147090 rank 3 nranks 8 cudaDev 3 busId ab000 - Destroy COMPLETE aikubeworker0012:2157427:2157524 [1] NCCL INFO comm 0x55b1b0f86630 rank 1 nranks 8 cudaDev 1 busId 2a000 - Destroy COMPLETE   |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 |  aikubeworker0016:1138578:1139592 [0] NCCL INFO comm 0x556eff26c190 rank 8 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 353.915  # # Collective test concluded: all_reduce_perf #   |
+
+### Multi-node NCCL alltoall
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.84 GB/s | 16G | 24.89 GB/s | >= 27 GB/s | FAIL |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.67 GB/s | 16G | 47.91 GB/s | >= 54 GB/s | FAIL |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.93 GB/s | 16G | 72.97 GB/s | >= 74 GB/s | FAIL |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 30.04 GB/s | 16G | 30.04 GB/s | >= 77 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | ETE aikubeworker0012:2157727:2157802 [0] NCCL INFO comm 0x55a0349b02b0 rank 0 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 24.8897  # # Collective test concluded: alltoall_perf #   |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | 0 | ETE aikubeworker0016:1141290:1142410 [0] NCCL INFO comm 0x55fabbea6410 rank 2 nranks 4 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 47.9094  # # Collective test concluded: alltoall_perf #   |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ETE aikubeworker0012:2158071:2158172 [0] NCCL INFO comm 0x563312baa7f0 rank 0 nranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 72.9657  # # Collective test concluded: alltoall_perf #   |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | 016:1143717:1145948 [7] NCCL INFO comm 0x5558cc9de640 rank 15 nranks 16 cudaDev 7 busId db000 - Destroy COMPLETE aikubeworker0016:1143713:1145946 [3] NCCL INFO comm 0x55c1af080e60 rank 11 nranks 16 cudaDev 3 busId 5d000 - Destroy COMPLETE   |
+
+**Overall: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
-- 
2.47.2


From e756f0b7b4dc3d817731ee064541e6dfc69dded8 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 16:42:27 +0800
Subject: [PATCH 08/41] Document NCCL rail saturation evidence

---
 ...s_multinode_nccl_counter_probe_20260523.md | 80 +++++++++++++++++++
 reports_multinode_nccl_diagnosis_20260523.md  | 38 +++++++++
 2 files changed, 118 insertions(+)
 create mode 100644 reports_multinode_nccl_counter_probe_20260523.md

diff --git a/reports_multinode_nccl_counter_probe_20260523.md b/reports_multinode_nccl_counter_probe_20260523.md
new file mode 100644
index 0000000..9ccc52c
--- /dev/null
+++ b/reports_multinode_nccl_counter_probe_20260523.md
@@ -0,0 +1,80 @@
+# 多机 NCCL 8 卡链路计数器探测
+
+- 日期：2026-05-23
+- 主机：`aikubeworker0012` / `172.72.8.12`，`aikubeworker0016` / `172.72.8.16`
+- NCCL：临时 `2.27.7+cuda12.4`
+- HCA：`mlx5_0,mlx5_1,mlx5_6,mlx5_7`
+- HCA 速率：每节点 4 x 400Gb/s NDR，理论单向合计约 `200 GB/s`
+
+## 结论
+
+8 卡 allreduce 的 NCCL `algbw` 已经到 `189 GB/s` 左右，接近当前每节点 4 条 400G rail 的理论单向合计 `200 GB/s`。因此 PDF 参考的 `491.84 GB/s busbw` 对应 `262 GB/s algbw`，在当前 4 x 400G rail 形态下不太可能达到，除非实际可用跨节点 rail 数量或网络能力高于当前节点暴露的 4 条 400G。
+
+8 卡 alltoall 仍只有 `30 GB/s busbw`，不是 HCA 顺序导致。HCA 顺序 sweep 都稳定在 `30.02-30.07 GB/s`。计数器显示 alltoall 流量主要压在 `mlx5_0` 和 `mlx5_6` 上，`mlx5_1` 和 `mlx5_7` 只有约三分之一流量，说明剩余问题更像 NCCL alltoall rail 分布、路由、拥塞、NCCL net plugin/SHARP 或网络侧策略问题。
+
+## 8 卡 allreduce
+
+NCCL 输出：
+
+| Metric | Value |
+|--------|-------|
+| `algbw` | `189.16 / 189.07 GB/s` |
+| `busbw` | `354.68 / 354.52 GB/s` |
+| `Avg bus bandwidth` | `354.597 GB/s` |
+
+allreduce busbw 换算关系约为：
+
+```text
+busbw = algbw * 2 * (nranks - 1) / nranks
+      = algbw * 1.875  # nranks=16
+```
+
+因此：
+
+| 项 | busbw | 换算 algbw |
+|----|-------|------------|
+| 当前测试 | `354.60 GB/s` | `189.12 GB/s` |
+| PDF 参考 | `491.84 GB/s` | `262.31 GB/s` |
+
+当前 `189.12 GB/s algbw` 已接近 `4 x 400Gb/s = 200 GB/s` 理论单向总带宽。
+
+## 8 卡 alltoall
+
+NCCL 输出：
+
+| Metric | Value |
+|--------|-------|
+| `algbw` | `32.04 / 32.05 GB/s` |
+| `busbw` | `30.03 / 30.04 GB/s` |
+| `Avg bus bandwidth` | `30.0389 GB/s` |
+
+同一测试窗口内，端口计数器增量显示流量不均衡：
+
+| Host | HCA | Xmit GB | Recv GB |
+|------|-----|---------|---------|
+| 172.72.8.12 | `mlx5_0` | `885.54` | `885.51` |
+| 172.72.8.12 | `mlx5_1` | `295.19` | `295.19` |
+| 172.72.8.12 | `mlx5_6` | `885.53` | `885.51` |
+| 172.72.8.12 | `mlx5_7` | `295.19` | `295.19` |
+| 172.72.8.16 | `mlx5_0` | `885.51` | `885.54` |
+| 172.72.8.16 | `mlx5_1` | `295.19` | `295.19` |
+| 172.72.8.16 | `mlx5_6` | `885.51` | `885.53` |
+| 172.72.8.16 | `mlx5_7` | `295.19` | `295.19` |
+
+## HCA 顺序 sweep
+
+8 卡 alltoall 对 HCA 顺序不敏感：
+
+| `NCCL_IB_HCA` | Avg Bus BW |
+|---------------|------------|
+| `mlx5_0,mlx5_1,mlx5_6,mlx5_7` | `30.0367 GB/s` |
+| `mlx5_0,mlx5_6,mlx5_1,mlx5_7` | `30.0696 GB/s` |
+| `mlx5_0,mlx5_7,mlx5_1,mlx5_6` | `30.0397 GB/s` |
+| `mlx5_1,mlx5_0,mlx5_7,mlx5_6` | `30.0413 GB/s` |
+| `mlx5_6,mlx5_7,mlx5_0,mlx5_1` | `30.0230 GB/s` |
+
+## 判断
+
+1. 8 卡 allreduce 当前不是软件参数小调能解决的问题，性能已经贴近当前 4 条 400G rail 的物理带宽上限。
+2. 8 卡 alltoall 仍明显异常，且不是 HCA 顺序问题；需要继续从 NCCL alltoall rail 分布、网络路由/拥塞、NCCL net plugin/SHARP、交换机侧策略排查。
+3. 如果验收必须达到 PDF 的 2 机 16 卡 `491.84/76.54 GB/s`，需要确认当前两台机器是否具备与 PDF 参考环境同等的有效跨节点 rail 数量和交换网络能力。
diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md
index fccf1b7..42d7b52 100644
--- a/reports_multinode_nccl_diagnosis_20260523.md
+++ b/reports_multinode_nccl_diagnosis_20260523.md
@@ -272,6 +272,36 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 
 解释：2 机 4 卡档位已经基本定位并修复到接近 PDF；2 机 8 卡档位不是简单 GPU 顺序问题。尝试调整 8 卡 `CUDA_VISIBLE_DEVICES` 顺序、加入 100G/25G active HCA、以及套 PDF 固定参数都没有改善；固定参数反而会把 8 卡 allreduce 从约 `354 GB/s` 压到约 `239 GB/s`。
 
+### 8. 8 卡链路计数器与物理上限判断
+
+计数器探测报告：`reports_multinode_nccl_counter_probe_20260523.md`
+
+当前 2 机 8 GPU allreduce 输出：
+
+| Metric | Value |
+|--------|-------|
+| `algbw` | `189.16 / 189.07 GB/s` |
+| `busbw` | `354.68 / 354.52 GB/s` |
+| `Avg bus bandwidth` | `354.597 GB/s` |
+
+allreduce 在 16 ranks 下的换算关系约为：
+
+```text
+busbw = algbw * 2 * (nranks - 1) / nranks = algbw * 1.875
+```
+
+因此 PDF 参考 `491.84 GB/s busbw` 对应约 `262.31 GB/s algbw`。但当前节点可用的 400G HCA 是 `mlx5_0,mlx5_1,mlx5_6,mlx5_7`，每节点 4 条 400Gb/s，理论单向合计约 `200 GB/s`。当前 allreduce `189 GB/s algbw` 已经接近这个物理上限，所以 8 卡 allreduce 剩余差距基本不能靠 NCCL 参数小调解决。
+
+8 卡 alltoall 当前仍只有：
+
+| Metric | Value |
+|--------|-------|
+| `algbw` | `32.04 / 32.05 GB/s` |
+| `busbw` | `30.03 / 30.04 GB/s` |
+| `Avg bus bandwidth` | `30.0389 GB/s` |
+
+同一测试窗口内端口计数器显示 alltoall 流量分布不均衡：`mlx5_0` 和 `mlx5_6` 的流量约 `885 GB`，`mlx5_1` 和 `mlx5_7` 约 `295 GB`，约为三倍差距。继续调换 `NCCL_IB_HCA` 顺序后，8 卡 alltoall 仍稳定在 `30.02-30.07 GB/s`，说明不是简单 HCA 列表顺序问题。
+
 ## 当前阻塞
 
 ### 阻塞 1：当前生产 NCCL 版本过旧，GDR 被禁用
@@ -297,6 +327,12 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 
 判断：2 机 8 GPU 档位的剩余差距更像硬件 rail 数量/交换网络/路由/拥塞/NCCL net plugin 能力问题，不再是旧 NCCL GDR disabled 或 4 卡 GPU 选择问题。
 
+补充证据：
+
+- 8 卡 allreduce `algbw ~= 189 GB/s`，接近当前 4 x 400G HCA 的理论单向合计 `200 GB/s`
+- PDF 8 卡 allreduce `491.84 GB/s busbw` 反推需要约 `262 GB/s algbw`，超过当前 4 x 400G 的物理单向总带宽
+- 8 卡 alltoall 端口计数器显示 rail 分布不均，且 HCA 顺序 sweep 无改善
+
 ### 阻塞 3：`nccl-gpu-2` SSH 存在外部连接压力
 
 现象：
@@ -318,6 +354,7 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 6. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin/SHARP；当前日志显示 `Could not find: libnccl-net.so`，NCCL 使用的是 internal IB plugin。
 7. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由和拥塞计数，确认 4 个 400Gb/s HCA 是否都在跨节点通信中充分利用。
 8. 确认当前 PDF 的 `491.84/76.54 GB/s` 是否要求当前这两台节点在只有 4 条 400G rail 的形态下也达到；如果要求一致，需要网络/硬件侧继续介入。
+9. 对 8 卡 alltoall，重点查 NCCL rail 分布、交换机 ECMP/自适应路由、拥塞计数、SHARP/NCCL net plugin，而不是继续调 `NCCL_IB_HCA` 顺序。
 
 ## 当前可交付物
 
@@ -333,4 +370,5 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 - `reports_multinode_nccl_16g_2x8_nccl227.md`：NCCL 2.27.7 16G 大包报告
 - `reports_multinode_nccl_16g_2x8_nccl227_auto.md`：NCCL 2.27.7 16G 自动 channel/QP 原始报告
 - `reports_multinode_nccl_pdf_matrix_nccl227.md`：NCCL 2.27.7 PDF 矩阵式原始报告
+- `reports_multinode_nccl_counter_probe_20260523.md`：8 卡链路计数器与 HCA 顺序 sweep 报告
 - `reports_multinode_nccl_diagnosis_20260523.md`：本中文诊断总结
-- 
2.47.2


From ce363b2f7abb31b82bf77431690a8bf0af3afd98 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 16:43:25 +0800
Subject: [PATCH 09/41] Document missing NCCL network plugin

---
 ...s_multinode_nccl_counter_probe_20260523.md |  1 +
 reports_multinode_nccl_diagnosis_20260523.md  | 24 +++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/reports_multinode_nccl_counter_probe_20260523.md b/reports_multinode_nccl_counter_probe_20260523.md
index 9ccc52c..debc0bc 100644
--- a/reports_multinode_nccl_counter_probe_20260523.md
+++ b/reports_multinode_nccl_counter_probe_20260523.md
@@ -78,3 +78,4 @@ NCCL 输出：
 1. 8 卡 allreduce 当前不是软件参数小调能解决的问题，性能已经贴近当前 4 条 400G rail 的物理带宽上限。
 2. 8 卡 alltoall 仍明显异常，且不是 HCA 顺序问题；需要继续从 NCCL alltoall rail 分布、网络路由/拥塞、NCCL net plugin/SHARP、交换机侧策略排查。
 3. 如果验收必须达到 PDF 的 2 机 16 卡 `491.84/76.54 GB/s`，需要确认当前两台机器是否具备与 PDF 参考环境同等的有效跨节点 rail 数量和交换网络能力。
+4. 两台机器当前均未发现 `libnccl-net.so` 或 SHARP/HCOLL 包，NCCL 使用 internal IB plugin；如果目标值依赖 NCCL net plugin/SHARP，需要先补齐对应运行环境。
diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md
index 42d7b52..fce5084 100644
--- a/reports_multinode_nccl_diagnosis_20260523.md
+++ b/reports_multinode_nccl_diagnosis_20260523.md
@@ -302,6 +302,29 @@ busbw = algbw * 2 * (nranks - 1) / nranks = algbw * 1.875
 
 同一测试窗口内端口计数器显示 alltoall 流量分布不均衡：`mlx5_0` 和 `mlx5_6` 的流量约 `885 GB`，`mlx5_1` 和 `mlx5_7` 约 `295 GB`，约为三倍差距。继续调换 `NCCL_IB_HCA` 顺序后，8 卡 alltoall 仍稳定在 `30.02-30.07 GB/s`，说明不是简单 HCA 列表顺序问题。
 
+### 9. NCCL net plugin / SHARP 状态
+
+两台机器上均未找到：
+
+- `libnccl-net.so`
+- `libsharp*`
+- SHARP/HCOLL 相关 deb 包
+
+当前仅看到 UCX 包：
+
+```text
+ucx 1.20.0-1.20260211.d9a4f352d.2601100
+```
+
+apt 源里与 NCCL 直接相关的包只有：
+
+```text
+libnccl2
+libnccl-dev
+```
+
+因此当前 NCCL 日志里的 `Could not find: libnccl-net.so` 是真实环境缺失，不是脚本漏配路径。当前运行走的是 NCCL internal IB plugin；如果要继续追 8 卡 alltoall 或 PDF 2 机 16 卡参考值，需要补齐匹配当前 OFED/driver/CUDA/NCCL 的 NCCL net plugin/SHARP 环境，或由网络侧确认该集群不依赖这些组件也能达到目标值。
+
 ## 当前阻塞
 
 ### 阻塞 1：当前生产 NCCL 版本过旧，GDR 被禁用
@@ -332,6 +355,7 @@ busbw = algbw * 2 * (nranks - 1) / nranks = algbw * 1.875
 - 8 卡 allreduce `algbw ~= 189 GB/s`，接近当前 4 x 400G HCA 的理论单向合计 `200 GB/s`
 - PDF 8 卡 allreduce `491.84 GB/s busbw` 反推需要约 `262 GB/s algbw`，超过当前 4 x 400G 的物理单向总带宽
 - 8 卡 alltoall 端口计数器显示 rail 分布不均，且 HCA 顺序 sweep 无改善
+- 当前环境缺失 NCCL net plugin/SHARP，NCCL 只能使用 internal IB plugin
 
 ### 阻塞 3：`nccl-gpu-2` SSH 存在外部连接压力
 
-- 
2.47.2


From a64e964e3cf470f62d8e5a5827e40f6a90687489 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 16:46:15 +0800
Subject: [PATCH 10/41] Add raw RDMA rail bandwidth evidence

---
 ...s_multinode_nccl_counter_probe_20260523.md | 29 ++++++++++++++++---
 reports_multinode_nccl_diagnosis_20260523.md  | 13 +++++++++
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/reports_multinode_nccl_counter_probe_20260523.md b/reports_multinode_nccl_counter_probe_20260523.md
index debc0bc..784b5c4 100644
--- a/reports_multinode_nccl_counter_probe_20260523.md
+++ b/reports_multinode_nccl_counter_probe_20260523.md
@@ -10,8 +10,28 @@
 
 8 卡 allreduce 的 NCCL `algbw` 已经到 `189 GB/s` 左右，接近当前每节点 4 条 400G rail 的理论单向合计 `200 GB/s`。因此 PDF 参考的 `491.84 GB/s busbw` 对应 `262 GB/s algbw`，在当前 4 x 400G rail 形态下不太可能达到，除非实际可用跨节点 rail 数量或网络能力高于当前节点暴露的 4 条 400G。
 
+裸 RDMA 并发 perftest 也验证了这 4 条 400G rail 本身可以同时工作：4 个 HCA 并发 `ib_write_bw` 合计 `1476.95 Gb/s`，即 `184.62 GB/s`。这与 NCCL 8 卡 allreduce 换算出的 `189 GB/s algbw` 一致，说明 allreduce 已经接近裸网络可用带宽。
+
 8 卡 alltoall 仍只有 `30 GB/s busbw`，不是 HCA 顺序导致。HCA 顺序 sweep 都稳定在 `30.02-30.07 GB/s`。计数器显示 alltoall 流量主要压在 `mlx5_0` 和 `mlx5_6` 上，`mlx5_1` 和 `mlx5_7` 只有约三分之一流量，说明剩余问题更像 NCCL alltoall rail 分布、路由、拥塞、NCCL net plugin/SHARP 或网络侧策略问题。
 
+## 裸 RDMA 4 rail 并发
+
+命令类型：
+
+```bash
+ib_write_bw -d <mlx5_X> -i 1 -p <port> -s 4194304 -n 5000 -F --report_gbits
+```
+
+结果：
+
+| HCA | BW average |
+|-----|------------|
+| `mlx5_0` | `387.16 Gb/s` |
+| `mlx5_1` | `387.07 Gb/s` |
+| `mlx5_6` | `355.02 Gb/s` |
+| `mlx5_7` | `347.70 Gb/s` |
+| Total | `1476.95 Gb/s` / `184.62 GB/s` |
+
 ## 8 卡 allreduce
 
 NCCL 输出：
@@ -75,7 +95,8 @@ NCCL 输出：
 
 ## 判断
 
-1. 8 卡 allreduce 当前不是软件参数小调能解决的问题，性能已经贴近当前 4 条 400G rail 的物理带宽上限。
-2. 8 卡 alltoall 仍明显异常，且不是 HCA 顺序问题；需要继续从 NCCL alltoall rail 分布、网络路由/拥塞、NCCL net plugin/SHARP、交换机侧策略排查。
-3. 如果验收必须达到 PDF 的 2 机 16 卡 `491.84/76.54 GB/s`，需要确认当前两台机器是否具备与 PDF 参考环境同等的有效跨节点 rail 数量和交换网络能力。
-4. 两台机器当前均未发现 `libnccl-net.so` 或 SHARP/HCOLL 包，NCCL 使用 internal IB plugin；如果目标值依赖 NCCL net plugin/SHARP，需要先补齐对应运行环境。
+1. 裸 RDMA 4 rail 可以并发跑到约 `184.62 GB/s`，网络基础带宽不是单 rail 瓶颈。
+2. 8 卡 allreduce 当前不是软件参数小调能解决的问题，性能已经贴近当前 4 条 400G rail 的物理带宽上限。
+3. 8 卡 alltoall 仍明显异常，且不是 HCA 顺序问题；需要继续从 NCCL alltoall rail 分布、网络路由/拥塞、NCCL net plugin/SHARP、交换机侧策略排查。
+4. 如果验收必须达到 PDF 的 2 机 16 卡 `491.84/76.54 GB/s`，需要确认当前两台机器是否具备与 PDF 参考环境同等的有效跨节点 rail 数量和交换网络能力。
+5. 两台机器当前均未发现 `libnccl-net.so` 或 SHARP/HCOLL 包，NCCL 使用 internal IB plugin；如果目标值依赖 NCCL net plugin/SHARP，需要先补齐对应运行环境。
diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md
index fce5084..8253caf 100644
--- a/reports_multinode_nccl_diagnosis_20260523.md
+++ b/reports_multinode_nccl_diagnosis_20260523.md
@@ -292,6 +292,18 @@ busbw = algbw * 2 * (nranks - 1) / nranks = algbw * 1.875
 
 因此 PDF 参考 `491.84 GB/s busbw` 对应约 `262.31 GB/s algbw`。但当前节点可用的 400G HCA 是 `mlx5_0,mlx5_1,mlx5_6,mlx5_7`，每节点 4 条 400Gb/s，理论单向合计约 `200 GB/s`。当前 allreduce `189 GB/s algbw` 已经接近这个物理上限，所以 8 卡 allreduce 剩余差距基本不能靠 NCCL 参数小调解决。
 
+裸 RDMA 4 rail 并发 `ib_write_bw` 也验证了底层 4 条 400G rail 可以同时工作：
+
+| HCA | BW average |
+|-----|------------|
+| `mlx5_0` | `387.16 Gb/s` |
+| `mlx5_1` | `387.07 Gb/s` |
+| `mlx5_6` | `355.02 Gb/s` |
+| `mlx5_7` | `347.70 Gb/s` |
+| Total | `1476.95 Gb/s` / `184.62 GB/s` |
+
+这个裸 RDMA 总带宽与 NCCL 8 卡 allreduce 的 `189 GB/s algbw` 接近，进一步说明 allreduce 已经贴近当前网络形态可提供的实际带宽。
+
 8 卡 alltoall 当前仍只有：
 
 | Metric | Value |
@@ -353,6 +365,7 @@ libnccl-dev
 补充证据：
 
 - 8 卡 allreduce `algbw ~= 189 GB/s`，接近当前 4 x 400G HCA 的理论单向合计 `200 GB/s`
+- 裸 RDMA 4 rail 并发 `ib_write_bw` 合计 `1476.95 Gb/s` / `184.62 GB/s`
 - PDF 8 卡 allreduce `491.84 GB/s busbw` 反推需要约 `262 GB/s algbw`，超过当前 4 x 400G 的物理单向总带宽
 - 8 卡 alltoall 端口计数器显示 rail 分布不均，且 HCA 顺序 sweep 无改善
 - 当前环境缺失 NCCL net plugin/SHARP，NCCL 只能使用 internal IB plugin
-- 
2.47.2


From 619a471634f3435fff56f836040a4650234969fb Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 17:00:03 +0800
Subject: [PATCH 11/41] Tune multinode alltoall PXN behavior

---
 .../multinode_nccl_nccl227_pdf_matrix.yaml    |  3 ++
 ...multinode_nccl_alltoall_tuning_20260523.md | 51 +++++++++++++++++++
 reports_multinode_nccl_diagnosis_20260523.md  | 23 +++++++--
 reports_multinode_nccl_pdf_matrix_nccl227.md  | 33 ++++++------
 4 files changed, 90 insertions(+), 20 deletions(-)
 create mode 100644 reports_multinode_nccl_alltoall_tuning_20260523.md

diff --git a/configs/multinode_nccl_nccl227_pdf_matrix.yaml b/configs/multinode_nccl_nccl227_pdf_matrix.yaml
index 34ce13e..00a3220 100644
--- a/configs/multinode_nccl_nccl227_pdf_matrix.yaml
+++ b/configs/multinode_nccl_nccl227_pdf_matrix.yaml
@@ -55,6 +55,9 @@ multinode_nccl:
     - nodes: 2
       gpus_per_node: 8
       label: 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs)
+      op_env:
+        alltoall:
+          NCCL_PXN_DISABLE: 1
       min_peak_busbw_gbps:
         allreduce: 491.84
         alltoall: 76.54
diff --git a/reports_multinode_nccl_alltoall_tuning_20260523.md b/reports_multinode_nccl_alltoall_tuning_20260523.md
new file mode 100644
index 0000000..d26630a
--- /dev/null
+++ b/reports_multinode_nccl_alltoall_tuning_20260523.md
@@ -0,0 +1,51 @@
+# 多机 NCCL 8 卡 alltoall 网络参数 sweep
+
+- 日期：2026-05-23
+- 主机：`aikubeworker0012` / `172.72.8.12`，`aikubeworker0016` / `172.72.8.16`
+- NCCL：临时 `2.27.7+cuda12.4`
+- 测试：2 nodes x 8 GPUs，`alltoall_perf -b 16G -e 16G`
+- HCA：`mlx5_0,mlx5_1,mlx5_6,mlx5_7`
+
+## 结论
+
+`NCCL_PXN_DISABLE=1` 是本轮唯一有效正向参数，可以把 8 卡 alltoall 从约 `30.06 GB/s` 提升到约 `37.24 GB/s`。纳入正式 PDF 矩阵配置后，8 卡 alltoall 原始报告结果为 `36.70 GB/s peak` / `36.74 GB/s avg`。
+
+这个提升有实际价值，但仍远低于 PDF 参考 `76.54 GB/s`。其他参数没有改善，部分明显变差：
+
+| Case | Avg Bus BW | 结论 |
+|------|------------|------|
+| baseline | `30.0633 GB/s` | 基线 |
+| `NCCL_PXN_DISABLE=1` | `37.2421 GB/s` | 有效提升 |
+| `NCCL_P2P_PXN_LEVEL=0` | `20.1205 GB/s` | 明显变差 |
+| `NCCL_P2P_PXN_LEVEL=1` | `30.0588 GB/s` | 无改善 |
+| `NCCL_P2P_PXN_LEVEL=2` | `30.0437 GB/s` | 无改善 |
+| `NCCL_NET_SHARED_COMMS=0` | `27.3889 GB/s` | 变差 |
+| `NCCL_NET_SHARED_BUFFERS=0` | `28.2389 GB/s` | 变差 |
+| `NCCL_NET_SHARED_COMMS=0 NCCL_NET_SHARED_BUFFERS=0` | `28.2279 GB/s` | 变差 |
+| `NCCL_NCHANNELS_PER_NET_PEER=2` | `30.0281 GB/s` | 无改善 |
+| `NCCL_NCHANNELS_PER_NET_PEER=4` | `29.9802 GB/s` | 无改善 |
+| `NCCL_IB_ADAPTIVE_ROUTING=1 NCCL_IB_AR_THRESHOLD=0` | `30.0526 GB/s` | 无改善 |
+| `NCCL_IB_ADAPTIVE_ROUTING=0` | `30.0535 GB/s` | 无改善 |
+| `NCCL_IB_PCI_RELAXED_ORDERING=0` | 未完成 | 明显异常，不建议 |
+
+## 正式配置更新
+
+`configs/multinode_nccl_nccl227_pdf_matrix.yaml` 已对 2 nodes x 8 GPUs 的 alltoall 增加：
+
+```yaml
+op_env:
+  alltoall:
+    NCCL_PXN_DISABLE: 1
+```
+
+正式矩阵报告：`reports_multinode_nccl_pdf_matrix_nccl227.md`
+
+| Topology | alltoall Peak Bus BW | alltoall Avg Bus BW | PDF Reference | Status |
+|----------|----------------------|---------------------|---------------|--------|
+| 2 nodes x 8 GPUs | `36.70 GB/s` | `36.74 GB/s` | `76.54 GB/s` | FAIL |
+
+## 判断
+
+1. PXN 在当前拓扑下对 8 卡 alltoall 有负面影响，禁用后有约 `22-24%` 提升。
+2. 禁用 PXN 后仍只有 PDF 目标的一半左右，剩余差距不是单一 NCCL 环境变量可以补齐。
+3. 后续重点仍应放在 NCCL net plugin/SHARP、交换网络策略、路由/拥塞和 alltoall rail 分布。
diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md
index 8253caf..732a6ac 100644
--- a/reports_multinode_nccl_diagnosis_20260523.md
+++ b/reports_multinode_nccl_diagnosis_20260523.md
@@ -16,6 +16,8 @@
 
 按 `sx算力节点跨Leaf NCCL测试报告.pdf` 的矩阵继续对齐后，发现 2 机 4 卡档位的核心问题是默认 GPU 选择不符合 GPU-NIC 亲和性。显式选择 `CUDA_VISIBLE_DEVICES=0,1,4,5` 后，2 机 4 卡 allreduce 可以恢复到 `333-335 GB/s` 区间，接近 PDF 的 `335.48 GB/s`；alltoall 配合 PDF 固定 NCCL 参数可到 `72.93 GB/s`，接近 PDF 的 `73.73 GB/s`。但 2 机 8 卡档位仍只有 allreduce `354.02 GB/s`、alltoall `30.04 GB/s`，与 PDF 的 `491.84/76.54 GB/s` 差距明显。
 
+进一步 sweep 8 卡 alltoall 网络参数后，`NCCL_PXN_DISABLE=1` 是唯一有效正向项。正式矩阵配置已对 2 机 8 GPU 的 alltoall 单独加入该变量，8 卡 alltoall 从约 `30.04 GB/s` 提升到 `36.70 GB/s` peak / `36.74 GB/s` avg，但仍低于 PDF 参考 `76.54 GB/s`。
+
 同时，`nccl-gpu-2` 的 SSH 入口曾因未认证连接过多触发 `MaxStartups` 随机拒绝，导致 `mpirun` 拉起远端 rank 失败。已经做了临时 SSHD 缓解并拿到有效的 2 节点 x 8 GPU allreduce/alltoall 报告。
 
 ## 已完成的修正
@@ -33,6 +35,7 @@
 11. 将 multi-node NCCL 配置中的 `qps_per_connection`、`min_nchannels`、`split_data_on_qps` 改为 `null`，避免默认导出会压低大包 allreduce 的固定 NCCL 参数。
 12. 增加 topology 级 `cuda_visible_devices`、`env`、`op_env` 配置能力，支持按 GPU/NIC 亲和性和不同 NCCL op 分别设置环境变量。
 13. 生成 PDF 矩阵式原始报告 `reports_multinode_nccl_pdf_matrix_nccl227.md`，覆盖 2 机 1/2/4/8 GPU per node。
+14. 对 8 卡 alltoall 做 NCCL 网络参数 sweep，并将有效项 `NCCL_PXN_DISABLE=1` 固化到 PDF 矩阵配置。
 
 ## 关键证据
 
@@ -265,13 +268,23 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 
 | Topology | allreduce | PDF Reference | Status | alltoall | PDF Reference | Status |
 |----------|-----------|---------------|--------|----------|---------------|--------|
-| 2 nodes x 1 GPU | `47.23 GB/s` | `48.90 GB/s` | FAIL | `24.84 GB/s` | `27.25 GB/s` | FAIL |
-| 2 nodes x 2 GPUs | `136.97 GB/s` | `136.93 GB/s` | PASS | `47.67 GB/s` | `54.41 GB/s` | FAIL |
-| 2 nodes x 4 GPUs | `333.22 GB/s` | `335.48 GB/s` | FAIL | `72.93 GB/s` | `73.73 GB/s` | FAIL |
-| 2 nodes x 8 GPUs | `354.02 GB/s` | `491.84 GB/s` | FAIL | `30.04 GB/s` | `76.54 GB/s` | FAIL |
+| 2 nodes x 1 GPU | `47.26 GB/s` | `48.90 GB/s` | FAIL | `24.87 GB/s` | `27.25 GB/s` | FAIL |
+| 2 nodes x 2 GPUs | `136.36 GB/s` | `136.93 GB/s` | FAIL | `47.69 GB/s` | `54.41 GB/s` | FAIL |
+| 2 nodes x 4 GPUs | `333.23 GB/s` | `335.48 GB/s` | FAIL | `72.82 GB/s` | `73.73 GB/s` | FAIL |
+| 2 nodes x 8 GPUs | `353.47 GB/s` | `491.84 GB/s` | FAIL | `36.70 GB/s` | `76.54 GB/s` | FAIL |
 
 解释：2 机 4 卡档位已经基本定位并修复到接近 PDF；2 机 8 卡档位不是简单 GPU 顺序问题。尝试调整 8 卡 `CUDA_VISIBLE_DEVICES` 顺序、加入 100G/25G active HCA、以及套 PDF 固定参数都没有改善；固定参数反而会把 8 卡 allreduce 从约 `354 GB/s` 压到约 `239 GB/s`。
 
+8 卡 alltoall 目前的最佳软件侧改动是 `NCCL_PXN_DISABLE=1`：
+
+| Case | 8 卡 alltoall Avg Bus BW |
+|------|--------------------------|
+| baseline | `30.06 GB/s` |
+| `NCCL_PXN_DISABLE=1` | `37.24 GB/s` |
+| 正式矩阵报告 | `36.74 GB/s` |
+
+其他变量如 `NCCL_P2P_PXN_LEVEL`、`NCCL_NET_SHARED_COMMS`、`NCCL_NET_SHARED_BUFFERS`、`NCCL_NCHANNELS_PER_NET_PEER`、`NCCL_IB_ADAPTIVE_ROUTING` 均无改善或变差。
+
 ### 8. 8 卡链路计数器与物理上限判断
 
 计数器探测报告：`reports_multinode_nccl_counter_probe_20260523.md`
@@ -369,6 +382,7 @@ libnccl-dev
 - PDF 8 卡 allreduce `491.84 GB/s busbw` 反推需要约 `262 GB/s algbw`，超过当前 4 x 400G 的物理单向总带宽
 - 8 卡 alltoall 端口计数器显示 rail 分布不均，且 HCA 顺序 sweep 无改善
 - 当前环境缺失 NCCL net plugin/SHARP，NCCL 只能使用 internal IB plugin
+- `NCCL_PXN_DISABLE=1` 可将 8 卡 alltoall 提升到约 `36.7 GB/s`，但仍不到 PDF 参考值的一半
 
 ### 阻塞 3：`nccl-gpu-2` SSH 存在外部连接压力
 
@@ -408,4 +422,5 @@ libnccl-dev
 - `reports_multinode_nccl_16g_2x8_nccl227_auto.md`：NCCL 2.27.7 16G 自动 channel/QP 原始报告
 - `reports_multinode_nccl_pdf_matrix_nccl227.md`：NCCL 2.27.7 PDF 矩阵式原始报告
 - `reports_multinode_nccl_counter_probe_20260523.md`：8 卡链路计数器与 HCA 顺序 sweep 报告
+- `reports_multinode_nccl_alltoall_tuning_20260523.md`：8 卡 alltoall NCCL 网络参数 sweep 报告
 - `reports_multinode_nccl_diagnosis_20260523.md`：本中文诊断总结
diff --git a/reports_multinode_nccl_pdf_matrix_nccl227.md b/reports_multinode_nccl_pdf_matrix_nccl227.md
index a18fb0d..c04d023 100644
--- a/reports_multinode_nccl_pdf_matrix_nccl227.md
+++ b/reports_multinode_nccl_pdf_matrix_nccl227.md
@@ -1,6 +1,6 @@
 # GPU Test Report
 
-- **Date:** 2026-05-23T08:32:58.113416
+- **Date:** 2026-05-23T08:58:19.911230
 - **Host:** aikubeworker0012
 
 ## Overall Acceptance Verdict
@@ -36,10 +36,10 @@ Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
 
 | Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
 |----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.23 GB/s | 16G | 47.24 GB/s | >= 49 GB/s | FAIL |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 136.97 GB/s | 16G | 137.17 GB/s | >= 137 GB/s | PASS |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 333.22 GB/s | 16G | 333.24 GB/s | >= 335 GB/s | FAIL |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 354.02 GB/s | 16G | 353.92 GB/s | >= 492 GB/s | FAIL |
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.26 GB/s | 16G | 47.19 GB/s | >= 49 GB/s | FAIL |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 136.36 GB/s | 16G | 136.69 GB/s | >= 137 GB/s | FAIL |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 333.23 GB/s | 16G | 333.45 GB/s | >= 335 GB/s | FAIL |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 353.47 GB/s | 16G | 353.86 GB/s | >= 492 GB/s | FAIL |
 
 | Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
 |----------|--------------|-----------------|------------------|-------------------|
@@ -50,18 +50,19 @@ Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
 
 | Topology | Return Code | Error / Output Tail |
 |----------|-------------|---------------------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | E aikubeworker0012:2157248:2157325 [0] NCCL INFO comm 0x5595f28bf420 rank 0 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 47.2399  # # Collective test concluded: all_reduce_perf #   |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ker0012:2157429:2157526 [3] NCCL INFO comm 0x55a8a0147090 rank 3 nranks 8 cudaDev 3 busId ab000 - Destroy COMPLETE aikubeworker0012:2157427:2157524 [1] NCCL INFO comm 0x55b1b0f86630 rank 1 nranks 8 cudaDev 1 busId 2a000 - Destroy COMPLETE   |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 |  aikubeworker0016:1138578:1139592 [0] NCCL INFO comm 0x556eff26c190 rank 8 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 353.915  # # Collective test concluded: all_reduce_perf #   |
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | TE aikubeworker0012:2165982:2166060 [0] NCCL INFO comm 0x55d452f2df80 rank 0 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 47.189  # # Collective test concluded: all_reduce_perf #   |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | 0 | ker0016:1221425:1222411 [0] NCCL INFO comm 0x56437384f040 rank 2 nranks 4 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0016:1221427:1222412 [1] NCCL INFO comm 0x55ab9313f950 rank 3 nranks 4 cudaDev 1 busId 2a000 - Destroy COMPLETE   |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | E aikubeworker0012:2166160:2166257 [0] NCCL INFO comm 0x557243829d50 rank 0 nranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 333.449  # # Collective test concluded: all_reduce_perf #   |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | r0012:2166272:2166442 [5] NCCL INFO comm 0x55721e270960 rank 5 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE aikubeworker0012:2166268:2166447 [1] NCCL INFO comm 0x5644fafd24e0 rank 1 nranks 16 cudaDev 1 busId 2a000 - Destroy COMPLETE   |
 
 ### Multi-node NCCL alltoall
 
 | Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
 |----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.84 GB/s | 16G | 24.89 GB/s | >= 27 GB/s | FAIL |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.67 GB/s | 16G | 47.91 GB/s | >= 54 GB/s | FAIL |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.93 GB/s | 16G | 72.97 GB/s | >= 74 GB/s | FAIL |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 30.04 GB/s | 16G | 30.04 GB/s | >= 77 GB/s | FAIL |
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.87 GB/s | 16G | 24.93 GB/s | >= 27 GB/s | FAIL |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.69 GB/s | 16G | 47.93 GB/s | >= 54 GB/s | FAIL |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.82 GB/s | 16G | 72.87 GB/s | >= 74 GB/s | FAIL |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 36.70 GB/s | 16G | 36.74 GB/s | >= 77 GB/s | FAIL |
 
 | Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
 |----------|--------------|-----------------|------------------|-------------------|
@@ -72,10 +73,10 @@ Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
 
 | Topology | Return Code | Error / Output Tail |
 |----------|-------------|---------------------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | ETE aikubeworker0012:2157727:2157802 [0] NCCL INFO comm 0x55a0349b02b0 rank 0 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 24.8897  # # Collective test concluded: alltoall_perf #   |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | 0 | ETE aikubeworker0016:1141290:1142410 [0] NCCL INFO comm 0x55fabbea6410 rank 2 nranks 4 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 47.9094  # # Collective test concluded: alltoall_perf #   |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ETE aikubeworker0012:2158071:2158172 [0] NCCL INFO comm 0x563312baa7f0 rank 0 nranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 72.9657  # # Collective test concluded: alltoall_perf #   |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | 016:1143717:1145948 [7] NCCL INFO comm 0x5558cc9de640 rank 15 nranks 16 cudaDev 7 busId db000 - Destroy COMPLETE aikubeworker0016:1143713:1145946 [3] NCCL INFO comm 0x55c1af080e60 rank 11 nranks 16 cudaDev 3 busId 5d000 - Destroy COMPLETE   |
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | ETE aikubeworker0012:2166458:2166534 [0] NCCL INFO comm 0x5603baefb150 rank 0 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 24.9304  # # Collective test concluded: alltoall_perf #   |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | 0 | ETE aikubeworker0012:2166543:2166743 [0] NCCL INFO comm 0x5569d31d4f50 rank 0 nranks 4 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 47.9258  # # Collective test concluded: alltoall_perf #   |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ker0016:1227342:1228382 [1] NCCL INFO comm 0x55cdec231780 rank 5 nranks 8 cudaDev 1 busId 2a000 - Destroy COMPLETE aikubeworker0016:1227344:1228381 [3] NCCL INFO comm 0x563c7ed39680 rank 7 nranks 8 cudaDev 3 busId ab000 - Destroy COMPLETE   |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | TE aikubeworker0012:2166925:2167127 [7] NCCL INFO comm 0x560553b91250 rank 7 nranks 16 cudaDev 7 busId db000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 36.7382  # # Collective test concluded: alltoall_perf #   |
 
 **Overall: FAIL**
 
-- 
2.47.2


From 2e194ded14fbd6d872e4a81474e406b848194a0a Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 17:03:02 +0800
Subject: [PATCH 12/41] Document PXN alltoall rail balancing

---
 ...multinode_nccl_alltoall_tuning_20260523.md | 37 ++++++++++++++++++-
 ...s_multinode_nccl_counter_probe_20260523.md | 20 +++++++++-
 reports_multinode_nccl_diagnosis_20260523.md  | 13 ++++++-
 3 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/reports_multinode_nccl_alltoall_tuning_20260523.md b/reports_multinode_nccl_alltoall_tuning_20260523.md
index d26630a..60fadd2 100644
--- a/reports_multinode_nccl_alltoall_tuning_20260523.md
+++ b/reports_multinode_nccl_alltoall_tuning_20260523.md
@@ -10,6 +10,8 @@
 
 `NCCL_PXN_DISABLE=1` 是本轮唯一有效正向参数，可以把 8 卡 alltoall 从约 `30.06 GB/s` 提升到约 `37.24 GB/s`。纳入正式 PDF 矩阵配置后，8 卡 alltoall 原始报告结果为 `36.70 GB/s peak` / `36.74 GB/s avg`。
 
+补充计数器探测显示，`NCCL_PXN_DISABLE=1` 的实际作用是把 alltoall 流量重新均匀分配到 4 条 400G rail 上。baseline 下 `mlx5_0/6` 与 `mlx5_1/7` 的流量约为 3:1；禁用 PXN 后四条 HCA 均为约 `590.98 GB`。但每条 rail 的实际吞吐仍只有约 `19.82 GB/s`，没有打满 400G rail。
+
 这个提升有实际价值，但仍远低于 PDF 参考 `76.54 GB/s`。其他参数没有改善，部分明显变差：
 
 | Case | Avg Bus BW | 结论 |
@@ -28,6 +30,36 @@
 | `NCCL_IB_ADAPTIVE_ROUTING=0` | `30.0535 GB/s` | 无改善 |
 | `NCCL_IB_PCI_RELAXED_ORDERING=0` | 未完成 | 明显异常，不建议 |
 
+## PXN disabled 端口计数器
+
+`NCCL_PXN_DISABLE=1` 后，8 卡 alltoall 输出：
+
+| Metric | Value |
+|--------|-------|
+| `algbw` | `39.37 / 39.46 GB/s` |
+| `busbw` | `36.91 / 37.00 GB/s` |
+| `Avg bus bandwidth` | `36.9518 GB/s` |
+
+端口计数器：
+
+| Host | HCA | Xmit GB | Recv GB | Xmit GB/s | Recv GB/s |
+|------|-----|---------|---------|-----------|-----------|
+| 172.72.8.12 | `mlx5_0` | `590.98` | `590.91` | `19.82` | `19.82` |
+| 172.72.8.12 | `mlx5_1` | `590.98` | `590.98` | `19.82` | `19.82` |
+| 172.72.8.12 | `mlx5_6` | `590.98` | `590.90` | `19.82` | `19.82` |
+| 172.72.8.12 | `mlx5_7` | `590.98` | `590.98` | `19.82` | `19.82` |
+| 172.72.8.16 | `mlx5_0` | `590.94` | `590.98` | `19.82` | `19.82` |
+| 172.72.8.16 | `mlx5_1` | `590.94` | `590.98` | `19.82` | `19.82` |
+| 172.72.8.16 | `mlx5_6` | `590.94` | `590.98` | `19.82` | `19.82` |
+| 172.72.8.16 | `mlx5_7` | `590.94` | `590.98` | `19.82` | `19.82` |
+
+对比 baseline：
+
+| Case | Rail 分布 | Avg Bus BW |
+|------|-----------|------------|
+| baseline | `mlx5_0/6` 约 `885 GB`，`mlx5_1/7` 约 `295 GB` | `30.04 GB/s` |
+| `NCCL_PXN_DISABLE=1` | 四条 HCA 均约 `591 GB` | `36.95 GB/s` |
+
 ## 正式配置更新
 
 `configs/multinode_nccl_nccl227_pdf_matrix.yaml` 已对 2 nodes x 8 GPUs 的 alltoall 增加：
@@ -47,5 +79,6 @@ op_env:
 ## 判断
 
 1. PXN 在当前拓扑下对 8 卡 alltoall 有负面影响，禁用后有约 `22-24%` 提升。
-2. 禁用 PXN 后仍只有 PDF 目标的一半左右，剩余差距不是单一 NCCL 环境变量可以补齐。
-3. 后续重点仍应放在 NCCL net plugin/SHARP、交换网络策略、路由/拥塞和 alltoall rail 分布。
+2. 禁用 PXN 可以修复 rail 分布不均衡，但无法打满每条 400G rail。
+3. 禁用 PXN 后仍只有 PDF 目标的一半左右，剩余差距不是单一 NCCL 环境变量可以补齐。
+4. 后续重点仍应放在 NCCL net plugin/SHARP、交换网络策略、路由/拥塞和 NCCL internal alltoall 实现效率。
diff --git a/reports_multinode_nccl_counter_probe_20260523.md b/reports_multinode_nccl_counter_probe_20260523.md
index 784b5c4..c13b5b7 100644
--- a/reports_multinode_nccl_counter_probe_20260523.md
+++ b/reports_multinode_nccl_counter_probe_20260523.md
@@ -14,6 +14,8 @@
 
 8 卡 alltoall 仍只有 `30 GB/s busbw`，不是 HCA 顺序导致。HCA 顺序 sweep 都稳定在 `30.02-30.07 GB/s`。计数器显示 alltoall 流量主要压在 `mlx5_0` 和 `mlx5_6` 上，`mlx5_1` 和 `mlx5_7` 只有约三分之一流量，说明剩余问题更像 NCCL alltoall rail 分布、路由、拥塞、NCCL net plugin/SHARP 或网络侧策略问题。
 
+补充测试显示，`NCCL_PXN_DISABLE=1` 可以把 alltoall 流量均匀分配到四条 HCA，并将 busbw 提升到约 `36.95 GB/s`。不过每条 400G rail 仍只有约 `19.82 GB/s`，没有达到裸 RDMA 单 rail 能力。
+
 ## 裸 RDMA 4 rail 并发
 
 命令类型：
@@ -93,10 +95,24 @@ NCCL 输出：
 | `mlx5_1,mlx5_0,mlx5_7,mlx5_6` | `30.0413 GB/s` |
 | `mlx5_6,mlx5_7,mlx5_0,mlx5_1` | `30.0230 GB/s` |
 
+## PXN disabled alltoall 计数器
+
+`NCCL_PXN_DISABLE=1` 后：
+
+| Metric | Value |
+|--------|-------|
+| `Avg bus bandwidth` | `36.9518 GB/s` |
+| 每条 HCA 流量 | 约 `590.94-590.98 GB` |
+| 每条 HCA 吞吐 | 约 `19.82 GB/s` |
+| 每节点 4 HCA 合计吞吐 | 约 `79.29 GB/s` |
+
+判断：禁用 PXN 可以修复 rail 分布不均衡，但不能让 alltoall 打满当前 4 条 400G rail。
+
 ## 判断
 
 1. 裸 RDMA 4 rail 可以并发跑到约 `184.62 GB/s`，网络基础带宽不是单 rail 瓶颈。
 2. 8 卡 allreduce 当前不是软件参数小调能解决的问题，性能已经贴近当前 4 条 400G rail 的物理带宽上限。
 3. 8 卡 alltoall 仍明显异常，且不是 HCA 顺序问题；需要继续从 NCCL alltoall rail 分布、网络路由/拥塞、NCCL net plugin/SHARP、交换机侧策略排查。
-4. 如果验收必须达到 PDF 的 2 机 16 卡 `491.84/76.54 GB/s`，需要确认当前两台机器是否具备与 PDF 参考环境同等的有效跨节点 rail 数量和交换网络能力。
-5. 两台机器当前均未发现 `libnccl-net.so` 或 SHARP/HCOLL 包，NCCL 使用 internal IB plugin；如果目标值依赖 NCCL net plugin/SHARP，需要先补齐对应运行环境。
+4. `NCCL_PXN_DISABLE=1` 可改善 8 卡 alltoall 的 rail 均衡性和性能，但无法补齐到 PDF 目标。
+5. 如果验收必须达到 PDF 的 2 机 16 卡 `491.84/76.54 GB/s`，需要确认当前两台机器是否具备与 PDF 参考环境同等的有效跨节点 rail 数量和交换网络能力。
+6. 两台机器当前均未发现 `libnccl-net.so` 或 SHARP/HCOLL 包，NCCL 使用 internal IB plugin；如果目标值依赖 NCCL net plugin/SHARP，需要先补齐对应运行环境。
diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md
index 732a6ac..7612e91 100644
--- a/reports_multinode_nccl_diagnosis_20260523.md
+++ b/reports_multinode_nccl_diagnosis_20260523.md
@@ -285,6 +285,15 @@ NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
 
 其他变量如 `NCCL_P2P_PXN_LEVEL`、`NCCL_NET_SHARED_COMMS`、`NCCL_NET_SHARED_BUFFERS`、`NCCL_NCHANNELS_PER_NET_PEER`、`NCCL_IB_ADAPTIVE_ROUTING` 均无改善或变差。
 
+PXN disabled 计数器显示该参数确实修复了 rail 分布：
+
+| Case | Rail 分布 | Avg Bus BW |
+|------|-----------|------------|
+| baseline | `mlx5_0/6` 约 `885 GB`，`mlx5_1/7` 约 `295 GB` | `30.04 GB/s` |
+| `NCCL_PXN_DISABLE=1` | 四条 HCA 均约 `591 GB` | `36.95 GB/s` |
+
+但禁用 PXN 后每条 400G rail 仍只有约 `19.82 GB/s`，没有接近裸 RDMA 单 rail 的 `347-387 Gb/s`。因此它解决的是 rail 分布不均衡的一部分，不是全部 alltoall 性能问题。
+
 ### 8. 8 卡链路计数器与物理上限判断
 
 计数器探测报告：`reports_multinode_nccl_counter_probe_20260523.md`
@@ -327,6 +336,8 @@ busbw = algbw * 2 * (nranks - 1) / nranks = algbw * 1.875
 
 同一测试窗口内端口计数器显示 alltoall 流量分布不均衡：`mlx5_0` 和 `mlx5_6` 的流量约 `885 GB`，`mlx5_1` 和 `mlx5_7` 约 `295 GB`，约为三倍差距。继续调换 `NCCL_IB_HCA` 顺序后，8 卡 alltoall 仍稳定在 `30.02-30.07 GB/s`，说明不是简单 HCA 列表顺序问题。
 
+`NCCL_PXN_DISABLE=1` 后，端口流量变为四条 HCA 均约 `591 GB`，alltoall `Avg bus bandwidth` 提升到 `36.9518 GB/s`，但每条 rail 吞吐仍只有约 `19.82 GB/s`。
+
 ### 9. NCCL net plugin / SHARP 状态
 
 两台机器上均未找到：
@@ -382,7 +393,7 @@ libnccl-dev
 - PDF 8 卡 allreduce `491.84 GB/s busbw` 反推需要约 `262 GB/s algbw`，超过当前 4 x 400G 的物理单向总带宽
 - 8 卡 alltoall 端口计数器显示 rail 分布不均，且 HCA 顺序 sweep 无改善
 - 当前环境缺失 NCCL net plugin/SHARP，NCCL 只能使用 internal IB plugin
-- `NCCL_PXN_DISABLE=1` 可将 8 卡 alltoall 提升到约 `36.7 GB/s`，但仍不到 PDF 参考值的一半
+- `NCCL_PXN_DISABLE=1` 可将 8 卡 alltoall 提升到约 `36.7 GB/s`，并修复 rail 分布不均，但仍不到 PDF 参考值的一半
 
 ### 阻塞 3：`nccl-gpu-2` SSH 存在外部连接压力
 
-- 
2.47.2


From edc469cee995fa899a22e63fe59122016594f6cb Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 17:13:03 +0800
Subject: [PATCH 13/41] Document NCCL alltoall counter probe

---
 ...multinode_nccl_alltoall_tuning_20260523.md | 18 +++++-
 ...s_multinode_nccl_counter_probe_20260523.md | 61 ++++++++++++++++++-
 reports_multinode_nccl_diagnosis_20260523.md  | 24 ++++++--
 3 files changed, 94 insertions(+), 9 deletions(-)

diff --git a/reports_multinode_nccl_alltoall_tuning_20260523.md b/reports_multinode_nccl_alltoall_tuning_20260523.md
index 60fadd2..f8d6515 100644
--- a/reports_multinode_nccl_alltoall_tuning_20260523.md
+++ b/reports_multinode_nccl_alltoall_tuning_20260523.md
@@ -10,7 +10,9 @@
 
 `NCCL_PXN_DISABLE=1` 是本轮唯一有效正向参数，可以把 8 卡 alltoall 从约 `30.06 GB/s` 提升到约 `37.24 GB/s`。纳入正式 PDF 矩阵配置后，8 卡 alltoall 原始报告结果为 `36.70 GB/s peak` / `36.74 GB/s avg`。
 
-补充计数器探测显示，`NCCL_PXN_DISABLE=1` 的实际作用是把 alltoall 流量重新均匀分配到 4 条 400G rail 上。baseline 下 `mlx5_0/6` 与 `mlx5_1/7` 的流量约为 3:1；禁用 PXN 后四条 HCA 均为约 `590.98 GB`。但每条 rail 的实际吞吐仍只有约 `19.82 GB/s`，没有打满 400G rail。
+补充计数器探测显示，`NCCL_PXN_DISABLE=1` 的实际作用是把 alltoall 流量重新均匀分配到 4 条 400G rail 上。baseline 下 `mlx5_0/6` 与 `mlx5_1/7` 的流量约为 3:1；禁用 PXN 后四条 HCA 均衡。但每条 rail 的实际吞吐仍只有约 `19-20 GB/s`，没有打满 400G rail。
+
+复测错误/拥塞 counter 后，没有看到 discard、链路错误、RoCE 重传、slow restart 或 packet sequence error 增长；主要非零异常是部分端口 `port_xmit_wait`。所以当前不支持“链路坏包/重传导致慢”的判断，更像发送等待/credit 等待、交换侧调度/拥塞控制，或 NCCL internal alltoall 通信模式效率不足。
 
 这个提升有实际价值，但仍远低于 PDF 参考 `76.54 GB/s`。其他参数没有改善，部分明显变差：
 
@@ -60,6 +62,18 @@
 | baseline | `mlx5_0/6` 约 `885 GB`，`mlx5_1/7` 约 `295 GB` | `30.04 GB/s` |
 | `NCCL_PXN_DISABLE=1` | 四条 HCA 均约 `591 GB` | `36.95 GB/s` |
 
+### 错误/等待 counter 复测
+
+PXN disabled 复测结果：
+
+| 观察项 | 结果 |
+|--------|------|
+| `Avg bus bandwidth` | `36.4512 GB/s` |
+| 每条 HCA 流量 | 约 `712.18-712.28 GiB`，四条 rail 均衡 |
+| discard / rcv error / symbol error / link down / link recovery | `0` 增量 |
+| RoCE retrans / slow restart / packet sequence error / out of sequence | `0` 增量 |
+| `port_xmit_wait` | `mlx5_1`、`mlx5_7` 有增长，约 `15.65M-23.49M` |
+
 ## 正式配置更新
 
 `configs/multinode_nccl_nccl227_pdf_matrix.yaml` 已对 2 nodes x 8 GPUs 的 alltoall 增加：
@@ -81,4 +95,4 @@ op_env:
 1. PXN 在当前拓扑下对 8 卡 alltoall 有负面影响，禁用后有约 `22-24%` 提升。
 2. 禁用 PXN 可以修复 rail 分布不均衡，但无法打满每条 400G rail。
 3. 禁用 PXN 后仍只有 PDF 目标的一半左右，剩余差距不是单一 NCCL 环境变量可以补齐。
-4. 后续重点仍应放在 NCCL net plugin/SHARP、交换网络策略、路由/拥塞和 NCCL internal alltoall 实现效率。
+4. 后续重点仍应放在 NCCL net plugin/SHARP、交换网络策略、credit/拥塞等待和 NCCL internal alltoall 实现效率。
diff --git a/reports_multinode_nccl_counter_probe_20260523.md b/reports_multinode_nccl_counter_probe_20260523.md
index c13b5b7..5579df8 100644
--- a/reports_multinode_nccl_counter_probe_20260523.md
+++ b/reports_multinode_nccl_counter_probe_20260523.md
@@ -14,7 +14,9 @@
 
 8 卡 alltoall 仍只有 `30 GB/s busbw`，不是 HCA 顺序导致。HCA 顺序 sweep 都稳定在 `30.02-30.07 GB/s`。计数器显示 alltoall 流量主要压在 `mlx5_0` 和 `mlx5_6` 上，`mlx5_1` 和 `mlx5_7` 只有约三分之一流量，说明剩余问题更像 NCCL alltoall rail 分布、路由、拥塞、NCCL net plugin/SHARP 或网络侧策略问题。
 
-补充测试显示，`NCCL_PXN_DISABLE=1` 可以把 alltoall 流量均匀分配到四条 HCA，并将 busbw 提升到约 `36.95 GB/s`。不过每条 400G rail 仍只有约 `19.82 GB/s`，没有达到裸 RDMA 单 rail 能力。
+补充测试显示，`NCCL_PXN_DISABLE=1` 可以把 alltoall 流量均匀分配到四条 HCA，并将 busbw 提升到约 `36.5-37.0 GB/s`。不过每条 400G rail 仍只有约 `19-20 GB/s`，没有达到裸 RDMA 单 rail 能力。
+
+进一步抓 `counters`/`hw_counters` 后，未看到 discard、CRC/符号错误、packet sequence error、RoCE retrans、slow restart 等错误类计数增长；只看到部分端口 `port_xmit_wait` 增长。也就是说，PXN disabled 后剩余问题不是明显的链路坏包/重传，而更像发送等待、信用/拥塞等待、交换网络调度或 NCCL internal alltoall 通信模式效率问题。
 
 ## 裸 RDMA 4 rail 并发
 
@@ -108,11 +110,66 @@ NCCL 输出：
 
 判断：禁用 PXN 可以修复 rail 分布不均衡，但不能让 alltoall 打满当前 4 条 400G rail。
 
+### PXN disabled 错误/拥塞 counter 复测
+
+复测命令仍为 2 nodes x 8 GPUs，`alltoall_perf -b 16G -e 16G -w 10 -n 10`，并使用：
+
+```bash
+NCCL_PXN_DISABLE=1
+NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_6,mlx5_7
+NCCL_NET_PLUGIN=none
+NCCL_NET_GDR_LEVEL=5
+NCCL_NET_GDR_READ=1
+NCCL_DMABUF_ENABLE=0
+```
+
+NCCL 输出：
+
+| Metric | Value |
+|--------|-------|
+| `algbw` | `39.04 / 38.72 GB/s` |
+| `busbw` | `36.60 / 36.30 GB/s` |
+| `Avg bus bandwidth` | `36.4512 GB/s` |
+
+流量分布保持均衡：
+
+| Host | HCA | Xmit GiB | Recv GiB |
+|------|-----|----------|----------|
+| aikubeworker0012 | `mlx5_0` | `712.28` | `712.19` |
+| aikubeworker0012 | `mlx5_1` | `712.27` | `712.27` |
+| aikubeworker0012 | `mlx5_6` | `712.28` | `712.18` |
+| aikubeworker0012 | `mlx5_7` | `712.27` | `712.27` |
+| aikubeworker0016 | `mlx5_0` | `712.23` | `712.27` |
+| aikubeworker0016 | `mlx5_1` | `712.23` | `712.27` |
+| aikubeworker0016 | `mlx5_6` | `712.23` | `712.27` |
+| aikubeworker0016 | `mlx5_7` | `712.23` | `712.27` |
+
+错误类 counter 增量：
+
+| Counter group | Result |
+|---------------|--------|
+| `port_xmit_discards`, `port_rcv_errors`, `port_rcv_remote_physical_errors`, `port_rcv_switch_relay_errors` | `0` |
+| `symbol_error`, `link_error_recovery`, `link_downed`, `local_link_integrity_errors`, `excessive_buffer_overrun_errors` | `0` |
+| `roce_adp_retrans`, `roce_adp_retrans_to`, `roce_slow_restart*` | `0` |
+| `packet_seq_err`, `out_of_sequence`, `out_of_buffer`, `duplicate_request`, `implied_nak_seq_err` | `0` |
+| `local_ack_timeout_err`, `req_transport_retries_exceeded`, `rnr_nak_retry_err` | `0` |
+
+非零等待类 counter：
+
+| Host | HCA | `port_xmit_wait` delta |
+|------|-----|------------------------|
+| aikubeworker0012 | `mlx5_1` | `23,492,853` |
+| aikubeworker0012 | `mlx5_7` | `17,420,720` |
+| aikubeworker0016 | `mlx5_1` | `20,428,901` |
+| aikubeworker0016 | `mlx5_7` | `15,650,027` |
+
+判断：PXN disabled 后 alltoall 没有明显链路错误、重传或丢包证据；剩余性能缺口更偏向 `port_xmit_wait` 指向的发送等待/信用等待、交换网络拥塞控制/调度，或 NCCL internal alltoall 在当前拓扑下的通信模式效率。
+
 ## 判断
 
 1. 裸 RDMA 4 rail 可以并发跑到约 `184.62 GB/s`，网络基础带宽不是单 rail 瓶颈。
 2. 8 卡 allreduce 当前不是软件参数小调能解决的问题，性能已经贴近当前 4 条 400G rail 的物理带宽上限。
-3. 8 卡 alltoall 仍明显异常，且不是 HCA 顺序问题；需要继续从 NCCL alltoall rail 分布、网络路由/拥塞、NCCL net plugin/SHARP、交换机侧策略排查。
+3. 8 卡 alltoall 仍明显异常，且不是 HCA 顺序问题；PXN disabled 后 rail 已均衡，但仍出现 `port_xmit_wait`，需要继续从网络拥塞/信用等待、交换机侧策略、NCCL alltoall 模式、NCCL net plugin/SHARP 排查。
 4. `NCCL_PXN_DISABLE=1` 可改善 8 卡 alltoall 的 rail 均衡性和性能，但无法补齐到 PDF 目标。
 5. 如果验收必须达到 PDF 的 2 机 16 卡 `491.84/76.54 GB/s`，需要确认当前两台机器是否具备与 PDF 参考环境同等的有效跨节点 rail 数量和交换网络能力。
 6. 两台机器当前均未发现 `libnccl-net.so` 或 SHARP/HCOLL 包，NCCL 使用 internal IB plugin；如果目标值依赖 NCCL net plugin/SHARP，需要先补齐对应运行环境。
diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md
index 7612e91..226c4f2 100644
--- a/reports_multinode_nccl_diagnosis_20260523.md
+++ b/reports_multinode_nccl_diagnosis_20260523.md
@@ -16,7 +16,7 @@
 
 按 `sx算力节点跨Leaf NCCL测试报告.pdf` 的矩阵继续对齐后，发现 2 机 4 卡档位的核心问题是默认 GPU 选择不符合 GPU-NIC 亲和性。显式选择 `CUDA_VISIBLE_DEVICES=0,1,4,5` 后，2 机 4 卡 allreduce 可以恢复到 `333-335 GB/s` 区间，接近 PDF 的 `335.48 GB/s`；alltoall 配合 PDF 固定 NCCL 参数可到 `72.93 GB/s`，接近 PDF 的 `73.73 GB/s`。但 2 机 8 卡档位仍只有 allreduce `354.02 GB/s`、alltoall `30.04 GB/s`，与 PDF 的 `491.84/76.54 GB/s` 差距明显。
 
-进一步 sweep 8 卡 alltoall 网络参数后，`NCCL_PXN_DISABLE=1` 是唯一有效正向项。正式矩阵配置已对 2 机 8 GPU 的 alltoall 单独加入该变量，8 卡 alltoall 从约 `30.04 GB/s` 提升到 `36.70 GB/s` peak / `36.74 GB/s` avg，但仍低于 PDF 参考 `76.54 GB/s`。
+进一步 sweep 8 卡 alltoall 网络参数后，`NCCL_PXN_DISABLE=1` 是唯一有效正向项。正式矩阵配置已对 2 机 8 GPU 的 alltoall 单独加入该变量，8 卡 alltoall 从约 `30.04 GB/s` 提升到 `36.70 GB/s` peak / `36.74 GB/s` avg，但仍低于 PDF 参考 `76.54 GB/s`。复测端口 counter 后，PXN disabled 下 4 条 rail 的流量已均衡，且没有明显链路错误、丢包、RoCE 重传或 slow restart；只在部分端口看到 `port_xmit_wait` 增长，剩余差距更像发送等待/信用等待、交换网络策略或 NCCL internal alltoall 通信模式效率问题。
 
 同时，`nccl-gpu-2` 的 SSH 入口曾因未认证连接过多触发 `MaxStartups` 随机拒绝，导致 `mpirun` 拉起远端 rank 失败。已经做了临时 SSHD 缓解并拿到有效的 2 节点 x 8 GPU allreduce/alltoall 报告。
 
@@ -36,6 +36,7 @@
 12. 增加 topology 级 `cuda_visible_devices`、`env`、`op_env` 配置能力，支持按 GPU/NIC 亲和性和不同 NCCL op 分别设置环境变量。
 13. 生成 PDF 矩阵式原始报告 `reports_multinode_nccl_pdf_matrix_nccl227.md`，覆盖 2 机 1/2/4/8 GPU per node。
 14. 对 8 卡 alltoall 做 NCCL 网络参数 sweep，并将有效项 `NCCL_PXN_DISABLE=1` 固化到 PDF 矩阵配置。
+15. 对 PXN disabled 后的 8 卡 alltoall 抓取 `counters`/`hw_counters` 增量，确认 rail 已均衡且无明显错误/重传，剩余异常主要伴随 `port_xmit_wait`。
 
 ## 关键证据
 
@@ -292,7 +293,19 @@ PXN disabled 计数器显示该参数确实修复了 rail 分布：
 | baseline | `mlx5_0/6` 约 `885 GB`，`mlx5_1/7` 约 `295 GB` | `30.04 GB/s` |
 | `NCCL_PXN_DISABLE=1` | 四条 HCA 均约 `591 GB` | `36.95 GB/s` |
 
-但禁用 PXN 后每条 400G rail 仍只有约 `19.82 GB/s`，没有接近裸 RDMA 单 rail 的 `347-387 Gb/s`。因此它解决的是 rail 分布不均衡的一部分，不是全部 alltoall 性能问题。
+但禁用 PXN 后每条 400G rail 仍只有约 `19-20 GB/s`，没有接近裸 RDMA 单 rail 的 `347-387 Gb/s`。因此它解决的是 rail 分布不均衡的一部分，不是全部 alltoall 性能问题。
+
+复测 PXN disabled alltoall 时继续抓 `counters`/`hw_counters`：
+
+| 观察项 | 结果 |
+|--------|------|
+| alltoall `Avg bus bandwidth` | `36.4512 GB/s` |
+| 每条 HCA 流量 | 约 `712.18-712.28 GiB`，四条 rail 均衡 |
+| discard / rcv error / symbol error / link down / link recovery | `0` 增量 |
+| RoCE retrans / slow restart / packet sequence error / out of sequence | `0` 增量 |
+| `port_xmit_wait` | `mlx5_1`、`mlx5_7` 有增长，约 `15.65M-23.49M` |
+
+判断：当前没有明显坏链路、丢包或重传证据；`port_xmit_wait` 更像发送侧等待 credit/拥塞控制/交换侧调度，或者 NCCL internal alltoall 在当前拓扑下没有把 rail 吞吐打起来。
 
 ### 8. 8 卡链路计数器与物理上限判断
 
@@ -391,9 +404,10 @@ libnccl-dev
 - 8 卡 allreduce `algbw ~= 189 GB/s`，接近当前 4 x 400G HCA 的理论单向合计 `200 GB/s`
 - 裸 RDMA 4 rail 并发 `ib_write_bw` 合计 `1476.95 Gb/s` / `184.62 GB/s`
 - PDF 8 卡 allreduce `491.84 GB/s busbw` 反推需要约 `262 GB/s algbw`，超过当前 4 x 400G 的物理单向总带宽
-- 8 卡 alltoall 端口计数器显示 rail 分布不均，且 HCA 顺序 sweep 无改善
+- 8 卡 alltoall baseline 端口计数器显示 rail 分布不均，且 HCA 顺序 sweep 无改善
 - 当前环境缺失 NCCL net plugin/SHARP，NCCL 只能使用 internal IB plugin
 - `NCCL_PXN_DISABLE=1` 可将 8 卡 alltoall 提升到约 `36.7 GB/s`，并修复 rail 分布不均，但仍不到 PDF 参考值的一半
+- PXN disabled 复测没有看到 discard、链路错误、RoCE 重传、slow restart、packet sequence error 等错误类 counter 增长；主要异常信号是部分端口 `port_xmit_wait`
 
 ### 阻塞 3：`nccl-gpu-2` SSH 存在外部连接压力
 
@@ -414,9 +428,9 @@ libnccl-dev
 4. 4 卡 allreduce 建议继续让 NCCL 自动选择 channel/QP；4 卡 alltoall 如果要贴近 PDF，可单独套 `NCCL_IB_QPS_PER_CONNECTION=4`、`NCCL_MIN_NCHANNELS=4`、`NCCL_IB_SPLIT_DATA_ON_QPS=1`。
 5. 8 卡 per node 不建议套上述固定参数，会降低 allreduce；继续用 auto。
 6. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin/SHARP；当前日志显示 `Could not find: libnccl-net.so`，NCCL 使用的是 internal IB plugin。
-7. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由和拥塞计数，确认 4 个 400Gb/s HCA 是否都在跨节点通信中充分利用。
+7. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由、credit/拥塞等待与交换机侧队列计数，解释 PXN disabled 后 `port_xmit_wait` 增长但无错误/重传的原因。
 8. 确认当前 PDF 的 `491.84/76.54 GB/s` 是否要求当前这两台节点在只有 4 条 400G rail 的形态下也达到；如果要求一致，需要网络/硬件侧继续介入。
-9. 对 8 卡 alltoall，重点查 NCCL rail 分布、交换机 ECMP/自适应路由、拥塞计数、SHARP/NCCL net plugin，而不是继续调 `NCCL_IB_HCA` 顺序。
+9. 对 8 卡 alltoall，重点查交换机 ECMP/自适应路由、拥塞/credit 等待、SHARP/NCCL net plugin 和 NCCL internal alltoall 行为；`NCCL_IB_HCA` 顺序与 rail 分布本身已经不是当前主问题。
 
 ## 当前可交付物
 
-- 
2.47.2


From 1813c11bbfcf84b3b466c9f9b3d09b44d5cf2fcb Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 17:17:22 +0800
Subject: [PATCH 14/41] Compare NCCL allreduce alltoall counters

---
 ...multinode_nccl_alltoall_tuning_20260523.md | 13 +++++-
 ...s_multinode_nccl_counter_probe_20260523.md | 40 +++++++++++++++++--
 reports_multinode_nccl_diagnosis_20260523.md  | 23 ++++++++---
 3 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/reports_multinode_nccl_alltoall_tuning_20260523.md b/reports_multinode_nccl_alltoall_tuning_20260523.md
index f8d6515..aea43d9 100644
--- a/reports_multinode_nccl_alltoall_tuning_20260523.md
+++ b/reports_multinode_nccl_alltoall_tuning_20260523.md
@@ -12,7 +12,7 @@
 
 补充计数器探测显示，`NCCL_PXN_DISABLE=1` 的实际作用是把 alltoall 流量重新均匀分配到 4 条 400G rail 上。baseline 下 `mlx5_0/6` 与 `mlx5_1/7` 的流量约为 3:1；禁用 PXN 后四条 HCA 均衡。但每条 rail 的实际吞吐仍只有约 `19-20 GB/s`，没有打满 400G rail。
 
-复测错误/拥塞 counter 后，没有看到 discard、链路错误、RoCE 重传、slow restart 或 packet sequence error 增长；主要非零异常是部分端口 `port_xmit_wait`。所以当前不支持“链路坏包/重传导致慢”的判断，更像发送等待/credit 等待、交换侧调度/拥塞控制，或 NCCL internal alltoall 通信模式效率不足。
+复测错误/拥塞 counter 后，没有看到 discard、链路错误、RoCE 重传、slow restart 或 packet sequence error 增长；主要非零异常是部分端口 `port_xmit_wait`。不过 allreduce 对照在 `354 GB/s busbw` 时也会出现同类 `port_xmit_wait`，所以当前不支持“链路坏包/重传导致慢”的判断，也不能只用 `port_xmit_wait` 解释 alltoall 低吞吐。更可能的方向是 NCCL internal alltoall 通信模式效率、交换侧调度/拥塞控制，或缺少 NCCL net plugin/SHARP。
 
 这个提升有实际价值，但仍远低于 PDF 参考 `76.54 GB/s`。其他参数没有改善，部分明显变差：
 
@@ -74,6 +74,15 @@ PXN disabled 复测结果：
 | RoCE retrans / slow restart / packet sequence error / out of sequence | `0` 增量 |
 | `port_xmit_wait` | `mlx5_1`、`mlx5_7` 有增长，约 `15.65M-23.49M` |
 
+allreduce 对照：
+
+| 观察项 | 结果 |
+|--------|------|
+| `Avg bus bandwidth` | `354.366 GB/s` |
+| 每条 HCA 流量 | 约 `178.03-178.07 GiB`，四条 rail 均衡 |
+| 错误/重传类 counter | `0` 增量 |
+| `port_xmit_wait` | `mlx5_1`、`mlx5_7` 有增长，约 `6.11M-6.59M` |
+
 ## 正式配置更新
 
 `configs/multinode_nccl_nccl227_pdf_matrix.yaml` 已对 2 nodes x 8 GPUs 的 alltoall 增加：
@@ -95,4 +104,4 @@ op_env:
 1. PXN 在当前拓扑下对 8 卡 alltoall 有负面影响，禁用后有约 `22-24%` 提升。
 2. 禁用 PXN 可以修复 rail 分布不均衡，但无法打满每条 400G rail。
 3. 禁用 PXN 后仍只有 PDF 目标的一半左右，剩余差距不是单一 NCCL 环境变量可以补齐。
-4. 后续重点仍应放在 NCCL net plugin/SHARP、交换网络策略、credit/拥塞等待和 NCCL internal alltoall 实现效率。
+4. 后续重点仍应放在 NCCL net plugin/SHARP、交换网络策略和 NCCL internal alltoall 实现效率；`port_xmit_wait` 需要结合 allreduce 对照解读，不能单独作为 alltoall 根因。
diff --git a/reports_multinode_nccl_counter_probe_20260523.md b/reports_multinode_nccl_counter_probe_20260523.md
index 5579df8..9e42251 100644
--- a/reports_multinode_nccl_counter_probe_20260523.md
+++ b/reports_multinode_nccl_counter_probe_20260523.md
@@ -16,7 +16,7 @@
 
 补充测试显示，`NCCL_PXN_DISABLE=1` 可以把 alltoall 流量均匀分配到四条 HCA，并将 busbw 提升到约 `36.5-37.0 GB/s`。不过每条 400G rail 仍只有约 `19-20 GB/s`，没有达到裸 RDMA 单 rail 能力。
 
-进一步抓 `counters`/`hw_counters` 后，未看到 discard、CRC/符号错误、packet sequence error、RoCE retrans、slow restart 等错误类计数增长；只看到部分端口 `port_xmit_wait` 增长。也就是说，PXN disabled 后剩余问题不是明显的链路坏包/重传，而更像发送等待、信用/拥塞等待、交换网络调度或 NCCL internal alltoall 通信模式效率问题。
+进一步抓 `counters`/`hw_counters` 后，未看到 discard、CRC/符号错误、packet sequence error、RoCE retrans、slow restart 等错误类计数增长；只看到部分端口 `port_xmit_wait` 增长。对照 allreduce 后发现，allreduce 在 `354 GB/s busbw` 时也会出现同类 `port_xmit_wait`，因此 `port_xmit_wait` 不是 alltoall 低吞吐的充分解释，只能说明发送侧存在等待。剩余问题更像 NCCL internal alltoall 通信模式、交换网络调度/拥塞控制、或缺少 NCCL net plugin/SHARP 能力。
 
 ## 裸 RDMA 4 rail 并发
 
@@ -62,6 +62,40 @@ busbw = algbw * 2 * (nranks - 1) / nranks
 
 当前 `189.12 GB/s algbw` 已接近 `4 x 400Gb/s = 200 GB/s` 理论单向总带宽。
 
+### allreduce counter 对照
+
+对同样 2 nodes x 8 GPUs、同样 4 条 HCA 的 16G allreduce 复测 counter：
+
+| Metric | Value |
+|--------|-------|
+| `algbw` | `189.22 / 188.77 GB/s` |
+| `busbw` | `354.79 / 353.94 GB/s` |
+| `Avg bus bandwidth` | `354.366 GB/s` |
+
+流量分布：
+
+| Host | HCA | Xmit GiB | Recv GiB |
+|------|-----|----------|----------|
+| aikubeworker0012 | `mlx5_0` | `178.07` | `178.03` |
+| aikubeworker0012 | `mlx5_1` | `178.07` | `178.07` |
+| aikubeworker0012 | `mlx5_6` | `178.07` | `178.03` |
+| aikubeworker0012 | `mlx5_7` | `178.07` | `178.07` |
+| aikubeworker0016 | `mlx5_0` | `178.03` | `178.07` |
+| aikubeworker0016 | `mlx5_1` | `178.07` | `178.07` |
+| aikubeworker0016 | `mlx5_6` | `178.03` | `178.07` |
+| aikubeworker0016 | `mlx5_7` | `178.07` | `178.07` |
+
+错误类 counter 增量同样为 `0`，非零等待类 counter 为：
+
+| Host | HCA | `port_xmit_wait` delta |
+|------|-----|------------------------|
+| aikubeworker0012 | `mlx5_1` | `6,555,518` |
+| aikubeworker0012 | `mlx5_7` | `6,325,059` |
+| aikubeworker0016 | `mlx5_1` | `6,585,965` |
+| aikubeworker0016 | `mlx5_7` | `6,112,874` |
+
+判断：allreduce 在达到当前 4 x 400G rail 物理上限附近时也会出现 `port_xmit_wait`，所以这个 counter 不能单独解释 alltoall 只有 `36-37 GB/s`。alltoall 的问题更偏向通信模式效率或网络调度策略，而不是简单链路错误。
+
 ## 8 卡 alltoall
 
 NCCL 输出：
@@ -163,13 +197,13 @@ NCCL 输出：
 | aikubeworker0016 | `mlx5_1` | `20,428,901` |
 | aikubeworker0016 | `mlx5_7` | `15,650,027` |
 
-判断：PXN disabled 后 alltoall 没有明显链路错误、重传或丢包证据；剩余性能缺口更偏向 `port_xmit_wait` 指向的发送等待/信用等待、交换网络拥塞控制/调度，或 NCCL internal alltoall 在当前拓扑下的通信模式效率。
+判断：PXN disabled 后 alltoall 没有明显链路错误、重传或丢包证据。结合 allreduce 对照，`port_xmit_wait` 只能作为发送等待信号，不能单独解释 alltoall 低吞吐；剩余性能缺口更偏向 NCCL internal alltoall 在当前拓扑下的通信模式效率、交换网络调度/拥塞控制，或外部 NCCL net plugin/SHARP 缺失。
 
 ## 判断
 
 1. 裸 RDMA 4 rail 可以并发跑到约 `184.62 GB/s`，网络基础带宽不是单 rail 瓶颈。
 2. 8 卡 allreduce 当前不是软件参数小调能解决的问题，性能已经贴近当前 4 条 400G rail 的物理带宽上限。
-3. 8 卡 alltoall 仍明显异常，且不是 HCA 顺序问题；PXN disabled 后 rail 已均衡，但仍出现 `port_xmit_wait`，需要继续从网络拥塞/信用等待、交换机侧策略、NCCL alltoall 模式、NCCL net plugin/SHARP 排查。
+3. 8 卡 alltoall 仍明显异常，且不是 HCA 顺序问题；PXN disabled 后 rail 已均衡，`port_xmit_wait` 不是 alltoall 独有，需要继续从 NCCL alltoall 模式、交换机侧策略、NCCL net plugin/SHARP 排查。
 4. `NCCL_PXN_DISABLE=1` 可改善 8 卡 alltoall 的 rail 均衡性和性能，但无法补齐到 PDF 目标。
 5. 如果验收必须达到 PDF 的 2 机 16 卡 `491.84/76.54 GB/s`，需要确认当前两台机器是否具备与 PDF 参考环境同等的有效跨节点 rail 数量和交换网络能力。
 6. 两台机器当前均未发现 `libnccl-net.so` 或 SHARP/HCOLL 包，NCCL 使用 internal IB plugin；如果目标值依赖 NCCL net plugin/SHARP，需要先补齐对应运行环境。
diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md
index 226c4f2..61e093d 100644
--- a/reports_multinode_nccl_diagnosis_20260523.md
+++ b/reports_multinode_nccl_diagnosis_20260523.md
@@ -16,7 +16,7 @@
 
 按 `sx算力节点跨Leaf NCCL测试报告.pdf` 的矩阵继续对齐后，发现 2 机 4 卡档位的核心问题是默认 GPU 选择不符合 GPU-NIC 亲和性。显式选择 `CUDA_VISIBLE_DEVICES=0,1,4,5` 后，2 机 4 卡 allreduce 可以恢复到 `333-335 GB/s` 区间，接近 PDF 的 `335.48 GB/s`；alltoall 配合 PDF 固定 NCCL 参数可到 `72.93 GB/s`，接近 PDF 的 `73.73 GB/s`。但 2 机 8 卡档位仍只有 allreduce `354.02 GB/s`、alltoall `30.04 GB/s`，与 PDF 的 `491.84/76.54 GB/s` 差距明显。
 
-进一步 sweep 8 卡 alltoall 网络参数后，`NCCL_PXN_DISABLE=1` 是唯一有效正向项。正式矩阵配置已对 2 机 8 GPU 的 alltoall 单独加入该变量，8 卡 alltoall 从约 `30.04 GB/s` 提升到 `36.70 GB/s` peak / `36.74 GB/s` avg，但仍低于 PDF 参考 `76.54 GB/s`。复测端口 counter 后，PXN disabled 下 4 条 rail 的流量已均衡，且没有明显链路错误、丢包、RoCE 重传或 slow restart；只在部分端口看到 `port_xmit_wait` 增长，剩余差距更像发送等待/信用等待、交换网络策略或 NCCL internal alltoall 通信模式效率问题。
+进一步 sweep 8 卡 alltoall 网络参数后，`NCCL_PXN_DISABLE=1` 是唯一有效正向项。正式矩阵配置已对 2 机 8 GPU 的 alltoall 单独加入该变量，8 卡 alltoall 从约 `30.04 GB/s` 提升到 `36.70 GB/s` peak / `36.74 GB/s` avg，但仍低于 PDF 参考 `76.54 GB/s`。复测端口 counter 后，PXN disabled 下 4 条 rail 的流量已均衡，且没有明显链路错误、丢包、RoCE 重传或 slow restart；同类 `port_xmit_wait` 在高吞吐 allreduce 中也会出现，因此它不是 alltoall 低吞吐的充分解释。剩余差距更像 NCCL internal alltoall 通信模式效率、交换网络策略，或缺少 NCCL net plugin/SHARP 能力。
 
 同时，`nccl-gpu-2` 的 SSH 入口曾因未认证连接过多触发 `MaxStartups` 随机拒绝，导致 `mpirun` 拉起远端 rank 失败。已经做了临时 SSHD 缓解并拿到有效的 2 节点 x 8 GPU allreduce/alltoall 报告。
 
@@ -36,7 +36,8 @@
 12. 增加 topology 级 `cuda_visible_devices`、`env`、`op_env` 配置能力，支持按 GPU/NIC 亲和性和不同 NCCL op 分别设置环境变量。
 13. 生成 PDF 矩阵式原始报告 `reports_multinode_nccl_pdf_matrix_nccl227.md`，覆盖 2 机 1/2/4/8 GPU per node。
 14. 对 8 卡 alltoall 做 NCCL 网络参数 sweep，并将有效项 `NCCL_PXN_DISABLE=1` 固化到 PDF 矩阵配置。
-15. 对 PXN disabled 后的 8 卡 alltoall 抓取 `counters`/`hw_counters` 增量，确认 rail 已均衡且无明显错误/重传，剩余异常主要伴随 `port_xmit_wait`。
+15. 对 PXN disabled 后的 8 卡 alltoall 抓取 `counters`/`hw_counters` 增量，确认 rail 已均衡且无明显错误/重传。
+16. 对同样 2x8 allreduce 抓 counter 对照，确认高吞吐 allreduce 也会出现 `port_xmit_wait`，因此该 counter 不是 alltoall 低吞吐的唯一根因。
 
 ## 关键证据
 
@@ -307,6 +308,17 @@ PXN disabled 计数器显示该参数确实修复了 rail 分布：
 
 判断：当前没有明显坏链路、丢包或重传证据；`port_xmit_wait` 更像发送侧等待 credit/拥塞控制/交换侧调度，或者 NCCL internal alltoall 在当前拓扑下没有把 rail 吞吐打起来。
 
+同样 2 nodes x 8 GPUs、同样 4 条 HCA 的 16G allreduce 对照：
+
+| 观察项 | 结果 |
+|--------|------|
+| allreduce `Avg bus bandwidth` | `354.366 GB/s` |
+| 每条 HCA 流量 | 约 `178.03-178.07 GiB`，四条 rail 均衡 |
+| 错误/重传类 counter | `0` 增量 |
+| `port_xmit_wait` | `mlx5_1`、`mlx5_7` 有增长，约 `6.11M-6.59M` |
+
+判断：allreduce 在接近物理上限时也会出现 `port_xmit_wait`，所以 alltoall 的核心问题不能只归因于该 counter。现在更应关注 NCCL alltoall 通信模式、交换网络策略、以及 NCCL net plugin/SHARP 能力差异。
+
 ### 8. 8 卡链路计数器与物理上限判断
 
 计数器探测报告：`reports_multinode_nccl_counter_probe_20260523.md`
@@ -407,7 +419,8 @@ libnccl-dev
 - 8 卡 alltoall baseline 端口计数器显示 rail 分布不均，且 HCA 顺序 sweep 无改善
 - 当前环境缺失 NCCL net plugin/SHARP，NCCL 只能使用 internal IB plugin
 - `NCCL_PXN_DISABLE=1` 可将 8 卡 alltoall 提升到约 `36.7 GB/s`，并修复 rail 分布不均，但仍不到 PDF 参考值的一半
-- PXN disabled 复测没有看到 discard、链路错误、RoCE 重传、slow restart、packet sequence error 等错误类 counter 增长；主要异常信号是部分端口 `port_xmit_wait`
+- PXN disabled 复测没有看到 discard、链路错误、RoCE 重传、slow restart、packet sequence error 等错误类 counter 增长
+- allreduce 对照同样出现 `port_xmit_wait` 但能跑到 `354.366 GB/s`，说明 `port_xmit_wait` 不是 alltoall 低吞吐的唯一根因
 
 ### 阻塞 3：`nccl-gpu-2` SSH 存在外部连接压力
 
@@ -428,9 +441,9 @@ libnccl-dev
 4. 4 卡 allreduce 建议继续让 NCCL 自动选择 channel/QP；4 卡 alltoall 如果要贴近 PDF，可单独套 `NCCL_IB_QPS_PER_CONNECTION=4`、`NCCL_MIN_NCHANNELS=4`、`NCCL_IB_SPLIT_DATA_ON_QPS=1`。
 5. 8 卡 per node 不建议套上述固定参数，会降低 allreduce；继续用 auto。
 6. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin/SHARP；当前日志显示 `Could not find: libnccl-net.so`，NCCL 使用的是 internal IB plugin。
-7. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由、credit/拥塞等待与交换机侧队列计数，解释 PXN disabled 后 `port_xmit_wait` 增长但无错误/重传的原因。
+7. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由、credit/拥塞等待与交换机侧队列计数；同时用 allreduce 对照避免把 `port_xmit_wait` 误判为 alltoall 独有根因。
 8. 确认当前 PDF 的 `491.84/76.54 GB/s` 是否要求当前这两台节点在只有 4 条 400G rail 的形态下也达到；如果要求一致，需要网络/硬件侧继续介入。
-9. 对 8 卡 alltoall，重点查交换机 ECMP/自适应路由、拥塞/credit 等待、SHARP/NCCL net plugin 和 NCCL internal alltoall 行为；`NCCL_IB_HCA` 顺序与 rail 分布本身已经不是当前主问题。
+9. 对 8 卡 alltoall，重点查 SHARP/NCCL net plugin、NCCL internal alltoall 行为、交换机 ECMP/自适应路由和拥塞/credit 等待；`NCCL_IB_HCA` 顺序与 rail 分布本身已经不是当前主问题。
 
 ## 当前可交付物
 
-- 
2.47.2


From 82c6316716e6ea554ad7167b2592bf9fcddc3c6a Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 17:28:28 +0800
Subject: [PATCH 15/41] Document NCCL alltoall secondary sweep

---
 ...multinode_nccl_alltoall_tuning_20260523.md | 32 +++++++++++++++++--
 reports_multinode_nccl_diagnosis_20260523.md  | 23 +++++++++++--
 2 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/reports_multinode_nccl_alltoall_tuning_20260523.md b/reports_multinode_nccl_alltoall_tuning_20260523.md
index aea43d9..d9711ff 100644
--- a/reports_multinode_nccl_alltoall_tuning_20260523.md
+++ b/reports_multinode_nccl_alltoall_tuning_20260523.md
@@ -14,7 +14,7 @@
 
 复测错误/拥塞 counter 后，没有看到 discard、链路错误、RoCE 重传、slow restart 或 packet sequence error 增长；主要非零异常是部分端口 `port_xmit_wait`。不过 allreduce 对照在 `354 GB/s busbw` 时也会出现同类 `port_xmit_wait`，所以当前不支持“链路坏包/重传导致慢”的判断，也不能只用 `port_xmit_wait` 解释 alltoall 低吞吐。更可能的方向是 NCCL internal alltoall 通信模式效率、交换侧调度/拥塞控制，或缺少 NCCL net plugin/SHARP。
 
-这个提升有实际价值，但仍远低于 PDF 参考 `76.54 GB/s`。其他参数没有改善，部分明显变差：
+这个提升有实际价值，但仍远低于 PDF 参考 `76.54 GB/s`。在 `NCCL_PXN_DISABLE=1` 之前做过一轮参数 sweep，其他参数没有改善，部分明显变差：
 
 | Case | Avg Bus BW | 结论 |
 |------|------------|------|
@@ -32,6 +32,31 @@
 | `NCCL_IB_ADAPTIVE_ROUTING=0` | `30.0535 GB/s` | 无改善 |
 | `NCCL_IB_PCI_RELAXED_ORDERING=0` | 未完成 | 明显异常，不建议 |
 
+在 `NCCL_PXN_DISABLE=1` 作为基线后又补跑了一轮叠加参数 sweep。短测窗口里 `NVLS_ENABLE=0`、`P2P_NET_CHUNKSIZE=4M` 有小幅波动式提升，但更长 `-w 10 -n 10` 复测没有复现，不能作为稳定优化项。
+
+| Case | Avg Bus BW | 结论 |
+|------|------------|------|
+| `NCCL_PXN_DISABLE=1` | `37.0069 GB/s` | 短测基线 |
+| `+ NCCL_NVLS_ENABLE=0` | `37.2217 GB/s` | 小幅波动，不稳定 |
+| `+ NCCL_P2P_NET_CHUNKSIZE=4194304` | `37.2522 GB/s` | 小幅波动，不稳定 |
+| `+ NCCL_BUFFSIZE=8388608` | `37.0911 GB/s` | 无实质改善 |
+| `+ NCCL_MIN_NCHANNELS=16 NCCL_MAX_NCHANNELS=16` | `37.0189 GB/s` | 无实质改善 |
+| `+ NCCL_IB_AR_THRESHOLD=0` | `37.0843 GB/s` | 无实质改善 |
+| `+ NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=0` | `35.9847 GB/s` | 变差 |
+| `+ NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=1` | `29.8406 GB/s` | 明显变差 |
+| `+ NCCL_IB_QPS_PER_CONNECTION=8 NCCL_IB_SPLIT_DATA_ON_QPS=1` | `24.1183 GB/s` | 明显变差 |
+| `+ NCCL_NCHANNELS_PER_NET_PEER=8` | `29.8904 GB/s` | 明显变差 |
+
+长测复核：
+
+| Case | Avg Bus BW | 结论 |
+|------|------------|------|
+| `NCCL_PXN_DISABLE=1` | `32.7280 GB/s` | 当前窗口基线下滑 |
+| `+ NCCL_P2P_NET_CHUNKSIZE=4194304` | `31.9340 GB/s` | 未复现短测提升 |
+| `+ NCCL_NVLS_ENABLE=0 NCCL_P2P_NET_CHUNKSIZE=4194304` | `27.6585 GB/s` | 明显变差 |
+
+补充 ENV/INIT/NET 日志确认，性能波动时仍是 NCCL `2.27.7+cuda12.4`、4 条 400G HCA、GDR enabled、internal IB plugin；不是退回旧 NCCL、HCA 选择错误或 GDR 失效。
+
 ## PXN disabled 端口计数器
 
 `NCCL_PXN_DISABLE=1` 后，8 卡 alltoall 输出：
@@ -103,5 +128,6 @@ op_env:
 
 1. PXN 在当前拓扑下对 8 卡 alltoall 有负面影响，禁用后有约 `22-24%` 提升。
 2. 禁用 PXN 可以修复 rail 分布不均衡，但无法打满每条 400G rail。
-3. 禁用 PXN 后仍只有 PDF 目标的一半左右，剩余差距不是单一 NCCL 环境变量可以补齐。
-4. 后续重点仍应放在 NCCL net plugin/SHARP、交换网络策略和 NCCL internal alltoall 实现效率；`port_xmit_wait` 需要结合 allreduce 对照解读，不能单独作为 alltoall 根因。
+3. PXN disabled 基线上继续叠加 NVLS、P2P chunk、buffer、channel、QP/split、AR 等参数，没有稳定收益；QP/split 和 `NCCL_NCHANNELS_PER_NET_PEER=8` 反而明显变差。
+4. 禁用 PXN 后仍只有 PDF 目标的一半左右，剩余差距不是单一 NCCL 环境变量可以补齐。
+5. 后续重点仍应放在 NCCL net plugin/SHARP、交换网络策略和 NCCL internal alltoall 实现效率；`port_xmit_wait` 需要结合 allreduce 对照解读，不能单独作为 alltoall 根因。
diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md
index 61e093d..5acbd5e 100644
--- a/reports_multinode_nccl_diagnosis_20260523.md
+++ b/reports_multinode_nccl_diagnosis_20260523.md
@@ -16,7 +16,7 @@
 
 按 `sx算力节点跨Leaf NCCL测试报告.pdf` 的矩阵继续对齐后，发现 2 机 4 卡档位的核心问题是默认 GPU 选择不符合 GPU-NIC 亲和性。显式选择 `CUDA_VISIBLE_DEVICES=0,1,4,5` 后，2 机 4 卡 allreduce 可以恢复到 `333-335 GB/s` 区间，接近 PDF 的 `335.48 GB/s`；alltoall 配合 PDF 固定 NCCL 参数可到 `72.93 GB/s`，接近 PDF 的 `73.73 GB/s`。但 2 机 8 卡档位仍只有 allreduce `354.02 GB/s`、alltoall `30.04 GB/s`，与 PDF 的 `491.84/76.54 GB/s` 差距明显。
 
-进一步 sweep 8 卡 alltoall 网络参数后，`NCCL_PXN_DISABLE=1` 是唯一有效正向项。正式矩阵配置已对 2 机 8 GPU 的 alltoall 单独加入该变量，8 卡 alltoall 从约 `30.04 GB/s` 提升到 `36.70 GB/s` peak / `36.74 GB/s` avg，但仍低于 PDF 参考 `76.54 GB/s`。复测端口 counter 后，PXN disabled 下 4 条 rail 的流量已均衡，且没有明显链路错误、丢包、RoCE 重传或 slow restart；同类 `port_xmit_wait` 在高吞吐 allreduce 中也会出现，因此它不是 alltoall 低吞吐的充分解释。剩余差距更像 NCCL internal alltoall 通信模式效率、交换网络策略，或缺少 NCCL net plugin/SHARP 能力。
+进一步 sweep 8 卡 alltoall 网络参数后，`NCCL_PXN_DISABLE=1` 是唯一有效正向项。正式矩阵配置已对 2 机 8 GPU 的 alltoall 单独加入该变量，8 卡 alltoall 从约 `30.04 GB/s` 提升到 `36.70 GB/s` peak / `36.74 GB/s` avg，但仍低于 PDF 参考 `76.54 GB/s`。复测端口 counter 后，PXN disabled 下 4 条 rail 的流量已均衡，且没有明显链路错误、丢包、RoCE 重传或 slow restart；同类 `port_xmit_wait` 在高吞吐 allreduce 中也会出现，因此它不是 alltoall 低吞吐的充分解释。继续在 PXN disabled 基线上叠加 NVLS、P2P chunk、buffer、channel、QP/split、AR 等参数，没有稳定收益。剩余差距更像 NCCL internal alltoall 通信模式效率、交换网络策略，或缺少 NCCL net plugin/SHARP 能力。
 
 同时，`nccl-gpu-2` 的 SSH 入口曾因未认证连接过多触发 `MaxStartups` 随机拒绝，导致 `mpirun` 拉起远端 rank 失败。已经做了临时 SSHD 缓解并拿到有效的 2 节点 x 8 GPU allreduce/alltoall 报告。
 
@@ -38,6 +38,7 @@
 14. 对 8 卡 alltoall 做 NCCL 网络参数 sweep，并将有效项 `NCCL_PXN_DISABLE=1` 固化到 PDF 矩阵配置。
 15. 对 PXN disabled 后的 8 卡 alltoall 抓取 `counters`/`hw_counters` 增量，确认 rail 已均衡且无明显错误/重传。
 16. 对同样 2x8 allreduce 抓 counter 对照，确认高吞吐 allreduce 也会出现 `port_xmit_wait`，因此该 counter 不是 alltoall 低吞吐的唯一根因。
+17. 在 PXN disabled 基线上继续 sweep NVLS、P2P chunk、buffer、channel、QP/split、AR 等参数，确认没有稳定收益，部分参数明显变差。
 
 ## 关键证据
 
@@ -319,6 +320,23 @@ PXN disabled 计数器显示该参数确实修复了 rail 分布：
 
 判断：allreduce 在接近物理上限时也会出现 `port_xmit_wait`，所以 alltoall 的核心问题不能只归因于该 counter。现在更应关注 NCCL alltoall 通信模式、交换网络策略、以及 NCCL net plugin/SHARP 能力差异。
 
+PXN disabled 基线上的二次参数 sweep：
+
+| Case | Avg Bus BW | 结论 |
+|------|------------|------|
+| `NCCL_PXN_DISABLE=1` | `37.0069 GB/s` | 短测基线 |
+| `+ NCCL_NVLS_ENABLE=0` | `37.2217 GB/s` | 小幅波动，不稳定 |
+| `+ NCCL_P2P_NET_CHUNKSIZE=4194304` | `37.2522 GB/s` | 小幅波动，不稳定 |
+| `+ NCCL_BUFFSIZE=8388608` | `37.0911 GB/s` | 无实质改善 |
+| `+ NCCL_MIN_NCHANNELS=16 NCCL_MAX_NCHANNELS=16` | `37.0189 GB/s` | 无实质改善 |
+| `+ NCCL_IB_AR_THRESHOLD=0` | `37.0843 GB/s` | 无实质改善 |
+| `+ NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=0` | `35.9847 GB/s` | 变差 |
+| `+ NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=1` | `29.8406 GB/s` | 明显变差 |
+| `+ NCCL_IB_QPS_PER_CONNECTION=8 NCCL_IB_SPLIT_DATA_ON_QPS=1` | `24.1183 GB/s` | 明显变差 |
+| `+ NCCL_NCHANNELS_PER_NET_PEER=8` | `29.8904 GB/s` | 明显变差 |
+
+长测复核没有复现 `NVLS/P2P chunk` 的短测小涨：同一环境确认仍为 NCCL `2.27.7+cuda12.4`、4 条 400G HCA、GDR enabled、internal IB plugin，但 baseline 窗口下滑到 `32.7280 GB/s`，`P2P_NET_CHUNKSIZE=4M` 为 `31.9340 GB/s`，`NVLS_ENABLE=0 + P2P_NET_CHUNKSIZE=4M` 为 `27.6585 GB/s`。因此这些参数不应固化到正式配置。
+
 ### 8. 8 卡链路计数器与物理上限判断
 
 计数器探测报告：`reports_multinode_nccl_counter_probe_20260523.md`
@@ -421,6 +439,7 @@ libnccl-dev
 - `NCCL_PXN_DISABLE=1` 可将 8 卡 alltoall 提升到约 `36.7 GB/s`，并修复 rail 分布不均，但仍不到 PDF 参考值的一半
 - PXN disabled 复测没有看到 discard、链路错误、RoCE 重传、slow restart、packet sequence error 等错误类 counter 增长
 - allreduce 对照同样出现 `port_xmit_wait` 但能跑到 `354.366 GB/s`，说明 `port_xmit_wait` 不是 alltoall 低吞吐的唯一根因
+- PXN disabled 基线上继续叠加 NVLS、P2P chunk、buffer、channel、QP/split、AR 等参数没有稳定收益；QP/split 和 `NCCL_NCHANNELS_PER_NET_PEER=8` 明显变差
 
 ### 阻塞 3：`nccl-gpu-2` SSH 存在外部连接压力
 
@@ -443,7 +462,7 @@ libnccl-dev
 6. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin/SHARP；当前日志显示 `Could not find: libnccl-net.so`，NCCL 使用的是 internal IB plugin。
 7. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由、credit/拥塞等待与交换机侧队列计数；同时用 allreduce 对照避免把 `port_xmit_wait` 误判为 alltoall 独有根因。
 8. 确认当前 PDF 的 `491.84/76.54 GB/s` 是否要求当前这两台节点在只有 4 条 400G rail 的形态下也达到；如果要求一致，需要网络/硬件侧继续介入。
-9. 对 8 卡 alltoall，重点查 SHARP/NCCL net plugin、NCCL internal alltoall 行为、交换机 ECMP/自适应路由和拥塞/credit 等待；`NCCL_IB_HCA` 顺序与 rail 分布本身已经不是当前主问题。
+9. 8 卡 alltoall 当前不建议继续盲调 NCCL 环境变量；重点查 SHARP/NCCL net plugin、NCCL internal alltoall 行为、交换机 ECMP/自适应路由和拥塞/credit 等待；`NCCL_IB_HCA` 顺序与 rail 分布本身已经不是当前主问题。
 
 ## 当前可交付物
 
-- 
2.47.2


From 24a7bd5c1b399161ca12dde3fb67e38f674a9a24 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 17:32:03 +0800
Subject: [PATCH 16/41] Document NCCL graph comparison

---
 ...multinode_nccl_alltoall_tuning_20260523.md | 27 +++++++++++++++++++
 reports_multinode_nccl_diagnosis_20260523.md  | 19 ++++++++++++-
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/reports_multinode_nccl_alltoall_tuning_20260523.md b/reports_multinode_nccl_alltoall_tuning_20260523.md
index d9711ff..dcf75c4 100644
--- a/reports_multinode_nccl_alltoall_tuning_20260523.md
+++ b/reports_multinode_nccl_alltoall_tuning_20260523.md
@@ -57,6 +57,33 @@
 
 补充 ENV/INIT/NET 日志确认，性能波动时仍是 NCCL `2.27.7+cuda12.4`、4 条 400G HCA、GDR enabled、internal IB plugin；不是退回旧 NCCL、HCA 选择错误或 GDR 失效。
 
+## NCCL GRAPH/TUNING 对照
+
+为避免只看带宽结果，补抓了 allreduce 与 PXN disabled alltoall 的 `NCCL_DEBUG_SUBSYS=INIT,NET,GRAPH,TUNING,COLL` 日志。该日志采样使用短迭代，只用于看 NCCL 图和通道选择，不作为性能结论。
+
+共同点：
+
+| 观察项 | allreduce | alltoall + `NCCL_PXN_DISABLE=1` |
+|--------|-----------|----------------------------------|
+| NCCL version | `2.27.7+cuda12.4` | `2.27.7+cuda12.4` |
+| HCA | `mlx5_0,mlx5_1,mlx5_6,mlx5_7` | `mlx5_0,mlx5_1,mlx5_6,mlx5_7` |
+| GDR | enabled | enabled |
+| external net plugin | missing, internal IB | missing, internal IB |
+| channels | `16 coll / 16 nvls / 16 p2p` | `16 coll / 16 nvls / 16 p2p` |
+| p2p channels per peer | `2` | `2` |
+| P2P chunk | `131072` | `131072` |
+
+差异：
+
+| 观察项 | allreduce | alltoall + `NCCL_PXN_DISABLE=1` |
+|--------|-----------|----------------------------------|
+| Pattern 4 | `crossNic 0`, `type NVL/PXN`, `nChannels 8` | `crossNic 2`, `type NVL/PIX`, `nChannels 8` |
+| `NET/IB/*/GDRDMA` channel edge lines | `256` | `512` |
+| `P2P/CUMEM` channel edge lines | `0` | `224` |
+| total NET/P2P channel edge lines | `256` | `736` |
+
+判断：PXN disabled 后 4 条 IB/GDRDMA rail 都仍被使用，且通道数没有少；但 alltoall 的 NCCL graph 明显更复杂，并混入大量本机 `P2P/CUMEM` 路径。这个结果进一步支持：剩余差距不是 HCA/GDR 基础环境没有生效，而是 alltoall collective graph、P2P/NET 组合方式、internal IB plugin 能力或交换网络策略的问题。
+
 ## PXN disabled 端口计数器
 
 `NCCL_PXN_DISABLE=1` 后，8 卡 alltoall 输出：
diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md
index 5acbd5e..6e769b5 100644
--- a/reports_multinode_nccl_diagnosis_20260523.md
+++ b/reports_multinode_nccl_diagnosis_20260523.md
@@ -16,7 +16,7 @@
 
 按 `sx算力节点跨Leaf NCCL测试报告.pdf` 的矩阵继续对齐后，发现 2 机 4 卡档位的核心问题是默认 GPU 选择不符合 GPU-NIC 亲和性。显式选择 `CUDA_VISIBLE_DEVICES=0,1,4,5` 后，2 机 4 卡 allreduce 可以恢复到 `333-335 GB/s` 区间，接近 PDF 的 `335.48 GB/s`；alltoall 配合 PDF 固定 NCCL 参数可到 `72.93 GB/s`，接近 PDF 的 `73.73 GB/s`。但 2 机 8 卡档位仍只有 allreduce `354.02 GB/s`、alltoall `30.04 GB/s`，与 PDF 的 `491.84/76.54 GB/s` 差距明显。
 
-进一步 sweep 8 卡 alltoall 网络参数后，`NCCL_PXN_DISABLE=1` 是唯一有效正向项。正式矩阵配置已对 2 机 8 GPU 的 alltoall 单独加入该变量，8 卡 alltoall 从约 `30.04 GB/s` 提升到 `36.70 GB/s` peak / `36.74 GB/s` avg，但仍低于 PDF 参考 `76.54 GB/s`。复测端口 counter 后，PXN disabled 下 4 条 rail 的流量已均衡，且没有明显链路错误、丢包、RoCE 重传或 slow restart；同类 `port_xmit_wait` 在高吞吐 allreduce 中也会出现，因此它不是 alltoall 低吞吐的充分解释。继续在 PXN disabled 基线上叠加 NVLS、P2P chunk、buffer、channel、QP/split、AR 等参数，没有稳定收益。剩余差距更像 NCCL internal alltoall 通信模式效率、交换网络策略，或缺少 NCCL net plugin/SHARP 能力。
+进一步 sweep 8 卡 alltoall 网络参数后，`NCCL_PXN_DISABLE=1` 是唯一有效正向项。正式矩阵配置已对 2 机 8 GPU 的 alltoall 单独加入该变量，8 卡 alltoall 从约 `30.04 GB/s` 提升到 `36.70 GB/s` peak / `36.74 GB/s` avg，但仍低于 PDF 参考 `76.54 GB/s`。复测端口 counter 后，PXN disabled 下 4 条 rail 的流量已均衡，且没有明显链路错误、丢包、RoCE 重传或 slow restart；同类 `port_xmit_wait` 在高吞吐 allreduce 中也会出现，因此它不是 alltoall 低吞吐的充分解释。继续在 PXN disabled 基线上叠加 NVLS、P2P chunk、buffer、channel、QP/split、AR 等参数，没有稳定收益。NCCL GRAPH/TUNING 日志显示 alltoall 的 channel graph 比 allreduce 复杂很多，且混入大量本机 `P2P/CUMEM` 路径，但 HCA/GDR/channel 基础状态一致。剩余差距更像 NCCL internal alltoall 通信模式效率、交换网络策略，或缺少 NCCL net plugin/SHARP 能力。
 
 同时，`nccl-gpu-2` 的 SSH 入口曾因未认证连接过多触发 `MaxStartups` 随机拒绝，导致 `mpirun` 拉起远端 rank 失败。已经做了临时 SSHD 缓解并拿到有效的 2 节点 x 8 GPU allreduce/alltoall 报告。
 
@@ -39,6 +39,7 @@
 15. 对 PXN disabled 后的 8 卡 alltoall 抓取 `counters`/`hw_counters` 增量，确认 rail 已均衡且无明显错误/重传。
 16. 对同样 2x8 allreduce 抓 counter 对照，确认高吞吐 allreduce 也会出现 `port_xmit_wait`，因此该 counter 不是 alltoall 低吞吐的唯一根因。
 17. 在 PXN disabled 基线上继续 sweep NVLS、P2P chunk、buffer、channel、QP/split、AR 等参数，确认没有稳定收益，部分参数明显变差。
+18. 抓取 allreduce 与 PXN disabled alltoall 的 `GRAPH/TUNING/COLL` 日志，确认两者 HCA/GDR/channel 基础状态一致，但 alltoall graph 明显更复杂。
 
 ## 关键证据
 
@@ -337,6 +338,21 @@ PXN disabled 基线上的二次参数 sweep：
 
 长测复核没有复现 `NVLS/P2P chunk` 的短测小涨：同一环境确认仍为 NCCL `2.27.7+cuda12.4`、4 条 400G HCA、GDR enabled、internal IB plugin，但 baseline 窗口下滑到 `32.7280 GB/s`，`P2P_NET_CHUNKSIZE=4M` 为 `31.9340 GB/s`，`NVLS_ENABLE=0 + P2P_NET_CHUNKSIZE=4M` 为 `27.6585 GB/s`。因此这些参数不应固化到正式配置。
 
+`GRAPH/TUNING/COLL` 日志对照：
+
+| 观察项 | allreduce | alltoall + `NCCL_PXN_DISABLE=1` |
+|--------|-----------|----------------------------------|
+| NCCL version | `2.27.7+cuda12.4` | `2.27.7+cuda12.4` |
+| HCA / GDR | 4 HCA, GDR enabled | 4 HCA, GDR enabled |
+| external net plugin | missing, internal IB | missing, internal IB |
+| channels | `16 coll / 16 nvls / 16 p2p` | `16 coll / 16 nvls / 16 p2p` |
+| Pattern 4 | `crossNic 0`, `type NVL/PXN`, `nChannels 8` | `crossNic 2`, `type NVL/PIX`, `nChannels 8` |
+| `NET/IB/*/GDRDMA` channel edge lines | `256` | `512` |
+| `P2P/CUMEM` channel edge lines | `0` | `224` |
+| total NET/P2P channel edge lines | `256` | `736` |
+
+判断：PXN disabled 后 4 条 IB/GDRDMA rail 和 16 个 p2p/coll/nvls channels 都仍在；但 alltoall graph 明显比 allreduce 复杂，并包含大量本机 P2P/CUMEM 边。这进一步说明问题不在 HCA/GDR 没生效，而在 alltoall collective graph、P2P/NET 组合方式、internal IB plugin 或交换网络策略。
+
 ### 8. 8 卡链路计数器与物理上限判断
 
 计数器探测报告：`reports_multinode_nccl_counter_probe_20260523.md`
@@ -440,6 +456,7 @@ libnccl-dev
 - PXN disabled 复测没有看到 discard、链路错误、RoCE 重传、slow restart、packet sequence error 等错误类 counter 增长
 - allreduce 对照同样出现 `port_xmit_wait` 但能跑到 `354.366 GB/s`，说明 `port_xmit_wait` 不是 alltoall 低吞吐的唯一根因
 - PXN disabled 基线上继续叠加 NVLS、P2P chunk、buffer、channel、QP/split、AR 等参数没有稳定收益；QP/split 和 `NCCL_NCHANNELS_PER_NET_PEER=8` 明显变差
+- NCCL GRAPH/TUNING 对照显示 alltoall 与 allreduce 的 HCA/GDR/channel 基础状态一致，但 alltoall channel edge 更多，并混入大量 `P2P/CUMEM` 本地路径
 
 ### 阻塞 3：`nccl-gpu-2` SSH 存在外部连接压力
 
-- 
2.47.2


From b55666948c4e0bb8a510caadbc20a045630a49e4 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 17:37:19 +0800
Subject: [PATCH 17/41] Add multinode NCCL deep diagnosis tools

---
 README.md                                    |  11 +
 docs/multinode_nccl_deep_diagnose_runbook.md | 201 +++++++++
 scripts/multinode_nccl_deep_diagnose.sh      | 425 +++++++++++++++++++
 3 files changed, 637 insertions(+)
 create mode 100644 docs/multinode_nccl_deep_diagnose_runbook.md
 create mode 100755 scripts/multinode_nccl_deep_diagnose.sh

diff --git a/README.md b/README.md
index eed4791..fd890d4 100644
--- a/README.md
+++ b/README.md
@@ -575,6 +575,17 @@ report:
 └── 确认: 训练 loss 正常下降
 ```
 
+#### 多节点 NCCL 深度诊断
+
+当 SOP-3 的多节点 NCCL 结果与验收 PDF 不一致时，可以在发起节点运行深度诊断脚本，复现 counter 抓取、GRAPH/TUNING 日志和 PXN disabled sweep：
+
+```bash
+bash scripts/multinode_nccl_deep_diagnose.sh preflight
+bash scripts/multinode_nccl_deep_diagnose.sh all
+```
+
+详细参数、输出目录和解读方法见 [docs/multinode_nccl_deep_diagnose_runbook.md](/Users/d-robotics/lab/test_gpu_scripts/docs/multinode_nccl_deep_diagnose_runbook.md)。
+
 ---
 
 ### SOP-4: 故障诊断
diff --git a/docs/multinode_nccl_deep_diagnose_runbook.md b/docs/multinode_nccl_deep_diagnose_runbook.md
new file mode 100644
index 0000000..11a0629
--- /dev/null
+++ b/docs/multinode_nccl_deep_diagnose_runbook.md
@@ -0,0 +1,201 @@
+# 多机 NCCL 深度诊断 runbook
+
+本文档用于复现 2026-05-23 这轮 2 机 8 卡 NCCL 排查里的关键动作：counter 抓取、GRAPH/TUNING 日志、以及 PXN disabled 基线上的二次参数 sweep。
+
+## 适用场景
+
+当前默认参数面向：
+
+- `aikubeworker0012` / `172.72.8.12`
+- `aikubeworker0016` / `172.72.8.16`
+- 每节点 8 GPU
+- 每节点 4 条 400G HCA：`mlx5_0,mlx5_1,mlx5_6,mlx5_7`
+- NCCL 临时运行库：`/tmp/nccl-2.27.7-cuda12.4`
+- nccl-tests：`/data/nccl-tests-latest/build`
+- OpenMPI：`/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun`
+
+脚本应在 coordinator 节点上执行，当前即 `aikubeworker0012`。
+
+## 快速运行
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/multinode_nccl_deep_diagnose.sh preflight
+bash scripts/multinode_nccl_deep_diagnose.sh all
+```
+
+默认输出目录为：
+
+```text
+/tmp/nccl_deep_diagnose_YYYYMMDD_HHMMSS
+```
+
+只跑单项：
+
+```bash
+# 轻量检查 SSH、mpirun、nccl-tests 和 HCA 路径
+bash scripts/multinode_nccl_deep_diagnose.sh preflight
+
+# allreduce counter 对照
+bash scripts/multinode_nccl_deep_diagnose.sh allreduce-counter
+
+# PXN disabled alltoall counter
+bash scripts/multinode_nccl_deep_diagnose.sh alltoall-counter
+
+# NCCL GRAPH/TUNING/COLL 对照
+bash scripts/multinode_nccl_deep_diagnose.sh graph
+
+# PXN disabled 基线上的二次参数 sweep
+bash scripts/multinode_nccl_deep_diagnose.sh pxn-sweep
+```
+
+## 常用参数覆盖
+
+```bash
+OUT_DIR=/tmp/my_nccl_diag \
+HOSTS=172.72.8.12:8,172.72.8.16:8 \
+PEER_HOST=172.72.8.16 \
+HCAS="mlx5_0 mlx5_1 mlx5_6 mlx5_7" \
+HCA_CSV=mlx5_0,mlx5_1,mlx5_6,mlx5_7 \
+bash scripts/multinode_nccl_deep_diagnose.sh all
+```
+
+如果 nccl-tests 或 NCCL 运行库路径变化：
+
+```bash
+NCCL_TESTS_DIR=/opt/gpu-test-tools/nccl-tests/build \
+NCCL_LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/path/to/nccl/lib:/usr/local/cuda/lib64 \
+bash scripts/multinode_nccl_deep_diagnose.sh graph
+```
+
+## 输出解读
+
+### preflight 模式
+
+典型输出文件：
+
+```text
+preflight.txt
+```
+
+该模式不跑 NCCL workload，只检查：
+
+- 本机和对端主机名。
+- OpenMPI `mpirun` 是否存在且可执行。
+- `all_reduce_perf` / `alltoall_perf` 是否存在且可执行。
+- 配置的 HCA 是否能在 `/sys/class/infiniband/<hca>/ports/1` 下读到 state/rate。
+- 发起节点到 `PEER_HOST` 的 root SSH 是否可用。
+
+如果这里出现 `MISSING`，先修环境；否则再跑 `all` 或单项诊断。
+
+### counter 模式
+
+典型输出文件：
+
+```text
+allreduce_counter/
+  allreduce.log
+  before.local
+  before.remote
+  after.local
+  after.remote
+  counter_delta.txt
+
+alltoall_pxn_counter/
+  alltoall_pxn.log
+  before.local
+  before.remote
+  after.local
+  after.remote
+  counter_delta.txt
+```
+
+重点看 `counter_delta.txt`：
+
+- `port_xmit_data` / `port_rcv_data`：端口流量，单位为 4-byte words，脚本同时换算 GiB。
+- `port_xmit_wait`：发送等待或 credit/拥塞等待信号。注意它不是 alltoall 独有根因，因为高吞吐 allreduce 也会出现。
+- `port_xmit_discards`、`port_rcv_errors`、`symbol_error`、`roce_adp_retrans`、`packet_seq_err` 等：错误、丢包、重传、链路异常类信号。
+
+当前已知基线：
+
+- allreduce 可到约 `354 GB/s busbw`，4 条 rail 均衡。
+- PXN disabled alltoall 通常在 `36-37 GB/s busbw` 附近，但有窗口波动。
+- alltoall PXN disabled 后 rail 均衡，且没有明显 error/retrans/slow restart。
+
+### graph 模式
+
+典型输出文件：
+
+```text
+graph/
+  allreduce.log
+  allreduce_summary.txt
+  alltoall_pxn.log
+  alltoall_pxn_summary.txt
+```
+
+重点看：
+
+- `nccl_version`
+- `plugin_missing`
+- `gdr_enabled_lines`
+- `pattern_counts`
+- `channel_summary`
+- `NET/IB/*/GDRDMA`
+- `P2P/CUMEM`
+- `channel_edge_lines`
+
+当前已知对照：
+
+| 观察项 | allreduce | alltoall + `NCCL_PXN_DISABLE=1` |
+|--------|-----------|----------------------------------|
+| HCA / GDR | 4 HCA, GDR enabled | 4 HCA, GDR enabled |
+| channels | `16 coll / 16 nvls / 16 p2p` | `16 coll / 16 nvls / 16 p2p` |
+| `NET/IB/*/GDRDMA` channel edge lines | `256` | `512` |
+| `P2P/CUMEM` channel edge lines | `0` | `224` |
+| total NET/P2P channel edge lines | `256` | `736` |
+
+判断边界：
+
+- 如果 HCA/GDR/channel 基础状态一致，但 alltoall graph 明显更复杂，问题更偏向 NCCL collective graph、P2P/NET 组合方式、internal IB plugin 或交换网络策略。
+- 如果 GDR disabled、HCA 不完整、plugin 路径变化，则不能直接与当前报告结论对比。
+
+### pxn-sweep 模式
+
+典型输出：
+
+```text
+pxn_sweep/
+  baseline.log
+  nvls_off.log
+  qps4_split1.log
+  qps8_split1.log
+  qps4_split0.log
+  channels16.log
+  buff8m.log
+  p2pchunk4m.log
+  netpeer8.log
+  ar0.log
+  summary.txt
+```
+
+当前结论：
+
+- `NCCL_PXN_DISABLE=1` 是已发现的唯一稳定正向项。
+- 在 PXN disabled 基线上继续叠加 NVLS、P2P chunk、buffer、channel、QP/split、AR，没有稳定收益。
+- QP/split 和 `NCCL_NCHANNELS_PER_NET_PEER=8` 在当前环境下明显变差。
+
+## 交接给网络/NCCL 环境侧的重点
+
+1. 当前不是旧 NCCL/GDR disabled 问题：NCCL `2.27.7` 下 4 条 HCA 都是 GDR enabled。
+2. 当前不是 rail 完全打偏问题：`NCCL_PXN_DISABLE=1` 后 alltoall 的 4 条 rail 已均衡。
+3. 当前不是明显坏链路/重传问题：未看到 discard、symbol error、RoCE retrans、slow restart、packet sequence error 等增长。
+4. allreduce 已接近当前 4 x 400G rail 的物理可用带宽；PDF 8 卡 allreduce 目标反推需要超过当前 4 rail 单向理论带宽。
+5. alltoall 剩余差距更像 NCCL internal alltoall graph、P2P/NET 组合方式、缺少 NCCL net plugin/SHARP，或交换网络策略/ECMP/拥塞控制问题。
+
+## 关联报告
+
+- `reports_multinode_nccl_diagnosis_20260523.md`
+- `reports_multinode_nccl_alltoall_tuning_20260523.md`
+- `reports_multinode_nccl_counter_probe_20260523.md`
+- `reports_multinode_nccl_pdf_matrix_nccl227.md`
diff --git a/scripts/multinode_nccl_deep_diagnose.sh b/scripts/multinode_nccl_deep_diagnose.sh
new file mode 100755
index 0000000..b16409c
--- /dev/null
+++ b/scripts/multinode_nccl_deep_diagnose.sh
@@ -0,0 +1,425 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Deep-diagnose multi-node NCCL behavior from the coordinator node.
+# Default values match the current 2-node H100 cross-leaf investigation.
+
+MODE="${1:-all}"
+
+MPI_BIN="${MPI_BIN:-/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun}"
+NCCL_TESTS_DIR="${NCCL_TESTS_DIR:-/data/nccl-tests-latest/build}"
+HOSTS="${HOSTS:-172.72.8.12:8,172.72.8.16:8}"
+PEER_HOST="${PEER_HOST:-172.72.8.16}"
+SSH_USER="${SSH_USER:-root}"
+HCAS="${HCAS:-mlx5_0 mlx5_1 mlx5_6 mlx5_7}"
+HCA_CSV="${HCA_CSV:-mlx5_0,mlx5_1,mlx5_6,mlx5_7}"
+OUT_DIR="${OUT_DIR:-/tmp/nccl_deep_diagnose_$(date +%Y%m%d_%H%M%S)}"
+
+BEGIN_SIZE="${BEGIN_SIZE:-16G}"
+END_SIZE="${END_SIZE:-16G}"
+WARMUP_ITERS="${WARMUP_ITERS:-10}"
+ITERS="${ITERS:-10}"
+GRAPH_WARMUP_ITERS="${GRAPH_WARMUP_ITERS:-1}"
+GRAPH_ITERS="${GRAPH_ITERS:-1}"
+SWEEP_WARMUP_ITERS="${SWEEP_WARMUP_ITERS:-3}"
+SWEEP_ITERS="${SWEEP_ITERS:-5}"
+
+NCCL_LD_LIBRARY_PATH="${NCCL_LD_LIBRARY_PATH:-/usr/mpi/gcc/openmpi-4.1.9a1/lib:/tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu:/usr/local/cuda-12.4/targets/x86_64-linux/lib}"
+DEFAULT_NCCL_DEBUG="${NCCL_DEBUG:-WARN}"
+
+COUNTERS="${COUNTERS:-port_xmit_data port_rcv_data port_xmit_packets port_rcv_packets port_xmit_wait port_xmit_discards port_rcv_errors port_rcv_remote_physical_errors port_rcv_switch_relay_errors port_xmit_constraint_errors port_rcv_constraint_errors symbol_error link_error_recovery link_downed local_link_integrity_errors excessive_buffer_overrun_errors VL15_dropped}"
+HW_COUNTERS="${HW_COUNTERS:-roce_adp_retrans roce_adp_retrans_to roce_slow_restart roce_slow_restart_cnps roce_slow_restart_trans packet_seq_err out_of_sequence out_of_buffer duplicate_request implied_nak_seq_err local_ack_timeout_err req_transport_retries_exceeded rnr_nak_retry_err rx_write_requests rx_read_requests}"
+
+mkdir -p "$OUT_DIR"
+
+mpi_base=(
+  "$MPI_BIN"
+  --allow-run-as-root
+  --mca btl_openib_warn_no_device_params_found 0
+  --mca btl_tcp_if_include bond0
+  --mca oob_tcp_if_include bond0
+  --mca plm_rsh_args "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes -o ConnectTimeout=10"
+  -H "$HOSTS"
+  --map-by ppr:8:node
+  -np 16
+)
+
+base_exports=(
+  LD_LIBRARY_PATH
+  NCCL_IB_GID_INDEX NCCL_IB_SL NCCL_IB_TC NCCL_SOCKET_IFNAME
+  NCCL_DEBUG NCCL_DEBUG_SUBSYS NCCL_IB_TIMEOUT NCCL_IB_HCA
+  NCCL_NET_PLUGIN NCCL_NVLS_ENABLE NCCL_NET_GDR_LEVEL NCCL_NET_GDR_READ
+  NCCL_DMABUF_ENABLE NCCL_PXN_DISABLE NCCL_IB_QPS_PER_CONNECTION
+  NCCL_IB_SPLIT_DATA_ON_QPS NCCL_MIN_NCHANNELS NCCL_MAX_NCHANNELS
+  NCCL_BUFFSIZE NCCL_P2P_NET_CHUNKSIZE NCCL_NCHANNELS_PER_NET_PEER
+  NCCL_IB_AR_THRESHOLD
+)
+
+set_common_env() {
+  unset NCCL_DEBUG_SUBSYS NCCL_PXN_DISABLE NCCL_IB_QPS_PER_CONNECTION
+  unset NCCL_IB_SPLIT_DATA_ON_QPS NCCL_MIN_NCHANNELS NCCL_MAX_NCHANNELS
+  unset NCCL_BUFFSIZE NCCL_P2P_NET_CHUNKSIZE NCCL_NCHANNELS_PER_NET_PEER
+  unset NCCL_IB_AR_THRESHOLD
+
+  export LD_LIBRARY_PATH="$NCCL_LD_LIBRARY_PATH"
+  export NCCL_IB_GID_INDEX="${NCCL_IB_GID_INDEX:-3}"
+  export NCCL_IB_SL="${NCCL_IB_SL:-5}"
+  export NCCL_IB_TC="${NCCL_IB_TC:-136}"
+  export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-bond0}"
+  export NCCL_DEBUG="$DEFAULT_NCCL_DEBUG"
+  export NCCL_IB_TIMEOUT="${NCCL_IB_TIMEOUT:-22}"
+  export NCCL_IB_HCA="$HCA_CSV"
+  export NCCL_NET_PLUGIN="${NCCL_NET_PLUGIN:-none}"
+  export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-1}"
+  export NCCL_NET_GDR_LEVEL="${NCCL_NET_GDR_LEVEL:-5}"
+  export NCCL_NET_GDR_READ="${NCCL_NET_GDR_READ:-1}"
+  export NCCL_DMABUF_ENABLE="${NCCL_DMABUF_ENABLE:-0}"
+}
+
+mpi_xargs() {
+  for name in "${base_exports[@]}"; do
+    if [[ -n "${!name+x}" ]]; then
+      printf -- '-x\n%s\n' "$name"
+    fi
+  done
+}
+
+run_nccl() {
+  local op="$1"
+  local bin="$2"
+  local log="$3"
+  local warmup="$4"
+  local iters="$5"
+  mapfile -t xargs < <(mpi_xargs)
+  "${mpi_base[@]}" "${xargs[@]}" \
+    "$bin" -b "$BEGIN_SIZE" -e "$END_SIZE" -g 1 -f 2 -w "$warmup" -n "$iters" \
+    >"$log" 2>&1
+  awk -v op="$op" '/Avg bus bandwidth/ {print op, $0}' "$log"
+}
+
+read_one_snapshot() {
+  local host_label="$1"
+  local out="$2"
+  : >"$out"
+  for hca in $HCAS; do
+    for c in $COUNTERS; do
+      local f="/sys/class/infiniband/$hca/ports/1/counters/$c"
+      if [[ -r "$f" ]]; then
+        printf '%s %s counters %s %s\n' "$host_label" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" >>"$out"
+      fi
+    done
+    for c in $HW_COUNTERS; do
+      local f="/sys/class/infiniband/$hca/ports/1/hw_counters/$c"
+      if [[ -r "$f" ]]; then
+        printf '%s %s hw_counters %s %s\n' "$host_label" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" >>"$out"
+      fi
+    done
+  done
+}
+
+read_remote_snapshot() {
+  local out="$1"
+  ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
+      -o BatchMode=yes -o ConnectTimeout=5 "${SSH_USER}@${PEER_HOST}" \
+      "HCAS='$HCAS' COUNTERS='$COUNTERS' HW_COUNTERS='$HW_COUNTERS' bash -s" <<'EOS' >"$out"
+for hca in $HCAS; do
+  for c in $COUNTERS; do
+    f="/sys/class/infiniband/$hca/ports/1/counters/$c"
+    if [ -r "$f" ]; then
+      printf '%s %s counters %s %s\n' "$HOSTNAME" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)"
+    fi
+  done
+  for c in $HW_COUNTERS; do
+    f="/sys/class/infiniband/$hca/ports/1/hw_counters/$c"
+    if [ -r "$f" ]; then
+      printf '%s %s hw_counters %s %s\n' "$HOSTNAME" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)"
+    fi
+  done
+done
+EOS
+}
+
+summarize_counter_delta() {
+  local before_a="$1"
+  local before_b="$2"
+  local after_a="$3"
+  local after_b="$4"
+  local out="$5"
+  python3 - "$before_a" "$before_b" "$after_a" "$after_b" >"$out" <<'PY'
+import pathlib
+import sys
+
+interesting = {
+    "port_xmit_wait", "port_xmit_discards", "port_rcv_errors",
+    "port_rcv_remote_physical_errors", "port_rcv_switch_relay_errors",
+    "port_xmit_constraint_errors", "port_rcv_constraint_errors",
+    "symbol_error", "link_error_recovery", "link_downed",
+    "local_link_integrity_errors", "excessive_buffer_overrun_errors",
+    "VL15_dropped", "roce_adp_retrans", "roce_adp_retrans_to",
+    "roce_slow_restart", "roce_slow_restart_cnps", "roce_slow_restart_trans",
+    "packet_seq_err", "out_of_sequence", "out_of_buffer",
+    "duplicate_request", "implied_nak_seq_err", "local_ack_timeout_err",
+    "req_transport_retries_exceeded", "rnr_nak_retry_err",
+}
+
+def load(path):
+    data = {}
+    for line in pathlib.Path(path).read_text().splitlines():
+        parts = line.split()
+        if len(parts) != 5:
+            continue
+        host, hca, kind, counter, value = parts
+        try:
+            data[(host, hca, kind, counter)] = int(value)
+        except ValueError:
+            pass
+    return data
+
+before = {}
+after = {}
+before.update(load(sys.argv[1]))
+before.update(load(sys.argv[2]))
+after.update(load(sys.argv[3]))
+after.update(load(sys.argv[4]))
+
+print("NONZERO_DELTAS")
+for key in sorted(set(before) | set(after)):
+    delta = after.get(key, 0) - before.get(key, 0)
+    if not delta:
+        continue
+    host, hca, kind, counter = key
+    if counter in {"port_xmit_data", "port_rcv_data"}:
+        gib = delta * 4 / (1024 ** 3)
+        print(f"{host} {hca} {kind} {counter} {delta} words4B {gib:.2f} GiB")
+    else:
+        print(f"{host} {hca} {kind} {counter} {delta}")
+
+print("ERROR_OR_CONGESTION_DELTAS")
+seen = False
+for key in sorted(set(before) | set(after)):
+    delta = after.get(key, 0) - before.get(key, 0)
+    if delta and key[3] in interesting:
+        seen = True
+        print(*key, delta)
+if not seen:
+    print("none")
+PY
+}
+
+run_counter_case() {
+  local op="$1"
+  local bin="$2"
+  local extra="${3:-}"
+  set_common_env
+  if [[ -n "$extra" ]]; then
+    eval "export $extra"
+  fi
+  local dir="$OUT_DIR/${op}_counter"
+  mkdir -p "$dir"
+  read_one_snapshot "$(hostname)" "$dir/before.local"
+  read_remote_snapshot "$dir/before.remote"
+  run_nccl "$op" "$bin" "$dir/${op}.log" "$WARMUP_ITERS" "$ITERS"
+  read_one_snapshot "$(hostname)" "$dir/after.local"
+  read_remote_snapshot "$dir/after.remote"
+  summarize_counter_delta "$dir/before.local" "$dir/before.remote" "$dir/after.local" "$dir/after.remote" "$dir/counter_delta.txt"
+  echo "$dir"
+}
+
+summarize_graph_log() {
+  local log="$1"
+  local out="$2"
+  python3 - "$log" >"$out" <<'PY'
+from pathlib import Path
+import collections
+import re
+import sys
+
+text = Path(sys.argv[1]).read_text(errors="ignore")
+print("avg_busbw", (re.findall(r"Avg bus bandwidth\s*:\s*([0-9.]+)", text) or ["NA"])[-1])
+print("nccl_version", sorted(set(re.findall(r"NCCL version ([^\s]+)", text))))
+print("plugin_missing", len(re.findall(r"Could not find: none libnccl-net-none\.so", text)))
+print("gdr_enabled_lines", len(re.findall(r"GPU Direct RDMA Enabled", text)))
+print("using_hca")
+for value, count in collections.Counter(re.findall(r"NET/IB : Using \[(.*?)\]; OOB", text)).most_common(4):
+    print(f"  {count} {value}")
+print("pattern_counts")
+patterns = re.findall(
+    r"Pattern (\d+), crossNic (\d+), nChannels (\d+), bw ([0-9.]+)/([0-9.]+), type ([^,]+), sameChannels (\d+)",
+    text,
+)
+for key, count in collections.Counter(patterns).most_common():
+    print(f"  {count} {key}")
+print("channel_summary")
+for value, count in collections.Counter(
+    re.findall(r"(\d+ coll channels, \d+ collnet channels, \d+ nvls channels, \d+ p2p channels, \d+ p2p channels per peer)", text)
+).most_common():
+    print(f"  {count} {value}")
+print("p2p_chunks", collections.Counter(re.findall(r"P2P Chunksize set to (\d+)", text)))
+print("check_p2p", collections.Counter(re.findall(r"Check P2P Type ([^\n]+)", text)))
+for token in ["NET/IB/0/GDRDMA", "NET/IB/1/GDRDMA", "NET/IB/2/GDRDMA", "NET/IB/3/GDRDMA", "P2P/CUMEM", "P2P/IPC", "SHM"]:
+    print(token, text.count(token))
+print("channel_edge_lines", len([line for line in text.splitlines() if "Channel " in line and ("via NET/IB" in line or "via P2P" in line)]))
+PY
+}
+
+run_graph_case() {
+  local op="$1"
+  local bin="$2"
+  local extra="${3:-}"
+  set_common_env
+  export NCCL_DEBUG=INFO
+  export NCCL_DEBUG_SUBSYS=INIT,NET,GRAPH,TUNING,COLL
+  if [[ -n "$extra" ]]; then
+    eval "export $extra"
+  fi
+  local dir="$OUT_DIR/graph"
+  mkdir -p "$dir"
+  local log="$dir/${op}.log"
+  run_nccl "$op" "$bin" "$log" "$GRAPH_WARMUP_ITERS" "$GRAPH_ITERS"
+  summarize_graph_log "$log" "$dir/${op}_summary.txt"
+  echo "$dir/${op}_summary.txt"
+}
+
+run_pxn_sweep() {
+  local dir="$OUT_DIR/pxn_sweep"
+  mkdir -p "$dir"
+  local cases=(
+    "baseline|"
+    "nvls_off|NCCL_NVLS_ENABLE=0"
+    "qps4_split1|NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=1"
+    "qps8_split1|NCCL_IB_QPS_PER_CONNECTION=8 NCCL_IB_SPLIT_DATA_ON_QPS=1"
+    "qps4_split0|NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=0"
+    "channels16|NCCL_MIN_NCHANNELS=16 NCCL_MAX_NCHANNELS=16"
+    "buff8m|NCCL_BUFFSIZE=8388608"
+    "p2pchunk4m|NCCL_P2P_NET_CHUNKSIZE=4194304"
+    "netpeer8|NCCL_NCHANNELS_PER_NET_PEER=8"
+    "ar0|NCCL_IB_AR_THRESHOLD=0"
+  )
+  : >"$dir/summary.txt"
+  for item in "${cases[@]}"; do
+    local name="${item%%|*}"
+    local extra="${item#*|}"
+    set_common_env
+    export NCCL_PXN_DISABLE=1
+    if [[ -n "$extra" ]]; then
+      eval "export $extra"
+    fi
+    local log="$dir/${name}.log"
+    {
+      echo "===== CASE $name ====="
+      echo "extra: ${extra:-none}"
+      run_nccl "alltoall" "$NCCL_TESTS_DIR/alltoall_perf" "$log" "$SWEEP_WARMUP_ITERS" "$SWEEP_ITERS"
+      awk '/Avg bus bandwidth/ {print}' "$log" | tail -1
+    } | tee -a "$dir/summary.txt"
+  done
+  echo "$dir/summary.txt"
+}
+
+run_preflight() {
+  set_common_env
+  local out="$OUT_DIR/preflight.txt"
+  {
+    echo "===== LOCAL ====="
+    echo "hostname: $(hostname)"
+    echo "mpirun: $MPI_BIN"
+    if [[ -x "$MPI_BIN" ]]; then
+      "$MPI_BIN" --version 2>&1 | sed -n '1p'
+    else
+      echo "MISSING executable: $MPI_BIN"
+    fi
+    for bin in "$NCCL_TESTS_DIR/all_reduce_perf" "$NCCL_TESTS_DIR/alltoall_perf"; do
+      if [[ -x "$bin" ]]; then
+        echo "OK executable: $bin"
+      else
+        echo "MISSING executable: $bin"
+      fi
+    done
+    for hca in $HCAS; do
+      local state="/sys/class/infiniband/$hca/ports/1/state"
+      local rate="/sys/class/infiniband/$hca/ports/1/rate"
+      if [[ -r "$state" ]]; then
+        echo "OK HCA: $hca state=$(cat "$state") rate=$(cat "$rate" 2>/dev/null || echo unknown)"
+      else
+        echo "MISSING HCA path: $hca"
+      fi
+    done
+
+    echo "===== REMOTE ====="
+    ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
+        -o BatchMode=yes -o ConnectTimeout=5 "${SSH_USER}@${PEER_HOST}" \
+        "MPI_BIN='$MPI_BIN' NCCL_TESTS_DIR='$NCCL_TESTS_DIR' HCAS='$HCAS' bash -s" <<'EOS'
+echo "hostname: $(hostname)"
+echo "mpirun: $MPI_BIN"
+if [ -x "$MPI_BIN" ]; then
+  "$MPI_BIN" --version 2>&1 | sed -n '1p'
+else
+  echo "MISSING executable: $MPI_BIN"
+fi
+for bin in "$NCCL_TESTS_DIR/all_reduce_perf" "$NCCL_TESTS_DIR/alltoall_perf"; do
+  if [ -x "$bin" ]; then
+    echo "OK executable: $bin"
+  else
+    echo "MISSING executable: $bin"
+  fi
+done
+for hca in $HCAS; do
+  state="/sys/class/infiniband/$hca/ports/1/state"
+  rate="/sys/class/infiniband/$hca/ports/1/rate"
+  if [ -r "$state" ]; then
+    echo "OK HCA: $hca state=$(cat "$state") rate=$(cat "$rate" 2>/dev/null || echo unknown)"
+  else
+    echo "MISSING HCA path: $hca"
+  fi
+done
+EOS
+  } | tee "$out"
+  echo "$out"
+}
+
+usage() {
+  cat <<EOF
+Usage: $0 [preflight|all|allreduce-counter|alltoall-counter|graph|pxn-sweep]
+
+Outputs are written to: $OUT_DIR
+
+Common overrides:
+  HOSTS, PEER_HOST, HCAS, HCA_CSV, MPI_BIN, NCCL_TESTS_DIR,
+  NCCL_LD_LIBRARY_PATH, BEGIN_SIZE, END_SIZE, WARMUP_ITERS, ITERS
+EOF
+}
+
+case "$MODE" in
+  preflight)
+    run_preflight
+    ;;
+  all)
+    run_preflight
+    run_counter_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
+    run_counter_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
+    run_graph_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
+    run_graph_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
+    run_pxn_sweep
+    ;;
+  allreduce-counter)
+    run_counter_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
+    ;;
+  alltoall-counter)
+    run_counter_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
+    ;;
+  graph)
+    run_graph_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
+    run_graph_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
+    ;;
+  pxn-sweep)
+    run_pxn_sweep
+    ;;
+  -h|--help|help)
+    usage
+    ;;
+  *)
+    usage
+    exit 2
+    ;;
+esac
+
+echo "OUT_DIR=$OUT_DIR"
-- 
2.47.2


From c183f5a9d1d5aaad93be9f127716fb6c8ed48178 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 18:51:41 +0800
Subject: [PATCH 18/41] Document NCCL deep diagnosis rerun

---
 ...ltinode_nccl_deep_diagnose_run_20260523.md | 125 ++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 reports_multinode_nccl_deep_diagnose_run_20260523.md

diff --git a/reports_multinode_nccl_deep_diagnose_run_20260523.md b/reports_multinode_nccl_deep_diagnose_run_20260523.md
new file mode 100644
index 0000000..a96c20d
--- /dev/null
+++ b/reports_multinode_nccl_deep_diagnose_run_20260523.md
@@ -0,0 +1,125 @@
+# 多节点 NCCL 深度诊断复跑报告 2026-05-23
+
+## 执行信息
+
+- 发起节点：`aikubeworker0012`
+- 对端节点：`aikubeworker0016`
+- 测试规模：2 节点 x 8 GPU
+- NCCL：`2.27.7+cuda12.4`
+- nccl-tests：`/data/nccl-tests-latest/build`
+- OpenMPI：`/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun`
+- 远端产物目录：`/root/test_gpu_scripts/reports/nccl_deep_diag_20260523_103932`
+- 诊断脚本：`scripts/multinode_nccl_deep_diagnose.sh all`
+
+## Preflight
+
+两台机器均通过轻量环境检查：
+
+| 项目 | aikubeworker0012 | aikubeworker0016 |
+|---|---:|---:|
+| OpenMPI | `4.1.9a1` | `4.1.9a1` |
+| `all_reduce_perf` | OK | OK |
+| `alltoall_perf` | OK | OK |
+| `mlx5_0` | 400 Gb/sec ACTIVE | 400 Gb/sec ACTIVE |
+| `mlx5_1` | 400 Gb/sec ACTIVE | 400 Gb/sec ACTIVE |
+| `mlx5_6` | 400 Gb/sec ACTIVE | 400 Gb/sec ACTIVE |
+| `mlx5_7` | 400 Gb/sec ACTIVE | 400 Gb/sec ACTIVE |
+
+## 16G 核心结果
+
+| 测试 | 配置 | Avg Bus BW | 结论 |
+|---|---|---:|---|
+| allreduce | 自动参数 | `354.025 GB/s` | 稳定复现当前高位基线 |
+| alltoall | `NCCL_PXN_DISABLE=1` | `36.9377 GB/s` | 稳定复现当前瓶颈基线 |
+| graph allreduce | `NCCL_DEBUG=INFO` | `354.224 GB/s` | 与 counter run 一致 |
+| graph alltoall | `NCCL_PXN_DISABLE=1`, `NCCL_DEBUG=INFO` | `37.14 GB/s` | 与 counter run 一致 |
+
+对 PDF 目标的含义：
+
+- 2x8 allreduce 仍明显低于 PDF 2 机 16 GPU 目标 `491.84 GB/s`。
+- 2x8 alltoall 仍明显低于 PDF 2 机 16 GPU 目标 `76.54 GB/s`。
+- 本轮没有发现能把 8 卡 alltoall 推出 `36-37 GB/s` 平台的参数。
+
+## Counter 观察
+
+### Rail 流量
+
+allreduce 每条 rail 发送流量约 `178.03-178.07 GiB`，alltoall + PXN disabled 每条 rail 发送流量约 `712.23-712.28 GiB`。四条 400G rail 在两类测试中都均衡。
+
+### 错误/拥塞类计数
+
+本轮未看到 discard、symbol error、RoCE retrans、slow restart、packet sequence error 等硬错误增长。
+
+有增长的是 `port_xmit_wait`：
+
+| 测试 | 计数增长 |
+|---|---|
+| allreduce | `aikubeworker0016 mlx5_1 +6725565`, `mlx5_7 +6103180` |
+| alltoall + PXN disabled | `aikubeworker0016 mlx5_1 +20988680`, `mlx5_7 +16271960` |
+
+这说明 `port_xmit_wait` 不是 alltoall 独有现象；高吞吐 allreduce 也会出现。它可以作为交换网络/credit 等待的信号继续给网络侧看，但不能单独解释 alltoall 低带宽。
+
+## GRAPH/TUNING 对照
+
+| 观察项 | allreduce | alltoall + `NCCL_PXN_DISABLE=1` |
+|---|---:|---:|
+| `avg_busbw` | `354.224` | `37.14` |
+| `plugin_missing` | `16` | `16` |
+| GDR enabled lines | `1344` | `704` |
+| channel summary | `16 coll / 16 nvls / 16 p2p` | `16 coll / 16 nvls / 16 p2p` |
+| Pattern 4 | `crossNic 0`, `NVL/PXN` | `crossNic 2`, `NVL/PIX` |
+| `NET/IB/*/GDRDMA` lines | `256` | `512` |
+| `P2P/CUMEM` lines | `0` | `224` |
+| total NET/P2P edge lines | `256` | `736` |
+
+解释：
+
+- HCA、GDR、NCCL 版本和基础 channel 数量不是差异根因。
+- alltoall 的通信图明显更复杂，引入更多 NET/P2P 边，且 Pattern 4 从 allreduce 的 `NVL/PXN` 变成 `NVL/PIX`。
+- 这继续支持问题偏向 NCCL alltoall 图策略、internal IB plugin、缺少外部 `libnccl-net.so`/SHARP，或交换网络策略，而不是单纯链路坏、HCA 不通、GDR 没开。
+
+## PXN Disabled Sweep
+
+基线均为 `NCCL_PXN_DISABLE=1`，16G，2x8 GPU。
+
+| Case | 额外参数 | Avg Bus BW |
+|---|---|---:|
+| baseline | 无 | `36.8024` |
+| nvls_off | `NCCL_NVLS_ENABLE=0` | `36.8095` |
+| qps4_split1 | `NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=1` | `30.5464` |
+| qps8_split1 | `NCCL_IB_QPS_PER_CONNECTION=8 NCCL_IB_SPLIT_DATA_ON_QPS=1` | `23.9345` |
+| qps4_split0 | `NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=0` | `35.8679` |
+| channels16 | `NCCL_MIN_NCHANNELS=16 NCCL_MAX_NCHANNELS=16` | `37.1776` |
+| buff8m | `NCCL_BUFFSIZE=8388608` | `37.0265` |
+| p2pchunk4m | `NCCL_P2P_NET_CHUNKSIZE=4194304` | `37.0188` |
+| netpeer8 | `NCCL_NCHANNELS_PER_NET_PEER=8` | `31.103` |
+| ar0 | `NCCL_IB_AR_THRESHOLD=0` | `36.9965` |
+
+结论：
+
+- `channels16`、`buff8m`、`p2pchunk4m`、`ar0` 只有 0.2-1.0% 左右波动，不能视为有效优化。
+- `qps4_split1`、`qps8_split1`、`netpeer8` 明显负向。
+- 当前 8 卡 alltoall 不建议套用 PDF 固定 QP/split 参数。
+
+## 脚本修正验证
+
+复跑后发现脚本在 GRAPH 模式后会把 `NCCL_DEBUG=INFO` 继承到 sweep，导致 sweep 日志过大；同时 OpenMPI 会对未设置的 `-x` 变量打印 warning。
+
+已修正：
+
+- `set_common_env` 每个 case 重置到默认 `NCCL_DEBUG=WARN`。
+- `mpi_xargs` 只导出已经设置的环境变量。
+
+验证方式：
+
+- 本地 `bash -n scripts/multinode_nccl_deep_diagnose.sh` 通过。
+- 远端 1M tiny `all` 冒烟测试通过。
+- tiny 产物中 `could not find environment variable` 计数为 `0`。
+
+## 当前判断
+
+1. allreduce 的高位基线稳定，2x8 仍在 `354 GB/s` 左右。
+2. alltoall 即使 PXN disabled 并且 rail 均衡，也只能稳定在 `36-37 GB/s`。
+3. 未发现明显坏链路、重传、丢包、HCA 不通或 GDR disabled。
+4. 当前 4 条 400G rail 的硬件形态与 PDF 目标疑似不等价；PDF 2x8 allreduce 目标 `491.84 GB/s` 反推需要超过当前 4 rail 单向理论上限。
+5. alltoall 还需要从 NCCL net plugin/SHARP、交换机路径/ECMP/拥塞控制、以及 NCCL alltoall 图策略侧继续排。
-- 
2.47.2


From f64e85efafd75000512b38a1b2471200dddc8f74 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 18:54:35 +0800
Subject: [PATCH 19/41] Document NCCL environment equivalence gaps

---
 ...multinode_nccl_environment_gap_20260523.md | 168 ++++++++++++++++++
 1 file changed, 168 insertions(+)
 create mode 100644 reports_multinode_nccl_environment_gap_20260523.md

diff --git a/reports_multinode_nccl_environment_gap_20260523.md b/reports_multinode_nccl_environment_gap_20260523.md
new file mode 100644
index 0000000..c4a65a5
--- /dev/null
+++ b/reports_multinode_nccl_environment_gap_20260523.md
@@ -0,0 +1,168 @@
+# 多节点 NCCL 环境等价性缺口说明 2026-05-23
+
+## 目的
+
+这份文档用于回答一个核心问题：当前 `aikubeworker0012` / `aikubeworker0016` 是否具备与参考 PDF 的 2 机 16 GPU NCCL 目标相同的硬件和 NCCL 网络软件环境。
+
+结论先行：**当前环境不能证明与 PDF 参考环境等价**。主要差异有两类：
+
+1. 当前每节点只有 4 条可用于 NCCL 的 400G InfiniBand rail。
+2. 当前没有外部 NCCL net plugin / SHARP / HCOLL 组件，NCCL 使用 internal IB plugin。
+
+## 采集时间和节点
+
+采集时间：`2026-05-23T10:53:18+00:00` 至 `2026-05-23T10:53:21+00:00`
+
+| 节点 | SSH alias | 内网地址 | kernel |
+|---|---|---|---|
+| `aikubeworker0012` | `nccl-gpu-1` | `172.72.8.12` | `5.15.0-119-generic` |
+| `aikubeworker0016` | `nccl-gpu-2` | `172.72.8.16` | `5.15.0-119-generic` |
+
+## HCA / Rail 现状
+
+两台机器的 `/sys/class/infiniband/mlx5_*/ports/1` 结果一致：
+
+| HCA | State | Rate | Link layer | 对 NCCL 跨节点验收的含义 |
+|---|---|---:|---|---|
+| `mlx5_0` | ACTIVE | `400 Gb/sec (4X NDR)` | InfiniBand | 可作为 400G rail |
+| `mlx5_1` | ACTIVE | `400 Gb/sec (4X NDR)` | InfiniBand | 可作为 400G rail |
+| `mlx5_2` | ACTIVE | `25 Gb/sec (1X EDR)` | Ethernet | 不是 400G IB rail |
+| `mlx5_3` | DOWN | `25 Gb/sec (1X EDR)` | Ethernet | 不可用 |
+| `mlx5_4` | ACTIVE | `100 Gb/sec (2X HDR)` | InfiniBand | 不是 400G rail |
+| `mlx5_5` | ACTIVE | `100 Gb/sec (2X HDR)` | InfiniBand | 不是 400G rail |
+| `mlx5_6` | ACTIVE | `400 Gb/sec (4X NDR)` | InfiniBand | 可作为 400G rail |
+| `mlx5_7` | ACTIVE | `400 Gb/sec (4X NDR)` | InfiniBand | 可作为 400G rail |
+| `mlx5_8` | ACTIVE | `25 Gb/sec (1X EDR)` | Ethernet | 不是 400G IB rail |
+| `mlx5_9` | DOWN | `25 Gb/sec (1X EDR)` | Ethernet | 不可用 |
+
+因此当前推荐并实际使用的 HCA 列表是：
+
+```text
+NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_6,mlx5_7
+```
+
+这代表每节点 `4 x 400Gb/s`，理论单向原始带宽约：
+
+```text
+4 * 400Gb/s / 8 = 200 GB/s
+```
+
+## 与 PDF 目标的物理带宽关系
+
+参考 PDF 的 2 机 16 GPU 目标：
+
+| Operation | PDF Bus BW |
+|---|---:|
+| AllReduce | `491.84 GB/s` |
+| AllToAll | `76.54 GB/s` |
+
+NCCL allreduce 在 16 ranks 下，`busbw = algbw * 2 * (n - 1) / n = algbw * 1.875`。
+
+因此 PDF 的 allreduce `491.84 GB/s busbw` 反推：
+
+```text
+491.84 / 1.875 = 262.31 GB/s algbw
+```
+
+但当前 4 条 400G rail 的理论单向原始带宽约 `200 GB/s`。本项目实测 2x8 allreduce：
+
+| 测试 | Bus BW | 反推 Alg BW |
+|---|---:|---:|
+| 本轮深度诊断 allreduce | `354.025 GB/s` | `188.81 GB/s` |
+| 本轮 GRAPH allreduce | `354.224 GB/s` | `188.92 GB/s` |
+
+这已经接近当前 4 x 400G rail 的物理单向上限。除非 PDF 参考环境具备更多有效 400G rail、更高交换网络能力，或使用了当前缺失的网络加速组件，否则当前 2x8 allreduce 很难靠 NCCL 环境变量小调达到 `491.84 GB/s`。
+
+## GPU-NIC 亲和性影响
+
+`nvidia-smi topo -m` 显示的 NIC legend 两台一致：
+
+| NIC | HCA |
+|---|---|
+| NIC0 | `mlx5_0` |
+| NIC1 | `mlx5_1` |
+| NIC2 | `mlx5_2` |
+| NIC3 | `mlx5_3` |
+| NIC4 | `mlx5_4` |
+| NIC5 | `mlx5_5` |
+| NIC6 | `mlx5_6` |
+| NIC7 | `mlx5_7` |
+| NIC8 | `mlx5_8` |
+| NIC9 | `mlx5_9` |
+
+关键亲和关系：
+
+| GPU | 最近的有效 400G HCA |
+|---|---|
+| GPU0 | `mlx5_0` |
+| GPU1 | `mlx5_1` |
+| GPU4 | `mlx5_6` |
+| GPU5 | `mlx5_7` |
+
+这解释了为什么 2 机 4 GPU 档位需要使用：
+
+```text
+CUDA_VISIBLE_DEVICES=0,1,4,5
+```
+
+默认 GPU0/1/2/3 会把 GPU2/GPU3 放到非理想 NIC 亲和路径上，其中 GPU2 最近的 `mlx5_2/3` 不是可用 400G IB rail。
+
+## NCCL Net Plugin / SHARP 状态
+
+在两台节点上搜索：
+
+```text
+find /usr /opt /tmp /root -name 'libnccl-net*.so*' -o -name 'libsharp*.so*'
+```
+
+结果为空。
+
+两台节点包列表中能看到：
+
+| 包 | 版本/说明 |
+|---|---|
+| `doca-ofed` | `3.3.0-088000` |
+| `mlnx-ofed-kernel-dkms` | `26.01.OFED.26.01.1.0.0.1-1` |
+| `ucx` | `1.20.0-1.20260211...` |
+
+未看到：
+
+- `libnccl-net.so`
+- `libsharp*.so`
+- SHARP packages
+- HCOLL packages
+
+本轮 NCCL GRAPH 日志也显示 `plugin_missing=16`，说明 NCCL 只能走 internal IB plugin。
+
+## 当前 2x8 结果归因边界
+
+已经基本排除：
+
+- 不是 SSH / mpirun launch 问题：preflight 已通过。
+- 不是 HCA 完全不可用：4 条 400G rail 都 ACTIVE，allreduce 能跑到约 `354 GB/s busbw`。
+- 不是 GDR disabled：NCCL `2.27.7` 日志中 GDR enabled。
+- 不是 rail 完全打偏：`NCCL_PXN_DISABLE=1` 后 alltoall 四条 rail 流量均衡。
+- 不是明显坏链路/重传：counter 未见 discard、RoCE retrans、slow restart、packet sequence error 等增长。
+
+仍然成立的缺口：
+
+1. **2x8 allreduce 的 PDF 目标疑似超过当前 4 x 400G rail 物理能力。**
+2. **2x8 alltoall 即使 rail 均衡仍只有 `36-37 GB/s`，更像 NCCL alltoall 图策略、internal IB plugin 能力、缺少 SHARP/NCCL net plugin 或交换网络策略问题。**
+
+## 给网络/环境侧的确认清单
+
+请网络/环境侧确认以下问题：
+
+1. PDF 参考环境每节点实际参与 NCCL 的 400G rail 数量是多少？是否为 8 条 400G，而不是当前的 4 条 400G？
+2. PDF 命令中列出的 HCA 列表是否在参考环境中全部为 400G InfiniBand ACTIVE？
+3. PDF 参考环境是否启用了 NCCL net plugin、SHARP、HCOLL、UCX plugin 或交换机侧 SHARP aggregation？
+4. 当前交换网络是否开启 adaptive routing / ECMP / congestion control，是否存在跨 Leaf 场景下对 alltoall pattern 不友好的 hash 或路径限制？
+5. 当前 `mlx5_4/5` 为什么只有 100G，`mlx5_2/8` 为什么是 Ethernet 25G，`mlx5_3/9` 为什么 DOWN；这些是否符合机器采购和验收预期？
+6. 如果验收必须按 PDF 的 `491.84/76.54 GB/s`，是否需要更换到与 PDF 等价的 rail 数量/交换网络/软件栈再测。
+
+## 建议下一步
+
+1. 暂停继续盲调 NCCL 小参数；已有 sweep 显示收益不稳定或负向。
+2. 先让硬件/网络侧确认 rail 数量和速率是否与 PDF 等价。
+3. 如果确认硬件等价，再补齐 NCCL net plugin / SHARP 环境，并用 `scripts/multinode_nccl_deep_diagnose.sh graph` 复查 plugin 和 graph 变化。
+4. 如果硬件不等价，应调整验收阈值或改用与 PDF 等价的节点组合复测。
-- 
2.47.2


From 892f833ff4244c39485de69debd6dd5482abdb5a Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 18:57:22 +0800
Subject: [PATCH 20/41] Add NCCL network handoff plan

---
 ...ts_multinode_nccl_handoff_plan_20260523.md | 150 ++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 reports_multinode_nccl_handoff_plan_20260523.md

diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md
new file mode 100644
index 0000000..b13496b
--- /dev/null
+++ b/reports_multinode_nccl_handoff_plan_20260523.md
@@ -0,0 +1,150 @@
+# 多节点 NCCL 交接计划 2026-05-23
+
+## 当前一句话结论
+
+当前 2 机 8 卡 NCCL 已经排除旧 NCCL、GDR disabled、HCA 选择错误、SSH/mpirun launch、明显链路错误等问题；剩余差距集中在 **硬件 rail 数量是否与 PDF 等价**、**NCCL net plugin / SHARP 是否缺失**、以及 **alltoall 在当前跨 Leaf 网络下的图策略/交换路径效率**。
+
+## 已经验证的事实
+
+| 事实 | 当前证据 |
+|---|---|
+| 两台机器可用于 NCCL 的 400G IB rail 是 4 条 | `mlx5_0,mlx5_1,mlx5_6,mlx5_7` 均为 `400 Gb/sec (4X NDR)` |
+| 其他 HCA 不等价 | `mlx5_4/5` 为 100G IB，`mlx5_2/8` 为 25G Ethernet，`mlx5_3/9` DOWN |
+| NCCL 2.27.7 GDR 可用 | GRAPH/NET 日志中 GDR enabled |
+| allreduce 已接近当前 4 rail 物理上限 | `354 GB/s busbw`，反推 `189 GB/s algbw`，接近 4 x 400G 的 `200 GB/s` 单向原始带宽 |
+| alltoall PXN disabled 后 rail 均衡但仍低 | `36-37 GB/s busbw`，每条 rail 约 `19-20 GB/s` |
+| 没看到硬错误 | 未见 discard、RoCE retrans、slow restart、packet sequence error 等增长 |
+| 当前缺外部 NCCL 网络组件 | 未找到 `libnccl-net*.so*` / `libsharp*.so*`，未见 SHARP/HCOLL 包 |
+
+## PDF 目标与当前物理能力的冲突
+
+PDF 2 机 16 GPU allreduce 目标是：
+
+```text
+491.84 GB/s busbw
+```
+
+16 ranks allreduce 换算关系：
+
+```text
+busbw = algbw * 1.875
+```
+
+因此 PDF 目标反推：
+
+```text
+491.84 / 1.875 = 262.31 GB/s algbw
+```
+
+当前每节点 4 条 400G rail 的理论单向原始带宽：
+
+```text
+4 * 400Gb/s / 8 = 200 GB/s
+```
+
+所以如果 PDF 环境有更多有效 400G rail，或启用了 SHARP/NCCL net plugin，而当前环境没有，则当前节点不应直接按 PDF 2x8 目标判定。
+
+## 决策树
+
+### A. 如果验收坚持 PDF 原始阈值
+
+必须先证明当前环境与 PDF 等价：
+
+1. 每节点是否有 8 条 400G IB rail 可用？
+2. PDF 命令中的 HCA 在参考环境里是否全部是 400G IB ACTIVE？
+3. PDF 环境是否启用了 SHARP / NCCL net plugin / HCOLL / UCX plugin？
+4. 当前跨 Leaf 交换网络策略是否与 PDF 环境一致？
+
+如果任一答案是否定或未知，应先补齐硬件/软件/网络环境再复测，不应继续靠 NCCL 小参数追 `491.84/76.54 GB/s`。
+
+### B. 如果验收按当前硬件形态重新定标
+
+建议把当前 2x8 allreduce 的可解释目标按 4 x 400G rail 物理能力重新评估：
+
+- allreduce 当前 `354 GB/s busbw`，反推 `189 GB/s algbw`，接近 `200 GB/s` 单向原始上限。
+- alltoall 当前 `36-37 GB/s` 仍偏低，需要作为独立问题继续排查。
+
+### C. 如果要继续优化 alltoall
+
+不要继续盲扫以下参数：
+
+- `NCCL_IB_QPS_PER_CONNECTION`
+- `NCCL_IB_SPLIT_DATA_ON_QPS`
+- `NCCL_NCHANNELS_PER_NET_PEER`
+- `NCCL_BUFFSIZE`
+- `NCCL_P2P_NET_CHUNKSIZE`
+- `NCCL_IB_AR_THRESHOLD`
+
+已有 sweep 表明它们没有稳定正收益，部分明显负向。
+
+优先做：
+
+1. 补齐并验证 `libnccl-net.so` / SHARP 环境。
+2. 让网络侧查跨 Leaf ECMP / adaptive routing / congestion control / credit wait。
+3. 用 `scripts/multinode_nccl_deep_diagnose.sh graph` 对比启用 plugin 前后的 NCCL graph。
+4. 如有等价 8 rail 节点，迁移同一脚本复测，确认 allreduce 物理上限是否抬升。
+
+## 给网络/硬件/环境侧的问题
+
+请直接确认下面这些问题：
+
+1. 这两台机器是否本来应该有 8 条 400G IB rail？如果是，为什么当前只有 4 条？
+2. `mlx5_4/5` 当前只有 100G，是配置、线缆、模块、交换机端口还是硬件限制？
+3. `mlx5_2/8` 为什么是 Ethernet 25G？是否预期不参与 IB NCCL？
+4. `mlx5_3/9` DOWN 是否符合预期？
+5. PDF 参考环境是否安装了 SHARP、HCOLL 或 NCCL net plugin？
+6. 当前交换机是否开启 adaptive routing，并且对 alltoall 这种多点到多点流量友好？
+7. 当前跨 Leaf 路径是否存在 ECMP hash 不均、PFC/credit wait、拥塞控制参数差异？
+
+## 后续复跑命令
+
+### 轻量检查
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/multinode_nccl_deep_diagnose.sh preflight
+```
+
+### 完整深度诊断
+
+```bash
+cd /root/test_gpu_scripts
+OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_$(date +%Y%m%d_%H%M%S) \
+  bash scripts/multinode_nccl_deep_diagnose.sh all
+```
+
+### 启用新 NCCL plugin / SHARP 后的最小复核
+
+```bash
+cd /root/test_gpu_scripts
+OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%d_%H%M%S) \
+  bash scripts/multinode_nccl_deep_diagnose.sh graph
+```
+
+复核重点：
+
+- `plugin_missing` 是否消失或明显减少。
+- NCCL 日志是否出现外部 net plugin。
+- alltoall graph 中 `P2P/CUMEM`、`NET/IB/*/GDRDMA`、`channel_edge_lines` 是否变化。
+- alltoall busbw 是否突破 `36-37 GB/s` 平台。
+
+## 关键文件
+
+| 文件 | 用途 |
+|---|---|
+| `reports_multinode_nccl_diagnosis_20260523.md` | 总诊断报告 |
+| `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮深度复跑结果 |
+| `reports_multinode_nccl_environment_gap_20260523.md` | 硬件/软件环境等价性缺口 |
+| `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail/counter 证据 |
+| `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall 参数 sweep 和结论 |
+| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本 runbook |
+| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 |
+
+## 当前建议
+
+当前不建议继续把精力放在 NCCL 环境变量微调上。更高价值的动作是：
+
+1. 确认 PDF 参考环境的 rail 数量、速率和 SHARP/plugin 状态。
+2. 补齐或明确排除 NCCL net plugin / SHARP。
+3. 让网络侧针对 alltoall 多点通信模式查跨 Leaf 路径和拥塞策略。
+4. 如果硬件不等价，调整验收阈值或换等价节点重测。
-- 
2.47.2


From ef56e5f15aaa21bbe6e6f31d543816e30d30bdc3 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 18:59:45 +0800
Subject: [PATCH 21/41] Add NCCL latest report index

---
 ...ts_multinode_nccl_latest_index_20260523.md | 144 ++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 reports_multinode_nccl_latest_index_20260523.md

diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md
new file mode 100644
index 0000000..94d17b5
--- /dev/null
+++ b/reports_multinode_nccl_latest_index_20260523.md
@@ -0,0 +1,144 @@
+# 多节点 NCCL 最新索引 2026-05-23
+
+## 当前状态
+
+当前工作分支：`h100-acceptance-current`
+
+当前结论：
+
+- 2 机 4 GPU 档位通过 GPU-NIC 亲和性修正后，已接近 PDF 参考值。
+- 2 机 8 GPU 档位仍未达到 PDF 参考值：
+  - allreduce 当前约 `354 GB/s busbw`，PDF 目标 `491.84 GB/s`。
+  - alltoall 当前约 `36-37 GB/s busbw`，PDF 目标 `76.54 GB/s`。
+- 当前 2 机 8 GPU 剩余差距不再像是旧 NCCL、GDR disabled、HCA 顺序、SSH/mpirun 或明显坏链路问题。
+- 当前更像是硬件 rail 数量与 PDF 不等价、NCCL net plugin / SHARP 缺失、或跨 Leaf alltoall 网络/图策略问题。
+
+## 先看这三份
+
+| 顺序 | 文件 | 用途 |
+|---:|---|---|
+| 1 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划，包含决策树、要问的问题和复跑命令 |
+| 2 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价，重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
+| 3 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
+
+## 关键脚本
+
+| 文件 | 用途 |
+|---|---|
+| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 |
+| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
+
+推荐先跑轻量检查：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/multinode_nccl_deep_diagnose.sh preflight
+```
+
+完整复跑：
+
+```bash
+cd /root/test_gpu_scripts
+OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_$(date +%Y%m%d_%H%M%S) \
+  bash scripts/multinode_nccl_deep_diagnose.sh all
+```
+
+启用 NCCL plugin / SHARP 后的最小复核：
+
+```bash
+cd /root/test_gpu_scripts
+OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%d_%H%M%S) \
+  bash scripts/multinode_nccl_deep_diagnose.sh graph
+```
+
+## 远端机器上的最新同步文件
+
+三份关键报告已经同步到两台节点：
+
+```text
+/root/test_gpu_scripts/reports_multinode_nccl_handoff_plan_20260523.md
+/root/test_gpu_scripts/reports_multinode_nccl_environment_gap_20260523.md
+/root/test_gpu_scripts/reports_multinode_nccl_deep_diagnose_run_20260523.md
+```
+
+最新完整诊断产物目录在 `aikubeworker0012`：
+
+```text
+/root/test_gpu_scripts/reports/nccl_deep_diag_20260523_103932
+```
+
+该目录包含：
+
+- `preflight.txt`
+- `allreduce_counter/`
+- `alltoall_pxn_counter/`
+- `graph/`
+- `pxn_sweep/`
+
+## 当前证据摘要
+
+### HCA / rail
+
+两台节点当前有效 400G IB rail 一致：
+
+```text
+mlx5_0, mlx5_1, mlx5_6, mlx5_7
+```
+
+非等价 HCA：
+
+```text
+mlx5_4, mlx5_5: 100G InfiniBand
+mlx5_2, mlx5_8: 25G Ethernet
+mlx5_3, mlx5_9: DOWN
+```
+
+因此当前每节点可用于 NCCL 的 400G rail 是 4 条，理论单向原始带宽约 `200 GB/s`。
+
+PDF allreduce 目标 `491.84 GB/s busbw` 反推 `262.31 GB/s algbw`，超过当前 4 x 400G rail 的理论单向带宽。
+
+### NCCL / plugin
+
+当前两台节点没有找到：
+
+```text
+libnccl-net*.so*
+libsharp*.so*
+```
+
+也没有看到 SHARP/HCOLL 包。NCCL GRAPH 日志显示 `plugin_missing=16`，当前走 internal IB plugin。
+
+### 深度诊断
+
+本轮完整复跑：
+
+| 项目 | 结果 |
+|---|---:|
+| allreduce 16G | `354.025 GB/s` |
+| graph allreduce 16G | `354.224 GB/s` |
+| alltoall + PXN disabled 16G | `36.9377 GB/s` |
+| graph alltoall + PXN disabled 16G | `37.14 GB/s` |
+
+PXN disabled sweep 未发现有效参数：
+
+- `channels16`、`buff8m`、`p2pchunk4m`、`ar0` 只有小幅噪声级波动。
+- `qps4_split1`、`qps8_split1`、`netpeer8` 明显负向。
+
+## 历史/支撑报告
+
+| 文件 | 说明 |
+|---|---|
+| `reports_multinode_nccl_diagnosis_20260523.md` | 长版总诊断，包含从旧 NCCL/GDR disabled 到 PDF 矩阵对齐的全过程 |
+| `reports_multinode_nccl_pdf_matrix_nccl227.md` | 按 PDF 矩阵跑出的正式 raw report |
+| `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail 和 counter 证据 |
+| `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall PXN 和参数 sweep 结论 |
+| `reports_rdma_single_node_summary.md` | 单节点 RDMA/HCA 速率摘要 |
+| `docs/multinode_nccl_concepts.md` | NCCL/RDMA 概念解释 |
+
+## 给下一位接手人的路线
+
+1. 先读 `reports_multinode_nccl_handoff_plan_20260523.md`。
+2. 用 `reports_multinode_nccl_environment_gap_20260523.md` 和硬件/网络侧确认当前节点是否应具备 8 条 400G rail。
+3. 如果硬件不等价，调整验收口径或换等价节点复测。
+4. 如果硬件确认等价，先补齐 NCCL net plugin / SHARP，再跑 `scripts/multinode_nccl_deep_diagnose.sh graph` 对比 plugin 前后。
+5. alltoall 继续排查时优先找网络路径/ECMP/adaptive routing/拥塞策略，不建议继续盲扫 NCCL 小参数。
-- 
2.47.2


From cadfbcfaa3ec28f9bd3ec44768859f9a1a0ad2e2 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 19:13:35 +0800
Subject: [PATCH 22/41] Add NCCL environment snapshot script

---
 ...ts_multinode_nccl_handoff_plan_20260523.md |   8 +
 ...ts_multinode_nccl_latest_index_20260523.md |  15 ++
 scripts/nccl_environment_snapshot.sh          | 169 ++++++++++++++++++
 3 files changed, 192 insertions(+)
 create mode 100644 scripts/nccl_environment_snapshot.sh

diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md
index b13496b..fb4e354 100644
--- a/reports_multinode_nccl_handoff_plan_20260523.md
+++ b/reports_multinode_nccl_handoff_plan_20260523.md
@@ -105,6 +105,13 @@ cd /root/test_gpu_scripts
 bash scripts/multinode_nccl_deep_diagnose.sh preflight
 ```
 
+### 单节点环境等价性快照
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/nccl_environment_snapshot.sh reports/nccl_environment_snapshot_$(hostname)_$(date +%Y%m%d_%H%M%S).md
+```
+
 ### 完整深度诊断
 
 ```bash
@@ -139,6 +146,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 | `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall 参数 sweep 和结论 |
 | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本 runbook |
 | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 |
+| `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
 
 ## 当前建议
 
diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md
index 94d17b5..4ccbc23 100644
--- a/reports_multinode_nccl_latest_index_20260523.md
+++ b/reports_multinode_nccl_latest_index_20260523.md
@@ -26,6 +26,7 @@
 | 文件 | 用途 |
 |---|---|
 | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 |
+| `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本，不启动 NCCL workload |
 | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
 
 推荐先跑轻量检查：
@@ -35,6 +36,13 @@ cd /root/test_gpu_scripts
 bash scripts/multinode_nccl_deep_diagnose.sh preflight
 ```
 
+采集单节点环境快照：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/nccl_environment_snapshot.sh reports/nccl_environment_snapshot_$(hostname)_$(date +%Y%m%d_%H%M%S).md
+```
+
 完整复跑：
 
 ```bash
@@ -75,6 +83,13 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 - `graph/`
 - `pxn_sweep/`
 
+最新单节点环境快照：
+
+```text
+aikubeworker0012: /root/test_gpu_scripts/reports/nccl_environment_snapshot_aikubeworker0012_20260523_111142.md
+aikubeworker0016: /root/test_gpu_scripts/reports/nccl_environment_snapshot_aikubeworker0016_20260523_111143.md
+```
+
 ## 当前证据摘要
 
 ### HCA / rail
diff --git a/scripts/nccl_environment_snapshot.sh b/scripts/nccl_environment_snapshot.sh
new file mode 100644
index 0000000..77725ff
--- /dev/null
+++ b/scripts/nccl_environment_snapshot.sh
@@ -0,0 +1,169 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Collect a lightweight NCCL/RDMA environment snapshot on one node.
+# This script does not run NCCL workloads and is safe to use before deeper tests.
+
+HOST="$(hostname 2>/dev/null || echo unknown)"
+TS="$(date +%Y%m%d_%H%M%S)"
+OUT_FILE="${1:-${OUT_FILE:-/tmp/nccl_environment_snapshot_${HOST}_${TS}.md}}"
+PDF_ALLREDUCE_BUSBW="${PDF_ALLREDUCE_BUSBW:-491.84}"
+PDF_ALLTOALL_BUSBW="${PDF_ALLTOALL_BUSBW:-76.54}"
+PLUGIN_SEARCH_ROOTS="${PLUGIN_SEARCH_ROOTS:-/usr /opt /tmp /root}"
+
+mkdir -p "$(dirname "$OUT_FILE")"
+shopt -s nullglob
+
+have_cmd() {
+  command -v "$1" >/dev/null 2>&1
+}
+
+emit_cmd() {
+  local title="$1"
+  shift
+  {
+    echo
+    echo "### $title"
+    echo
+    echo '```text'
+    "$@" 2>&1 || true
+    echo '```'
+  } >>"$OUT_FILE"
+}
+
+active_400g_hcas=()
+non_400g_rows=()
+
+{
+  echo "# NCCL/RDMA 环境快照"
+  echo
+  echo "- Host: \`$HOST\`"
+  echo "- Time: \`$(date -Is 2>/dev/null || date)\`"
+  echo "- Kernel: \`$(uname -r 2>/dev/null || echo unknown)\`"
+  echo
+  echo "## HCA / Port 状态"
+  echo
+  echo "| HCA | Port | State | Phys State | Rate | Link Layer | 400G IB Rail |"
+  echo "|---|---:|---|---|---:|---|---|"
+} >"$OUT_FILE"
+
+hca_paths=(/sys/class/infiniband/mlx5_*)
+if ((${#hca_paths[@]})); then
+  for hca_path in "${hca_paths[@]}"; do
+    hca="$(basename "$hca_path")"
+    for port_path in "$hca_path"/ports/*; do
+      [[ -d "$port_path" ]] || continue
+      port="$(basename "$port_path")"
+      state="$(cat "$port_path/state" 2>/dev/null || echo NA)"
+      phys_state="$(cat "$port_path/phys_state" 2>/dev/null || echo NA)"
+      rate="$(cat "$port_path/rate" 2>/dev/null || echo NA)"
+      layer="$(cat "$port_path/link_layer" 2>/dev/null || echo NA)"
+      is_400g="NO"
+      if [[ "$state" == *"ACTIVE"* && "$rate" == 400\ Gb/sec* && "$layer" == "InfiniBand" ]]; then
+        is_400g="YES"
+        active_400g_hcas+=("$hca")
+      else
+        non_400g_rows+=("$hca port=$port state=$state rate=$rate layer=$layer")
+      fi
+      printf '| `%s` | `%s` | `%s` | `%s` | `%s` | `%s` | `%s` |\n' \
+        "$hca" "$port" "$state" "$phys_state" "$rate" "$layer" "$is_400g" >>"$OUT_FILE"
+    done
+  done
+else
+  printf '| N/A | N/A | `%s` | N/A | N/A | N/A | NO |\n' "/sys/class/infiniband/mlx5_* not found" >>"$OUT_FILE"
+fi
+
+{
+  echo
+  echo "## Rail 摘要"
+  echo
+  if ((${#active_400g_hcas[@]})); then
+    hca_csv="$(IFS=,; echo "${active_400g_hcas[*]}")"
+    echo "- Active 400G IB rail count: \`${#active_400g_hcas[@]}\`"
+    echo "- Candidate \`NCCL_IB_HCA\`: \`$hca_csv\`"
+    echo "- Theoretical one-way raw bandwidth: \`${#active_400g_hcas[@]} * 400Gb/s / 8 = $((${#active_400g_hcas[@]} * 50)) GB/s\`"
+  else
+    echo "- Active 400G IB rail count: \`0\`"
+    echo "- Candidate \`NCCL_IB_HCA\`: \`N/A\`"
+  fi
+  echo
+  echo "Non-400G / non-IB / down ports:"
+  echo
+  if ((${#non_400g_rows[@]})); then
+    for row in "${non_400g_rows[@]}"; do
+      echo "- \`$row\`"
+    done
+  else
+    echo "- none"
+  fi
+  echo
+  echo "## PDF 目标换算"
+  echo
+  echo "- PDF allreduce busbw target: \`${PDF_ALLREDUCE_BUSBW} GB/s\`"
+  echo "- PDF alltoall busbw target: \`${PDF_ALLTOALL_BUSBW} GB/s\`"
+} >>"$OUT_FILE"
+
+python3 - "$PDF_ALLREDUCE_BUSBW" "${#active_400g_hcas[@]}" >>"$OUT_FILE" <<'PY' || true
+import sys
+
+busbw = float(sys.argv[1])
+rail_count = int(sys.argv[2])
+algbw = busbw / 1.875
+raw = rail_count * 50.0
+print(f"- 16-rank allreduce implied algbw: `{algbw:.2f} GB/s`")
+if rail_count:
+    pct = algbw / raw * 100
+    print(f"- Implied algbw / current raw 400G rail bandwidth: `{pct:.1f}%`")
+    if algbw > raw:
+        print("- Interpretation: PDF allreduce target is above current 400G rail one-way raw bandwidth.")
+    else:
+        print("- Interpretation: PDF allreduce target is within current 400G rail one-way raw bandwidth.")
+else:
+    print("- Interpretation: no active 400G IB rail was detected.")
+PY
+
+{
+  echo
+  echo "## NCCL Net Plugin / SHARP 文件"
+  echo
+  echo '```text'
+} >>"$OUT_FILE"
+
+read -r -a plugin_roots <<<"$PLUGIN_SEARCH_ROOTS"
+find "${plugin_roots[@]}" \( -name 'libnccl-net*.so*' -o -name 'libsharp*.so*' \) \
+  2>/dev/null | sort >>"$OUT_FILE" || true
+
+if ! grep -q 'libnccl-net\|libsharp' "$OUT_FILE"; then
+  echo "none found under $PLUGIN_SEARCH_ROOTS" >>"$OUT_FILE"
+fi
+
+echo '```' >>"$OUT_FILE"
+
+if have_cmd dpkg; then
+  emit_cmd "Relevant Debian packages" bash -lc "dpkg -l | egrep -i 'nccl|sharp|hcoll|ucx|ofed|mlnx' | sed -n '1,160p'"
+else
+  emit_cmd "Relevant packages" bash -lc "echo 'dpkg not found'"
+fi
+
+if have_cmd nvidia-smi; then
+  emit_cmd "nvidia-smi topo -m" nvidia-smi topo -m
+else
+  emit_cmd "nvidia-smi topo -m" bash -lc "echo 'nvidia-smi not found'"
+fi
+
+if have_cmd ibstat; then
+  emit_cmd "ibstat" ibstat
+fi
+
+{
+  echo
+  echo "## 建议判断"
+  echo
+  echo "1. 如果 Active 400G IB rail 少于 PDF 参考环境，不能直接按 PDF 阈值判断等价。"
+  echo "2. 如果没有 \`libnccl-net*.so*\` / \`libsharp*.so*\`，NCCL 可能只能走 internal IB plugin。"
+  echo "3. 若要追 PDF 2x8 目标，请先确认 rail 数量、SHARP/NCCL net plugin、跨 Leaf 交换策略是否与 PDF 环境一致。"
+  echo
+  echo "Snapshot written to: \`$OUT_FILE\`"
+} >>"$OUT_FILE"
+
+echo "$OUT_FILE"
-- 
2.47.2


From 2c5c31e451d5c861d69b206992dd362e0cf112db Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 19:16:40 +0800
Subject: [PATCH 23/41] Add single-node H100 all runner

---
 ...ts_multinode_nccl_handoff_plan_20260523.md |   8 ++
 ...ts_multinode_nccl_latest_index_20260523.md |   8 ++
 scripts/run_h100_single_node_all.sh           | 134 ++++++++++++++++++
 3 files changed, 150 insertions(+)
 create mode 100755 scripts/run_h100_single_node_all.sh

diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md
index fb4e354..9b639ad 100644
--- a/reports_multinode_nccl_handoff_plan_20260523.md
+++ b/reports_multinode_nccl_handoff_plan_20260523.md
@@ -112,6 +112,13 @@ cd /root/test_gpu_scripts
 bash scripts/nccl_environment_snapshot.sh reports/nccl_environment_snapshot_$(hostname)_$(date +%Y%m%d_%H%M%S).md
 ```
 
+### 单节点 H100 原始 all 报告
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_h100_single_node_all.sh
+```
+
 ### 完整深度诊断
 
 ```bash
@@ -147,6 +154,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本 runbook |
 | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 |
 | `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
+| `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 |
 
 ## 当前建议
 
diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md
index 4ccbc23..2aa9bd3 100644
--- a/reports_multinode_nccl_latest_index_20260523.md
+++ b/reports_multinode_nccl_latest_index_20260523.md
@@ -27,8 +27,16 @@
 |---|---|
 | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 |
 | `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本，不启动 NCCL workload |
+| `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口，默认同时采环境快照 |
 | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
 
+单节点 H100 原始 all 报告：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_h100_single_node_all.sh
+```
+
 推荐先跑轻量检查：
 
 ```bash
diff --git a/scripts/run_h100_single_node_all.sh b/scripts/run_h100_single_node_all.sh
new file mode 100755
index 0000000..91d25fe
--- /dev/null
+++ b/scripts/run_h100_single_node_all.sh
@@ -0,0 +1,134 @@
+#!/usr/bin/env bash
+set -uo pipefail
+
+# Run the single-node H100 acceptance suite and keep the raw report paths stable.
+# The suite itself still lives in gpu_tester.py; this wrapper only standardizes
+# snapshot/report naming for repeated machine-level runs.
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
+
+PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
+CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/default.yaml}"
+OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
+FORMAT="${FORMAT:-md}"
+DRY_RUN=0
+SNAPSHOT=1
+
+usage() {
+  cat <<'EOF'
+Usage: run_h100_single_node_all.sh [options]
+
+Options:
+  --python PATH       Python executable (default: /root/gpu-test-venv/bin/python)
+  --config PATH       gpu_tester config file (default: configs/default.yaml)
+  --out-dir PATH      Report output directory (default: reports)
+  --format FORMAT     Report format: md, json, or html (default: md)
+  --no-snapshot       Do not run nccl_environment_snapshot.sh first
+  --dry-run           Print commands without running them
+  -h, --help          Show this help
+EOF
+}
+
+while (($#)); do
+  case "$1" in
+    --python)
+      PYTHON_BIN="$2"
+      shift 2
+      ;;
+    --config)
+      CONFIG_FILE="$2"
+      shift 2
+      ;;
+    --out-dir)
+      OUT_DIR="$2"
+      shift 2
+      ;;
+    --format)
+      FORMAT="$2"
+      shift 2
+      ;;
+    --no-snapshot)
+      SNAPSHOT=0
+      shift
+      ;;
+    --dry-run)
+      DRY_RUN=1
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
+  echo "Unsupported format: $FORMAT" >&2
+  exit 2
+fi
+
+if [[ ! -x "$PYTHON_BIN" ]]; then
+  PYTHON_BIN="$(command -v python3 || true)"
+fi
+
+if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
+  echo "Python executable not found. Set --python or PYTHON_BIN." >&2
+  exit 1
+fi
+
+HOST="$(hostname 2>/dev/null || echo unknown)"
+TS="$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUT_DIR"
+
+SNAPSHOT_FILE="$OUT_DIR/nccl_environment_snapshot_${HOST}_${TS}.md"
+REPORT_FILE="$OUT_DIR/h100_single_node_all_${HOST}_${TS}.${FORMAT}"
+
+snapshot_cmd=(bash "$PROJECT_DIR/scripts/nccl_environment_snapshot.sh" "$SNAPSHOT_FILE")
+test_cmd=(
+  "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
+  --config "$CONFIG_FILE"
+  --test all
+  --report
+  --format "$FORMAT"
+  --output "$REPORT_FILE"
+)
+
+echo "Project: $PROJECT_DIR"
+echo "Host: $HOST"
+echo "Config: $CONFIG_FILE"
+echo "Report: $REPORT_FILE"
+if ((SNAPSHOT)); then
+  echo "Snapshot: $SNAPSHOT_FILE"
+fi
+
+if ((DRY_RUN)); then
+  if ((SNAPSHOT)); then
+    printf 'DRY RUN snapshot:'
+    printf ' %q' "${snapshot_cmd[@]}"
+    printf '\n'
+  fi
+  printf 'DRY RUN test:'
+  printf ' %q' "${test_cmd[@]}"
+  printf '\n'
+  exit 0
+fi
+
+if ((SNAPSHOT)); then
+  "${snapshot_cmd[@]}"
+fi
+
+"${test_cmd[@]}"
+status=$?
+
+echo "Report written to: $REPORT_FILE"
+if ((SNAPSHOT)); then
+  echo "Snapshot written to: $SNAPSHOT_FILE"
+fi
+
+exit "$status"
-- 
2.47.2


From 8923270ce038d70a2011a2c140ad60f0aa4fed2f Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 19:21:58 +0800
Subject: [PATCH 24/41] Add multinode NCCL PDF matrix runner

---
 configs/multinode_nccl_nccl227_16g.yaml       |   2 +-
 configs/multinode_nccl_nccl227_auto_16g.yaml  |   2 +-
 .../multinode_nccl_nccl227_diagnostic.yaml    |   2 +-
 .../multinode_nccl_nccl227_pdf_matrix.yaml    |   2 +-
 configs/multinode_nccl_nccl227_sweep.yaml     |   2 +-
 docs/multinode_nccl_deep_diagnose_runbook.md  |  12 +-
 ...ts_multinode_nccl_handoff_plan_20260523.md |   9 ++
 ...ts_multinode_nccl_latest_index_20260523.md |   9 ++
 scripts/run_multinode_nccl_pdf_matrix.sh      | 142 ++++++++++++++++++
 9 files changed, 176 insertions(+), 6 deletions(-)
 create mode 100755 scripts/run_multinode_nccl_pdf_matrix.sh

diff --git a/configs/multinode_nccl_nccl227_16g.yaml b/configs/multinode_nccl_nccl227_16g.yaml
index c5552fe..5f57a4b 100644
--- a/configs/multinode_nccl_nccl227_16g.yaml
+++ b/configs/multinode_nccl_nccl227_16g.yaml
@@ -23,7 +23,7 @@ multinode_nccl:
     - /usr/mpi/gcc/openmpi-4.1.9a1/lib
     - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
     - /usr/local/cuda-12.4/targets/x86_64-linux/lib
-  nccl_tests_dir: null
+  nccl_tests_dir: /data/nccl-tests-latest/build
   tests:
     - all_reduce_perf
     - alltoall_perf
diff --git a/configs/multinode_nccl_nccl227_auto_16g.yaml b/configs/multinode_nccl_nccl227_auto_16g.yaml
index 2492989..f547bff 100644
--- a/configs/multinode_nccl_nccl227_auto_16g.yaml
+++ b/configs/multinode_nccl_nccl227_auto_16g.yaml
@@ -23,7 +23,7 @@ multinode_nccl:
     - /usr/mpi/gcc/openmpi-4.1.9a1/lib
     - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
     - /usr/local/cuda-12.4/targets/x86_64-linux/lib
-  nccl_tests_dir: null
+  nccl_tests_dir: /data/nccl-tests-latest/build
   tests:
     - all_reduce_perf
     - alltoall_perf
diff --git a/configs/multinode_nccl_nccl227_diagnostic.yaml b/configs/multinode_nccl_nccl227_diagnostic.yaml
index 5465772..64c0479 100644
--- a/configs/multinode_nccl_nccl227_diagnostic.yaml
+++ b/configs/multinode_nccl_nccl227_diagnostic.yaml
@@ -23,7 +23,7 @@ multinode_nccl:
     - /usr/mpi/gcc/openmpi-4.1.9a1/lib
     - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
     - /usr/local/cuda-12.4/targets/x86_64-linux/lib
-  nccl_tests_dir: null
+  nccl_tests_dir: /data/nccl-tests-latest/build
   tests:
     - all_reduce_perf
     - alltoall_perf
diff --git a/configs/multinode_nccl_nccl227_pdf_matrix.yaml b/configs/multinode_nccl_nccl227_pdf_matrix.yaml
index 00a3220..2c33573 100644
--- a/configs/multinode_nccl_nccl227_pdf_matrix.yaml
+++ b/configs/multinode_nccl_nccl227_pdf_matrix.yaml
@@ -23,7 +23,7 @@ multinode_nccl:
     - /usr/mpi/gcc/openmpi-4.1.9a1/lib
     - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
     - /usr/local/cuda-12.4/targets/x86_64-linux/lib
-  nccl_tests_dir: null
+  nccl_tests_dir: /data/nccl-tests-latest/build
   tests:
     - all_reduce_perf
     - alltoall_perf
diff --git a/configs/multinode_nccl_nccl227_sweep.yaml b/configs/multinode_nccl_nccl227_sweep.yaml
index da96ef1..f46a4ab 100644
--- a/configs/multinode_nccl_nccl227_sweep.yaml
+++ b/configs/multinode_nccl_nccl227_sweep.yaml
@@ -23,7 +23,7 @@ multinode_nccl:
     - /usr/mpi/gcc/openmpi-4.1.9a1/lib
     - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
     - /usr/local/cuda-12.4/targets/x86_64-linux/lib
-  nccl_tests_dir: null
+  nccl_tests_dir: /data/nccl-tests-latest/build
   tests:
     - all_reduce_perf
     - alltoall_perf
diff --git a/docs/multinode_nccl_deep_diagnose_runbook.md b/docs/multinode_nccl_deep_diagnose_runbook.md
index 11a0629..8bd082e 100644
--- a/docs/multinode_nccl_deep_diagnose_runbook.md
+++ b/docs/multinode_nccl_deep_diagnose_runbook.md
@@ -24,6 +24,16 @@ bash scripts/multinode_nccl_deep_diagnose.sh preflight
 bash scripts/multinode_nccl_deep_diagnose.sh all
 ```
 
+如果要按 PDF 参考矩阵跑正式多机多卡报告，使用：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_multinode_nccl_pdf_matrix.sh
+```
+
+它会跑 2 机 x 1/2/4/8 GPU per node 的 `all_reduce_perf` 和 `alltoall_perf`，输出到
+`reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS.md`。
+
 默认输出目录为：
 
 ```text
@@ -63,7 +73,7 @@ bash scripts/multinode_nccl_deep_diagnose.sh all
 如果 nccl-tests 或 NCCL 运行库路径变化：
 
 ```bash
-NCCL_TESTS_DIR=/opt/gpu-test-tools/nccl-tests/build \
+NCCL_TESTS_DIR=/data/nccl-tests-latest/build \
 NCCL_LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/path/to/nccl/lib:/usr/local/cuda/lib64 \
 bash scripts/multinode_nccl_deep_diagnose.sh graph
 ```
diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md
index 9b639ad..6df9c66 100644
--- a/reports_multinode_nccl_handoff_plan_20260523.md
+++ b/reports_multinode_nccl_handoff_plan_20260523.md
@@ -119,6 +119,13 @@ cd /root/test_gpu_scripts
 bash scripts/run_h100_single_node_all.sh
 ```
 
+### 多机多卡 PDF 矩阵
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_multinode_nccl_pdf_matrix.sh
+```
+
 ### 完整深度诊断
 
 ```bash
@@ -155,6 +162,8 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 |
 | `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
 | `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 |
+| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口 |
+| `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置 |
 
 ## 当前建议
 
diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md
index 2aa9bd3..2d5b2ae 100644
--- a/reports_multinode_nccl_latest_index_20260523.md
+++ b/reports_multinode_nccl_latest_index_20260523.md
@@ -28,8 +28,17 @@
 | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 |
 | `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本，不启动 NCCL workload |
 | `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口，默认同时采环境快照 |
+| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口，跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall |
+| `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置，固定 NCCL 2.27.7 和 `/data/nccl-tests-latest/build` |
 | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
 
+多机多卡 PDF 矩阵：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_multinode_nccl_pdf_matrix.sh
+```
+
 单节点 H100 原始 all 报告：
 
 ```bash
diff --git a/scripts/run_multinode_nccl_pdf_matrix.sh b/scripts/run_multinode_nccl_pdf_matrix.sh
new file mode 100755
index 0000000..c61dcab
--- /dev/null
+++ b/scripts/run_multinode_nccl_pdf_matrix.sh
@@ -0,0 +1,142 @@
+#!/usr/bin/env bash
+set -uo pipefail
+
+# Run the formal cross-node NCCL PDF matrix for the current two-node H100 pair.
+# This wrapper standardizes the command, output naming, and preflight hook; the
+# actual benchmark implementation remains in gpu_tester.py / MultiNodeNCCLTest.
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
+
+PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
+CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_pdf_matrix.yaml}"
+OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
+FORMAT="${FORMAT:-md}"
+DRY_RUN=0
+RUN_PREFLIGHT=1
+PREFLIGHT_ONLY=0
+
+usage() {
+  cat <<'EOF'
+Usage: run_multinode_nccl_pdf_matrix.sh [options]
+
+Options:
+  --python PATH       Python executable (default: /root/gpu-test-venv/bin/python)
+  --config PATH       Matrix config file (default: configs/multinode_nccl_nccl227_pdf_matrix.yaml)
+  --out-dir PATH      Report output directory (default: reports)
+  --format FORMAT     Report format: md, json, or html (default: md)
+  --no-preflight      Skip scripts/multinode_nccl_deep_diagnose.sh preflight
+  --preflight-only    Run only the preflight check, not the matrix workload
+  --dry-run           Print commands without running them
+  -h, --help          Show this help
+EOF
+}
+
+while (($#)); do
+  case "$1" in
+    --python)
+      PYTHON_BIN="$2"
+      shift 2
+      ;;
+    --config)
+      CONFIG_FILE="$2"
+      shift 2
+      ;;
+    --out-dir)
+      OUT_DIR="$2"
+      shift 2
+      ;;
+    --format)
+      FORMAT="$2"
+      shift 2
+      ;;
+    --no-preflight)
+      RUN_PREFLIGHT=0
+      shift
+      ;;
+    --preflight-only)
+      PREFLIGHT_ONLY=1
+      shift
+      ;;
+    --dry-run)
+      DRY_RUN=1
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
+  echo "Unsupported format: $FORMAT" >&2
+  exit 2
+fi
+
+if [[ ! -x "$PYTHON_BIN" ]]; then
+  PYTHON_BIN="$(command -v python3 || true)"
+fi
+
+if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
+  echo "Python executable not found. Set --python or PYTHON_BIN." >&2
+  exit 1
+fi
+
+TS="$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUT_DIR"
+
+REPORT_FILE="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}.${FORMAT}"
+PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
+MATRIX_CMD=(
+  "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
+  --config "$CONFIG_FILE"
+  --test multinode-nccl
+  --report
+  --format "$FORMAT"
+  --output "$REPORT_FILE"
+)
+
+echo "Project: $PROJECT_DIR"
+echo "Config: $CONFIG_FILE"
+echo "Report: $REPORT_FILE"
+echo "Matrix: 2 nodes x {1,2,4,8} GPUs per node; all_reduce_perf + alltoall_perf; 16G"
+
+if ((DRY_RUN)); then
+  if ((RUN_PREFLIGHT)); then
+    printf 'DRY RUN preflight:'
+    printf ' %q' "${PREFLIGHT_CMD[@]}"
+    printf '\n'
+  fi
+  if ((PREFLIGHT_ONLY)); then
+    exit 0
+  fi
+  printf 'DRY RUN matrix:'
+  printf ' %q' "${MATRIX_CMD[@]}"
+  printf '\n'
+  exit 0
+fi
+
+if ((RUN_PREFLIGHT)); then
+  "${PREFLIGHT_CMD[@]}"
+  preflight_status=$?
+  if ((preflight_status != 0)); then
+    echo "Preflight failed with exit code $preflight_status" >&2
+    exit "$preflight_status"
+  fi
+fi
+
+if ((PREFLIGHT_ONLY)); then
+  exit 0
+fi
+
+"${MATRIX_CMD[@]}"
+status=$?
+
+echo "Report written to: $REPORT_FILE"
+exit "$status"
-- 
2.47.2


From c73d738557cc9ec68d7a25d82be100f70e1edb7b Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 19:30:14 +0800
Subject: [PATCH 25/41] Record multinode NCCL PDF matrix run

---
 modules/report.py                             | 17 +++-
 ...ts_multinode_nccl_handoff_plan_20260523.md | 22 ++++-
 ...ts_multinode_nccl_latest_index_20260523.md | 29 ++++++-
 ...ltinode_nccl_pdf_matrix_20260523_112247.md | 84 +++++++++++++++++++
 ..._multinode_nccl_pdf_matrix_run_20260523.md | 63 ++++++++++++++
 5 files changed, 205 insertions(+), 10 deletions(-)
 create mode 100644 reports_multinode_nccl_pdf_matrix_20260523_112247.md
 create mode 100644 reports_multinode_nccl_pdf_matrix_run_20260523.md

diff --git a/modules/report.py b/modules/report.py
index b10d1a0..c905d0b 100644
--- a/modules/report.py
+++ b/modules/report.py
@@ -750,8 +750,14 @@ class ReportGenerator:
 
     @staticmethod
     def _overall_acceptance_verdict(summary_items: list[tuple[str, str]]) -> tuple[str, list[tuple[str, str]], list[str]]:
-        """PDF-style machine verdict: every required item must be present and PASS."""
-        required = [
+        """PDF-style verdict for the report scope.
+
+        Full-suite reports require every single-node acceptance item. Standalone
+        reports, such as `--test multinode-nccl`, should only judge the items
+        that were actually requested instead of reporting unrelated evidence as
+        missing.
+        """
+        single_node_required = [
             "GPU Info",
             "Health Check",
             "Memory Bandwidth",
@@ -764,6 +770,13 @@ class ReportGenerator:
             "Training",
         ]
         status_by_name = dict(summary_items)
+        present_single_node = [name for name in single_node_required if name in status_by_name]
+        if len(present_single_node) >= 3:
+            required = list(single_node_required)
+            if "Multi-node NCCL" in status_by_name:
+                required.append("Multi-node NCCL")
+        else:
+            required = list(status_by_name)
         missing = [name for name in required if name not in status_by_name]
         failures = [
             (name, status)
diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md
index 6df9c66..25b78cf 100644
--- a/reports_multinode_nccl_handoff_plan_20260523.md
+++ b/reports_multinode_nccl_handoff_plan_20260523.md
@@ -11,8 +11,9 @@
 | 两台机器可用于 NCCL 的 400G IB rail 是 4 条 | `mlx5_0,mlx5_1,mlx5_6,mlx5_7` 均为 `400 Gb/sec (4X NDR)` |
 | 其他 HCA 不等价 | `mlx5_4/5` 为 100G IB，`mlx5_2/8` 为 25G Ethernet，`mlx5_3/9` DOWN |
 | NCCL 2.27.7 GDR 可用 | GRAPH/NET 日志中 GDR enabled |
-| allreduce 已接近当前 4 rail 物理上限 | `354 GB/s busbw`，反推 `189 GB/s algbw`，接近 4 x 400G 的 `200 GB/s` 单向原始带宽 |
-| alltoall PXN disabled 后 rail 均衡但仍低 | `36-37 GB/s busbw`，每条 rail 约 `19-20 GB/s` |
+| allreduce 已接近当前 4 rail 物理上限 | 最新 PDF matrix 2x8 为 `354.56 GB/s busbw`，反推 `189.10 GB/s algbw`，接近 4 x 400G 的 `200 GB/s` 单向原始带宽 |
+| alltoall PXN disabled 后 rail 均衡但仍低 | 最新 PDF matrix 2x8 为 `36.82 GB/s busbw`，每条 rail 约 `19-20 GB/s` |
+| 正式 PDF matrix 已复跑 | `reports_multinode_nccl_pdf_matrix_20260523_112247.md`，所有 case 正确性通过但性能阈值 FAIL |
 | 没看到硬错误 | 未见 discard、RoCE retrans、slow restart、packet sequence error 等增长 |
 | 当前缺外部 NCCL 网络组件 | 未找到 `libnccl-net*.so*` / `libsharp*.so*`，未见 SHARP/HCOLL 包 |
 
@@ -61,8 +62,19 @@ busbw = algbw * 1.875
 
 建议把当前 2x8 allreduce 的可解释目标按 4 x 400G rail 物理能力重新评估：
 
-- allreduce 当前 `354 GB/s busbw`，反推 `189 GB/s algbw`，接近 `200 GB/s` 单向原始上限。
-- alltoall 当前 `36-37 GB/s` 仍偏低，需要作为独立问题继续排查。
+- allreduce 当前 `354.56 GB/s busbw`，反推 `189.10 GB/s algbw`，接近 `200 GB/s` 单向原始上限。
+- alltoall 当前 `36.82 GB/s` 仍偏低，需要作为独立问题继续排查。
+
+## 最新 PDF matrix 结果
+
+| Topology | AllReduce | AllReduce Target | AllToAll | AllToAll Target |
+|---|---:|---:|---:|---:|
+| 2 nodes x 1 GPU | `47.15` | `48.90` | `24.85` | `27.25` |
+| 2 nodes x 2 GPUs | `136.62` | `136.93` | `47.71` | `54.41` |
+| 2 nodes x 4 GPUs | `335.19` | `335.48` | `72.63` | `73.73` |
+| 2 nodes x 8 GPUs | `354.56` | `491.84` | `36.82` | `76.54` |
+
+所有 case 的 return code 为 `0`，NCCL `Out of bounds values` 为 `0 OK`。因此本轮 FAIL 是性能阈值失败，不是 NCCL 正确性或启动链路失败。
 
 ### C. 如果要继续优化 alltoall
 
@@ -154,6 +166,8 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 | 文件 | 用途 |
 |---|---|
 | `reports_multinode_nccl_diagnosis_20260523.md` | 总诊断报告 |
+| `reports_multinode_nccl_pdf_matrix_20260523_112247.md` | 最新多机多卡 PDF matrix 原始报告 |
+| `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新多机多卡 PDF matrix 中文摘要 |
 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮深度复跑结果 |
 | `reports_multinode_nccl_environment_gap_20260523.md` | 硬件/软件环境等价性缺口 |
 | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail/counter 证据 |
diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md
index 2d5b2ae..ef9bf8c 100644
--- a/reports_multinode_nccl_latest_index_20260523.md
+++ b/reports_multinode_nccl_latest_index_20260523.md
@@ -6,10 +6,11 @@
 
 当前结论：
 
-- 2 机 4 GPU 档位通过 GPU-NIC 亲和性修正后，已接近 PDF 参考值。
+- 2026-05-23 `11:22` 已完成正式多机多卡 PDF matrix 复跑，原始报告为 `reports_multinode_nccl_pdf_matrix_20260523_112247.md`，中文结论为 `reports_multinode_nccl_pdf_matrix_run_20260523.md`。
+- 2 机 1/2/4 GPU per node 档位已接近 PDF 参考值，但严格按阈值仍 FAIL。
 - 2 机 8 GPU 档位仍未达到 PDF 参考值：
-  - allreduce 当前约 `354 GB/s busbw`，PDF 目标 `491.84 GB/s`。
-  - alltoall 当前约 `36-37 GB/s busbw`，PDF 目标 `76.54 GB/s`。
+  - allreduce 实测 `354.56 GB/s busbw`，PDF 目标 `491.84 GB/s`。
+  - alltoall 实测 `36.82 GB/s busbw`，PDF 目标 `76.54 GB/s`。
 - 当前 2 机 8 GPU 剩余差距不再像是旧 NCCL、GDR disabled、HCA 顺序、SSH/mpirun 或明显坏链路问题。
 - 当前更像是硬件 rail 数量与 PDF 不等价、NCCL net plugin / SHARP 缺失、或跨 Leaf alltoall 网络/图策略问题。
 
@@ -19,7 +20,8 @@
 |---:|---|---|
 | 1 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划，包含决策树、要问的问题和复跑命令 |
 | 2 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价，重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
-| 3 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
+| 3 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
+| 4 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
 
 ## 关键脚本
 
@@ -107,6 +109,14 @@ aikubeworker0012: /root/test_gpu_scripts/reports/nccl_environment_snapshot_aikub
 aikubeworker0016: /root/test_gpu_scripts/reports/nccl_environment_snapshot_aikubeworker0016_20260523_111143.md
 ```
 
+最新多机多卡 PDF matrix：
+
+```text
+aikubeworker0012: /root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_112247.md
+local copy: reports_multinode_nccl_pdf_matrix_20260523_112247.md
+summary: reports_multinode_nccl_pdf_matrix_run_20260523.md
+```
+
 ## 当前证据摘要
 
 ### HCA / rail
@@ -142,6 +152,15 @@ libsharp*.so*
 
 ### 深度诊断
 
+正式 PDF matrix 复跑：
+
+| Topology | AllReduce | AllReduce Target | AllToAll | AllToAll Target |
+|---|---:|---:|---:|---:|
+| 2 nodes x 1 GPU | `47.15` | `48.90` | `24.85` | `27.25` |
+| 2 nodes x 2 GPUs | `136.62` | `136.93` | `47.71` | `54.41` |
+| 2 nodes x 4 GPUs | `335.19` | `335.48` | `72.63` | `73.73` |
+| 2 nodes x 8 GPUs | `354.56` | `491.84` | `36.82` | `76.54` |
+
 本轮完整复跑：
 
 | 项目 | 结果 |
@@ -162,6 +181,8 @@ PXN disabled sweep 未发现有效参数：
 |---|---|
 | `reports_multinode_nccl_diagnosis_20260523.md` | 长版总诊断，包含从旧 NCCL/GDR disabled 到 PDF 矩阵对齐的全过程 |
 | `reports_multinode_nccl_pdf_matrix_nccl227.md` | 按 PDF 矩阵跑出的正式 raw report |
+| `reports_multinode_nccl_pdf_matrix_20260523_112247.md` | 最新正式 PDF matrix 原始报告 |
+| `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式 PDF matrix 中文摘要 |
 | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail 和 counter 证据 |
 | `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall PXN 和参数 sweep 结论 |
 | `reports_rdma_single_node_summary.md` | 单节点 RDMA/HCA 速率摘要 |
diff --git a/reports_multinode_nccl_pdf_matrix_20260523_112247.md b/reports_multinode_nccl_pdf_matrix_20260523_112247.md
new file mode 100644
index 0000000..e67c8a4
--- /dev/null
+++ b/reports_multinode_nccl_pdf_matrix_20260523_112247.md
@@ -0,0 +1,84 @@
+# GPU Test Report
+
+- **Date:** 2026-05-23T11:26:21.306224
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- RDMA
+- DCGM
+- Training
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Multi-node NCCL | FAIL |
+
+## Multi-node NCCL / Cross Leaf
+
+Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
+
+- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
+- **Preflight:** PASS
+
+### Multi-node NCCL allreduce
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.15 GB/s | 16G | 47.18 GB/s | >= 49 GB/s | FAIL |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 136.62 GB/s | 16G | 136.67 GB/s | >= 137 GB/s | FAIL |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 335.19 GB/s | 16G | 334.85 GB/s | >= 335 GB/s | FAIL |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 354.56 GB/s | 16G | 354.21 GB/s | >= 492 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | ranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0016:1321368:1321509 [0] NCCL INFO comm 0x56428b645570 rank 1 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 47.1841  #   |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | 0 | ranks 4 cudaDev 1 busId 2a000 - Destroy COMPLETE aikubeworker0012:2199872:2199936 [0] NCCL INFO comm 0x561da4512280 rank 0 nranks 4 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 136.668  #   |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0016:1321707:1321805 [0] NCCL INFO comm 0x562bad8777a0 rank 4 nranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 334.846  #   |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | nks 16 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0016:1321873:1322056 [0] NCCL INFO comm 0x55ba6708f500 rank 8 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 354.211  #   |
+
+### Multi-node NCCL alltoall
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.85 GB/s | 16G | 24.92 GB/s | >= 27 GB/s | FAIL |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.71 GB/s | 16G | 47.93 GB/s | >= 54 GB/s | FAIL |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.63 GB/s | 16G | 72.67 GB/s | >= 74 GB/s | FAIL |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 36.82 GB/s | 16G | 36.86 GB/s | >= 77 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0016:1322113:1322193 [0] NCCL INFO comm 0x55b760411150 rank 1 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 24.917  #   |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | 0 | ker0012:2200344:2200469 [1] NCCL INFO comm 0x55efef439da0 rank 1 nranks 4 cudaDev 1 busId 2a000 - Destroy COMPLETE aikubeworker0016:1322250:1322338 [1] NCCL INFO comm 0x558ecf546380 rank 3 nranks 4 cudaDev 1 busId 2a000 - Destroy COMPLETE   |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2200479:2200573 [0] NCCL INFO comm 0x55db60daef30 rank 0 nranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 72.6664  #   |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | r0012:2200587:2200767 [5] NCCL INFO comm 0x5556a6f71620 rank 5 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE aikubeworker0012:2200588:2200772 [6] NCCL INFO comm 0x5585a1623170 rank 6 nranks 16 cudaDev 6 busId ba000 - Destroy COMPLETE   |
+
+**Overall: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_multinode_nccl_pdf_matrix_run_20260523.md b/reports_multinode_nccl_pdf_matrix_run_20260523.md
new file mode 100644
index 0000000..e04ac0d
--- /dev/null
+++ b/reports_multinode_nccl_pdf_matrix_run_20260523.md
@@ -0,0 +1,63 @@
+# 多机多卡 NCCL PDF 矩阵实测 2026-05-23
+
+执行节点：`aikubeworker0012`
+
+对端节点：`aikubeworker0016`
+
+原始报告：`reports_multinode_nccl_pdf_matrix_20260523_112247.md`
+
+远端报告：`/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_112247.md`
+
+远端日志：`/root/test_gpu_scripts/reports/run_logs/multinode_nccl_pdf_matrix_20260523_112247.log`
+
+执行命令：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_multinode_nccl_pdf_matrix.sh
+```
+
+## 结论
+
+本轮正式矩阵已跑通，`mpirun`、SSH、`nccl-tests`、GDRDMA、4 条 400G HCA 都可用；失败不是启动失败或功能错误，而是 bus bandwidth 未达到 PDF 阈值。
+
+所有 case 的 return code 都是 `0`，`Out of bounds values` 为 `0 OK`，说明 NCCL 正确性没有报错。FAIL 来自性能阈值。
+
+## Preflight
+
+| 项目 | 结果 |
+|---|---|
+| OpenMPI | PASS，`/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun` |
+| all_reduce_perf | PASS，`/data/nccl-tests-latest/build/all_reduce_perf` |
+| alltoall_perf | PASS，`/data/nccl-tests-latest/build/alltoall_perf` |
+| SSH 172.72.8.12 | PASS |
+| SSH 172.72.8.16 | PASS |
+| HCA | 两端 `mlx5_0,mlx5_1,mlx5_6,mlx5_7` 均为 `400 Gb/sec (4X NDR)` ACTIVE |
+| NCCL network | IB |
+| GPU Direct RDMA | ENABLED |
+
+## AllReduce
+
+| Topology | Peak Bus BW | Avg Bus BW | PDF Threshold | Gap | Status |
+|---|---:|---:|---:|---:|---|
+| 2 nodes x 1 GPU | 47.15 GB/s | 47.18 GB/s | >= 48.90 GB/s | -1.75 GB/s | FAIL |
+| 2 nodes x 2 GPUs | 136.62 GB/s | 136.67 GB/s | >= 136.93 GB/s | -0.31 GB/s | FAIL |
+| 2 nodes x 4 GPUs | 335.19 GB/s | 334.85 GB/s | >= 335.48 GB/s | -0.29 GB/s | FAIL |
+| 2 nodes x 8 GPUs | 354.56 GB/s | 354.21 GB/s | >= 491.84 GB/s | -137.28 GB/s | FAIL |
+
+## AllToAll
+
+| Topology | Peak Bus BW | Avg Bus BW | PDF Threshold | Gap | Status |
+|---|---:|---:|---:|---:|---|
+| 2 nodes x 1 GPU | 24.85 GB/s | 24.92 GB/s | >= 27.25 GB/s | -2.40 GB/s | FAIL |
+| 2 nodes x 2 GPUs | 47.71 GB/s | 47.93 GB/s | >= 54.41 GB/s | -6.70 GB/s | FAIL |
+| 2 nodes x 4 GPUs | 72.63 GB/s | 72.67 GB/s | >= 73.73 GB/s | -1.10 GB/s | FAIL |
+| 2 nodes x 8 GPUs | 36.82 GB/s | 36.86 GB/s | >= 76.54 GB/s | -39.72 GB/s | FAIL |
+
+## 判断
+
+1. 2x2、2x4 的 AllReduce 已非常接近 PDF 阈值，差距分别只有 `0.31` 和 `0.29 GB/s`。
+2. 2x4 的 AllToAll 也接近阈值，差 `1.10 GB/s`。
+3. 2x8 是主要问题：AllReduce 只有 `354.56 / 491.84`，AllToAll 只有 `36.82 / 76.54`。
+4. 当前环境已经确认只有 4 条 400G IB rail 参与 NCCL，且没有发现外部 NCCL net plugin / SHARP；这仍是解释 2x8 目标不可达或严重掉速的最强证据。
+5. 本轮没有看到 GDR disabled 或 HCA 不可用，所以下一步不应继续纠结 SSH/mpirun/nccl-tests 启动链路，而应对齐 PDF 参考环境的 rail 数量、net plugin/SHARP、交换机跨 Leaf 策略。
-- 
2.47.2


From 7bc15742ea1cd5b38124448df63184c1cadda4d9 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 19:33:01 +0800
Subject: [PATCH 26/41] Clarify multinode NCCL report thresholds

---
 modules/report.py                             | 16 ++++++++--
 ...ltinode_nccl_pdf_matrix_20260523_112247.md | 31 +++++++------------
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/modules/report.py b/modules/report.py
index c905d0b..79640c7 100644
--- a/modules/report.py
+++ b/modules/report.py
@@ -439,7 +439,7 @@ class ReportGenerator:
                             if row.get("status") != "PASS"
                         ]
                         failed_sizes_text = ", ".join(failed_sizes) if failed_sizes else "-"
-                        lines.append(f"| {op} | {bw:.1f} | {failed_sizes_text} | >= {req:.0f} | {status} |")
+                        lines.append(f"| {op} | {bw:.1f} | {failed_sizes_text} | >= {_format_gbps(req)} | {status} |")
                     elif isinstance(data, dict) and data.get("error"):
                         lines.append(f"| {op} | - | - | - | ERROR: {data['error']} |")
                 lines.append("")
@@ -457,7 +457,7 @@ class ReportGenerator:
                             f"{row.get('worst_busbw_gbps', 0):.1f} | "
                             f"{row.get('mean_busbw_gbps', 0):.1f} | "
                             f"{row.get('stddev_pct', 0):.2f}% | "
-                            f">= {data.get('min_required_gbps', 0):.0f} | "
+                            f">= {_format_gbps(data.get('min_required_gbps', 0))} | "
                             f"{row.get('status', '?')} |"
                         )
                     lines.append("")
@@ -485,7 +485,7 @@ class ReportGenerator:
                 lines.append("|----------|----------------------|-------------|-----------|------------|-----------|--------|")
                 for topo in data.get("topologies", []):
                     threshold = topo.get("min_required_gbps", 0) or 0
-                    threshold_text = f">= {threshold:.0f} GB/s" if threshold else "-"
+                    threshold_text = f">= {_format_gbps(threshold)} GB/s" if threshold else "-"
                     cuda_visible = topo.get("cuda_visible_devices") or "-"
                     lines.append(
                         f"| {topo.get('label', '')} | {cuda_visible} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | "
@@ -956,3 +956,13 @@ class ReportGenerator:
                 items.append(("Training", f"{status} ({detail})"))
 
         return items
+
+
+def _format_gbps(value) -> str:
+    try:
+        numeric = float(value)
+    except (TypeError, ValueError):
+        return str(value)
+    if numeric.is_integer():
+        return f"{numeric:.0f}"
+    return f"{numeric:.2f}"
diff --git a/reports_multinode_nccl_pdf_matrix_20260523_112247.md b/reports_multinode_nccl_pdf_matrix_20260523_112247.md
index e67c8a4..8d07aef 100644
--- a/reports_multinode_nccl_pdf_matrix_20260523_112247.md
+++ b/reports_multinode_nccl_pdf_matrix_20260523_112247.md
@@ -7,17 +7,8 @@
 
 **Result: FAIL**
 
-Missing required evidence:
-- GPU Info
-- Health Check
-- Memory Bandwidth
-- Compute Throughput
-- NVLink/NVSwitch
-- NCCL
-- Stress Test
-- RDMA
-- DCGM
-- Training
+Failed or unverified items:
+- Multi-node NCCL: FAIL
 
 ## Summary
 
@@ -36,10 +27,10 @@ Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
 
 | Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
 |----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.15 GB/s | 16G | 47.18 GB/s | >= 49 GB/s | FAIL |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 136.62 GB/s | 16G | 136.67 GB/s | >= 137 GB/s | FAIL |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 335.19 GB/s | 16G | 334.85 GB/s | >= 335 GB/s | FAIL |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 354.56 GB/s | 16G | 354.21 GB/s | >= 492 GB/s | FAIL |
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.15 GB/s | 16G | 47.18 GB/s | >= 48.90 GB/s | FAIL |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 136.62 GB/s | 16G | 136.67 GB/s | >= 136.93 GB/s | FAIL |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 335.19 GB/s | 16G | 334.85 GB/s | >= 335.48 GB/s | FAIL |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 354.56 GB/s | 16G | 354.21 GB/s | >= 491.84 GB/s | FAIL |
 
 | Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
 |----------|--------------|-----------------|------------------|-------------------|
@@ -59,10 +50,10 @@ Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
 
 | Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
 |----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.85 GB/s | 16G | 24.92 GB/s | >= 27 GB/s | FAIL |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.71 GB/s | 16G | 47.93 GB/s | >= 54 GB/s | FAIL |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.63 GB/s | 16G | 72.67 GB/s | >= 74 GB/s | FAIL |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 36.82 GB/s | 16G | 36.86 GB/s | >= 77 GB/s | FAIL |
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.85 GB/s | 16G | 24.92 GB/s | >= 27.25 GB/s | FAIL |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.71 GB/s | 16G | 47.93 GB/s | >= 54.41 GB/s | FAIL |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.63 GB/s | 16G | 72.67 GB/s | >= 73.73 GB/s | FAIL |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 36.82 GB/s | 16G | 36.86 GB/s | >= 76.54 GB/s | FAIL |
 
 | Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
 |----------|--------------|-----------------|------------------|-------------------|
@@ -81,4 +72,4 @@ Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
 **Overall: FAIL**
 
 ---
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
+*Generated by GPU Test Suite v0.2.0*
-- 
2.47.2


From 098d1715f2a4d78153aab14f8087dc47509143c4 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 19:36:53 +0800
Subject: [PATCH 27/41] Archive multinode NCCL raw artifacts

---
 docs/multinode_nccl_deep_diagnose_runbook.md    |  8 ++++++++
 modules/report.py                               |  2 ++
 reports_multinode_nccl_handoff_plan_20260523.md |  2 +-
 reports_multinode_nccl_latest_index_20260523.md | 10 +++++++++-
 scripts/run_multinode_nccl_pdf_matrix.sh        |  7 ++++++-
 5 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/docs/multinode_nccl_deep_diagnose_runbook.md b/docs/multinode_nccl_deep_diagnose_runbook.md
index 8bd082e..433d1ce 100644
--- a/docs/multinode_nccl_deep_diagnose_runbook.md
+++ b/docs/multinode_nccl_deep_diagnose_runbook.md
@@ -34,6 +34,14 @@ bash scripts/run_multinode_nccl_pdf_matrix.sh
 它会跑 2 机 x 1/2/4/8 GPU per node 的 `all_reduce_perf` 和 `alltoall_perf`，输出到
 `reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS.md`。
 
+同时会生成：
+
+```text
+reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS_artifacts/
+```
+
+每个 case 保存完整 `*.cmd.txt`、`*.stdout.txt`、`*.stderr.txt` 和解析后的 `*.json`，用于复核原始 NCCL 输出。
+
 默认输出目录为：
 
 ```text
diff --git a/modules/report.py b/modules/report.py
index 79640c7..8411521 100644
--- a/modules/report.py
+++ b/modules/report.py
@@ -468,6 +468,8 @@ class ReportGenerator:
         if multinode and not multinode.get("error"):
             lines.append("## Multi-node NCCL / Cross Leaf\n")
             lines.append(f"Source: {multinode.get('source', 'unknown')} | Mode: {multinode.get('mode', 'unknown')}\n")
+            if multinode.get("artifact_dir"):
+                lines.append(f"- **Artifacts:** `{multinode.get('artifact_dir')}`")
             hosts = multinode.get("hosts", [])
             if hosts:
                 host_text = ", ".join(f"{h.get('name') or h.get('addr')}({h.get('addr')})" for h in hosts)
diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md
index 25b78cf..05df781 100644
--- a/reports_multinode_nccl_handoff_plan_20260523.md
+++ b/reports_multinode_nccl_handoff_plan_20260523.md
@@ -176,7 +176,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 |
 | `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
 | `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 |
-| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口 |
+| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口；复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` |
 | `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置 |
 
 ## 当前建议
diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md
index ef9bf8c..1aa52ef 100644
--- a/reports_multinode_nccl_latest_index_20260523.md
+++ b/reports_multinode_nccl_latest_index_20260523.md
@@ -30,7 +30,7 @@
 | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 |
 | `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本，不启动 NCCL workload |
 | `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口，默认同时采环境快照 |
-| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口，跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall |
+| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口，跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall，并归档每个 case 的 command/stdout/stderr/parsed JSON |
 | `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置，固定 NCCL 2.27.7 和 `/data/nccl-tests-latest/build` |
 | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
 
@@ -117,6 +117,14 @@ local copy: reports_multinode_nccl_pdf_matrix_20260523_112247.md
 summary: reports_multinode_nccl_pdf_matrix_run_20260523.md
 ```
 
+下一次用 `scripts/run_multinode_nccl_pdf_matrix.sh` 复跑时，还会生成：
+
+```text
+/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS_artifacts/
+```
+
+目录内按 case 保存完整 `cmd/stdout/stderr/json`，用于给网络/硬件侧复核原始 NCCL 输出。
+
 ## 当前证据摘要
 
 ### HCA / rail
diff --git a/scripts/run_multinode_nccl_pdf_matrix.sh b/scripts/run_multinode_nccl_pdf_matrix.sh
index c61dcab..572ce04 100755
--- a/scripts/run_multinode_nccl_pdf_matrix.sh
+++ b/scripts/run_multinode_nccl_pdf_matrix.sh
@@ -92,6 +92,7 @@ TS="$(date +%Y%m%d_%H%M%S)"
 mkdir -p "$OUT_DIR"
 
 REPORT_FILE="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}.${FORMAT}"
+ARTIFACT_DIR="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}_artifacts"
 PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
 MATRIX_CMD=(
   "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
@@ -105,6 +106,7 @@ MATRIX_CMD=(
 echo "Project: $PROJECT_DIR"
 echo "Config: $CONFIG_FILE"
 echo "Report: $REPORT_FILE"
+echo "Artifacts: $ARTIFACT_DIR"
 echo "Matrix: 2 nodes x {1,2,4,8} GPUs per node; all_reduce_perf + alltoall_perf; 16G"
 
 if ((DRY_RUN)); then
@@ -117,6 +119,7 @@ if ((DRY_RUN)); then
     exit 0
   fi
   printf 'DRY RUN matrix:'
+  printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR"
   printf ' %q' "${MATRIX_CMD[@]}"
   printf '\n'
   exit 0
@@ -135,8 +138,10 @@ if ((PREFLIGHT_ONLY)); then
   exit 0
 fi
 
-"${MATRIX_CMD[@]}"
+mkdir -p "$ARTIFACT_DIR"
+MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${MATRIX_CMD[@]}"
 status=$?
 
 echo "Report written to: $REPORT_FILE"
+echo "Artifacts written to: $ARTIFACT_DIR"
 exit "$status"
-- 
2.47.2


From 4d066391298309eec230a74a3d4603e08dde5af1 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 19:45:03 +0800
Subject: [PATCH 28/41] Record multinode NCCL artifacts run

---
 ...ts_multinode_nccl_handoff_plan_20260523.md | 23 +++---
 ...ts_multinode_nccl_latest_index_20260523.md | 25 ++++---
 ...ltinode_nccl_pdf_matrix_20260523_113803.md | 75 +++++++++++++++++++
 ...trix_artifacts_manifest_20260523_113803.md | 33 ++++++++
 ..._multinode_nccl_pdf_matrix_run_20260523.md | 32 ++++----
 5 files changed, 154 insertions(+), 34 deletions(-)
 create mode 100644 reports_multinode_nccl_pdf_matrix_20260523_113803.md
 create mode 100644 reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md

diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md
index 05df781..2393c25 100644
--- a/reports_multinode_nccl_handoff_plan_20260523.md
+++ b/reports_multinode_nccl_handoff_plan_20260523.md
@@ -11,9 +11,10 @@
 | 两台机器可用于 NCCL 的 400G IB rail 是 4 条 | `mlx5_0,mlx5_1,mlx5_6,mlx5_7` 均为 `400 Gb/sec (4X NDR)` |
 | 其他 HCA 不等价 | `mlx5_4/5` 为 100G IB，`mlx5_2/8` 为 25G Ethernet，`mlx5_3/9` DOWN |
 | NCCL 2.27.7 GDR 可用 | GRAPH/NET 日志中 GDR enabled |
-| allreduce 已接近当前 4 rail 物理上限 | 最新 PDF matrix 2x8 为 `354.56 GB/s busbw`，反推 `189.10 GB/s algbw`，接近 4 x 400G 的 `200 GB/s` 单向原始带宽 |
-| alltoall PXN disabled 后 rail 均衡但仍低 | 最新 PDF matrix 2x8 为 `36.82 GB/s busbw`，每条 rail 约 `19-20 GB/s` |
-| 正式 PDF matrix 已复跑 | `reports_multinode_nccl_pdf_matrix_20260523_112247.md`，所有 case 正确性通过但性能阈值 FAIL |
+| allreduce 已接近当前 4 rail 物理上限 | 最新 PDF matrix 2x8 为 `353.85 GB/s busbw`，反推 `188.72 GB/s algbw`，接近 4 x 400G 的 `200 GB/s` 单向原始带宽 |
+| alltoall PXN disabled 后 rail 均衡但仍低 | 最新 PDF matrix 2x8 为 `36.83 GB/s busbw`，每条 rail 约 `19-20 GB/s` |
+| 正式 PDF matrix 已复跑 | `reports_multinode_nccl_pdf_matrix_20260523_113803.md`，所有 case 正确性通过；除 2x2 allreduce 外，性能阈值仍 FAIL |
+| 原始 artifacts 已归档 | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`，每个 case 有完整 `cmd/stdout/stderr/json` |
 | 没看到硬错误 | 未见 discard、RoCE retrans、slow restart、packet sequence error 等增长 |
 | 当前缺外部 NCCL 网络组件 | 未找到 `libnccl-net*.so*` / `libsharp*.so*`，未见 SHARP/HCOLL 包 |
 
@@ -62,17 +63,17 @@ busbw = algbw * 1.875
 
 建议把当前 2x8 allreduce 的可解释目标按 4 x 400G rail 物理能力重新评估：
 
-- allreduce 当前 `354.56 GB/s busbw`，反推 `189.10 GB/s algbw`，接近 `200 GB/s` 单向原始上限。
-- alltoall 当前 `36.82 GB/s` 仍偏低，需要作为独立问题继续排查。
+- allreduce 当前 `353.85 GB/s busbw`，反推 `188.72 GB/s algbw`，接近 `200 GB/s` 单向原始上限。
+- alltoall 当前 `36.83 GB/s` 仍偏低，需要作为独立问题继续排查。
 
 ## 最新 PDF matrix 结果
 
 | Topology | AllReduce | AllReduce Target | AllToAll | AllToAll Target |
 |---|---:|---:|---:|---:|
-| 2 nodes x 1 GPU | `47.15` | `48.90` | `24.85` | `27.25` |
-| 2 nodes x 2 GPUs | `136.62` | `136.93` | `47.71` | `54.41` |
-| 2 nodes x 4 GPUs | `335.19` | `335.48` | `72.63` | `73.73` |
-| 2 nodes x 8 GPUs | `354.56` | `491.84` | `36.82` | `76.54` |
+| 2 nodes x 1 GPU | `47.29` | `48.90` | `24.85` | `27.25` |
+| 2 nodes x 2 GPUs | `137.16` | `136.93` | `47.76` | `54.41` |
+| 2 nodes x 4 GPUs | `335.07` | `335.48` | `72.74` | `73.73` |
+| 2 nodes x 8 GPUs | `353.85` | `491.84` | `36.83` | `76.54` |
 
 所有 case 的 return code 为 `0`，NCCL `Out of bounds values` 为 `0 OK`。因此本轮 FAIL 是性能阈值失败，不是 NCCL 正确性或启动链路失败。
 
@@ -166,8 +167,10 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 | 文件 | 用途 |
 |---|---|
 | `reports_multinode_nccl_diagnosis_20260523.md` | 总诊断报告 |
-| `reports_multinode_nccl_pdf_matrix_20260523_112247.md` | 最新多机多卡 PDF matrix 原始报告 |
+| `reports_multinode_nccl_pdf_matrix_20260523_112247.md` | 上一次多机多卡 PDF matrix 原始报告 |
+| `reports_multinode_nccl_pdf_matrix_20260523_113803.md` | 最新带 artifacts 的多机多卡 PDF matrix 原始报告 |
 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新多机多卡 PDF matrix 中文摘要 |
+| `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum |
 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮深度复跑结果 |
 | `reports_multinode_nccl_environment_gap_20260523.md` | 硬件/软件环境等价性缺口 |
 | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail/counter 证据 |
diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md
index 1aa52ef..3864273 100644
--- a/reports_multinode_nccl_latest_index_20260523.md
+++ b/reports_multinode_nccl_latest_index_20260523.md
@@ -6,11 +6,11 @@
 
 当前结论：
 
-- 2026-05-23 `11:22` 已完成正式多机多卡 PDF matrix 复跑，原始报告为 `reports_multinode_nccl_pdf_matrix_20260523_112247.md`，中文结论为 `reports_multinode_nccl_pdf_matrix_run_20260523.md`。
+- 2026-05-23 `11:38` 已完成带 artifacts 的正式多机多卡 PDF matrix 复跑，原始报告为 `reports_multinode_nccl_pdf_matrix_20260523_113803.md`，中文结论为 `reports_multinode_nccl_pdf_matrix_run_20260523.md`，artifact manifest 为 `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`。
 - 2 机 1/2/4 GPU per node 档位已接近 PDF 参考值，但严格按阈值仍 FAIL。
 - 2 机 8 GPU 档位仍未达到 PDF 参考值：
-  - allreduce 实测 `354.56 GB/s busbw`，PDF 目标 `491.84 GB/s`。
-  - alltoall 实测 `36.82 GB/s busbw`，PDF 目标 `76.54 GB/s`。
+  - allreduce 实测 `353.85 GB/s busbw`，PDF 目标 `491.84 GB/s`。
+  - alltoall 实测 `36.83 GB/s busbw`，PDF 目标 `76.54 GB/s`。
 - 当前 2 机 8 GPU 剩余差距不再像是旧 NCCL、GDR disabled、HCA 顺序、SSH/mpirun 或明显坏链路问题。
 - 当前更像是硬件 rail 数量与 PDF 不等价、NCCL net plugin / SHARP 缺失、或跨 Leaf alltoall 网络/图策略问题。
 
@@ -112,9 +112,12 @@ aikubeworker0016: /root/test_gpu_scripts/reports/nccl_environment_snapshot_aikub
 最新多机多卡 PDF matrix：
 
 ```text
-aikubeworker0012: /root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_112247.md
-local copy: reports_multinode_nccl_pdf_matrix_20260523_112247.md
+aikubeworker0012: /root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803.md
+artifacts: /root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts
+artifacts tar: /root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.tar.gz
+local copy: reports_multinode_nccl_pdf_matrix_20260523_113803.md
 summary: reports_multinode_nccl_pdf_matrix_run_20260523.md
+manifest: reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md
 ```
 
 下一次用 `scripts/run_multinode_nccl_pdf_matrix.sh` 复跑时，还会生成：
@@ -164,10 +167,10 @@ libsharp*.so*
 
 | Topology | AllReduce | AllReduce Target | AllToAll | AllToAll Target |
 |---|---:|---:|---:|---:|
-| 2 nodes x 1 GPU | `47.15` | `48.90` | `24.85` | `27.25` |
-| 2 nodes x 2 GPUs | `136.62` | `136.93` | `47.71` | `54.41` |
-| 2 nodes x 4 GPUs | `335.19` | `335.48` | `72.63` | `73.73` |
-| 2 nodes x 8 GPUs | `354.56` | `491.84` | `36.82` | `76.54` |
+| 2 nodes x 1 GPU | `47.29` | `48.90` | `24.85` | `27.25` |
+| 2 nodes x 2 GPUs | `137.16` | `136.93` | `47.76` | `54.41` |
+| 2 nodes x 4 GPUs | `335.07` | `335.48` | `72.74` | `73.73` |
+| 2 nodes x 8 GPUs | `353.85` | `491.84` | `36.83` | `76.54` |
 
 本轮完整复跑：
 
@@ -189,8 +192,10 @@ PXN disabled sweep 未发现有效参数：
 |---|---|
 | `reports_multinode_nccl_diagnosis_20260523.md` | 长版总诊断，包含从旧 NCCL/GDR disabled 到 PDF 矩阵对齐的全过程 |
 | `reports_multinode_nccl_pdf_matrix_nccl227.md` | 按 PDF 矩阵跑出的正式 raw report |
-| `reports_multinode_nccl_pdf_matrix_20260523_112247.md` | 最新正式 PDF matrix 原始报告 |
+| `reports_multinode_nccl_pdf_matrix_20260523_112247.md` | 上一次正式 PDF matrix 原始报告 |
+| `reports_multinode_nccl_pdf_matrix_20260523_113803.md` | 最新带 artifacts 的正式 PDF matrix 原始报告 |
 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式 PDF matrix 中文摘要 |
+| `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum |
 | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail 和 counter 证据 |
 | `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall PXN 和参数 sweep 结论 |
 | `reports_rdma_single_node_summary.md` | 单节点 RDMA/HCA 速率摘要 |
diff --git a/reports_multinode_nccl_pdf_matrix_20260523_113803.md b/reports_multinode_nccl_pdf_matrix_20260523_113803.md
new file mode 100644
index 0000000..06b509e
--- /dev/null
+++ b/reports_multinode_nccl_pdf_matrix_20260523_113803.md
@@ -0,0 +1,75 @@
+# GPU Test Report
+
+- **Date:** 2026-05-23T11:41:35.567886
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Failed or unverified items:
+- Multi-node NCCL: FAIL
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Multi-node NCCL | FAIL |
+
+## Multi-node NCCL / Cross Leaf
+
+Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
+
+- **Artifacts:** `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`
+- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
+- **Preflight:** PASS
+
+### Multi-node NCCL allreduce
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.29 GB/s | 16G | 47.26 GB/s | >= 48.90 GB/s | FAIL |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 137.16 GB/s | 16G | 137.13 GB/s | >= 136.93 GB/s | PASS |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 335.07 GB/s | 16G | 335.02 GB/s | >= 335.48 GB/s | FAIL |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 353.85 GB/s | 16G | 353.85 GB/s | >= 491.84 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | ranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2203142:2203200 [0] NCCL INFO comm 0x55e463572510 rank 0 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 47.2628  #   |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2203280:2203363 [0] NCCL INFO comm 0x55e2f3808c60 rank 0 nranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 335.021  #   |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | nks 16 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2203376:2203528 [0] NCCL INFO comm 0x55a5166a30c0 rank 0 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 353.854  #   |
+
+### Multi-node NCCL alltoall
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.85 GB/s | 16G | 24.90 GB/s | >= 27.25 GB/s | FAIL |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.76 GB/s | 16G | 47.98 GB/s | >= 54.41 GB/s | FAIL |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.74 GB/s | 16G | 72.80 GB/s | >= 73.73 GB/s | FAIL |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 36.83 GB/s | 16G | 36.85 GB/s | >= 76.54 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | ranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2203543:2203602 [0] NCCL INFO comm 0x55af2a804ba0 rank 0 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 24.9006  #   |
+| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | 0 | ker0012:2203610:2203792 [1] NCCL INFO comm 0x55e99a564500 rank 1 nranks 4 cudaDev 1 busId 2a000 - Destroy COMPLETE aikubeworker0016:1325607:1325696 [0] NCCL INFO comm 0x55eaaa7389c0 rank 2 nranks 4 cudaDev 0 busId 18000 - Destroy COMPLETE   |
+| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0016:1325765:1325869 [3] NCCL INFO comm 0x55cb0f1c9c10 rank 7 nranks 8 cudaDev 3 busId ab000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 72.7968  #   |
+| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | 0016:1325927:1326140 [2] NCCL INFO comm 0x5627d2adee20 rank 10 nranks 16 cudaDev 2 busId 3a000 - Destroy COMPLETE aikubeworker0016:1325926:1326135 [1] NCCL INFO comm 0x55c00c344ea0 rank 9 nranks 16 cudaDev 1 busId 2a000 - Destroy COMPLETE   |
+
+**Overall: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md b/reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md
new file mode 100644
index 0000000..a398123
--- /dev/null
+++ b/reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md
@@ -0,0 +1,33 @@
+# 多机多卡 NCCL PDF Matrix Artifacts Manifest 2026-05-23
+
+- Remote report: `reports/multinode_nccl_pdf_matrix_20260523_113803.md`
+- Remote artifact dir: `reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`
+- Remote artifact tar: `reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.tar.gz`
+- Case count: `8`
+- Artifact files: `32`
+
+## Case Summary
+
+| Case | Peak Bus BW | Avg Bus BW | Threshold | Wrong | Return Code | Status |
+|---|---:|---:|---:|---:|---:|---|
+| `allreduce_2x1_2_nodes_x_1_GPU_PDF_2_machines_2_GPUs` | 47.29 | 47.26 | 48.90 | 0 | 0 | FAIL |
+| `allreduce_2x2_2_nodes_x_2_GPUs_PDF_2_machines_4_GPUs` | 137.16 | 137.13 | 136.93 | 0 | 0 | PASS |
+| `allreduce_2x4_2_nodes_x_4_GPUs_PDF_2_machines_8_GPUs` | 335.07 | 335.02 | 335.48 | 0 | 0 | FAIL |
+| `allreduce_2x8_2_nodes_x_8_GPUs_PDF_2_machines_16_GPUs` | 353.85 | 353.85 | 491.84 | 0 | 0 | FAIL |
+| `alltoall_2x1_2_nodes_x_1_GPU_PDF_2_machines_2_GPUs` | 24.85 | 24.90 | 27.25 | 0 | 0 | FAIL |
+| `alltoall_2x2_2_nodes_x_2_GPUs_PDF_2_machines_4_GPUs` | 47.76 | 47.98 | 54.41 | 0 | 0 | FAIL |
+| `alltoall_2x4_2_nodes_x_4_GPUs_PDF_2_machines_8_GPUs` | 72.74 | 72.80 | 73.73 | 0 | 0 | FAIL |
+| `alltoall_2x8_2_nodes_x_8_GPUs_PDF_2_machines_16_GPUs` | 36.83 | 36.85 | 76.54 | 0 | 0 | FAIL |
+
+## Checksums
+
+```text
+682ac637460472d464a0d56ccc0f3335ed7f79a270157a403ebec23b8d9feceb  reports/multinode_nccl_pdf_matrix_20260523_113803.md
+7371fcaf7269f92eb1544e5e63573ebf77f4ae38f668b5b22169ca86e6d603ee  reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.tar.gz
+```
+
+Per-file artifact checksums are on the remote node at:
+
+```text
+reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.sha256
+```
diff --git a/reports_multinode_nccl_pdf_matrix_run_20260523.md b/reports_multinode_nccl_pdf_matrix_run_20260523.md
index e04ac0d..0006ea7 100644
--- a/reports_multinode_nccl_pdf_matrix_run_20260523.md
+++ b/reports_multinode_nccl_pdf_matrix_run_20260523.md
@@ -4,11 +4,15 @@
 
 对端节点：`aikubeworker0016`
 
-原始报告：`reports_multinode_nccl_pdf_matrix_20260523_112247.md`
+原始报告：`reports_multinode_nccl_pdf_matrix_20260523_113803.md`
 
-远端报告：`/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_112247.md`
+远端报告：`/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803.md`
 
-远端日志：`/root/test_gpu_scripts/reports/run_logs/multinode_nccl_pdf_matrix_20260523_112247.log`
+远端 artifacts：`/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`
+
+远端 artifacts tar：`/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.tar.gz`
+
+Artifacts manifest：`reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`
 
 执行命令：
 
@@ -40,24 +44,24 @@ bash scripts/run_multinode_nccl_pdf_matrix.sh
 
 | Topology | Peak Bus BW | Avg Bus BW | PDF Threshold | Gap | Status |
 |---|---:|---:|---:|---:|---|
-| 2 nodes x 1 GPU | 47.15 GB/s | 47.18 GB/s | >= 48.90 GB/s | -1.75 GB/s | FAIL |
-| 2 nodes x 2 GPUs | 136.62 GB/s | 136.67 GB/s | >= 136.93 GB/s | -0.31 GB/s | FAIL |
-| 2 nodes x 4 GPUs | 335.19 GB/s | 334.85 GB/s | >= 335.48 GB/s | -0.29 GB/s | FAIL |
-| 2 nodes x 8 GPUs | 354.56 GB/s | 354.21 GB/s | >= 491.84 GB/s | -137.28 GB/s | FAIL |
+| 2 nodes x 1 GPU | 47.29 GB/s | 47.26 GB/s | >= 48.90 GB/s | -1.61 GB/s | FAIL |
+| 2 nodes x 2 GPUs | 137.16 GB/s | 137.13 GB/s | >= 136.93 GB/s | +0.23 GB/s | PASS |
+| 2 nodes x 4 GPUs | 335.07 GB/s | 335.02 GB/s | >= 335.48 GB/s | -0.41 GB/s | FAIL |
+| 2 nodes x 8 GPUs | 353.85 GB/s | 353.85 GB/s | >= 491.84 GB/s | -137.99 GB/s | FAIL |
 
 ## AllToAll
 
 | Topology | Peak Bus BW | Avg Bus BW | PDF Threshold | Gap | Status |
 |---|---:|---:|---:|---:|---|
-| 2 nodes x 1 GPU | 24.85 GB/s | 24.92 GB/s | >= 27.25 GB/s | -2.40 GB/s | FAIL |
-| 2 nodes x 2 GPUs | 47.71 GB/s | 47.93 GB/s | >= 54.41 GB/s | -6.70 GB/s | FAIL |
-| 2 nodes x 4 GPUs | 72.63 GB/s | 72.67 GB/s | >= 73.73 GB/s | -1.10 GB/s | FAIL |
-| 2 nodes x 8 GPUs | 36.82 GB/s | 36.86 GB/s | >= 76.54 GB/s | -39.72 GB/s | FAIL |
+| 2 nodes x 1 GPU | 24.85 GB/s | 24.90 GB/s | >= 27.25 GB/s | -2.40 GB/s | FAIL |
+| 2 nodes x 2 GPUs | 47.76 GB/s | 47.98 GB/s | >= 54.41 GB/s | -6.65 GB/s | FAIL |
+| 2 nodes x 4 GPUs | 72.74 GB/s | 72.80 GB/s | >= 73.73 GB/s | -0.99 GB/s | FAIL |
+| 2 nodes x 8 GPUs | 36.83 GB/s | 36.85 GB/s | >= 76.54 GB/s | -39.71 GB/s | FAIL |
 
 ## 判断
 
-1. 2x2、2x4 的 AllReduce 已非常接近 PDF 阈值，差距分别只有 `0.31` 和 `0.29 GB/s`。
-2. 2x4 的 AllToAll 也接近阈值，差 `1.10 GB/s`。
-3. 2x8 是主要问题：AllReduce 只有 `354.56 / 491.84`，AllToAll 只有 `36.82 / 76.54`。
+1. 2x2 的 AllReduce 本次过线，2x4 的 AllReduce 非常接近 PDF 阈值，差 `0.41 GB/s`。
+2. 2x4 的 AllToAll 也接近阈值，差 `0.99 GB/s`。
+3. 2x8 是主要问题：AllReduce 只有 `353.85 / 491.84`，AllToAll 只有 `36.83 / 76.54`。
 4. 当前环境已经确认只有 4 条 400G IB rail 参与 NCCL，且没有发现外部 NCCL net plugin / SHARP；这仍是解释 2x8 目标不可达或严重掉速的最强证据。
 5. 本轮没有看到 GDR disabled 或 HCA 不可用，所以下一步不应继续纠结 SSH/mpirun/nccl-tests 启动链路，而应对齐 PDF 参考环境的 rail 数量、net plugin/SHARP、交换机跨 Leaf 策略。
-- 
2.47.2


From e0cb796b0c28c25eafa6c860ce301eb9a82be84d Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 19:50:51 +0800
Subject: [PATCH 29/41] Analyze multinode NCCL artifact signals

---
 ..._nccl_artifact_signal_analysis_20260523.md | 141 ++++++++++++++++++
 ...ts_multinode_nccl_handoff_plan_20260523.md |   2 +
 ...ts_multinode_nccl_latest_index_20260523.md |   8 +-
 3 files changed, 149 insertions(+), 2 deletions(-)
 create mode 100644 reports_multinode_nccl_artifact_signal_analysis_20260523.md

diff --git a/reports_multinode_nccl_artifact_signal_analysis_20260523.md b/reports_multinode_nccl_artifact_signal_analysis_20260523.md
new file mode 100644
index 0000000..1d8bc64
--- /dev/null
+++ b/reports_multinode_nccl_artifact_signal_analysis_20260523.md
@@ -0,0 +1,141 @@
+# 多机多卡 NCCL Artifacts 信号分析 2026-05-23
+
+## 分析对象
+
+- 本地 artifacts 解包目录：`/private/tmp/nccl_artifacts_113803/multinode_nccl_pdf_matrix_20260523_113803_artifacts`
+- 远端原始报告：`/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803.md`
+- 远端 artifacts：`/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`
+- 远端 artifacts tar：`/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.tar.gz`
+- 本地 manifest：`reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`
+
+这份文档只看最新正式 PDF matrix 复跑产生的原始 `cmd/stdout/stderr/json`，目的是回答：当前多机多卡 NCCL 是否真的走了 IB/GDRDMA，是否用到了正确 HCA，是否有 SHARP/外部 NCCL net plugin 信号，以及 2x8 失败更像卡在哪一层。
+
+## 一句话结论
+
+最新 artifacts 证明本轮多机多卡测试不是 launch 失败、不是回退 TCP、不是 GDRDMA 没开，也不是 HCA 名字选错；所有 case 都走 `IB`，都识别并启用了 `mlx5_0,mlx5_1,mlx5_6,mlx5_7` 这 4 条 400G rail，NCCL 正确性 `wrong=0`。当前主要缺口仍然是：环境没有外部 NCCL net plugin / SHARP 证据，且 2x8 档位的 PDF 阈值明显高于当前 4 rail 环境可解释能力，alltoall 还存在独立的跨 Leaf 多点通信效率问题。
+
+## Artifacts 信号表
+
+| Case | Peak | Threshold | Status | Plugin missing | NET/IB using | Using network IB | HCA set | GDR HCA set | GDRDMA edges | P2P/CUMEM | SHARP/CollNet | stdout KB |
+|---|---:|---:|---|---:|---:|---:|---|---|---:|---:|---:|---:|
+| allreduce_2x1 1_GPU | 47.29 | 48.90 | FAIL | 2 | 2 | 2 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 16 | 0 | 0 | 24 |
+| allreduce_2x2 2_GPUs | 137.16 | 136.93 | PASS | 4 | 4 | 4 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 32 | 32 | 0 | 68 |
+| allreduce_2x4 4_GPUs | 335.07 | 335.48 | FAIL | 8 | 8 | 8 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 256 | 0 | 0 | 259 |
+| allreduce_2x8 8_GPUs | 353.85 | 491.84 | FAIL | 16 | 16 | 16 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 256 | 0 | 0 | 410 |
+| alltoall_2x1 1_GPU | 24.85 | 27.25 | FAIL | 2 | 2 | 2 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 8 | 0 | 0 | 19 |
+| alltoall_2x2 2_GPUs | 47.76 | 54.41 | FAIL | 4 | 4 | 4 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 24 | 8 | 0 | 52 |
+| alltoall_2x4 4_GPUs | 72.74 | 73.73 | FAIL | 8 | 8 | 8 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 80 | 48 | 0 | 200 |
+| alltoall_2x8 8_GPUs | 36.83 | 76.54 | FAIL | 16 | 16 | 16 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 512 | 224 | 0 | 603 |
+
+字段解释：
+
+- `Plugin missing`：日志里的 `NET/Plugin: Could not find: none libnccl-net-none.so.` 次数。当前命令显式设置了 `NCCL_NET_PLUGIN=none`，所以这个信号表示没有使用外部 NCCL net plugin，而不是 NCCL 没有网络。
+- `NET/IB using`：日志里的 `NET/IB : Using ...` 次数，说明每个 rank 初始化时看到的 IB HCA 列表。
+- `Using network IB`：NCCL 最终选择了 `IB` 网络。
+- `GDR HCA set`：出现 `GPU Direct RDMA Enabled for HCA ...` 的 HCA 集合。
+- `GDRDMA edges`：NCCL graph/connection 中经由 `NET/IB/*/GDRDMA` 的跨节点边数量。
+- `P2P/CUMEM`：节点内 GPU 间路径信号，不是跨节点 IB。
+- `SHARP/CollNet`：日志中 `SHARP`、`CollNet`、`HCOLL` 相关信号计数。当前为 0。
+
+## 已排除的问题
+
+### 1. 不是 TCP 回退
+
+所有 8 个 case 都有 `Using network IB`，且每个 rank 均有 `NET/IB : Using ...`。这说明 NCCL 通信路径不是 socket/TCP 回退。
+
+### 2. 不是 HCA 名字选错
+
+所有 case 的 HCA 集合都一致：
+
+```text
+mlx5_0, mlx5_1, mlx5_6, mlx5_7
+```
+
+这与当前配置里的 `NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_6,mlx5_7` 一致，也与前面环境快照中确认的 4 条 400G IB rail 一致。
+
+### 3. 不是 GDRDMA 没开
+
+所有 case 都出现 `GPU Direct RDMA Enabled for HCA ...`，并且跨节点连接里有 `NET/IB/*/GDRDMA` 边。2x8 alltoall 甚至有 512 条 `GDRDMA/Shared` 边，所以不能简单判断为 GDRDMA 被关掉。
+
+### 4. 不是 NCCL 正确性失败
+
+最新 manifest 中 8 个 case 全部：
+
+```text
+returncode = 0
+wrong_count = 0
+```
+
+因此当前 FAIL 是严格 PDF 性能阈值失败，不是结果错误。
+
+## 仍然成立的缺口
+
+### 1. 外部 NCCL net plugin / SHARP 仍缺证据
+
+当前命令中显式设置：
+
+```text
+NCCL_NET_PLUGIN=none
+```
+
+所有 case 均出现 `NET/Plugin: Could not find: none libnccl-net-none.so.`，同时 `SHARP/CollNet` 信号计数为 0。结合前面的环境检查没有找到 `libnccl-net*.so*` / `libsharp*.so*`，当前环境不能证明与 PDF 参考环境的软件栈等价。
+
+### 2. 2x8 allreduce 更像被 4 rail 物理能力卡住
+
+2x8 allreduce：
+
+```text
+当前 busbw = 353.85 GB/s
+PDF 阈值 = 491.84 GB/s
+```
+
+16 rank allreduce 的换算关系是：
+
+```text
+busbw = algbw * 1.875
+```
+
+当前实测反推：
+
+```text
+353.85 / 1.875 = 188.72 GB/s algbw
+```
+
+当前每节点 4 条 400G rail 的理论单向原始带宽约：
+
+```text
+4 * 400 Gb/s / 8 = 200 GB/s
+```
+
+所以 allreduce 已经接近 4 rail 的可解释上限；如果 PDF 阈值来自更多 400G rail 或带 SHARP/plugin 的环境，当前节点不应直接按该阈值判死。
+
+### 3. 2x8 alltoall 是独立重点问题
+
+2x8 alltoall：
+
+```text
+当前 busbw = 36.83 GB/s
+PDF 阈值 = 76.54 GB/s
+```
+
+alltoall 和 allreduce 使用同一组 HCA，同样走 IB/GDRDMA，但 2x8 alltoall 下降明显。这个现象更像多点到多点流量在当前跨 Leaf 网络、ECMP/adaptive routing、拥塞控制或 NCCL graph 策略下效率不够，而不是单纯 HCA 没起来。
+
+## 下一步建议
+
+1. 先不要继续盲扫 NCCL 小参数。已有 artifacts 说明基础链路已经起来，继续微调环境变量的收益大概率很低。
+2. 向硬件/网络侧确认 PDF 参考环境每节点是否有 8 条 400G rail，以及是否启用了 SHARP、HCOLL 或外部 NCCL net plugin。
+3. 如果验收坚持 PDF 原阈值，应先补齐 plugin/SHARP 或换等价 8 rail 节点复测。
+4. 如果当前硬件形态就是 4 条 400G rail，则 allreduce 阈值应重新定标；alltoall 单独作为跨 Leaf 多点通信效率问题继续排查。
+5. 补齐 plugin/SHARP 后，优先复跑：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_multinode_nccl_pdf_matrix.sh
+```
+
+并对比新旧 artifacts 中：
+
+- `Plugin missing` 是否消失。
+- 是否出现外部 net plugin、SHARP 或 CollNet 信号。
+- 2x8 allreduce 是否突破当前 `353-354 GB/s` 平台。
+- 2x8 alltoall 是否突破当前 `36-37 GB/s` 平台。
diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md
index 2393c25..e91ff01 100644
--- a/reports_multinode_nccl_handoff_plan_20260523.md
+++ b/reports_multinode_nccl_handoff_plan_20260523.md
@@ -15,6 +15,7 @@
 | alltoall PXN disabled 后 rail 均衡但仍低 | 最新 PDF matrix 2x8 为 `36.83 GB/s busbw`，每条 rail 约 `19-20 GB/s` |
 | 正式 PDF matrix 已复跑 | `reports_multinode_nccl_pdf_matrix_20260523_113803.md`，所有 case 正确性通过；除 2x2 allreduce 外，性能阈值仍 FAIL |
 | 原始 artifacts 已归档 | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`，每个 case 有完整 `cmd/stdout/stderr/json` |
+| artifacts 信号已分析 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md`，确认所有 case 都走 IB/GDRDMA 和 4 条 400G HCA，未见 SHARP/CollNet |
 | 没看到硬错误 | 未见 discard、RoCE retrans、slow restart、packet sequence error 等增长 |
 | 当前缺外部 NCCL 网络组件 | 未找到 `libnccl-net*.so*` / `libsharp*.so*`，未见 SHARP/HCOLL 包 |
 
@@ -171,6 +172,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 | `reports_multinode_nccl_pdf_matrix_20260523_113803.md` | 最新带 artifacts 的多机多卡 PDF matrix 原始报告 |
 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新多机多卡 PDF matrix 中文摘要 |
 | `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum |
+| `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 |
 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮深度复跑结果 |
 | `reports_multinode_nccl_environment_gap_20260523.md` | 硬件/软件环境等价性缺口 |
 | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail/counter 证据 |
diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md
index 3864273..2ff15e1 100644
--- a/reports_multinode_nccl_latest_index_20260523.md
+++ b/reports_multinode_nccl_latest_index_20260523.md
@@ -7,6 +7,7 @@
 当前结论：
 
 - 2026-05-23 `11:38` 已完成带 artifacts 的正式多机多卡 PDF matrix 复跑，原始报告为 `reports_multinode_nccl_pdf_matrix_20260523_113803.md`，中文结论为 `reports_multinode_nccl_pdf_matrix_run_20260523.md`，artifact manifest 为 `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`。
+- 已补充 artifacts 信号分析：`reports_multinode_nccl_artifact_signal_analysis_20260523.md`。结论是所有 case 都走 `IB`，都使用 `mlx5_0,mlx5_1,mlx5_6,mlx5_7`，都有 GDRDMA 信号，但没有 SHARP/CollNet/外部 NCCL net plugin 证据。
 - 2 机 1/2/4 GPU per node 档位已接近 PDF 参考值，但严格按阈值仍 FAIL。
 - 2 机 8 GPU 档位仍未达到 PDF 参考值：
   - allreduce 实测 `353.85 GB/s busbw`，PDF 目标 `491.84 GB/s`。
@@ -20,8 +21,9 @@
 |---:|---|---|
 | 1 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划，包含决策树、要问的问题和复跑命令 |
 | 2 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价，重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
-| 3 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
-| 4 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
+| 3 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析，确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 |
+| 4 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
+| 5 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
 
 ## 关键脚本
 
@@ -85,6 +87,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 ```text
 /root/test_gpu_scripts/reports_multinode_nccl_handoff_plan_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_environment_gap_20260523.md
+/root/test_gpu_scripts/reports_multinode_nccl_artifact_signal_analysis_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_deep_diagnose_run_20260523.md
 ```
 
@@ -196,6 +199,7 @@ PXN disabled sweep 未发现有效参数：
 | `reports_multinode_nccl_pdf_matrix_20260523_113803.md` | 最新带 artifacts 的正式 PDF matrix 原始报告 |
 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式 PDF matrix 中文摘要 |
 | `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum |
+| `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 |
 | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail 和 counter 证据 |
 | `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall PXN 和参数 sweep 结论 |
 | `reports_rdma_single_node_summary.md` | 单节点 RDMA/HCA 速率摘要 |
-- 
2.47.2


From c2db68f608dd34d2adb01595986b6a5bb4cb83f0 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 20:07:47 +0800
Subject: [PATCH 30/41] Add multinode NCCL all collectives run

---
 ...node_nccl_nccl227_all_collectives_2x8.yaml |  72 +++++++++
 ...de_nccl_all_collectives_20260523_120144.md |  98 ++++++++++++
 ...inode_nccl_all_collectives_run_20260523.md |  49 ++++++
 ...ts_multinode_nccl_handoff_plan_20260523.md |  14 ++
 ...ts_multinode_nccl_latest_index_20260523.md |  35 ++++-
 scripts/run_multinode_nccl_all_collectives.sh | 147 ++++++++++++++++++
 6 files changed, 413 insertions(+), 2 deletions(-)
 create mode 100644 configs/multinode_nccl_nccl227_all_collectives_2x8.yaml
 create mode 100644 reports_multinode_nccl_all_collectives_20260523_120144.md
 create mode 100644 reports_multinode_nccl_all_collectives_run_20260523.md
 create mode 100755 scripts/run_multinode_nccl_all_collectives.sh

diff --git a/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml b/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml
new file mode 100644
index 0000000..1e5d464
--- /dev/null
+++ b/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml
@@ -0,0 +1,72 @@
+tools:
+  install_dir: /opt/gpu-test-tools
+
+report:
+  output_dir: ./reports
+  format: md
+
+multinode_nccl:
+  enabled: true
+  mode: cross-leaf-all-collectives-nccl-2.27.7
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: /data/nccl-tests-latest/build
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+    - broadcast_perf
+    - reduce_scatter_perf
+    - all_gather_perf
+    - sendrecv_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+      label: 2 nodes x 8 GPUs (all collectives evidence run)
+      op_env:
+        alltoall:
+          NCCL_PXN_DISABLE: 1
+  begin_size: 16G
+  end_size: 16G
+  step_factor: 2
+  warmup_iters: 10
+  gpus_per_rank: 1
+  timeout_sec: 1800
+  debug: INFO
+  socket_ifname: bond0
+  oob_tcp_ifname: bond0
+  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: null
+  min_nchannels: null
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: null
+  extra_env:
+    NCCL_DEBUG_SUBSYS: INIT,NET
+    NCCL_NET_GDR_LEVEL: 5
+    NCCL_NET_GDR_READ: 1
+    NCCL_DMABUF_ENABLE: 0
+  min_peak_busbw_gbps:
+    allreduce: 491.84
+    alltoall: 76.54
+    broadcast: 0
+    reducescatter: 0
+    allgather: 0
+    sendrecv: 0
diff --git a/reports_multinode_nccl_all_collectives_20260523_120144.md b/reports_multinode_nccl_all_collectives_20260523_120144.md
new file mode 100644
index 0000000..2b1d604
--- /dev/null
+++ b/reports_multinode_nccl_all_collectives_20260523_120144.md
@@ -0,0 +1,98 @@
+# GPU Test Report
+
+- **Date:** 2026-05-23T12:04:48.257734
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Failed or unverified items:
+- Multi-node NCCL: FAIL
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Multi-node NCCL | FAIL |
+
+## Multi-node NCCL / Cross Leaf
+
+Source: nccl-tests-mpirun | Mode: cross-leaf-all-collectives-nccl-2.27.7
+
+- **Artifacts:** `/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts`
+- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
+- **Preflight:** PASS
+
+### Multi-node NCCL allreduce
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | - | 354.27 GB/s | 16G | 354.45 GB/s | >= 491.84 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | 0 | nks 16 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2208791:2208941 [0] NCCL INFO comm 0x557970d9f5f0 rank 0 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 354.452  #   |
+
+### Multi-node NCCL alltoall
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | - | 37.00 GB/s | 16G | 37.14 GB/s | >= 76.54 GB/s | FAIL |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+| Topology | Return Code | Error / Output Tail |
+|----------|-------------|---------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | 0 | r0012:2208962:2209141 [5] NCCL INFO comm 0x564c4f9c4a30 rank 5 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE aikubeworker0012:2208963:2209143 [6] NCCL INFO comm 0x56328e52f270 rank 6 nranks 16 cudaDev 6 busId ba000 - Destroy COMPLETE   |
+
+### Multi-node NCCL broadcast
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | - | 191.65 GB/s | 16G | 190.25 GB/s | - | PASS |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+### Multi-node NCCL reducescatter
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | - | 192.75 GB/s | 16G | 192.74 GB/s | - | PASS |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+### Multi-node NCCL allgather
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | - | 192.14 GB/s | 16G | 192.47 GB/s | - | PASS |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+### Multi-node NCCL sendrecv
+
+| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|----------------------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | - | 26.98 GB/s | 16G | 26.97 GB/s | - | PASS |
+
+| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
+|----------|--------------|-----------------|------------------|-------------------|
+| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
+
+**Overall: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_multinode_nccl_all_collectives_run_20260523.md b/reports_multinode_nccl_all_collectives_run_20260523.md
new file mode 100644
index 0000000..9468190
--- /dev/null
+++ b/reports_multinode_nccl_all_collectives_run_20260523.md
@@ -0,0 +1,49 @@
+# 多机多卡 NCCL 六项 Collective 补测结果 2026-05-23
+
+## 测试对象
+
+- 节点：`nccl-gpu-1(172.72.8.12)` + `nccl-gpu-2(172.72.8.16)`
+- 拓扑：`2 nodes x 8 GPUs`
+- NCCL：`2.27.7`
+- nccl-tests：`/data/nccl-tests-latest/build`
+- 配置：`configs/multinode_nccl_nccl227_all_collectives_2x8.yaml`
+- 入口：`scripts/run_multinode_nccl_all_collectives.sh`
+- 远端报告：`/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md`
+- 远端 artifacts：`/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts`
+- 本地报告：`reports_multinode_nccl_all_collectives_20260523_120144.md`
+
+## 一句话结论
+
+这次补测已经把单机 `test all` 中的 6 个 NCCL collective 扩展到了多机 2x8 场景：`allreduce/alltoall/broadcast/reducescatter/allgather/sendrecv` 都能跑通，`returncode=0`、`wrong_count=0`，并且都走 `IB + GDRDMA`。按已知 PDF 2x8 阈值，`allreduce` 和 `alltoall` 仍 FAIL；新增的 4 项目前没有 PDF 跨节点阈值，因此只作为证据采集项，不判生产验收性能。
+
+## 结果表
+
+| Operation | Peak Bus BW | Threshold | Correctness | Network | Status |
+|---|---:|---:|---|---|---|
+| allreduce | `354.27 GB/s` | `>= 491.84 GB/s` | `wrong=0` | `IB/GDRDMA` | FAIL |
+| alltoall | `37.00 GB/s` | `>= 76.54 GB/s` | `wrong=0` | `IB/GDRDMA` | FAIL |
+| broadcast | `191.65 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
+| reducescatter | `192.75 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
+| allgather | `192.14 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
+| sendrecv | `26.98 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
+
+## 怎么解读
+
+1. 这次不是替代 PDF matrix，而是补齐多机多卡 collective 覆盖面。
+2. `allreduce/alltoall` 继续沿用已知 PDF 2x8 阈值，所以报告整体是 `FAIL`。
+3. `broadcast/reducescatter/allgather/sendrecv` 当前只能证明“多机 2x8 能跑、正确性为 0 wrong、走 IB/GDRDMA”，还不能证明生产性能达标，因为手头 PDF matrix 没给这 4 项跨节点阈值。
+4. 新增 4 项的带宽大致呈现两个层次：
+   - `broadcast/reducescatter/allgather` 在 `191-193 GB/s`，接近当前 4 x 400G rail 的单向原始上限。
+   - `sendrecv` 只有 `26.98 GB/s`，需要结合 sendrecv 的 traffic pattern 单独解读，不能直接和 allreduce busbw 混比。
+
+## 校验信息
+
+```text
+06c565281813c4260da9cfee8f0b0289b61b3be95c01dd670c71fa1a441133e3  reports/multinode_nccl_all_collectives_20260523_120144.md
+020eb35ddc5933da78b5c00c1b6fc25b11b23c4505300276d9736fbe8a35519b  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+47f68b7510df3b472e7ac0ec2fb53dcefbe687bb4de0c889f8947cc652d09e61  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+fa2828cdfcb86e6715a17c8bf45de10ce421c12f0877efff9bafb218b2f00df3  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+077fec1bf498fd202e2866f1cf6fb4502ac8d1bafba156f213453b21f6a6df2b  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+be24943eb4b63e304cee41831adeb23ffbbc0e890ff19b067e06d6a4b48b2d90  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+4560364922a85d21827357b906491aae8283c6148ff1c0e0f0dc379a68307fdd  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+```
diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md
index e91ff01..80b27c5 100644
--- a/reports_multinode_nccl_handoff_plan_20260523.md
+++ b/reports_multinode_nccl_handoff_plan_20260523.md
@@ -16,6 +16,7 @@
 | 正式 PDF matrix 已复跑 | `reports_multinode_nccl_pdf_matrix_20260523_113803.md`，所有 case 正确性通过；除 2x2 allreduce 外，性能阈值仍 FAIL |
 | 原始 artifacts 已归档 | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`，每个 case 有完整 `cmd/stdout/stderr/json` |
 | artifacts 信号已分析 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md`，确认所有 case 都走 IB/GDRDMA 和 4 条 400G HCA，未见 SHARP/CollNet |
+| 多机六项 collective 已补测 | `reports_multinode_nccl_all_collectives_run_20260523.md`，2x8 下 6 项均正确性通过，allreduce/alltoall 按 PDF 阈值仍 FAIL |
 | 没看到硬错误 | 未见 discard、RoCE retrans、slow restart、packet sequence error 等增长 |
 | 当前缺外部 NCCL 网络组件 | 未找到 `libnccl-net*.so*` / `libsharp*.so*`，未见 SHARP/HCOLL 包 |
 
@@ -140,6 +141,15 @@ cd /root/test_gpu_scripts
 bash scripts/run_multinode_nccl_pdf_matrix.sh
 ```
 
+### 多机多卡 2x8 六项 collective 补测
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_multinode_nccl_all_collectives.sh
+```
+
+说明：这个入口用于补齐单机 `test all` 中已有、但多机 PDF matrix 还没覆盖的 NCCL collective。已知 PDF 2x8 阈值仍用于 `allreduce/alltoall`；新增的 `broadcast/reducescatter/allgather/sendrecv` 暂作为证据采集项，不强行套 PDF allreduce/alltoall 阈值。
+
 ### 完整深度诊断
 
 ```bash
@@ -173,6 +183,8 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新多机多卡 PDF matrix 中文摘要 |
 | `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum |
 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 |
+| `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 |
+| `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 |
 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮深度复跑结果 |
 | `reports_multinode_nccl_environment_gap_20260523.md` | 硬件/软件环境等价性缺口 |
 | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail/counter 证据 |
@@ -182,7 +194,9 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 | `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
 | `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 |
 | `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口；复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` |
+| `scripts/run_multinode_nccl_all_collectives.sh` | 多机多卡 2x8 六项 collective 补测入口；复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` |
 | `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置 |
+| `configs/multinode_nccl_nccl227_all_collectives_2x8.yaml` | 多机多卡 2x8 六项 collective 补测配置 |
 
 ## 当前建议
 
diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md
index 2ff15e1..ebc3481 100644
--- a/reports_multinode_nccl_latest_index_20260523.md
+++ b/reports_multinode_nccl_latest_index_20260523.md
@@ -8,6 +8,7 @@
 
 - 2026-05-23 `11:38` 已完成带 artifacts 的正式多机多卡 PDF matrix 复跑，原始报告为 `reports_multinode_nccl_pdf_matrix_20260523_113803.md`，中文结论为 `reports_multinode_nccl_pdf_matrix_run_20260523.md`，artifact manifest 为 `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`。
 - 已补充 artifacts 信号分析：`reports_multinode_nccl_artifact_signal_analysis_20260523.md`。结论是所有 case 都走 `IB`，都使用 `mlx5_0,mlx5_1,mlx5_6,mlx5_7`，都有 GDRDMA 信号，但没有 SHARP/CollNet/外部 NCCL net plugin 证据。
+- 已补充并实跑多机多卡 2x8 六项 collective：`reports_multinode_nccl_all_collectives_run_20260523.md`。新增 `broadcast/reducescatter/allgather/sendrecv` 均 `returncode=0`、`wrong=0`、走 `IB/GDRDMA`；已知 PDF 阈值项 `allreduce/alltoall` 仍 FAIL。
 - 2 机 1/2/4 GPU per node 档位已接近 PDF 参考值，但严格按阈值仍 FAIL。
 - 2 机 8 GPU 档位仍未达到 PDF 参考值：
   - allreduce 实测 `353.85 GB/s busbw`，PDF 目标 `491.84 GB/s`。
@@ -22,8 +23,9 @@
 | 1 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划，包含决策树、要问的问题和复跑命令 |
 | 2 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价，重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
 | 3 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析，确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 |
-| 4 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
-| 5 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
+| 4 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果，补齐单机 test all 的 NCCL 覆盖面 |
+| 5 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
+| 6 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
 
 ## 关键脚本
 
@@ -33,7 +35,9 @@
 | `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本，不启动 NCCL workload |
 | `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口，默认同时采环境快照 |
 | `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口，跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall，并归档每个 case 的 command/stdout/stderr/parsed JSON |
+| `scripts/run_multinode_nccl_all_collectives.sh` | 多机多卡 2x8 六项 collective 补测入口，跑 allreduce/alltoall/broadcast/reducescatter/allgather/sendrecv，并归档每个 case |
 | `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置，固定 NCCL 2.27.7 和 `/data/nccl-tests-latest/build` |
+| `configs/multinode_nccl_nccl227_all_collectives_2x8.yaml` | 多机多卡 2x8 六项 collective 补测配置，allreduce/alltoall 保留 PDF 阈值，新增 4 项暂按证据采集 |
 | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
 
 多机多卡 PDF 矩阵：
@@ -43,6 +47,13 @@ cd /root/test_gpu_scripts
 bash scripts/run_multinode_nccl_pdf_matrix.sh
 ```
 
+多机多卡 2x8 六项 collective 补测：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_multinode_nccl_all_collectives.sh
+```
+
 单节点 H100 原始 all 报告：
 
 ```bash
@@ -88,6 +99,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 /root/test_gpu_scripts/reports_multinode_nccl_handoff_plan_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_environment_gap_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_artifact_signal_analysis_20260523.md
+/root/test_gpu_scripts/reports_multinode_nccl_all_collectives_run_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_deep_diagnose_run_20260523.md
 ```
 
@@ -123,6 +135,15 @@ summary: reports_multinode_nccl_pdf_matrix_run_20260523.md
 manifest: reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md
 ```
 
+最新多机多卡 2x8 六项 collective 补测：
+
+```text
+aikubeworker0012: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md
+artifacts: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts
+local copy: reports_multinode_nccl_all_collectives_20260523_120144.md
+summary: reports_multinode_nccl_all_collectives_run_20260523.md
+```
+
 下一次用 `scripts/run_multinode_nccl_pdf_matrix.sh` 复跑时，还会生成：
 
 ```text
@@ -131,6 +152,14 @@ manifest: reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.m
 
 目录内按 case 保存完整 `cmd/stdout/stderr/json`，用于给网络/硬件侧复核原始 NCCL 输出。
 
+下一次用 `scripts/run_multinode_nccl_all_collectives.sh` 补测时，还会生成：
+
+```text
+/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_YYYYMMDD_HHMMSS_artifacts/
+```
+
+目录内按 6 个 collective 保存完整 `cmd/stdout/stderr/json`。该入口用于补齐单节点 `test all` 中已有、但多机 PDF matrix 未覆盖的 `broadcast/reducescatter/allgather/sendrecv` 证据；已知 PDF 2x8 阈值仍用于 `allreduce/alltoall`。
+
 ## 当前证据摘要
 
 ### HCA / rail
@@ -200,6 +229,8 @@ PXN disabled sweep 未发现有效参数：
 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式 PDF matrix 中文摘要 |
 | `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum |
 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 |
+| `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 |
+| `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 |
 | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail 和 counter 证据 |
 | `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall PXN 和参数 sweep 结论 |
 | `reports_rdma_single_node_summary.md` | 单节点 RDMA/HCA 速率摘要 |
diff --git a/scripts/run_multinode_nccl_all_collectives.sh b/scripts/run_multinode_nccl_all_collectives.sh
new file mode 100755
index 0000000..819e893
--- /dev/null
+++ b/scripts/run_multinode_nccl_all_collectives.sh
@@ -0,0 +1,147 @@
+#!/usr/bin/env bash
+set -uo pipefail
+
+# Run a two-node, eight-GPU-per-node NCCL evidence pass across the six
+# collectives used by the single-node H100 acceptance flow.
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
+
+PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
+CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml}"
+OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
+FORMAT="${FORMAT:-md}"
+DRY_RUN=0
+RUN_PREFLIGHT=1
+PREFLIGHT_ONLY=0
+
+usage() {
+  cat <<'EOF'
+Usage: run_multinode_nccl_all_collectives.sh [options]
+
+Options:
+  --python PATH       Python executable (default: /root/gpu-test-venv/bin/python)
+  --config PATH       Config file (default: configs/multinode_nccl_nccl227_all_collectives_2x8.yaml)
+  --out-dir PATH      Report output directory (default: reports)
+  --format FORMAT     Report format: md, json, or html (default: md)
+  --no-preflight      Skip scripts/multinode_nccl_deep_diagnose.sh preflight
+  --preflight-only    Run only the preflight check, not the workload
+  --dry-run           Print commands without running them
+  -h, --help          Show this help
+EOF
+}
+
+while (($#)); do
+  case "$1" in
+    --python)
+      PYTHON_BIN="$2"
+      shift 2
+      ;;
+    --config)
+      CONFIG_FILE="$2"
+      shift 2
+      ;;
+    --out-dir)
+      OUT_DIR="$2"
+      shift 2
+      ;;
+    --format)
+      FORMAT="$2"
+      shift 2
+      ;;
+    --no-preflight)
+      RUN_PREFLIGHT=0
+      shift
+      ;;
+    --preflight-only)
+      PREFLIGHT_ONLY=1
+      shift
+      ;;
+    --dry-run)
+      DRY_RUN=1
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
+  echo "Unsupported format: $FORMAT" >&2
+  exit 2
+fi
+
+if [[ ! -x "$PYTHON_BIN" ]]; then
+  PYTHON_BIN="$(command -v python3 || true)"
+fi
+
+if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
+  echo "Python executable not found. Set --python or PYTHON_BIN." >&2
+  exit 1
+fi
+
+TS="$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUT_DIR"
+
+REPORT_FILE="$OUT_DIR/multinode_nccl_all_collectives_${TS}.${FORMAT}"
+ARTIFACT_DIR="$OUT_DIR/multinode_nccl_all_collectives_${TS}_artifacts"
+PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
+RUN_CMD=(
+  "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
+  --config "$CONFIG_FILE"
+  --test multinode-nccl
+  --report
+  --format "$FORMAT"
+  --output "$REPORT_FILE"
+)
+
+echo "Project: $PROJECT_DIR"
+echo "Config: $CONFIG_FILE"
+echo "Report: $REPORT_FILE"
+echo "Artifacts: $ARTIFACT_DIR"
+echo "Collectives: allreduce, alltoall, broadcast, reducescatter, allgather, sendrecv"
+echo "Topology: 2 nodes x 8 GPUs per node; 16G"
+
+if ((DRY_RUN)); then
+  if ((RUN_PREFLIGHT)); then
+    printf 'DRY RUN preflight:'
+    printf ' %q' "${PREFLIGHT_CMD[@]}"
+    printf '\n'
+  fi
+  if ((PREFLIGHT_ONLY)); then
+    exit 0
+  fi
+  printf 'DRY RUN workload:'
+  printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR"
+  printf ' %q' "${RUN_CMD[@]}"
+  printf '\n'
+  exit 0
+fi
+
+if ((RUN_PREFLIGHT)); then
+  "${PREFLIGHT_CMD[@]}"
+  preflight_status=$?
+  if ((preflight_status != 0)); then
+    echo "Preflight failed with exit code $preflight_status" >&2
+    exit "$preflight_status"
+  fi
+fi
+
+if ((PREFLIGHT_ONLY)); then
+  exit 0
+fi
+
+mkdir -p "$ARTIFACT_DIR"
+MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${RUN_CMD[@]}"
+status=$?
+
+echo "Report written to: $REPORT_FILE"
+echo "Artifacts written to: $ARTIFACT_DIR"
+exit "$status"
-- 
2.47.2


From 90c46e40b326cd07c1c03869af84d6e17f8543dc Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 20:11:22 +0800
Subject: [PATCH 31/41] Archive all-collectives NCCL artifacts

---
 ...llectives_20260523_120144_artifacts.sha256 | 24 ++++++++++
 ..._collectives_20260523_120144_bundle.sha256 |  2 +
 ...ives_artifacts_manifest_20260523_120144.md | 46 +++++++++++++++++++
 ...ts_multinode_nccl_handoff_plan_20260523.md |  2 +
 ...ts_multinode_nccl_latest_index_20260523.md | 10 +++-
 5 files changed, 82 insertions(+), 2 deletions(-)
 create mode 100644 reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256
 create mode 100644 reports_multinode_nccl_all_collectives_20260523_120144_bundle.sha256
 create mode 100644 reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md

diff --git a/reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256 b/reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256
new file mode 100644
index 0000000..0264ba3
--- /dev/null
+++ b/reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256
@@ -0,0 +1,24 @@
+efa4a915bdf4943aef5d88c402c24eb2c60848e5f440f58058a1e99217b07e0d  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt
+020eb35ddc5933da78b5c00c1b6fc25b11b23c4505300276d9736fbe8a35519b  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt
+903772b675d9a9f7b04e061a25a90f97bf7844dddb5f3809bc9c501f4d6c783d  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt
+b7ea7350b3703d4b31389d92b375562bd04a50b40fe16a6c8d037b134a51dbd5  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt
+47f68b7510df3b472e7ac0ec2fb53dcefbe687bb4de0c889f8947cc652d09e61  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt
+6889180431d639e414e188e1dbc586157565e8506255731b7b38d221d0f72919  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt
+6ecbd8473d987d2a7839135029902bd629403eb407a7873502a49be26fa1c947  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt
+fa2828cdfcb86e6715a17c8bf45de10ce421c12f0877efff9bafb218b2f00df3  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt
+2eae24183754f8d084945d9857b84033ebccf1a2e606931b4f4fc19c5e2e876f  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt
+277e900dc1efa8f036616226dbc30cb616ba97337e929ad8b1a14c12484867b3  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt
+077fec1bf498fd202e2866f1cf6fb4502ac8d1bafba156f213453b21f6a6df2b  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt
+727c69ad6111b891c25360bd9e97ce15f2e7a36d5ff61ae88a7577ecb61c895f  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt
+8bec99a952eeb26fa3c6d89cbf2331393923fd4f0fae153b8efe3da239c0a09f  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt
+be24943eb4b63e304cee41831adeb23ffbbc0e890ff19b067e06d6a4b48b2d90  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt
+a8220b6a4fe3ae037837919a181452e0fc735f58f27fafff07ea431b09b905de  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt
+ead794f19e1d2d780cf1840c124b6e0955c70c8b157feb47c4826599d5643b39  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt
+4560364922a85d21827357b906491aae8283c6148ff1c0e0f0dc379a68307fdd  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt
+ade548ee5fdbe2d1fce461237b5b713cc2af24e6c2857bbbd73837f28551af27  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt
diff --git a/reports_multinode_nccl_all_collectives_20260523_120144_bundle.sha256 b/reports_multinode_nccl_all_collectives_20260523_120144_bundle.sha256
new file mode 100644
index 0000000..3097f81
--- /dev/null
+++ b/reports_multinode_nccl_all_collectives_20260523_120144_bundle.sha256
@@ -0,0 +1,2 @@
+06c565281813c4260da9cfee8f0b0289b61b3be95c01dd670c71fa1a441133e3  reports/multinode_nccl_all_collectives_20260523_120144.md
+fa5961d47a5905da6ebc6c726421d73ddc2314a316a8f578683d31fe69c256e5  reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz
diff --git a/reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md b/reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md
new file mode 100644
index 0000000..b1fc9b5
--- /dev/null
+++ b/reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md
@@ -0,0 +1,46 @@
+# 多机多卡 NCCL 六项 Collective Artifacts Manifest 2026-05-23
+
+- Remote report: `reports/multinode_nccl_all_collectives_20260523_120144.md`
+- Remote artifact dir: `reports/multinode_nccl_all_collectives_20260523_120144_artifacts`
+- Remote artifact tar: `reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz`
+- Remote bundle checksum: `reports/multinode_nccl_all_collectives_20260523_120144_bundle.sha256`
+- Remote per-file checksum: `reports/multinode_nccl_all_collectives_20260523_120144_artifacts.sha256`
+- Local report copy: `reports_multinode_nccl_all_collectives_20260523_120144.md`
+- Local artifact tar copy: `/private/tmp/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz`
+- Case count: `6`
+- Artifact files: `24`
+
+## Case Summary
+
+| Case | Peak Bus BW | Avg Bus BW | Threshold | Wrong | Return Code | Status |
+|---|---:|---:|---:|---:|---:|---|
+| `allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 354.27 | 354.45 | 491.84 | 0 | 0 | FAIL |
+| `alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 37.00 | 37.14 | 76.54 | 0 | 0 | FAIL |
+| `broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 191.65 | 190.25 | 0.00 | 0 | 0 | PASS |
+| `reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 192.75 | 192.74 | 0.00 | 0 | 0 | PASS |
+| `allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 192.14 | 192.47 | 0.00 | 0 | 0 | PASS |
+| `sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 26.98 | 26.97 | 0.00 | 0 | 0 | PASS |
+
+## Bundle Checksums
+
+```text
+06c565281813c4260da9cfee8f0b0289b61b3be95c01dd670c71fa1a441133e3  reports/multinode_nccl_all_collectives_20260523_120144.md
+fa5961d47a5905da6ebc6c726421d73ddc2314a316a8f578683d31fe69c256e5  reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz
+```
+
+## Per-file Checksums
+
+```text
+020eb35ddc5933da78b5c00c1b6fc25b11b23c4505300276d9736fbe8a35519b  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+47f68b7510df3b472e7ac0ec2fb53dcefbe687bb4de0c889f8947cc652d09e61  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+fa2828cdfcb86e6715a17c8bf45de10ce421c12f0877efff9bafb218b2f00df3  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+077fec1bf498fd202e2866f1cf6fb4502ac8d1bafba156f213453b21f6a6df2b  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+be24943eb4b63e304cee41831adeb23ffbbc0e890ff19b067e06d6a4b48b2d90  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+4560364922a85d21827357b906491aae8283c6148ff1c0e0f0dc379a68307fdd  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
+```
+
+完整逐文件 checksum 已保存为：
+
+```text
+reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256
+```
diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md
index 80b27c5..69bae84 100644
--- a/reports_multinode_nccl_handoff_plan_20260523.md
+++ b/reports_multinode_nccl_handoff_plan_20260523.md
@@ -17,6 +17,7 @@
 | 原始 artifacts 已归档 | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`，每个 case 有完整 `cmd/stdout/stderr/json` |
 | artifacts 信号已分析 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md`，确认所有 case 都走 IB/GDRDMA 和 4 条 400G HCA，未见 SHARP/CollNet |
 | 多机六项 collective 已补测 | `reports_multinode_nccl_all_collectives_run_20260523.md`，2x8 下 6 项均正确性通过，allreduce/alltoall 按 PDF 阈值仍 FAIL |
+| 六项 collective artifacts 已归档 | `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md`，远端 tar 为 `reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz` |
 | 没看到硬错误 | 未见 discard、RoCE retrans、slow restart、packet sequence error 等增长 |
 | 当前缺外部 NCCL 网络组件 | 未找到 `libnccl-net*.so*` / `libsharp*.so*`，未见 SHARP/HCOLL 包 |
 
@@ -185,6 +186,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 |
 | `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 |
 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 |
+| `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 最新多机多卡 2x8 六项 collective artifacts manifest 和 checksum |
 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮深度复跑结果 |
 | `reports_multinode_nccl_environment_gap_20260523.md` | 硬件/软件环境等价性缺口 |
 | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail/counter 证据 |
diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md
index ebc3481..1e99d08 100644
--- a/reports_multinode_nccl_latest_index_20260523.md
+++ b/reports_multinode_nccl_latest_index_20260523.md
@@ -9,6 +9,7 @@
 - 2026-05-23 `11:38` 已完成带 artifacts 的正式多机多卡 PDF matrix 复跑，原始报告为 `reports_multinode_nccl_pdf_matrix_20260523_113803.md`，中文结论为 `reports_multinode_nccl_pdf_matrix_run_20260523.md`，artifact manifest 为 `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`。
 - 已补充 artifacts 信号分析：`reports_multinode_nccl_artifact_signal_analysis_20260523.md`。结论是所有 case 都走 `IB`，都使用 `mlx5_0,mlx5_1,mlx5_6,mlx5_7`，都有 GDRDMA 信号，但没有 SHARP/CollNet/外部 NCCL net plugin 证据。
 - 已补充并实跑多机多卡 2x8 六项 collective：`reports_multinode_nccl_all_collectives_run_20260523.md`。新增 `broadcast/reducescatter/allgather/sendrecv` 均 `returncode=0`、`wrong=0`、走 `IB/GDRDMA`；已知 PDF 阈值项 `allreduce/alltoall` 仍 FAIL。
+- 六项 collective 的完整 artifacts 已归档：`reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md`，远端 tar 为 `reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz`。
 - 2 机 1/2/4 GPU per node 档位已接近 PDF 参考值，但严格按阈值仍 FAIL。
 - 2 机 8 GPU 档位仍未达到 PDF 参考值：
   - allreduce 实测 `353.85 GB/s busbw`，PDF 目标 `491.84 GB/s`。
@@ -24,8 +25,9 @@
 | 2 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价，重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
 | 3 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析，确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 |
 | 4 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果，补齐单机 test all 的 NCCL 覆盖面 |
-| 5 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
-| 6 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
+| 5 | `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 多机多卡 2x8 六项 collective artifacts manifest 和 checksum |
+| 6 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
+| 7 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
 
 ## 关键脚本
 
@@ -100,6 +102,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 /root/test_gpu_scripts/reports_multinode_nccl_environment_gap_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_artifact_signal_analysis_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_all_collectives_run_20260523.md
+/root/test_gpu_scripts/reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md
 /root/test_gpu_scripts/reports_multinode_nccl_deep_diagnose_run_20260523.md
 ```
 
@@ -140,8 +143,10 @@ manifest: reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.m
 ```text
 aikubeworker0012: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md
 artifacts: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts
+artifacts tar: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz
 local copy: reports_multinode_nccl_all_collectives_20260523_120144.md
 summary: reports_multinode_nccl_all_collectives_run_20260523.md
+manifest: reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md
 ```
 
 下一次用 `scripts/run_multinode_nccl_pdf_matrix.sh` 复跑时，还会生成：
@@ -231,6 +236,7 @@ PXN disabled sweep 未发现有效参数：
 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 |
 | `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 |
 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 |
+| `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 最新多机多卡 2x8 六项 collective artifacts manifest 和 checksum |
 | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail 和 counter 证据 |
 | `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall PXN 和参数 sweep 结论 |
 | `reports_rdma_single_node_summary.md` | 单节点 RDMA/HCA 速率摘要 |
-- 
2.47.2


From 5b022d5849c53eceb0a0176c9e1eae25f0e68e08 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 20:15:01 +0800
Subject: [PATCH 32/41] Summarize current H100 acceptance status

---
 ...h100_acceptance_current_status_20260523.md | 158 ++++++++++++++++++
 ...ts_multinode_nccl_handoff_plan_20260523.md |   3 +
 ...ts_multinode_nccl_latest_index_20260523.md |  29 ++--
 3 files changed, 178 insertions(+), 12 deletions(-)
 create mode 100644 reports_h100_acceptance_current_status_20260523.md

diff --git a/reports_h100_acceptance_current_status_20260523.md b/reports_h100_acceptance_current_status_20260523.md
new file mode 100644
index 0000000..4900f9a
--- /dev/null
+++ b/reports_h100_acceptance_current_status_20260523.md
@@ -0,0 +1,158 @@
+# H100 验收当前状态总览 2026-05-23
+
+## 一句话结论
+
+当前脚本能力和证据链已经基本补齐：单节点 `test all`、多机多卡 PDF matrix、2x8 六项 collective、跨节点 RDMA、NCCL artifacts、环境快照和 checksum 都已经有可复跑入口和原始证据。但按当前 PDF/配置口径，两台 H100 节点仍不能判定生产验收通过，主要阻塞不是脚本没跑，而是多项实测指标低于阈值，以及当前硬件/软件环境无法证明与 PDF 参考环境等价。
+
+## 当前总状态
+
+| 范围 | 当前证据 | 结论 | 主要阻塞 |
+|---|---|---|---|
+| 单节点 `test all` | `reports_test_all_latest_summary_cn_20260523.md` | 两台均 FAIL | Compute、NCCL、Stress、RDMA |
+| 跨节点 RDMA | `reports_rdma_cross_node_mlx5_0_20260523.md` | FAIL | read BW、write/read latency 未达阈值 |
+| 多机多卡 PDF matrix | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | FAIL | 2x8 allreduce/alltoall 差距大，1/4 GPU 档位部分小差距 |
+| 多机多卡 2x8 六项 collective | `reports_multinode_nccl_all_collectives_run_20260523.md` | FAIL / evidence complete | 6 项正确性通过；allreduce/alltoall 按 PDF 阈值 FAIL |
+| NCCL artifacts 信号 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 基础链路正常 | IB/GDRDMA/HCA 均正常；无 SHARP/CollNet/外部 net plugin |
+| 环境等价性 | `reports_multinode_nccl_environment_gap_20260523.md` | 未证明等价 | 每节点只有 4 条 400G rail，缺 NCCL net plugin / SHARP |
+
+## 已完成的能力
+
+| 能力 | 当前状态 |
+|---|---|
+| 单节点 H100 all 验收入口 | `scripts/run_h100_single_node_all.sh` 已可用，默认带环境快照 |
+| 多机 PDF matrix 入口 | `scripts/run_multinode_nccl_pdf_matrix.sh` 已可用，自动归档每个 case 的 `cmd/stdout/stderr/json` |
+| 多机 2x8 六项 collective 入口 | `scripts/run_multinode_nccl_all_collectives.sh` 已可用，覆盖 `allreduce/alltoall/broadcast/reducescatter/allgather/sendrecv` |
+| NCCL 深度诊断入口 | `scripts/multinode_nccl_deep_diagnose.sh` 已可用，覆盖 preflight、counter、graph、PXN sweep |
+| 环境等价性快照 | `scripts/nccl_environment_snapshot.sh` 已可用 |
+| 原始证据归档 | PDF matrix 和六项 collective artifacts 均已 tar + checksum |
+| 中文解释文档 | 指标说明、NCCL/RDMA 概念、handoff、environment gap、artifact signal analysis 均已生成 |
+
+## 单节点验收状态
+
+两台机器的单节点 `test all` 当前都是：
+
+```text
+Suite: 6/10 PASS
+PDF acceptance: FAIL
+```
+
+通过项：
+
+- GPU Info
+- Health
+- Memory Bandwidth
+- NVLink/NVSwitch
+- DCGM diag -r 3
+- Training Simulation
+
+失败项：
+
+| 项目 | 当前现象 | 备注 |
+|---|---|---|
+| Compute | 多 dtype 绝对 TFLOPS 阈值未达，部分 GPU 间 spread 超 3% | 需要复核 H100 阈值口径和具体 dtype 路径 |
+| NCCL 单机 | 真实 `nccl-tests` 已可测，但多 op/size 未达阈值 | 主要是 1M 小包，以及 reducescatter/allgather 的 2G |
+| Stress | 30 分钟可跑满，但温差和 `sw_power_cap` throttle 导致 FAIL | 更像散热/功耗策略或阈值口径问题 |
+| RDMA 单机 | read BW 未达标，部分端口速率低于 400G | 单机 local-loopback 不能替代跨节点 RDMA |
+
+## 跨节点 RDMA 状态
+
+跨节点 `mlx5_0` 单 rail perftest 结果：
+
+| Direction | Test | Value | Threshold | Status |
+|---|---|---:|---:|---|
+| 0016 -> 0012 | ib_write_bw | 49.35 GB/s | >= 47 GB/s | PASS |
+| 0016 -> 0012 | ib_read_bw | 44.36 GB/s | >= 47 GB/s | FAIL |
+| 0016 -> 0012 | ib_write_lat avg | 2.17 us | <= 2.0 us | FAIL |
+| 0016 -> 0012 | ib_read_lat avg | 4.05 us | <= 3.5 us | FAIL |
+| 0012 -> 0016 | ib_write_bw | 48.38 GB/s | >= 47 GB/s | PASS |
+| 0012 -> 0016 | ib_read_bw | 44.37 GB/s | >= 47 GB/s | FAIL |
+| 0012 -> 0016 | ib_write_lat avg | 2.13 us | <= 2.0 us | FAIL |
+| 0012 -> 0016 | ib_read_lat avg | 4.08 us | <= 3.5 us | FAIL |
+
+判断：链路连通、ibping 正常、PFC/ECN/CNP/congestion counter 干净；但 read bandwidth 和 latency 仍低于阈值，需要网络/OFED/BIOS/firmware 或 perftest 参数侧继续确认。
+
+## 多机多卡 NCCL 状态
+
+### PDF Matrix
+
+| Topology | AllReduce | Target | Status | AllToAll | Target | Status |
+|---|---:|---:|---|---:|---:|---|
+| 2 nodes x 1 GPU | 47.29 | 48.90 | FAIL | 24.85 | 27.25 | FAIL |
+| 2 nodes x 2 GPUs | 137.16 | 136.93 | PASS | 47.76 | 54.41 | FAIL |
+| 2 nodes x 4 GPUs | 335.07 | 335.48 | FAIL | 72.74 | 73.73 | FAIL |
+| 2 nodes x 8 GPUs | 353.85 | 491.84 | FAIL | 36.83 | 76.54 | FAIL |
+
+所有 case 均 `returncode=0`、`wrong=0`，所以 FAIL 来自性能阈值，不是功能错误。
+
+### 2x8 六项 Collective 补测
+
+| Operation | Peak Bus BW | Threshold | Correctness | Network | Status |
+|---|---:|---:|---|---|---|
+| allreduce | 354.27 | >= 491.84 | wrong=0 | IB/GDRDMA | FAIL |
+| alltoall | 37.00 | >= 76.54 | wrong=0 | IB/GDRDMA | FAIL |
+| broadcast | 191.65 | 未配置 | wrong=0 | IB/GDRDMA | PASS evidence |
+| reducescatter | 192.75 | 未配置 | wrong=0 | IB/GDRDMA | PASS evidence |
+| allgather | 192.14 | 未配置 | wrong=0 | IB/GDRDMA | PASS evidence |
+| sendrecv | 26.98 | 未配置 | wrong=0 | IB/GDRDMA | PASS evidence |
+
+这说明多机多卡 collective 覆盖面已经补齐，但生产性能是否达标仍取决于 PDF 是否有对应跨节点阈值，以及当前环境是否与 PDF 等价。
+
+## 当前最关键阻塞
+
+### 1. PDF 参考环境等价性未确认
+
+当前两台节点每节点只有 4 条可用于 NCCL 的 400G IB rail：
+
+```text
+mlx5_0, mlx5_1, mlx5_6, mlx5_7
+```
+
+其他 HCA：
+
+```text
+mlx5_4, mlx5_5: 100G InfiniBand
+mlx5_2, mlx5_8: 25G Ethernet
+mlx5_3, mlx5_9: DOWN
+```
+
+PDF 2x8 allreduce 目标 `491.84 GB/s busbw` 反推 algbw 为 `262.31 GB/s`，高于当前 4 x 400G rail 的理论单向原始带宽 `200 GB/s`。如果 PDF 参考环境有更多 400G rail 或 SHARP/plugin，当前硬件/软件栈不等价。
+
+### 2. 缺少 NCCL net plugin / SHARP
+
+当前没有发现：
+
+```text
+libnccl-net*.so*
+libsharp*.so*
+SHARP / HCOLL package
+```
+
+NCCL 日志中没有 SHARP/CollNet 迹象，当前走 internal IB plugin。
+
+### 3. alltoall 仍是独立问题
+
+`NCCL_PXN_DISABLE=1` 后 alltoall rail 更均衡，但 2x8 仍只有约 `36-37 GB/s`。已有 sweep 没找到稳定正收益，下一步应该交给网络路径、ECMP/adaptive routing、拥塞控制、plugin/SHARP 等方向，而不是继续盲调 NCCL 小参数。
+
+### 4. 单节点 Compute/Stress/RDMA 也未过
+
+即使多机 NCCL 后续解决，两台机器按当前 PDF `test all` 仍因 Compute、Stress、RDMA 项失败，不能直接判整机生产验收通过。
+
+## 建议下一步
+
+1. **硬件/网络侧先确认 PDF 等价性。** 确认参考环境每节点到底是 4 条还是 8 条 400G rail，是否启用 SHARP/NCCL net plugin，交换网络是否同一策略。
+2. **环境侧补齐或明确排除 SHARP/plugin。** 如果 PDF 环境有，当前必须补齐后重跑 `scripts/run_multinode_nccl_pdf_matrix.sh` 和 `scripts/run_multinode_nccl_all_collectives.sh`。
+3. **网络侧排查 alltoall。** 重点看跨 Leaf ECMP/adaptive routing/拥塞控制/credit wait，而不是只看链路是否 up。
+4. **单节点继续分项收敛。** Compute 阈值、Stress 温差/功耗 cap、RDMA read/latency 需要分别确认是机器问题、配置问题还是阈值口径问题。
+5. **如果硬件不等价，调整验收阈值或换等价节点复测。** 当前证据不支持把 4 rail 环境直接按疑似更高规格 PDF 阈值判定。
+
+## 当前最值得先读的文件
+
+| 顺序 | 文件 | 用途 |
+|---:|---|---|
+| 1 | `reports_h100_acceptance_current_status_20260523.md` | 当前总览和阻塞清单 |
+| 2 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划 |
+| 3 | `reports_multinode_nccl_environment_gap_20260523.md` | PDF 环境等价性缺口 |
+| 4 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | NCCL artifacts 信号分析 |
+| 5 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机 2x8 六项 collective 补测摘要 |
+| 6 | `reports_test_all_latest_summary_cn_20260523.md` | 单节点 test all 中文汇总 |
+| 7 | `reports_rdma_cross_node_mlx5_0_20260523.md` | 跨节点 RDMA 单 rail 证据 |
diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md
index 69bae84..d70ea8b 100644
--- a/reports_multinode_nccl_handoff_plan_20260523.md
+++ b/reports_multinode_nccl_handoff_plan_20260523.md
@@ -4,6 +4,8 @@
 
 当前 2 机 8 卡 NCCL 已经排除旧 NCCL、GDR disabled、HCA 选择错误、SSH/mpirun launch、明显链路错误等问题；剩余差距集中在 **硬件 rail 数量是否与 PDF 等价**、**NCCL net plugin / SHARP 是否缺失**、以及 **alltoall 在当前跨 Leaf 网络下的图策略/交换路径效率**。
 
+全局验收状态先看 `reports_h100_acceptance_current_status_20260523.md`；该文件把单节点 `test all`、跨节点 RDMA、多机 NCCL 和阻塞项汇总到一张总表。
+
 ## 已经验证的事实
 
 | 事实 | 当前证据 |
@@ -178,6 +180,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 
 | 文件 | 用途 |
 |---|---|
+| `reports_h100_acceptance_current_status_20260523.md` | 当前 H100 验收总览，汇总单节点、多机 NCCL、跨节点 RDMA 和阻塞项 |
 | `reports_multinode_nccl_diagnosis_20260523.md` | 总诊断报告 |
 | `reports_multinode_nccl_pdf_matrix_20260523_112247.md` | 上一次多机多卡 PDF matrix 原始报告 |
 | `reports_multinode_nccl_pdf_matrix_20260523_113803.md` | 最新带 artifacts 的多机多卡 PDF matrix 原始报告 |
diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md
index 1e99d08..2867b32 100644
--- a/reports_multinode_nccl_latest_index_20260523.md
+++ b/reports_multinode_nccl_latest_index_20260523.md
@@ -10,6 +10,7 @@
 - 已补充 artifacts 信号分析：`reports_multinode_nccl_artifact_signal_analysis_20260523.md`。结论是所有 case 都走 `IB`，都使用 `mlx5_0,mlx5_1,mlx5_6,mlx5_7`，都有 GDRDMA 信号，但没有 SHARP/CollNet/外部 NCCL net plugin 证据。
 - 已补充并实跑多机多卡 2x8 六项 collective：`reports_multinode_nccl_all_collectives_run_20260523.md`。新增 `broadcast/reducescatter/allgather/sendrecv` 均 `returncode=0`、`wrong=0`、走 `IB/GDRDMA`；已知 PDF 阈值项 `allreduce/alltoall` 仍 FAIL。
 - 六项 collective 的完整 artifacts 已归档：`reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md`，远端 tar 为 `reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz`。
+- 已补充当前验收状态总览：`reports_h100_acceptance_current_status_20260523.md`，把单节点、多机 NCCL、跨节点 RDMA、环境等价性和阻塞项合并到一份中文总表。
 - 2 机 1/2/4 GPU per node 档位已接近 PDF 参考值，但严格按阈值仍 FAIL。
 - 2 机 8 GPU 档位仍未达到 PDF 参考值：
   - allreduce 实测 `353.85 GB/s busbw`，PDF 目标 `491.84 GB/s`。
@@ -21,13 +22,14 @@
 
 | 顺序 | 文件 | 用途 |
 |---:|---|---|
-| 1 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划，包含决策树、要问的问题和复跑命令 |
-| 2 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价，重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
-| 3 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析，确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 |
-| 4 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果，补齐单机 test all 的 NCCL 覆盖面 |
-| 5 | `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 多机多卡 2x8 六项 collective artifacts manifest 和 checksum |
-| 6 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
-| 7 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
+| 1 | `reports_h100_acceptance_current_status_20260523.md` | 当前 H100 验收总览，汇总单节点、多机 NCCL、跨节点 RDMA 和阻塞项 |
+| 2 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划，包含决策树、要问的问题和复跑命令 |
+| 3 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价，重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
+| 4 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析，确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 |
+| 5 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果，补齐单机 test all 的 NCCL 覆盖面 |
+| 6 | `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 多机多卡 2x8 六项 collective artifacts manifest 和 checksum |
+| 7 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
+| 8 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
 
 ## 关键脚本
 
@@ -99,6 +101,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 
 ```text
 /root/test_gpu_scripts/reports_multinode_nccl_handoff_plan_20260523.md
+/root/test_gpu_scripts/reports_h100_acceptance_current_status_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_environment_gap_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_artifact_signal_analysis_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_all_collectives_run_20260523.md
@@ -228,6 +231,7 @@ PXN disabled sweep 未发现有效参数：
 | 文件 | 说明 |
 |---|---|
 | `reports_multinode_nccl_diagnosis_20260523.md` | 长版总诊断，包含从旧 NCCL/GDR disabled 到 PDF 矩阵对齐的全过程 |
+| `reports_h100_acceptance_current_status_20260523.md` | 当前 H100 验收总览，汇总单节点、多机 NCCL、跨节点 RDMA 和阻塞项 |
 | `reports_multinode_nccl_pdf_matrix_nccl227.md` | 按 PDF 矩阵跑出的正式 raw report |
 | `reports_multinode_nccl_pdf_matrix_20260523_112247.md` | 上一次正式 PDF matrix 原始报告 |
 | `reports_multinode_nccl_pdf_matrix_20260523_113803.md` | 最新带 artifacts 的正式 PDF matrix 原始报告 |
@@ -244,8 +248,9 @@ PXN disabled sweep 未发现有效参数：
 
 ## 给下一位接手人的路线
 
-1. 先读 `reports_multinode_nccl_handoff_plan_20260523.md`。
-2. 用 `reports_multinode_nccl_environment_gap_20260523.md` 和硬件/网络侧确认当前节点是否应具备 8 条 400G rail。
-3. 如果硬件不等价，调整验收口径或换等价节点复测。
-4. 如果硬件确认等价，先补齐 NCCL net plugin / SHARP，再跑 `scripts/multinode_nccl_deep_diagnose.sh graph` 对比 plugin 前后。
-5. alltoall 继续排查时优先找网络路径/ECMP/adaptive routing/拥塞策略，不建议继续盲扫 NCCL 小参数。
+1. 先读 `reports_h100_acceptance_current_status_20260523.md`。
+2. 再读 `reports_multinode_nccl_handoff_plan_20260523.md`。
+3. 用 `reports_multinode_nccl_environment_gap_20260523.md` 和硬件/网络侧确认当前节点是否应具备 8 条 400G rail。
+4. 如果硬件不等价，调整验收口径或换等价节点复测。
+5. 如果硬件确认等价，先补齐 NCCL net plugin / SHARP，再跑 `scripts/multinode_nccl_deep_diagnose.sh graph` 对比 plugin 前后。
+6. alltoall 继续排查时优先找网络路径/ECMP/adaptive routing/拥塞策略，不建议继续盲扫 NCCL 小参数。
-- 
2.47.2


From 1203b025a0c8173c9d9119d381aba58c27604979 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 20:22:15 +0800
Subject: [PATCH 33/41] Document H100 acceptance entrypoint

---
 README.md | 77 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 62 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index fd890d4..2555326 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,49 @@
 > **支持 GPU 架构：** Ampere (A100/A800) · Hopper (H100/H200) · Blackwell (B200/B300)
 > 系统自动检测 GPU 型号并使用对应的规格参数进行基准对比。
 
+## H100 当前验收入口
+
+当前分支 `h100-acceptance-current` 已补齐 H100 单节点、多节点 NCCL、跨节点 RDMA 的主要证据链。按现有 PDF/配置口径，当前结论仍是 **FAIL**：脚本和证据基本可交付，但机器尚未达到生产验收阈值。
+
+| 优先级 | 文件 | 用途 |
+|---|---|---|
+| 1 | [reports_h100_acceptance_current_status_20260523.md](reports_h100_acceptance_current_status_20260523.md) | 当前总状态：已测项、失败项、阻塞项、下一步 |
+| 2 | [reports_multinode_nccl_latest_index_20260523.md](reports_multinode_nccl_latest_index_20260523.md) | 多节点 NCCL 相关报告索引 |
+| 3 | [reports_multinode_nccl_handoff_plan_20260523.md](reports_multinode_nccl_handoff_plan_20260523.md) | 接手人复跑和继续定位计划 |
+| 4 | [reports_test_all_latest_summary_cn_20260523.md](reports_test_all_latest_summary_cn_20260523.md) | 单节点 `test all` 中文原始汇总 |
+| 5 | [reports_rdma_cross_node_mlx5_0_20260523.md](reports_rdma_cross_node_mlx5_0_20260523.md) | 跨节点 RDMA `mlx5_0` 双向结果 |
+
+当前主要阻塞：
+
+- 单节点 `test all`：两台节点均为 `6/10 PASS`，Compute、NCCL、Stress、RDMA 未过。
+- 跨节点 RDMA：`mlx5_0` 写带宽接近/达到阈值，但读带宽和读写延迟未过。
+- 多节点 NCCL：`2x8 allreduce`、`2x8 alltoall` 按 PDF 阈值未过；NCCL `wrong_count=0`，主要是性能不达标。
+- 环境差异：当前可用 400G IB rail 主要是 `mlx5_0,mlx5_1,mlx5_6,mlx5_7`，未发现外部 NCCL net plugin / SHARP / HCOLL。
+
+### H100 复跑入口
+
+远端默认路径为 `/root/test_gpu_scripts`，建议在 `nccl-gpu-1` 作为发起节点执行多节点测试。
+
+```bash
+# 单节点全量验收，分别在每台机器执行
+bash scripts/run_h100_single_node_all.sh
+
+# 多节点 NCCL PDF 矩阵：allreduce/alltoall x 2x1/2x2/2x4/2x8
+bash scripts/run_multinode_nccl_pdf_matrix.sh
+
+# 多节点 NCCL 六类 collective：2 节点 x 8 GPU
+bash scripts/run_multinode_nccl_all_collectives.sh
+
+# 多节点 NCCL 深度诊断和环境证据抓取
+bash scripts/multinode_nccl_deep_diagnose.sh preflight
+bash scripts/multinode_nccl_deep_diagnose.sh all
+```
+
 ---
 
 ## 目录
 
+- [H100 当前验收入口](#h100-当前验收入口)
 - [项目结构](#项目结构)
 - [环境要求](#环境要求)
 - [快速开始](#快速开始)
@@ -26,23 +65,31 @@
 ## 项目结构
 
 ```
-servertest/
-├── gpu_tester.py               # 主入口：CLI + 交互式菜单
-├── install_deps.sh             # 一键安装三方工具
+test_gpu_scripts/
+├── gpu_tester.py                               # 主入口：CLI + 交互式菜单
+├── install_deps.sh                             # 一键安装三方工具
 ├── configs/
-│   └── default.yaml            # 默认配置
+│   ├── default.yaml                            # 默认配置
+│   ├── multinode_nccl_nccl227_pdf_matrix.yaml  # H100 多节点 PDF 矩阵配置
+│   └── multinode_nccl_nccl227_all_collectives_2x8.yaml
 ├── modules/
-│   ├── gpu_specs.py            # GPU 规格数据库 (A100/A800/H100/H200/B200/B300)
-│   ├── gpu_info.py             # GPU 检测 & 信息
-│   ├── health_check.py         # 健康诊断
-│   ├── benchmark.py            # 内存带宽 + 计算吞吐
-│   ├── nccl_test.py            # NCCL 多卡通信
-│   ├── stress_test.py          # GPU 压力/稳定性
-│   ├── rdma_test.py            # RDMA/InfiniBand
-│   ├── training_sim.py         # 训练模拟
-│   └── report.py               # 报告生成
-├── requirements.txt
-└── 调研.md                     # 行业框架调研
+│   ├── gpu_specs.py                            # GPU 规格数据库
+│   ├── gpu_info.py                             # GPU 检测 & 信息
+│   ├── health_check.py                         # 健康诊断
+│   ├── benchmark.py                            # 内存带宽 + 计算吞吐
+│   ├── nccl_test.py                            # NCCL 多卡/多节点通信
+│   ├── stress_test.py                          # GPU 压力/稳定性
+│   ├── rdma_test.py                            # RDMA/InfiniBand
+│   ├── training_sim.py                         # 训练模拟
+│   └── report.py                               # 报告生成
+├── scripts/
+│   ├── run_h100_single_node_all.sh             # H100 单节点全量复跑
+│   ├── run_multinode_nccl_pdf_matrix.sh        # 多节点 NCCL PDF 矩阵复跑
+│   ├── run_multinode_nccl_all_collectives.sh   # 多节点 NCCL 六类 collective 复跑
+│   └── multinode_nccl_deep_diagnose.sh         # 多节点 NCCL 深度诊断
+├── docs/                                       # 指标说明和 runbook
+├── reports_*20260523*.md                       # 当前 H100 验收证据和汇总报告
+└── requirements.txt
 ```
 
 ---
-- 
2.47.2


From edb4612cc6e1986fb60fda780baa2eef268a5157 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 20:25:39 +0800
Subject: [PATCH 34/41] Add H100 acceptance closure checklist

---
 README.md                                     |   9 +-
 ...0_acceptance_closure_checklist_20260523.md | 101 ++++++++++++++++++
 ...h100_acceptance_current_status_20260523.md |  14 +--
 ...ts_multinode_nccl_latest_index_20260523.md |  17 +--
 4 files changed, 124 insertions(+), 17 deletions(-)
 create mode 100644 reports_h100_acceptance_closure_checklist_20260523.md

diff --git a/README.md b/README.md
index 2555326..3050464 100644
--- a/README.md
+++ b/README.md
@@ -13,10 +13,11 @@
 | 优先级 | 文件 | 用途 |
 |---|---|---|
 | 1 | [reports_h100_acceptance_current_status_20260523.md](reports_h100_acceptance_current_status_20260523.md) | 当前总状态：已测项、失败项、阻塞项、下一步 |
-| 2 | [reports_multinode_nccl_latest_index_20260523.md](reports_multinode_nccl_latest_index_20260523.md) | 多节点 NCCL 相关报告索引 |
-| 3 | [reports_multinode_nccl_handoff_plan_20260523.md](reports_multinode_nccl_handoff_plan_20260523.md) | 接手人复跑和继续定位计划 |
-| 4 | [reports_test_all_latest_summary_cn_20260523.md](reports_test_all_latest_summary_cn_20260523.md) | 单节点 `test all` 中文原始汇总 |
-| 5 | [reports_rdma_cross_node_mlx5_0_20260523.md](reports_rdma_cross_node_mlx5_0_20260523.md) | 跨节点 RDMA `mlx5_0` 双向结果 |
+| 2 | [reports_h100_acceptance_closure_checklist_20260523.md](reports_h100_acceptance_closure_checklist_20260523.md) | 收尾检查清单：可交付项、未关闭门禁、最短收尾路径 |
+| 3 | [reports_multinode_nccl_latest_index_20260523.md](reports_multinode_nccl_latest_index_20260523.md) | 多节点 NCCL 相关报告索引 |
+| 4 | [reports_multinode_nccl_handoff_plan_20260523.md](reports_multinode_nccl_handoff_plan_20260523.md) | 接手人复跑和继续定位计划 |
+| 5 | [reports_test_all_latest_summary_cn_20260523.md](reports_test_all_latest_summary_cn_20260523.md) | 单节点 `test all` 中文原始汇总 |
+| 6 | [reports_rdma_cross_node_mlx5_0_20260523.md](reports_rdma_cross_node_mlx5_0_20260523.md) | 跨节点 RDMA `mlx5_0` 双向结果 |
 
 当前主要阻塞：
 
diff --git a/reports_h100_acceptance_closure_checklist_20260523.md b/reports_h100_acceptance_closure_checklist_20260523.md
new file mode 100644
index 0000000..6c30aa8
--- /dev/null
+++ b/reports_h100_acceptance_closure_checklist_20260523.md
@@ -0,0 +1,101 @@
+# H100 验收收尾检查清单 2026-05-23
+
+## 结论
+
+当前项目已经可以进入“阶段性交付/问题转交”状态，但不能进入“生产验收通过”状态。
+
+原因不是测试没跑完，而是当前证据明确显示多个验收门禁仍为 `FAIL`。要真正收尾，必须满足下面两种路径之一：
+
+1. **通过路径：** 修复硬件/网络/软件环境后复跑，单节点、跨节点 RDMA、多节点 NCCL 均达到 PDF/配置阈值。
+2. **例外路径：** 硬件/网络/环境侧书面确认当前机器与 PDF 参考环境不等价，并给出新的验收阈值或豁免口径，再按新口径复核。
+
+在这两条路径完成前，本项目只能交付“已测证据 + 阻塞定位 + 复跑入口”，不能判定 H100 节点生产验收通过。
+
+## 当前可关闭的工作
+
+| 工作项 | 状态 | 证据 |
+|---|---|---|
+| 单节点 `test all` 入口 | 完成 | `scripts/run_h100_single_node_all.sh` |
+| 单节点中文原始汇总 | 完成 | `reports_test_all_latest_summary_cn_20260523.md` |
+| 跨节点 RDMA 单 rail 证据 | 完成 | `reports_rdma_cross_node_mlx5_0_20260523.md` |
+| 多节点 NCCL PDF matrix | 完成 | `scripts/run_multinode_nccl_pdf_matrix.sh`，`reports_multinode_nccl_pdf_matrix_run_20260523.md` |
+| 多节点 2x8 六项 collective | 完成 | `scripts/run_multinode_nccl_all_collectives.sh`，`reports_multinode_nccl_all_collectives_run_20260523.md` |
+| NCCL artifacts / checksum | 完成 | `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`，`reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` |
+| 环境等价性分析 | 完成 | `reports_multinode_nccl_environment_gap_20260523.md` |
+| 接手 runbook / README 入口 | 完成 | `README.md`，`reports_multinode_nccl_handoff_plan_20260523.md` |
+
+这些工作可以作为当前阶段交付物归档。
+
+## 不能关闭的验收门禁
+
+| 门禁 | 当前结果 | 现有证据 | 关闭条件 |
+|---|---|---|---|
+| 单节点 Compute | FAIL | 两台机器多 dtype 绝对 TFLOPS 未达阈值，部分 GPU spread 超 3% | 复核阈值/测试实现后重跑通过，或更新阈值口径 |
+| 单节点 NCCL | FAIL | 多 op/size 未达阈值，尤其小包和部分 2G case | 按 PDF/config 逐 size 通过，或明确小包/阈值豁免 |
+| 单节点 Stress | FAIL | 30 分钟可跑满，但温差和 `sw_power_cap` throttle 触发 FAIL | 调整散热/功耗策略或阈值后重跑通过 |
+| 单节点 RDMA | FAIL | read BW 未达 47 GB/s，`mlx5_4/5` 只有 100G | perftest read/write/latency 和端口速率满足验收口径 |
+| 跨节点 RDMA | FAIL | `mlx5_0` 写带宽 PASS，但读带宽和读写 latency FAIL | 双向 write/read BW/latency 全部达标 |
+| 多节点 NCCL allreduce | FAIL | 2x8 `353.85 GB/s`，目标 `491.84 GB/s` | 环境等价后达到 PDF 阈值，或按 4 x 400G rail 重定标 |
+| 多节点 NCCL alltoall | FAIL | 2x8 `36.83 GB/s`，目标 `76.54 GB/s` | 网络/plugin/SHARP/路径修复后达到阈值，或明确新口径 |
+| PDF 环境等价性 | 未证明 | 当前每节点只有 4 条 400G rail，缺外部 NCCL net plugin / SHARP | 确认参考环境 rail/plugin/SHARP/交换策略，并补齐或重定标 |
+
+## 最短收尾路径
+
+### 路径 A：按原 PDF 阈值验收
+
+必须先完成环境补齐：
+
+1. 确认每节点是否应有 8 条 400G IB rail；如果是，修复 `mlx5_4/5`、`mlx5_2/8`、`mlx5_3/9` 的速率/模式/状态。
+2. 如 PDF 参考环境使用 SHARP、HCOLL、UCX plugin 或 NCCL net plugin，则在两台节点补齐同等组件。
+3. 让网络侧确认跨 Leaf ECMP / adaptive routing / congestion control / credit wait 配置。
+4. 复跑：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_h100_single_node_all.sh
+bash scripts/run_multinode_nccl_pdf_matrix.sh
+bash scripts/run_multinode_nccl_all_collectives.sh
+```
+
+关闭标准：`reports_h100_acceptance_current_status_*.md` 中所有必测项不再有 `FAIL`。
+
+### 路径 B：承认当前环境与 PDF 不等价
+
+必须拿到新的验收口径：
+
+1. 硬件/网络侧确认当前机器实际有效 400G IB rail 数量。
+2. 明确是否允许按 4 x 400G rail 的物理上限重定 allreduce 阈值。
+3. 明确 2x8 alltoall 的合理目标，或要求安装 plugin/SHARP 后再判。
+4. 明确单节点 Compute、Stress、RDMA 的阈值是否沿用 PDF 原口径。
+5. 用新口径更新配置后复跑并生成新报告。
+
+关闭标准：新口径必须写进配置或报告，不能只口头说明。
+
+## 下一步优先级
+
+| 优先级 | 动作 | 负责人建议 | 为什么 |
+|---:|---|---|---|
+| P0 | 确认 PDF 参考环境 rail/plugin/SHARP 状态 | 硬件/网络/环境侧 | 不确认等价性，2x8 allreduce 阈值是否合理无法判断 |
+| P0 | 查跨 Leaf alltoall 网络路径 | 网络侧 | alltoall 低于目标过多，且参数 sweep 无稳定收益 |
+| P1 | 复核单节点 Compute 阈值和测试 dtype 路径 | 测试/平台侧 | 两台机器多 dtype 绝对阈值均失败，需要确认是不是口径问题 |
+| P1 | 处理 Stress `sw_power_cap` 和温差 | 机房/硬件侧 | 压测能跑满，但 telemetry 门禁未过 |
+| P1 | 处理 RDMA read BW/latency | 网络/OFED/固件侧 | 单节点和跨节点 RDMA 都有 read/latency 缺口 |
+| P2 | 启用 plugin/SHARP 后复跑 NCCL graph | 平台侧 | 用于验证 `plugin_missing` 是否消失、图策略是否变化 |
+
+## 当前交付物入口
+
+优先读：
+
+1. `reports_h100_acceptance_current_status_20260523.md`
+2. `reports_h100_acceptance_closure_checklist_20260523.md`
+3. `reports_multinode_nccl_handoff_plan_20260523.md`
+4. `reports_multinode_nccl_environment_gap_20260523.md`
+5. `reports_multinode_nccl_latest_index_20260523.md`
+
+当前项目可以向外汇报为：
+
+```text
+测试脚本、复跑入口、原始 artifacts、checksum 和中文报告已经齐备；
+但当前 H100 生产验收未通过，剩余问题集中在单节点 Compute/NCCL/Stress/RDMA、
+跨节点 RDMA read/latency、多节点 NCCL 2x8 allreduce/alltoall 性能，以及 PDF 环境等价性。
+```
diff --git a/reports_h100_acceptance_current_status_20260523.md b/reports_h100_acceptance_current_status_20260523.md
index 4900f9a..f8cbe3c 100644
--- a/reports_h100_acceptance_current_status_20260523.md
+++ b/reports_h100_acceptance_current_status_20260523.md
@@ -14,6 +14,7 @@
 | 多机多卡 2x8 六项 collective | `reports_multinode_nccl_all_collectives_run_20260523.md` | FAIL / evidence complete | 6 项正确性通过；allreduce/alltoall 按 PDF 阈值 FAIL |
 | NCCL artifacts 信号 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 基础链路正常 | IB/GDRDMA/HCA 均正常；无 SHARP/CollNet/外部 net plugin |
 | 环境等价性 | `reports_multinode_nccl_environment_gap_20260523.md` | 未证明等价 | 每节点只有 4 条 400G rail，缺 NCCL net plugin / SHARP |
+| 收尾检查 | `reports_h100_acceptance_closure_checklist_20260523.md` | 可阶段性交付 | 生产验收门禁仍未关闭 |
 
 ## 已完成的能力
 
@@ -150,9 +151,10 @@ NCCL 日志中没有 SHARP/CollNet 迹象，当前走 internal IB plugin。
 | 顺序 | 文件 | 用途 |
 |---:|---|---|
 | 1 | `reports_h100_acceptance_current_status_20260523.md` | 当前总览和阻塞清单 |
-| 2 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划 |
-| 3 | `reports_multinode_nccl_environment_gap_20260523.md` | PDF 环境等价性缺口 |
-| 4 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | NCCL artifacts 信号分析 |
-| 5 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机 2x8 六项 collective 补测摘要 |
-| 6 | `reports_test_all_latest_summary_cn_20260523.md` | 单节点 test all 中文汇总 |
-| 7 | `reports_rdma_cross_node_mlx5_0_20260523.md` | 跨节点 RDMA 单 rail 证据 |
+| 2 | `reports_h100_acceptance_closure_checklist_20260523.md` | 收尾检查清单和关闭条件 |
+| 3 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划 |
+| 4 | `reports_multinode_nccl_environment_gap_20260523.md` | PDF 环境等价性缺口 |
+| 5 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | NCCL artifacts 信号分析 |
+| 6 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机 2x8 六项 collective 补测摘要 |
+| 7 | `reports_test_all_latest_summary_cn_20260523.md` | 单节点 test all 中文汇总 |
+| 8 | `reports_rdma_cross_node_mlx5_0_20260523.md` | 跨节点 RDMA 单 rail 证据 |
diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md
index 2867b32..5bee9fe 100644
--- a/reports_multinode_nccl_latest_index_20260523.md
+++ b/reports_multinode_nccl_latest_index_20260523.md
@@ -11,6 +11,7 @@
 - 已补充并实跑多机多卡 2x8 六项 collective：`reports_multinode_nccl_all_collectives_run_20260523.md`。新增 `broadcast/reducescatter/allgather/sendrecv` 均 `returncode=0`、`wrong=0`、走 `IB/GDRDMA`；已知 PDF 阈值项 `allreduce/alltoall` 仍 FAIL。
 - 六项 collective 的完整 artifacts 已归档：`reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md`，远端 tar 为 `reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz`。
 - 已补充当前验收状态总览：`reports_h100_acceptance_current_status_20260523.md`，把单节点、多机 NCCL、跨节点 RDMA、环境等价性和阻塞项合并到一份中文总表。
+- 已补充收尾检查清单：`reports_h100_acceptance_closure_checklist_20260523.md`，明确哪些工作可以阶段性交付、哪些验收门禁仍不能关闭。
 - 2 机 1/2/4 GPU per node 档位已接近 PDF 参考值，但严格按阈值仍 FAIL。
 - 2 机 8 GPU 档位仍未达到 PDF 参考值：
   - allreduce 实测 `353.85 GB/s busbw`，PDF 目标 `491.84 GB/s`。
@@ -23,13 +24,14 @@
 | 顺序 | 文件 | 用途 |
 |---:|---|---|
 | 1 | `reports_h100_acceptance_current_status_20260523.md` | 当前 H100 验收总览，汇总单节点、多机 NCCL、跨节点 RDMA 和阻塞项 |
-| 2 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划，包含决策树、要问的问题和复跑命令 |
-| 3 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价，重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
-| 4 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析，确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 |
-| 5 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果，补齐单机 test all 的 NCCL 覆盖面 |
-| 6 | `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 多机多卡 2x8 六项 collective artifacts manifest 和 checksum |
-| 7 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
-| 8 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
+| 2 | `reports_h100_acceptance_closure_checklist_20260523.md` | 收尾检查清单：可交付项、未关闭门禁、最短收尾路径 |
+| 3 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划，包含决策树、要问的问题和复跑命令 |
+| 4 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价，重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
+| 5 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析，确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 |
+| 6 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果，补齐单机 test all 的 NCCL 覆盖面 |
+| 7 | `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 多机多卡 2x8 六项 collective artifacts manifest 和 checksum |
+| 8 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
+| 9 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
 
 ## 关键脚本
 
@@ -102,6 +104,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 ```text
 /root/test_gpu_scripts/reports_multinode_nccl_handoff_plan_20260523.md
 /root/test_gpu_scripts/reports_h100_acceptance_current_status_20260523.md
+/root/test_gpu_scripts/reports_h100_acceptance_closure_checklist_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_environment_gap_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_artifact_signal_analysis_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_all_collectives_run_20260523.md
-- 
2.47.2


From 639651ef242bb81f5c382f4ba4038bd275bf7501 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 20:29:19 +0800
Subject: [PATCH 35/41] Add H100 network escalation request

---
 README.md                                     |   9 +-
 ...0_acceptance_closure_checklist_20260523.md |   8 +-
 ...h100_acceptance_current_status_20260523.md |  14 +-
 ...rk_hardware_escalation_request_20260523.md | 193 ++++++++++++++++++
 ...ts_multinode_nccl_latest_index_20260523.md |  17 +-
 5 files changed, 221 insertions(+), 20 deletions(-)
 create mode 100644 reports_h100_network_hardware_escalation_request_20260523.md

diff --git a/README.md b/README.md
index 3050464..ea763a1 100644
--- a/README.md
+++ b/README.md
@@ -14,10 +14,11 @@
 |---|---|---|
 | 1 | [reports_h100_acceptance_current_status_20260523.md](reports_h100_acceptance_current_status_20260523.md) | 当前总状态：已测项、失败项、阻塞项、下一步 |
 | 2 | [reports_h100_acceptance_closure_checklist_20260523.md](reports_h100_acceptance_closure_checklist_20260523.md) | 收尾检查清单：可交付项、未关闭门禁、最短收尾路径 |
-| 3 | [reports_multinode_nccl_latest_index_20260523.md](reports_multinode_nccl_latest_index_20260523.md) | 多节点 NCCL 相关报告索引 |
-| 4 | [reports_multinode_nccl_handoff_plan_20260523.md](reports_multinode_nccl_handoff_plan_20260523.md) | 接手人复跑和继续定位计划 |
-| 5 | [reports_test_all_latest_summary_cn_20260523.md](reports_test_all_latest_summary_cn_20260523.md) | 单节点 `test all` 中文原始汇总 |
-| 6 | [reports_rdma_cross_node_mlx5_0_20260523.md](reports_rdma_cross_node_mlx5_0_20260523.md) | 跨节点 RDMA `mlx5_0` 双向结果 |
+| 3 | [reports_h100_network_hardware_escalation_request_20260523.md](reports_h100_network_hardware_escalation_request_20260523.md) | 给网络/硬件/环境侧的闭环请求和回填表 |
+| 4 | [reports_multinode_nccl_latest_index_20260523.md](reports_multinode_nccl_latest_index_20260523.md) | 多节点 NCCL 相关报告索引 |
+| 5 | [reports_multinode_nccl_handoff_plan_20260523.md](reports_multinode_nccl_handoff_plan_20260523.md) | 接手人复跑和继续定位计划 |
+| 6 | [reports_test_all_latest_summary_cn_20260523.md](reports_test_all_latest_summary_cn_20260523.md) | 单节点 `test all` 中文原始汇总 |
+| 7 | [reports_rdma_cross_node_mlx5_0_20260523.md](reports_rdma_cross_node_mlx5_0_20260523.md) | 跨节点 RDMA `mlx5_0` 双向结果 |
 
 当前主要阻塞：
 
diff --git a/reports_h100_acceptance_closure_checklist_20260523.md b/reports_h100_acceptance_closure_checklist_20260523.md
index 6c30aa8..670c146 100644
--- a/reports_h100_acceptance_closure_checklist_20260523.md
+++ b/reports_h100_acceptance_closure_checklist_20260523.md
@@ -22,6 +22,7 @@
 | 多节点 2x8 六项 collective | 完成 | `scripts/run_multinode_nccl_all_collectives.sh`，`reports_multinode_nccl_all_collectives_run_20260523.md` |
 | NCCL artifacts / checksum | 完成 | `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`，`reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` |
 | 环境等价性分析 | 完成 | `reports_multinode_nccl_environment_gap_20260523.md` |
+| 网络/硬件/环境闭环请求 | 完成 | `reports_h100_network_hardware_escalation_request_20260523.md` |
 | 接手 runbook / README 入口 | 完成 | `README.md`，`reports_multinode_nccl_handoff_plan_20260523.md` |
 
 这些工作可以作为当前阶段交付物归档。
@@ -88,9 +89,10 @@ bash scripts/run_multinode_nccl_all_collectives.sh
 
 1. `reports_h100_acceptance_current_status_20260523.md`
 2. `reports_h100_acceptance_closure_checklist_20260523.md`
-3. `reports_multinode_nccl_handoff_plan_20260523.md`
-4. `reports_multinode_nccl_environment_gap_20260523.md`
-5. `reports_multinode_nccl_latest_index_20260523.md`
+3. `reports_h100_network_hardware_escalation_request_20260523.md`
+4. `reports_multinode_nccl_handoff_plan_20260523.md`
+5. `reports_multinode_nccl_environment_gap_20260523.md`
+6. `reports_multinode_nccl_latest_index_20260523.md`
 
 当前项目可以向外汇报为：
 
diff --git a/reports_h100_acceptance_current_status_20260523.md b/reports_h100_acceptance_current_status_20260523.md
index f8cbe3c..8b74012 100644
--- a/reports_h100_acceptance_current_status_20260523.md
+++ b/reports_h100_acceptance_current_status_20260523.md
@@ -15,6 +15,7 @@
 | NCCL artifacts 信号 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 基础链路正常 | IB/GDRDMA/HCA 均正常；无 SHARP/CollNet/外部 net plugin |
 | 环境等价性 | `reports_multinode_nccl_environment_gap_20260523.md` | 未证明等价 | 每节点只有 4 条 400G rail，缺 NCCL net plugin / SHARP |
 | 收尾检查 | `reports_h100_acceptance_closure_checklist_20260523.md` | 可阶段性交付 | 生产验收门禁仍未关闭 |
+| 网络/硬件/环境闭环 | `reports_h100_network_hardware_escalation_request_20260523.md` | 已形成请求 | 等待 rail/plugin/SHARP/交换策略/阈值口径回填 |
 
 ## 已完成的能力
 
@@ -152,9 +153,10 @@ NCCL 日志中没有 SHARP/CollNet 迹象，当前走 internal IB plugin。
 |---:|---|---|
 | 1 | `reports_h100_acceptance_current_status_20260523.md` | 当前总览和阻塞清单 |
 | 2 | `reports_h100_acceptance_closure_checklist_20260523.md` | 收尾检查清单和关闭条件 |
-| 3 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划 |
-| 4 | `reports_multinode_nccl_environment_gap_20260523.md` | PDF 环境等价性缺口 |
-| 5 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | NCCL artifacts 信号分析 |
-| 6 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机 2x8 六项 collective 补测摘要 |
-| 7 | `reports_test_all_latest_summary_cn_20260523.md` | 单节点 test all 中文汇总 |
-| 8 | `reports_rdma_cross_node_mlx5_0_20260523.md` | 跨节点 RDMA 单 rail 证据 |
+| 3 | `reports_h100_network_hardware_escalation_request_20260523.md` | 给网络/硬件/环境侧的闭环请求 |
+| 4 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划 |
+| 5 | `reports_multinode_nccl_environment_gap_20260523.md` | PDF 环境等价性缺口 |
+| 6 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | NCCL artifacts 信号分析 |
+| 7 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机 2x8 六项 collective 补测摘要 |
+| 8 | `reports_test_all_latest_summary_cn_20260523.md` | 单节点 test all 中文汇总 |
+| 9 | `reports_rdma_cross_node_mlx5_0_20260523.md` | 跨节点 RDMA 单 rail 证据 |
diff --git a/reports_h100_network_hardware_escalation_request_20260523.md b/reports_h100_network_hardware_escalation_request_20260523.md
new file mode 100644
index 0000000..f4a82d5
--- /dev/null
+++ b/reports_h100_network_hardware_escalation_request_20260523.md
@@ -0,0 +1,193 @@
+# H100 网络/硬件/环境侧闭环请求 2026-05-23
+
+## 用途
+
+这份文档用于转交给网络、硬件、机房、环境维护同事，目标是把当前 H100 验收剩余 `FAIL` 从“测试侧已复现”推进到“责任侧确认并闭环”。
+
+当前测试侧已经完成单节点 `test all`、跨节点 RDMA、多节点 NCCL PDF matrix、2x8 六项 collective、NCCL artifacts、checksum 和中文报告。当前不能判生产验收通过，剩余问题需要网络/硬件/环境侧确认。
+
+## 需要对方先读的结论
+
+当前两台机器：
+
+| 角色 | 主机名 | 地址 |
+|---|---|---|
+| nccl-gpu-1 | `aikubeworker0012` | `172.72.8.12` |
+| nccl-gpu-2 | `aikubeworker0016` | `172.72.8.16` |
+
+当前主要阻塞：
+
+| 阻塞 | 当前证据 | 需要确认 |
+|---|---|---|
+| 每节点有效 400G IB rail 只有 4 条 | `mlx5_0,mlx5_1,mlx5_6,mlx5_7` | 这是否符合采购/布线/验收预期 |
+| 其他 HCA 不等价 | `mlx5_4/5` 为 100G IB，`mlx5_2/8` 为 25G Ethernet，`mlx5_3/9` DOWN | 是配置问题、线缆/模块问题、交换端口问题，还是设计如此 |
+| 缺外部 NCCL 网络组件 | 未找到 `libnccl-net*.so*`、`libsharp*.so*`，未见 SHARP/HCOLL 包 | PDF 参考环境是否启用这些组件 |
+| 跨节点 RDMA read/latency 未过 | `ib_read_bw` 约 44.36 GB/s，目标 >= 47 GB/s；latency 也未达阈值 | OFED/固件/BIOS/交换网络/perftest 参数是否需要调整 |
+| 2x8 NCCL allreduce 未达 PDF | `353.85 GB/s` vs `491.84 GB/s` | PDF 目标是否要求更多 rail 或 plugin/SHARP |
+| 2x8 NCCL alltoall 未达 PDF | `36.83 GB/s` vs `76.54 GB/s` | 跨 Leaf ECMP/adaptive routing/congestion control 是否影响多点流量 |
+
+## 请对方必须回填的问题
+
+### 1. Rail / 端口 / HCA
+
+请逐项回答：
+
+| 问题 | 回答 |
+|---|---|
+| 这两台机器是否设计为每节点 8 条 400G InfiniBand rail？ |  |
+| 如果是，为什么当前只有 `mlx5_0,mlx5_1,mlx5_6,mlx5_7` 是 400G IB ACTIVE？ |  |
+| `mlx5_4`、`mlx5_5` 为什么只有 100G IB？ |  |
+| `mlx5_2`、`mlx5_8` 为什么是 25G Ethernet？ |  |
+| `mlx5_3`、`mlx5_9` 为什么 DOWN？ |  |
+| 当前 HCA 状态是否符合这批机器的采购/交付规格？ |  |
+| 如果不符合，修复动作和预计完成时间是什么？ |  |
+
+建议在两台节点分别执行并回填输出：
+
+```bash
+hostname
+for d in /sys/class/infiniband/mlx5_*; do
+  dev=$(basename "$d")
+  printf "%s state=%s rate=%s link_layer=%s\n" \
+    "$dev" \
+    "$(cat "$d/ports/1/state" 2>/dev/null)" \
+    "$(cat "$d/ports/1/rate" 2>/dev/null)" \
+    "$(cat "$d/ports/1/link_layer" 2>/dev/null)"
+done
+nvidia-smi topo -m
+```
+
+### 2. PDF 参考环境等价性
+
+请确认 PDF 参考环境到底是什么形态：
+
+| 问题 | 回答 |
+|---|---|
+| PDF 参考环境每节点实际参与 NCCL 的 400G rail 数量是多少？ |  |
+| PDF 参考环境的 HCA 列表是否全部为 400G IB ACTIVE？ |  |
+| PDF 是否是在同一 Leaf、跨 Leaf，还是不同交换路径下测得？ |  |
+| PDF 是否启用了 adaptive routing / ECMP / congestion control 特定策略？ |  |
+| PDF 是否使用了外部 NCCL net plugin / SHARP / HCOLL / UCX plugin？ |  |
+| 如果当前环境与 PDF 不等价，是否仍要求按 PDF 阈值验收？ |  |
+
+测试侧当前判断：如果 PDF 2x8 allreduce 目标 `491.84 GB/s busbw` 是硬阈值，则其反推 algbw 为：
+
+```text
+491.84 / 1.875 = 262.31 GB/s
+```
+
+当前每节点 4 条 400G rail 的理论单向原始带宽约：
+
+```text
+4 * 400Gb/s / 8 = 200 GB/s
+```
+
+因此请明确：当前 4 rail 形态是否允许按 PDF 2x8 allreduce 目标验收。
+
+### 3. NCCL net plugin / SHARP / HCOLL
+
+请逐项回答：
+
+| 问题 | 回答 |
+|---|---|
+| 当前生产验收标准是否要求安装 NCCL net plugin？ |  |
+| 当前生产验收标准是否要求启用 SHARP 或 HCOLL？ |  |
+| 如果要求，安装包来源、版本、安装路径是什么？ |  |
+| 安装后是否需要设置 `LD_LIBRARY_PATH`、`NCCL_NET_PLUGIN`、`NCCL_COLLNET_ENABLE` 等变量？ |  |
+| 如果不要求，是否确认 internal IB plugin 即为验收参考环境？ |  |
+
+建议在两台节点分别执行并回填输出：
+
+```bash
+hostname
+find /usr /opt /root /data -name 'libnccl-net*.so*' -o -name 'libsharp*.so*' 2>/dev/null
+dpkg -l | egrep -i 'sharp|hcoll|nccl|ucx|ofed|doca' || true
+ldconfig -p | egrep -i 'nccl-net|sharp|hcoll|ucx' || true
+```
+
+### 4. 跨节点 RDMA read/latency
+
+当前测试侧证据：
+
+| Direction | Test | Value | Threshold | Status |
+|---|---|---:|---:|---|
+| 0016 -> 0012 | `ib_write_bw` | 49.35 GB/s | >= 47 GB/s | PASS |
+| 0016 -> 0012 | `ib_read_bw` | 44.36 GB/s | >= 47 GB/s | FAIL |
+| 0016 -> 0012 | `ib_write_lat` avg | 2.17 us | <= 2.0 us | FAIL |
+| 0016 -> 0012 | `ib_read_lat` avg | 4.05 us | <= 3.5 us | FAIL |
+| 0012 -> 0016 | `ib_write_bw` | 48.38 GB/s | >= 47 GB/s | PASS |
+| 0012 -> 0016 | `ib_read_bw` | 44.37 GB/s | >= 47 GB/s | FAIL |
+| 0012 -> 0016 | `ib_write_lat` avg | 2.13 us | <= 2.0 us | FAIL |
+| 0012 -> 0016 | `ib_read_lat` avg | 4.08 us | <= 3.5 us | FAIL |
+
+请确认：
+
+| 问题 | 回答 |
+|---|---|
+| 当前 OFED / firmware / BIOS 设置是否符合 400G IB perftest 验收推荐？ |  |
+| read BW 明显低于 write BW 是否符合预期？ |  |
+| 当前 latency 阈值是否适用于跨 Leaf 场景？ |  |
+| 是否需要指定 GID index、MTU、SL、traffic class、PCI relaxed ordering 或其他参数？ |  |
+| 是否能提供网络侧 port counter / credit wait / congestion 证据？ |  |
+
+### 5. alltoall 跨 Leaf 路径
+
+当前测试侧已经做过 NCCL 参数 sweep，`NCCL_PXN_DISABLE=1` 后 rail 更均衡，但 2x8 alltoall 仍只有 `36-37 GB/s`。继续盲调 NCCL 小参数没有明显收益。
+
+请网络侧确认：
+
+| 问题 | 回答 |
+|---|---|
+| 两台机器是否跨 Leaf？ |  |
+| 当前跨 Leaf ECMP hash 是否适合 alltoall 多点到多点流量？ |  |
+| adaptive routing 是否开启？ |  |
+| 是否存在 credit wait、PFC pause、拥塞控制、buffer 或 QoS 策略限制？ |  |
+| 是否能提供 alltoall 运行窗口内的交换机端口 counter？ |  |
+
+## 测试侧可配合复跑的命令
+
+如果网络/硬件/环境侧完成调整，请在 `nccl-gpu-1` 上复跑：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/multinode_nccl_deep_diagnose.sh preflight
+bash scripts/run_multinode_nccl_pdf_matrix.sh
+bash scripts/run_multinode_nccl_all_collectives.sh
+```
+
+如果调整了 SHARP/plugin，请额外跑：
+
+```bash
+cd /root/test_gpu_scripts
+OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%d_%H%M%S) \
+  bash scripts/multinode_nccl_deep_diagnose.sh graph
+```
+
+如果调整了单节点环境，请分别在两台节点跑：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_h100_single_node_all.sh
+```
+
+## 测试侧当前交付物
+
+| 文件 | 用途 |
+|---|---|
+| `reports_h100_acceptance_current_status_20260523.md` | 当前总览 |
+| `reports_h100_acceptance_closure_checklist_20260523.md` | 收尾检查清单和关闭条件 |
+| `reports_h100_network_hardware_escalation_request_20260523.md` | 本闭环请求 |
+| `reports_multinode_nccl_environment_gap_20260523.md` | PDF 环境等价性缺口 |
+| `reports_multinode_nccl_handoff_plan_20260523.md` | 复跑和接手计划 |
+| `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 多节点 NCCL PDF matrix 摘要 |
+| `reports_multinode_nccl_all_collectives_run_20260523.md` | 多节点 2x8 六项 collective 摘要 |
+| `reports_rdma_cross_node_mlx5_0_20260523.md` | 跨节点 RDMA 单 rail 证据 |
+
+## 闭环判定
+
+网络/硬件/环境侧需要输出以下任一结论，测试侧才能继续往最终验收推进：
+
+1. **环境修复完成：** 当前两台机器已达到 PDF 参考环境等价状态，请测试侧复跑。
+2. **环境不等价但可接受：** 当前机器规格与 PDF 不同，请按新的阈值/豁免口径复跑；新口径需写入配置或报告。
+3. **硬件/网络异常：** 当前机器或网络不满足交付规格，需要先修复硬件/布线/交换配置。
+4. **参考标准有误：** PDF 阈值不适用于当前场景，需要更新验收标准。
diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md
index 5bee9fe..5a7e0af 100644
--- a/reports_multinode_nccl_latest_index_20260523.md
+++ b/reports_multinode_nccl_latest_index_20260523.md
@@ -12,6 +12,7 @@
 - 六项 collective 的完整 artifacts 已归档：`reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md`，远端 tar 为 `reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz`。
 - 已补充当前验收状态总览：`reports_h100_acceptance_current_status_20260523.md`，把单节点、多机 NCCL、跨节点 RDMA、环境等价性和阻塞项合并到一份中文总表。
 - 已补充收尾检查清单：`reports_h100_acceptance_closure_checklist_20260523.md`，明确哪些工作可以阶段性交付、哪些验收门禁仍不能关闭。
+- 已补充网络/硬件/环境侧闭环请求：`reports_h100_network_hardware_escalation_request_20260523.md`，用于让责任侧回填 rail、plugin/SHARP、跨 Leaf 和新阈值口径。
 - 2 机 1/2/4 GPU per node 档位已接近 PDF 参考值，但严格按阈值仍 FAIL。
 - 2 机 8 GPU 档位仍未达到 PDF 参考值：
   - allreduce 实测 `353.85 GB/s busbw`，PDF 目标 `491.84 GB/s`。
@@ -25,13 +26,14 @@
 |---:|---|---|
 | 1 | `reports_h100_acceptance_current_status_20260523.md` | 当前 H100 验收总览，汇总单节点、多机 NCCL、跨节点 RDMA 和阻塞项 |
 | 2 | `reports_h100_acceptance_closure_checklist_20260523.md` | 收尾检查清单：可交付项、未关闭门禁、最短收尾路径 |
-| 3 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划，包含决策树、要问的问题和复跑命令 |
-| 4 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价，重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
-| 5 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析，确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 |
-| 6 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果，补齐单机 test all 的 NCCL 覆盖面 |
-| 7 | `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 多机多卡 2x8 六项 collective artifacts manifest 和 checksum |
-| 8 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
-| 9 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
+| 3 | `reports_h100_network_hardware_escalation_request_20260523.md` | 给网络/硬件/环境侧的闭环请求和回填表 |
+| 4 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划，包含决策树、要问的问题和复跑命令 |
+| 5 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价，重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
+| 6 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析，确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 |
+| 7 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果，补齐单机 test all 的 NCCL 覆盖面 |
+| 8 | `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 多机多卡 2x8 六项 collective artifacts manifest 和 checksum |
+| 9 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
+| 10 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
 
 ## 关键脚本
 
@@ -105,6 +107,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 /root/test_gpu_scripts/reports_multinode_nccl_handoff_plan_20260523.md
 /root/test_gpu_scripts/reports_h100_acceptance_current_status_20260523.md
 /root/test_gpu_scripts/reports_h100_acceptance_closure_checklist_20260523.md
+/root/test_gpu_scripts/reports_h100_network_hardware_escalation_request_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_environment_gap_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_artifact_signal_analysis_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_all_collectives_run_20260523.md
-- 
2.47.2


From f80a3b36369196ee1e1ab3f13577fb329faf15af Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 20:34:01 +0800
Subject: [PATCH 36/41] Add H100 acceptance delivery manifest

---
 README.md                                     |  11 +-
 ...0_acceptance_closure_checklist_20260523.md |  10 +-
 ...h100_acceptance_current_status_20260523.md |  16 +-
 ...0_acceptance_delivery_manifest_20260523.md | 149 ++++++++++++++++++
 ...ts_multinode_nccl_latest_index_20260523.md |  19 ++-
 5 files changed, 181 insertions(+), 24 deletions(-)
 create mode 100644 reports_h100_acceptance_delivery_manifest_20260523.md

diff --git a/README.md b/README.md
index ea763a1..80e954d 100644
--- a/README.md
+++ b/README.md
@@ -14,11 +14,12 @@
 |---|---|---|
 | 1 | [reports_h100_acceptance_current_status_20260523.md](reports_h100_acceptance_current_status_20260523.md) | 当前总状态：已测项、失败项、阻塞项、下一步 |
 | 2 | [reports_h100_acceptance_closure_checklist_20260523.md](reports_h100_acceptance_closure_checklist_20260523.md) | 收尾检查清单：可交付项、未关闭门禁、最短收尾路径 |
-| 3 | [reports_h100_network_hardware_escalation_request_20260523.md](reports_h100_network_hardware_escalation_request_20260523.md) | 给网络/硬件/环境侧的闭环请求和回填表 |
-| 4 | [reports_multinode_nccl_latest_index_20260523.md](reports_multinode_nccl_latest_index_20260523.md) | 多节点 NCCL 相关报告索引 |
-| 5 | [reports_multinode_nccl_handoff_plan_20260523.md](reports_multinode_nccl_handoff_plan_20260523.md) | 接手人复跑和继续定位计划 |
-| 6 | [reports_test_all_latest_summary_cn_20260523.md](reports_test_all_latest_summary_cn_20260523.md) | 单节点 `test all` 中文原始汇总 |
-| 7 | [reports_rdma_cross_node_mlx5_0_20260523.md](reports_rdma_cross_node_mlx5_0_20260523.md) | 跨节点 RDMA `mlx5_0` 双向结果 |
+| 3 | [reports_h100_acceptance_delivery_manifest_20260523.md](reports_h100_acceptance_delivery_manifest_20260523.md) | 交付包 manifest：入口、脚本、远端 artifacts、checksum |
+| 4 | [reports_h100_network_hardware_escalation_request_20260523.md](reports_h100_network_hardware_escalation_request_20260523.md) | 给网络/硬件/环境侧的闭环请求和回填表 |
+| 5 | [reports_multinode_nccl_latest_index_20260523.md](reports_multinode_nccl_latest_index_20260523.md) | 多节点 NCCL 相关报告索引 |
+| 6 | [reports_multinode_nccl_handoff_plan_20260523.md](reports_multinode_nccl_handoff_plan_20260523.md) | 接手人复跑和继续定位计划 |
+| 7 | [reports_test_all_latest_summary_cn_20260523.md](reports_test_all_latest_summary_cn_20260523.md) | 单节点 `test all` 中文原始汇总 |
+| 8 | [reports_rdma_cross_node_mlx5_0_20260523.md](reports_rdma_cross_node_mlx5_0_20260523.md) | 跨节点 RDMA `mlx5_0` 双向结果 |
 
 当前主要阻塞：
 
diff --git a/reports_h100_acceptance_closure_checklist_20260523.md b/reports_h100_acceptance_closure_checklist_20260523.md
index 670c146..6b0264f 100644
--- a/reports_h100_acceptance_closure_checklist_20260523.md
+++ b/reports_h100_acceptance_closure_checklist_20260523.md
@@ -22,6 +22,7 @@
 | 多节点 2x8 六项 collective | 完成 | `scripts/run_multinode_nccl_all_collectives.sh`，`reports_multinode_nccl_all_collectives_run_20260523.md` |
 | NCCL artifacts / checksum | 完成 | `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`，`reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` |
 | 环境等价性分析 | 完成 | `reports_multinode_nccl_environment_gap_20260523.md` |
+| 交付包 manifest | 完成 | `reports_h100_acceptance_delivery_manifest_20260523.md` |
 | 网络/硬件/环境闭环请求 | 完成 | `reports_h100_network_hardware_escalation_request_20260523.md` |
 | 接手 runbook / README 入口 | 完成 | `README.md`，`reports_multinode_nccl_handoff_plan_20260523.md` |
 
@@ -89,10 +90,11 @@ bash scripts/run_multinode_nccl_all_collectives.sh
 
 1. `reports_h100_acceptance_current_status_20260523.md`
 2. `reports_h100_acceptance_closure_checklist_20260523.md`
-3. `reports_h100_network_hardware_escalation_request_20260523.md`
-4. `reports_multinode_nccl_handoff_plan_20260523.md`
-5. `reports_multinode_nccl_environment_gap_20260523.md`
-6. `reports_multinode_nccl_latest_index_20260523.md`
+3. `reports_h100_acceptance_delivery_manifest_20260523.md`
+4. `reports_h100_network_hardware_escalation_request_20260523.md`
+5. `reports_multinode_nccl_handoff_plan_20260523.md`
+6. `reports_multinode_nccl_environment_gap_20260523.md`
+7. `reports_multinode_nccl_latest_index_20260523.md`
 
 当前项目可以向外汇报为：
 
diff --git a/reports_h100_acceptance_current_status_20260523.md b/reports_h100_acceptance_current_status_20260523.md
index 8b74012..0686918 100644
--- a/reports_h100_acceptance_current_status_20260523.md
+++ b/reports_h100_acceptance_current_status_20260523.md
@@ -15,6 +15,7 @@
 | NCCL artifacts 信号 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 基础链路正常 | IB/GDRDMA/HCA 均正常；无 SHARP/CollNet/外部 net plugin |
 | 环境等价性 | `reports_multinode_nccl_environment_gap_20260523.md` | 未证明等价 | 每节点只有 4 条 400G rail，缺 NCCL net plugin / SHARP |
 | 收尾检查 | `reports_h100_acceptance_closure_checklist_20260523.md` | 可阶段性交付 | 生产验收门禁仍未关闭 |
+| 交付包 manifest | `reports_h100_acceptance_delivery_manifest_20260523.md` | 已形成 | 入口、脚本、远端 artifacts、checksum 已汇总 |
 | 网络/硬件/环境闭环 | `reports_h100_network_hardware_escalation_request_20260523.md` | 已形成请求 | 等待 rail/plugin/SHARP/交换策略/阈值口径回填 |
 
 ## 已完成的能力
@@ -153,10 +154,11 @@ NCCL 日志中没有 SHARP/CollNet 迹象，当前走 internal IB plugin。
 |---:|---|---|
 | 1 | `reports_h100_acceptance_current_status_20260523.md` | 当前总览和阻塞清单 |
 | 2 | `reports_h100_acceptance_closure_checklist_20260523.md` | 收尾检查清单和关闭条件 |
-| 3 | `reports_h100_network_hardware_escalation_request_20260523.md` | 给网络/硬件/环境侧的闭环请求 |
-| 4 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划 |
-| 5 | `reports_multinode_nccl_environment_gap_20260523.md` | PDF 环境等价性缺口 |
-| 6 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | NCCL artifacts 信号分析 |
-| 7 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机 2x8 六项 collective 补测摘要 |
-| 8 | `reports_test_all_latest_summary_cn_20260523.md` | 单节点 test all 中文汇总 |
-| 9 | `reports_rdma_cross_node_mlx5_0_20260523.md` | 跨节点 RDMA 单 rail 证据 |
+| 3 | `reports_h100_acceptance_delivery_manifest_20260523.md` | 交付包 manifest 和 checksum |
+| 4 | `reports_h100_network_hardware_escalation_request_20260523.md` | 给网络/硬件/环境侧的闭环请求 |
+| 5 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划 |
+| 6 | `reports_multinode_nccl_environment_gap_20260523.md` | PDF 环境等价性缺口 |
+| 7 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | NCCL artifacts 信号分析 |
+| 8 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机 2x8 六项 collective 补测摘要 |
+| 9 | `reports_test_all_latest_summary_cn_20260523.md` | 单节点 test all 中文汇总 |
+| 10 | `reports_rdma_cross_node_mlx5_0_20260523.md` | 跨节点 RDMA 单 rail 证据 |
diff --git a/reports_h100_acceptance_delivery_manifest_20260523.md b/reports_h100_acceptance_delivery_manifest_20260523.md
new file mode 100644
index 0000000..1de9278
--- /dev/null
+++ b/reports_h100_acceptance_delivery_manifest_20260523.md
@@ -0,0 +1,149 @@
+# H100 验收交付包 Manifest 2026-05-23
+
+## 交付结论
+
+当前分支：`h100-acceptance-current`
+
+最新 commit：以 `git log -1 --oneline` 为准。
+
+当前状态：**测试侧阶段性交付完成，生产验收未通过。**
+
+本交付包已经覆盖单节点 `test all`、跨节点 RDMA、多节点 NCCL PDF matrix、多节点 2x8 六项 collective、环境等价性分析、网络/硬件/环境闭环请求、复跑脚本和 artifacts checksum。剩余工作需要网络/硬件/环境侧确认后才能继续往最终验收推进。
+
+## 主入口
+
+按下面顺序阅读：
+
+| 顺序 | 文件 | 用途 |
+|---:|---|---|
+| 1 | `README.md` | 仓库入口和 H100 当前验收入口 |
+| 2 | `reports_h100_acceptance_current_status_20260523.md` | 当前总状态和阻塞项 |
+| 3 | `reports_h100_acceptance_closure_checklist_20260523.md` | 可交付项、未关闭门禁、收尾路径 |
+| 4 | `reports_h100_network_hardware_escalation_request_20260523.md` | 给网络/硬件/环境侧的回填请求 |
+| 5 | `reports_multinode_nccl_latest_index_20260523.md` | 多节点 NCCL 报告索引 |
+
+## 核心报告
+
+| 分类 | 文件 | 当前结论 |
+|---|---|---|
+| 总览 | `reports_h100_acceptance_current_status_20260523.md` | FAIL，证据链完整但门禁未过 |
+| 收尾 | `reports_h100_acceptance_closure_checklist_20260523.md` | 可阶段性交付，不能判生产通过 |
+| 闭环请求 | `reports_h100_network_hardware_escalation_request_20260523.md` | 等待网络/硬件/环境侧回填 |
+| 单节点 | `reports_test_all_latest_summary_cn_20260523.md` | 两台均 `6/10 PASS`，整体 FAIL |
+| 跨节点 RDMA | `reports_rdma_cross_node_mlx5_0_20260523.md` | write BW PASS，read BW/latency FAIL |
+| 多节点 NCCL PDF matrix | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 8 个 case 仅 1 个性能 PASS；正确性均 OK |
+| 多节点 NCCL 六项 collective | `reports_multinode_nccl_all_collectives_run_20260523.md` | 6 项正确性 OK；allreduce/alltoall 按 PDF 阈值 FAIL |
+| 环境等价性 | `reports_multinode_nccl_environment_gap_20260523.md` | 当前不能证明与 PDF 等价 |
+| NCCL artifact 信号 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | IB/GDRDMA 正常；缺外部 plugin/SHARP |
+| 接手计划 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给继续定位和复跑的人使用 |
+
+## 可复跑入口
+
+| 脚本 | 用途 | 建议执行位置 |
+|---|---|---|
+| `scripts/run_h100_single_node_all.sh` | 单节点 H100 全量验收 | 两台节点分别执行 |
+| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多节点 NCCL PDF matrix | `nccl-gpu-1` |
+| `scripts/run_multinode_nccl_all_collectives.sh` | 多节点 2x8 六项 collective | `nccl-gpu-1` |
+| `scripts/multinode_nccl_deep_diagnose.sh` | 多节点 NCCL 深度诊断 | `nccl-gpu-1` |
+| `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照 | 两台节点分别执行 |
+
+推荐复跑顺序：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/multinode_nccl_deep_diagnose.sh preflight
+bash scripts/run_multinode_nccl_pdf_matrix.sh
+bash scripts/run_multinode_nccl_all_collectives.sh
+```
+
+如果网络/硬件/环境侧调整了单节点条件，还需要分别在两台节点执行：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_h100_single_node_all.sh
+```
+
+## 远端位置
+
+两台远端默认路径：
+
+```text
+nccl-gpu-1: /root/test_gpu_scripts
+nccl-gpu-2: /root/test_gpu_scripts
+```
+
+最新多节点 NCCL 原始 artifacts 位于 `nccl-gpu-1`：
+
+| 类型 | 路径 |
+|---|---|
+| PDF matrix raw report | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803.md` |
+| PDF matrix artifacts dir | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts` |
+| PDF matrix artifacts tar | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.tar.gz` |
+| 六项 collective raw report | `/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md` |
+| 六项 collective artifacts dir | `/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts` |
+| 六项 collective artifacts tar | `/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz` |
+
+## Artifact 校验
+
+PDF matrix bundle checksum：
+
+```text
+682ac637460472d464a0d56ccc0f3335ed7f79a270157a403ebec23b8d9feceb  reports/multinode_nccl_pdf_matrix_20260523_113803.md
+7371fcaf7269f92eb1544e5e63573ebf77f4ae38f668b5b22169ca86e6d603ee  reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.tar.gz
+```
+
+六项 collective bundle checksum：
+
+```text
+06c565281813c4260da9cfee8f0b0289b61b3be95c01dd670c71fa1a441133e3  reports/multinode_nccl_all_collectives_20260523_120144.md
+fa5961d47a5905da6ebc6c726421d73ddc2314a316a8f578683d31fe69c256e5  reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz
+```
+
+逐文件 checksum：
+
+| 文件 | 用途 |
+|---|---|
+| `reports_multinode_nccl_all_collectives_20260523_120144_bundle.sha256` | 六项 collective raw report + tar checksum |
+| `reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256` | 六项 collective artifacts 逐文件 checksum |
+| `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | PDF matrix case summary + bundle checksum |
+| `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 六项 collective case summary + bundle/per-file checksum |
+
+## 入口文件 SHA256
+
+以下 hash 用于确认本地与两台远端入口文件一致。本 manifest 本身不做自引用 hash。
+
+```text
+bf3fd8197285dca964b78c584ee6263b0d0f4d47fbf689d121367666d3398231  README.md
+846c3da4ac655a0b3ad072e4c4475d91b55e2bdc9d8aedb9c5f9d800608fb64c  reports_h100_acceptance_current_status_20260523.md
+4a0ee9f456acc1284bf3a42df5bd338affb831471c27ca4b6584201acd72fd52  reports_h100_acceptance_closure_checklist_20260523.md
+45438db9204ceef5f65019a6594c016f3183799ed3b89dcf40f383a34f9e3466  reports_h100_network_hardware_escalation_request_20260523.md
+d982d6f3698e8860b8505d65105f6056c11f1f72758401a4613ae8315b6f92d0  reports_multinode_nccl_latest_index_20260523.md
+8fca70e703961745d5bdacaa3fccb814709c426c0fa7713d0df2d1f2fb26a3f4  reports_multinode_nccl_handoff_plan_20260523.md
+b0d0d1fa9b1aa0d8cbdd2672508df5c7bafffc91b607b35b199e624352147e75  reports_multinode_nccl_environment_gap_20260523.md
+a7bc27c630fb97c0b83a3427ed4017a16a21e1285f4be5a2cc21f653921fab2b  reports_multinode_nccl_pdf_matrix_run_20260523.md
+60bdb85e087e796d59c6f0cb7e79c7e60b4147b5fff8c6b60606f6c1f53b1b58  reports_multinode_nccl_all_collectives_run_20260523.md
+6affec63694d31dc2d7f097210794e7821e931b8c8b9ac8f451c6f7948bf138a  reports_test_all_latest_summary_cn_20260523.md
+3895cdf040220aa13093c3377c301580120f04eb9958dbb7c3df3d7285c2d733  reports_rdma_cross_node_mlx5_0_20260523.md
+```
+
+## 还不能关闭的事项
+
+| 项目 | 当前阻塞 |
+|---|---|
+| 单节点 Compute | 多 dtype 绝对 TFLOPS 阈值未达，部分 GPU spread 超 3% |
+| 单节点 NCCL | 多 op/size 未达阈值，小包和部分 2G case 明显 |
+| 单节点 Stress | 30 分钟可跑满，但温差和 `sw_power_cap` throttle 触发 FAIL |
+| 单节点 RDMA | read BW 未达 47 GB/s，部分端口不是 400G |
+| 跨节点 RDMA | read BW 和 write/read latency 未达阈值 |
+| 多节点 NCCL allreduce | 2x8 `353.85 GB/s`，PDF 目标 `491.84 GB/s` |
+| 多节点 NCCL alltoall | 2x8 `36.83 GB/s`，PDF 目标 `76.54 GB/s` |
+| PDF 环境等价性 | 当前只有 4 条 400G rail，缺 NCCL net plugin / SHARP 证据 |
+
+## 下一步闭环条件
+
+网络/硬件/环境侧需要给出以下任一结论：
+
+1. 当前两台机器已修复到 PDF 参考环境等价状态，测试侧复跑。
+2. 当前机器与 PDF 参考环境不等价，但可以接受新的阈值或豁免口径。
+3. 当前硬件/网络不满足交付规格，需要先修复。
+4. PDF 阈值不适用于当前跨 Leaf/4 rail/plugin 缺失场景，需要更新验收标准。
diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md
index 5a7e0af..129b50d 100644
--- a/reports_multinode_nccl_latest_index_20260523.md
+++ b/reports_multinode_nccl_latest_index_20260523.md
@@ -13,6 +13,7 @@
 - 已补充当前验收状态总览：`reports_h100_acceptance_current_status_20260523.md`，把单节点、多机 NCCL、跨节点 RDMA、环境等价性和阻塞项合并到一份中文总表。
 - 已补充收尾检查清单：`reports_h100_acceptance_closure_checklist_20260523.md`，明确哪些工作可以阶段性交付、哪些验收门禁仍不能关闭。
 - 已补充网络/硬件/环境侧闭环请求：`reports_h100_network_hardware_escalation_request_20260523.md`，用于让责任侧回填 rail、plugin/SHARP、跨 Leaf 和新阈值口径。
+- 已补充交付包 manifest：`reports_h100_acceptance_delivery_manifest_20260523.md`，汇总主入口、脚本、远端 artifacts 和 checksum。
 - 2 机 1/2/4 GPU per node 档位已接近 PDF 参考值，但严格按阈值仍 FAIL。
 - 2 机 8 GPU 档位仍未达到 PDF 参考值：
   - allreduce 实测 `353.85 GB/s busbw`，PDF 目标 `491.84 GB/s`。
@@ -26,14 +27,15 @@
 |---:|---|---|
 | 1 | `reports_h100_acceptance_current_status_20260523.md` | 当前 H100 验收总览，汇总单节点、多机 NCCL、跨节点 RDMA 和阻塞项 |
 | 2 | `reports_h100_acceptance_closure_checklist_20260523.md` | 收尾检查清单：可交付项、未关闭门禁、最短收尾路径 |
-| 3 | `reports_h100_network_hardware_escalation_request_20260523.md` | 给网络/硬件/环境侧的闭环请求和回填表 |
-| 4 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划，包含决策树、要问的问题和复跑命令 |
-| 5 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价，重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
-| 6 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析，确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 |
-| 7 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果，补齐单机 test all 的 NCCL 覆盖面 |
-| 8 | `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 多机多卡 2x8 六项 collective artifacts manifest 和 checksum |
-| 9 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
-| 10 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
+| 3 | `reports_h100_acceptance_delivery_manifest_20260523.md` | 交付包 manifest：入口、脚本、远端 artifacts、checksum |
+| 4 | `reports_h100_network_hardware_escalation_request_20260523.md` | 给网络/硬件/环境侧的闭环请求和回填表 |
+| 5 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划，包含决策树、要问的问题和复跑命令 |
+| 6 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价，重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
+| 7 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析，确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 |
+| 8 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果，补齐单机 test all 的 NCCL 覆盖面 |
+| 9 | `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 多机多卡 2x8 六项 collective artifacts manifest 和 checksum |
+| 10 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
+| 11 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
 
 ## 关键脚本
 
@@ -107,6 +109,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
 /root/test_gpu_scripts/reports_multinode_nccl_handoff_plan_20260523.md
 /root/test_gpu_scripts/reports_h100_acceptance_current_status_20260523.md
 /root/test_gpu_scripts/reports_h100_acceptance_closure_checklist_20260523.md
+/root/test_gpu_scripts/reports_h100_acceptance_delivery_manifest_20260523.md
 /root/test_gpu_scripts/reports_h100_network_hardware_escalation_request_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_environment_gap_20260523.md
 /root/test_gpu_scripts/reports_multinode_nccl_artifact_signal_analysis_20260523.md
-- 
2.47.2


From 4484c731b6ceb6a1f1cdebc27cf019740d6674eb Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Sat, 23 May 2026 20:37:19 +0800
Subject: [PATCH 37/41] Add H100 acceptance PR summary

---
 README.md                                     |  11 +-
 ...0_acceptance_delivery_manifest_20260523.md |   9 +-
 ...rts_h100_acceptance_pr_summary_20260523.md | 144 ++++++++++++++++++
 3 files changed, 156 insertions(+), 8 deletions(-)
 create mode 100644 reports_h100_acceptance_pr_summary_20260523.md

diff --git a/README.md b/README.md
index 80e954d..21aad0d 100644
--- a/README.md
+++ b/README.md
@@ -15,11 +15,12 @@
 | 1 | [reports_h100_acceptance_current_status_20260523.md](reports_h100_acceptance_current_status_20260523.md) | 当前总状态：已测项、失败项、阻塞项、下一步 |
 | 2 | [reports_h100_acceptance_closure_checklist_20260523.md](reports_h100_acceptance_closure_checklist_20260523.md) | 收尾检查清单：可交付项、未关闭门禁、最短收尾路径 |
 | 3 | [reports_h100_acceptance_delivery_manifest_20260523.md](reports_h100_acceptance_delivery_manifest_20260523.md) | 交付包 manifest：入口、脚本、远端 artifacts、checksum |
-| 4 | [reports_h100_network_hardware_escalation_request_20260523.md](reports_h100_network_hardware_escalation_request_20260523.md) | 给网络/硬件/环境侧的闭环请求和回填表 |
-| 5 | [reports_multinode_nccl_latest_index_20260523.md](reports_multinode_nccl_latest_index_20260523.md) | 多节点 NCCL 相关报告索引 |
-| 6 | [reports_multinode_nccl_handoff_plan_20260523.md](reports_multinode_nccl_handoff_plan_20260523.md) | 接手人复跑和继续定位计划 |
-| 7 | [reports_test_all_latest_summary_cn_20260523.md](reports_test_all_latest_summary_cn_20260523.md) | 单节点 `test all` 中文原始汇总 |
-| 8 | [reports_rdma_cross_node_mlx5_0_20260523.md](reports_rdma_cross_node_mlx5_0_20260523.md) | 跨节点 RDMA `mlx5_0` 双向结果 |
+| 4 | [reports_h100_acceptance_pr_summary_20260523.md](reports_h100_acceptance_pr_summary_20260523.md) | PR/审阅摘要：变更范围、验证、风险、合并说明 |
+| 5 | [reports_h100_network_hardware_escalation_request_20260523.md](reports_h100_network_hardware_escalation_request_20260523.md) | 给网络/硬件/环境侧的闭环请求和回填表 |
+| 6 | [reports_multinode_nccl_latest_index_20260523.md](reports_multinode_nccl_latest_index_20260523.md) | 多节点 NCCL 相关报告索引 |
+| 7 | [reports_multinode_nccl_handoff_plan_20260523.md](reports_multinode_nccl_handoff_plan_20260523.md) | 接手人复跑和继续定位计划 |
+| 8 | [reports_test_all_latest_summary_cn_20260523.md](reports_test_all_latest_summary_cn_20260523.md) | 单节点 `test all` 中文原始汇总 |
+| 9 | [reports_rdma_cross_node_mlx5_0_20260523.md](reports_rdma_cross_node_mlx5_0_20260523.md) | 跨节点 RDMA `mlx5_0` 双向结果 |
 
 当前主要阻塞：
 
diff --git a/reports_h100_acceptance_delivery_manifest_20260523.md b/reports_h100_acceptance_delivery_manifest_20260523.md
index 1de9278..735b5ea 100644
--- a/reports_h100_acceptance_delivery_manifest_20260523.md
+++ b/reports_h100_acceptance_delivery_manifest_20260523.md
@@ -19,8 +19,9 @@
 | 1 | `README.md` | 仓库入口和 H100 当前验收入口 |
 | 2 | `reports_h100_acceptance_current_status_20260523.md` | 当前总状态和阻塞项 |
 | 3 | `reports_h100_acceptance_closure_checklist_20260523.md` | 可交付项、未关闭门禁、收尾路径 |
-| 4 | `reports_h100_network_hardware_escalation_request_20260523.md` | 给网络/硬件/环境侧的回填请求 |
-| 5 | `reports_multinode_nccl_latest_index_20260523.md` | 多节点 NCCL 报告索引 |
+| 4 | `reports_h100_acceptance_pr_summary_20260523.md` | PR/审阅摘要 |
+| 5 | `reports_h100_network_hardware_escalation_request_20260523.md` | 给网络/硬件/环境侧的回填请求 |
+| 6 | `reports_multinode_nccl_latest_index_20260523.md` | 多节点 NCCL 报告索引 |
 
 ## 核心报告
 
@@ -28,6 +29,7 @@
 |---|---|---|
 | 总览 | `reports_h100_acceptance_current_status_20260523.md` | FAIL，证据链完整但门禁未过 |
 | 收尾 | `reports_h100_acceptance_closure_checklist_20260523.md` | 可阶段性交付，不能判生产通过 |
+| PR 摘要 | `reports_h100_acceptance_pr_summary_20260523.md` | 给代码审阅和合并说明使用 |
 | 闭环请求 | `reports_h100_network_hardware_escalation_request_20260523.md` | 等待网络/硬件/环境侧回填 |
 | 单节点 | `reports_test_all_latest_summary_cn_20260523.md` | 两台均 `6/10 PASS`，整体 FAIL |
 | 跨节点 RDMA | `reports_rdma_cross_node_mlx5_0_20260523.md` | write BW PASS，read BW/latency FAIL |
@@ -113,9 +115,10 @@ fa5961d47a5905da6ebc6c726421d73ddc2314a316a8f578683d31fe69c256e5  reports/multin
 以下 hash 用于确认本地与两台远端入口文件一致。本 manifest 本身不做自引用 hash。
 
 ```text
-bf3fd8197285dca964b78c584ee6263b0d0f4d47fbf689d121367666d3398231  README.md
+e2faf6cbd968924727c669827d7e838d5165ee961133c8e55e8993134b5e7b63  README.md
 846c3da4ac655a0b3ad072e4c4475d91b55e2bdc9d8aedb9c5f9d800608fb64c  reports_h100_acceptance_current_status_20260523.md
 4a0ee9f456acc1284bf3a42df5bd338affb831471c27ca4b6584201acd72fd52  reports_h100_acceptance_closure_checklist_20260523.md
+0c71f36b9b1a6c5a73bd32337a56a702d3faa37c02640b93cb5d00b9b80c362f  reports_h100_acceptance_pr_summary_20260523.md
 45438db9204ceef5f65019a6594c016f3183799ed3b89dcf40f383a34f9e3466  reports_h100_network_hardware_escalation_request_20260523.md
 d982d6f3698e8860b8505d65105f6056c11f1f72758401a4613ae8315b6f92d0  reports_multinode_nccl_latest_index_20260523.md
 8fca70e703961745d5bdacaa3fccb814709c426c0fa7713d0df2d1f2fb26a3f4  reports_multinode_nccl_handoff_plan_20260523.md
diff --git a/reports_h100_acceptance_pr_summary_20260523.md b/reports_h100_acceptance_pr_summary_20260523.md
new file mode 100644
index 0000000..27b6436
--- /dev/null
+++ b/reports_h100_acceptance_pr_summary_20260523.md
@@ -0,0 +1,144 @@
+# H100 验收分支 PR 摘要 2026-05-23
+
+## 建议 PR 标题
+
+```text
+Add H100 acceptance evidence, multinode NCCL runs, and handoff reports
+```
+
+## PR 结论
+
+本 PR 完成 H100 验收测试侧的阶段性交付：脚本、单节点报告、多节点 NCCL 报告、RDMA 证据、artifacts、checksum、中文说明和交接文档已经齐备。
+
+但本 PR **不表示生产验收通过**。当前两台 H100 节点按现有 PDF/配置口径仍为 `FAIL`，需要网络/硬件/环境侧完成回填或修复后再复跑。
+
+## 变更范围
+
+### 测试入口
+
+- 新增/完善单节点 H100 `test all` 入口。
+- 新增多节点 NCCL PDF matrix 复跑入口。
+- 新增多节点 2x8 六项 collective 复跑入口。
+- 新增 NCCL 深度诊断和环境快照入口。
+
+### 配置
+
+- 固定 NCCL 2.27.7 / nccl-tests 路径的多节点 PDF matrix 配置。
+- 新增 2x8 六项 collective 配置。
+- `allreduce/alltoall` 保留已知 PDF 2x8 阈值；新增的 `broadcast/reducescatter/allgather/sendrecv` 暂按证据采集处理。
+
+### 报告和证据
+
+- 单节点 `test all` 中文汇总。
+- 跨节点 RDMA `mlx5_0` 双向证据。
+- 多节点 NCCL PDF matrix 中文摘要、原始报告、artifacts manifest。
+- 多节点 2x8 六项 collective 中文摘要、原始报告、artifacts manifest。
+- NCCL artifact 信号分析、环境等价性分析、handoff 计划、收尾清单。
+- 网络/硬件/环境侧闭环请求和交付包 manifest。
+
+## 当前验收状态
+
+| 范围 | 结论 | 说明 |
+|---|---|---|
+| 单节点 `test all` | FAIL | 两台均 `6/10 PASS`；Compute、NCCL、Stress、RDMA 未过 |
+| 跨节点 RDMA | FAIL | write BW PASS；read BW 和 latency 未达阈值 |
+| 多节点 NCCL PDF matrix | FAIL | 8 个 case 仅 2x2 allreduce 性能 PASS；所有 case 正确性 OK |
+| 多节点 2x8 六项 collective | FAIL / evidence complete | 6 项正确性 OK；allreduce/alltoall 按 PDF 阈值 FAIL |
+| 环境等价性 | 未证明 | 当前每节点只有 4 条 400G rail，缺外部 NCCL net plugin / SHARP 证据 |
+
+## 关键结果
+
+### 单节点
+
+```text
+aikubeworker0012: 6/10 PASS, PDF acceptance FAIL
+aikubeworker0016: 6/10 PASS, PDF acceptance FAIL
+```
+
+### 跨节点 RDMA
+
+```text
+ib_write_bw: 48.38-49.35 GB/s, PASS
+ib_read_bw: 44.36-44.37 GB/s, FAIL
+ib_write_lat avg: 2.13-2.17 us, FAIL
+ib_read_lat avg: 4.05-4.08 us, FAIL
+```
+
+### 多节点 NCCL PDF matrix
+
+| Topology | AllReduce | Target | Status | AllToAll | Target | Status |
+|---|---:|---:|---|---:|---:|---|
+| 2 nodes x 1 GPU | 47.29 | 48.90 | FAIL | 24.85 | 27.25 | FAIL |
+| 2 nodes x 2 GPUs | 137.16 | 136.93 | PASS | 47.76 | 54.41 | FAIL |
+| 2 nodes x 4 GPUs | 335.07 | 335.48 | FAIL | 72.74 | 73.73 | FAIL |
+| 2 nodes x 8 GPUs | 353.85 | 491.84 | FAIL | 36.83 | 76.54 | FAIL |
+
+所有 NCCL case 均 `returncode=0`、`wrong=0`，当前失败来自性能阈值，不是功能错误。
+
+## 主要风险
+
+1. **不能把本 PR 合并理解为验收通过。**
+   当前结果明确是 `FAIL`，本 PR 交付的是证据链和复跑能力。
+
+2. **PDF 2x8 allreduce 阈值可能要求比当前环境更强的 rail/plugin 能力。**
+   当前每节点仅 4 条 400G IB rail；PDF 2x8 allreduce 目标 `491.84 GB/s busbw` 反推 algbw `262.31 GB/s`，高于 4 x 400G rail 的理论单向原始带宽 `200 GB/s`。
+
+3. **alltoall 需要网络侧继续定位。**
+   `NCCL_PXN_DISABLE=1` 后 rail 更均衡，但 2x8 alltoall 仍只有 `36-37 GB/s`。
+
+4. **单节点门禁也仍未过。**
+   即使多节点 NCCL 后续解决，Compute、Stress、RDMA 单节点项仍需闭环。
+
+## 验证方式
+
+已完成：
+
+- `git diff --check`
+- 本地与两台远端入口文件 sha256 核对
+- 多节点 NCCL PDF matrix 复跑并归档 artifacts
+- 多节点 2x8 六项 collective 复跑并归档 artifacts
+- 跨节点 RDMA 单 rail 双向测试
+- 单节点 `test all` 汇总
+
+远端同步路径：
+
+```text
+nccl-gpu-1: /root/test_gpu_scripts
+nccl-gpu-2: /root/test_gpu_scripts
+```
+
+## 复跑命令
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/multinode_nccl_deep_diagnose.sh preflight
+bash scripts/run_multinode_nccl_pdf_matrix.sh
+bash scripts/run_multinode_nccl_all_collectives.sh
+```
+
+单节点复跑：
+
+```bash
+cd /root/test_gpu_scripts
+bash scripts/run_h100_single_node_all.sh
+```
+
+## Reviewer 重点看
+
+| 文件 | 为什么要看 |
+|---|---|
+| `reports_h100_acceptance_current_status_20260523.md` | 当前总览和失败项 |
+| `reports_h100_acceptance_delivery_manifest_20260523.md` | 交付包入口、远端 artifacts、checksum |
+| `reports_h100_network_hardware_escalation_request_20260523.md` | 需要网络/硬件/环境侧回填的问题 |
+| `reports_multinode_nccl_environment_gap_20260523.md` | 为什么当前环境不能证明与 PDF 等价 |
+| `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 多节点 PDF matrix 结果 |
+| `reports_multinode_nccl_all_collectives_run_20260523.md` | 六项 collective 补测结果 |
+
+## 合并建议
+
+可以合并为测试侧交付分支，但合并说明中必须保留：
+
+```text
+当前 H100 生产验收未通过；本分支交付测试证据、复跑脚本和闭环请求。
+最终验收需等待网络/硬件/环境侧确认或修复后复跑。
+```
-- 
2.47.2


From 4dddab27b3ed4abec644f282c6c24fd01589f7ef Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Tue, 26 May 2026 00:13:33 +0800
Subject: [PATCH 38/41] Add FP8 GEMM path comparison reports

---
 reports_cublaslt_fp8_crosscheck_20260524.md   |  87 ++++++
 ...gemm_aikubeworker0012_20260524_071148.json |  21 ++
 ...gemm_aikubeworker0016_20260524_071200.json |  21 ++
 reports_fp8_path_comparison_20260525.md       | 169 ++++++++++
 ...ined_aikubeworker0012_20260525_042347.json | 142 +++++++++
 ...ined_aikubeworker0012_20260525_045408.json | 156 ++++++++++
 ...ined_aikubeworker0016_20260525_042402.json | 142 +++++++++
 ...ined_aikubeworker0016_20260525_050048.json | 156 ++++++++++
 reports_gpu_Test_combined_20260524.md         | 152 +++++++++
 reports_gpu_Test_formal_20260524.md           | 123 ++++++++
 reports_gpu_Test_pdf.css                      | 102 ++++++
 scripts/cublaslt_fp8_gemm_bench.cu            | 291 ++++++++++++++++++
 scripts/pytorch_fp8_path_bench.py             | 277 +++++++++++++++++
 scripts/run_cublaslt_fp8_gemm.sh              |  45 +++
 scripts/run_fp8_path_comparison.sh            |  93 ++++++
 15 files changed, 1977 insertions(+)
 create mode 100644 reports_cublaslt_fp8_crosscheck_20260524.md
 create mode 100644 reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json
 create mode 100644 reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json
 create mode 100644 reports_fp8_path_comparison_20260525.md
 create mode 100644 reports_fp8_paths_combined_aikubeworker0012_20260525_042347.json
 create mode 100644 reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json
 create mode 100644 reports_fp8_paths_combined_aikubeworker0016_20260525_042402.json
 create mode 100644 reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json
 create mode 100644 reports_gpu_Test_combined_20260524.md
 create mode 100644 reports_gpu_Test_formal_20260524.md
 create mode 100644 reports_gpu_Test_pdf.css
 create mode 100644 scripts/cublaslt_fp8_gemm_bench.cu
 create mode 100755 scripts/pytorch_fp8_path_bench.py
 create mode 100755 scripts/run_cublaslt_fp8_gemm.sh
 create mode 100755 scripts/run_fp8_path_comparison.sh

diff --git a/reports_cublaslt_fp8_crosscheck_20260524.md b/reports_cublaslt_fp8_crosscheck_20260524.md
new file mode 100644
index 0000000..194a562
--- /dev/null
+++ b/reports_cublaslt_fp8_crosscheck_20260524.md
@@ -0,0 +1,87 @@
+# cuBLASLt FP8 GEMM Cross-Check Report
+
+Date: 2026-05-24
+
+Scope: Validate whether the single-node FP8 compute FAIL is caused by hardware/platform limits or by the original PyTorch `_scaled_mm` benchmark path.
+
+## Method
+
+Added a direct cuBLASLt FP8 GEMM micro-benchmark:
+
+- Source: `scripts/cublaslt_fp8_gemm_bench.cu`
+- Wrapper: `scripts/run_cublaslt_fp8_gemm.sh`
+- Input dtype: `CUDA_R_8F_E4M3`
+- Output dtype: `CUDA_R_16BF`
+- Accumulate / compute type: `CUBLAS_COMPUTE_32F`
+- Layout: cuBLASLt FP8-required TN format
+- Matrix size: `8192`
+- Warmup: `50`
+- Iterations: `500`
+- GPUs: single-node 8 GPUs, measured one GPU at a time
+
+NVIDIA cuBLASLt documentation states FP8 kernels require TN format, `CUBLAS_COMPUTE_32F`, and `CUDA_R_32F` scale type. The implemented benchmark follows those constraints.
+
+## Results
+
+### aikubeworker0012 / nccl-gpu-1
+
+Raw report: `reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json`
+
+| GPU | FP8 TFLOPS |
+|---:|---:|
+| 0 | 1615.6 |
+| 1 | 1611.0 |
+| 2 | 1599.0 |
+| 3 | 1607.1 |
+| 4 | 1614.0 |
+| 5 | 1604.4 |
+| 6 | 1608.4 |
+| 7 | 1609.1 |
+
+Summary:
+
+- Mean: `1608.6 TFLOPS`
+- Min / Max: `1599.0 / 1615.6 TFLOPS`
+- Spread: `1.03%`
+- FP8 absolute threshold: `>= 1400 TFLOPS`
+- Verdict against FP8 absolute threshold: **PASS**
+- Verdict against 8-GPU consistency threshold `<= 3%`: **PASS**
+
+### aikubeworker0016 / nccl-gpu-2
+
+Raw report: `reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json`
+
+| GPU | FP8 TFLOPS |
+|---:|---:|
+| 0 | 1602.3 |
+| 1 | 1604.0 |
+| 2 | 1616.9 |
+| 3 | 1610.6 |
+| 4 | 1620.5 |
+| 5 | 1630.3 |
+| 6 | 1605.1 |
+| 7 | 1620.2 |
+
+Summary:
+
+- Mean: `1613.7 TFLOPS`
+- Min / Max: `1602.3 / 1630.3 TFLOPS`
+- Spread: `1.74%`
+- FP8 absolute threshold: `>= 1400 TFLOPS`
+- Verdict against FP8 absolute threshold: **PASS**
+- Verdict against 8-GPU consistency threshold `<= 3%`: **PASS**
+
+## Comparison With Existing PyTorch `_scaled_mm` Result
+
+| Host | PyTorch `_scaled_mm` FP8 | cuBLASLt FP8 | Delta |
+|---|---:|---:|---:|
+| aikubeworker0012 | 1170.4 | 1608.6 | +438.2 |
+| aikubeworker0016 | 1179.5 | 1613.7 | +434.2 |
+
+The cuBLASLt path passes the `>= 1400 TFLOPS` FP8 absolute threshold on both machines, while the original PyTorch `_scaled_mm` path remains around `1170-1180 TFLOPS`.
+
+## Conclusion
+
+The FP8 hardware path is capable of exceeding the configured H100 FP8 acceptance threshold on both machines. The earlier FP8 FAIL is therefore most likely a benchmark implementation issue in the current PyTorch `_scaled_mm` path, not a GPU hardware, power, clock, thermal, MIG, ECC, or Fabric Manager issue.
+
+Recommended next action: replace or augment the existing FP8 compute acceptance item with the cuBLASLt FP8 GEMM cross-check, while keeping the PyTorch `_scaled_mm` result as a secondary software-stack signal.
diff --git a/reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json b/reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json
new file mode 100644
index 0000000..b61e641
--- /dev/null
+++ b/reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json
@@ -0,0 +1,21 @@
+{
+  "source": "cuBLASLt",
+  "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
+  "matrix_size": 8192,
+  "warmup": 50,
+  "iterations": 500,
+  "per_gpu": [
+    {"index": 0, "fp8_tflops": 1615.6},
+    {"index": 1, "fp8_tflops": 1611.0},
+    {"index": 2, "fp8_tflops": 1599.0},
+    {"index": 3, "fp8_tflops": 1607.1},
+    {"index": 4, "fp8_tflops": 1614.0},
+    {"index": 5, "fp8_tflops": 1604.4},
+    {"index": 6, "fp8_tflops": 1608.4},
+    {"index": 7, "fp8_tflops": 1609.1}
+  ],
+  "mean_tflops": 1608.6,
+  "min_tflops": 1599.0,
+  "max_tflops": 1615.6,
+  "spread_pct": 1.03
+}
diff --git a/reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json b/reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json
new file mode 100644
index 0000000..6808990
--- /dev/null
+++ b/reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json
@@ -0,0 +1,21 @@
+{
+  "source": "cuBLASLt",
+  "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
+  "matrix_size": 8192,
+  "warmup": 50,
+  "iterations": 500,
+  "per_gpu": [
+    {"index": 0, "fp8_tflops": 1602.3},
+    {"index": 1, "fp8_tflops": 1604.0},
+    {"index": 2, "fp8_tflops": 1616.9},
+    {"index": 3, "fp8_tflops": 1610.6},
+    {"index": 4, "fp8_tflops": 1620.5},
+    {"index": 5, "fp8_tflops": 1630.3},
+    {"index": 6, "fp8_tflops": 1605.1},
+    {"index": 7, "fp8_tflops": 1620.2}
+  ],
+  "mean_tflops": 1613.7,
+  "min_tflops": 1602.3,
+  "max_tflops": 1630.3,
+  "spread_pct": 1.74
+}
diff --git a/reports_fp8_path_comparison_20260525.md b/reports_fp8_path_comparison_20260525.md
new file mode 100644
index 0000000..c245b15
--- /dev/null
+++ b/reports_fp8_path_comparison_20260525.md
@@ -0,0 +1,169 @@
+# FP8 GEMM 路径对比测试报告
+
+测试日期：2026-05-25  
+测试节点：aikubeworker0012、aikubeworker0016  
+测试 GPU：NVIDIA H100 80GB HBM3  
+测试目标：对比同一 FP8 GEMM 规模下 PyTorch eager、CUDA Graph、Transformer Engine 和 direct cuBLASLt 的性能差异。
+
+## 一、测试结论
+
+本次 A-E 五条路径均已完成实测。
+
+核心结论：
+
+1. direct cuBLASLt 是本组测试里最快路径，两台机器分别达到 1626.6 TFLOPS 和 1598.1 TFLOPS。
+2. PyTorch eager `_scaled_mm` 默认路径约为 1161.9-1186.1 TFLOPS。
+3. 打开 `use_fast_accum=True` 后，PyTorch eager 路径有稳定提升，约提升 5.0%-6.7%。
+4. CUDA Graph + `_scaled_mm(use_fast_accum=True)` 进一步提升到 1277.7-1322.2 TFLOPS，但仍低于 direct cuBLASLt。
+5. Transformer Engine 本次使用的是 `te.Linear` + `fp8_autocast` 路径，不是裸 GEMM，因此包含 TE module、cast、FP8 recipe 等额外开销，结果低于 direct cuBLASLt，也低于 CUDA Graph `_scaled_mm`。
+
+这说明：当前 GPU 硬件和 cuBLASLt 裸 GEMM 能力本身没有问题；之前 PyTorch `_scaled_mm` 1170-1180 TFLOPS 左右的结果，主要反映的是 PyTorch eager 路径和当前 benchmark 方式下的端到端路径性能，而不是 GPU 算力极限。
+
+## 二、测试方法
+
+统一参数：
+
+| 参数 | 值 |
+|---|---:|
+| matrix_size | 8192 |
+| M/N/K | 8192/8192/8192 |
+| warmup | 50 |
+| iterations | 500 |
+| GPU index | 0 |
+| PyTorch | 2.6.0+cu124 |
+| CUDA | 12.4 |
+| 输入 dtype | FP8 E4M3 |
+| 输出 dtype | BF16 |
+| accumulation | FP32 |
+| scale_a / scale_b | 1.0 / 1.0 |
+
+测试路径定义：
+
+| 路径 | 名称 | 含义 |
+|---|---|---|
+| A | 当前 eager `_scaled_mm` | PyTorch 立即执行模式调用 `torch._scaled_mm`，默认 accumulation 参数 |
+| B | `_scaled_mm(use_fast_accum=True)` | PyTorch eager 路径，但显式打开 fast accumulation |
+| C | CUDA Graph + `_scaled_mm(use_fast_accum=True)` | 捕获并 replay 同一个 `_scaled_mm` 调用，降低 Python/PyTorch launch 间隙 |
+| D | Transformer Engine FP8 GEMM | `te.Linear` 在 `fp8_autocast` 下执行，包含 TE 层封装和 FP8 recipe 开销 |
+| E | direct cuBLASLt | C++/CUDA 直接调用 `cublasLtMatmul`，绕过 PyTorch eager |
+
+复现脚本：
+
+```bash
+MATRIX_SIZE=8192 WARMUP=50 ITERATIONS=500 GPU_INDEX=0 WORKSPACE_MB=256 \
+  /root/test_gpu_scripts/scripts/run_fp8_path_comparison.sh
+```
+
+## 三、实测结果
+
+### aikubeworker0012
+
+原始 JSON：`/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json`
+
+| 路径 | 状态 | TFLOPS | 单轮 CUDA event 时间 |
+|---|---|---:|---:|
+| A eager `_scaled_mm` default | OK | 1186.1 | 927.014 us |
+| B eager `_scaled_mm` fast_accum | OK | 1266.0 | 868.481 us |
+| C CUDA Graph + fast_accum | OK | 1322.2 | 831.573 us |
+| D Transformer Engine FP8 Linear | OK | 1153.2 | 953.478 us |
+| E direct cuBLASLt fast_accum | OK | 1626.6 | 未在 combined JSON 中记录 |
+
+相对 A 的提升：
+
+| 路径 | 相对 A |
+|---|---:|
+| B | +6.7% |
+| C | +11.5% |
+| D | -2.8% |
+| E | +37.1% |
+
+E 路径 cuBLASLt 算法信息：
+
+| 字段 | 值 |
+|---|---:|
+| algo_id | 52 |
+| tile_id | 23 |
+| splitk | 1 |
+| stages_id | 36 |
+| inner_shape_id | 0 |
+| cluster_shape_id | 3 |
+
+### aikubeworker0016
+
+原始 JSON：`/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json`
+
+| 路径 | 状态 | TFLOPS | 单轮 CUDA event 时间 |
+|---|---|---:|---:|
+| A eager `_scaled_mm` default | OK | 1161.9 | 946.313 us |
+| B eager `_scaled_mm` fast_accum | OK | 1220.4 | 900.960 us |
+| C CUDA Graph + fast_accum | OK | 1277.7 | 860.543 us |
+| D Transformer Engine FP8 Linear | OK | 1125.3 | 977.054 us |
+| E direct cuBLASLt fast_accum | OK | 1598.1 | 未在 combined JSON 中记录 |
+
+相对 A 的提升：
+
+| 路径 | 相对 A |
+|---|---:|
+| B | +5.0% |
+| C | +10.0% |
+| D | -3.2% |
+| E | +37.5% |
+
+E 路径 cuBLASLt 算法信息：
+
+| 字段 | 值 |
+|---|---:|
+| algo_id | 52 |
+| tile_id | 23 |
+| splitk | 1 |
+| stages_id | 36 |
+| inner_shape_id | 0 |
+| cluster_shape_id | 3 |
+
+## 四、对 PyTorch FP8 能否“上去”的判断
+
+从本次结果看，PyTorch FP8 路径可以通过两类方式上去：
+
+1. 打开更快的 math/accumulation 参数，例如 `use_fast_accum=True`。
+2. 使用 CUDA Graph replay，减少 eager 模式下每轮调度、enqueue 之间的间隙。
+
+但在当前 `matrix_size=8192`、单个 `_scaled_mm`、PyTorch eager/Graph benchmark 的测试形态下，PyTorch 路径仍没有达到 direct cuBLASLt 的 1598-1626 TFLOPS。也就是说，direct cuBLASLt 证明硬件和底层库有能力跑得更高；PyTorch eager `_scaled_mm` 测到的是 PyTorch 当前封装路径在这个 shape 下的实际表现。
+
+如果把目标定义为“让 PyTorch 代码路径更接近裸 cuBLASLt”，后续可以继续验证：
+
+1. 更大的 GEMM size，例如 16384。
+2. 固定 shape 后用 `torch.compile` 或 Inductor。
+3. CUDA Graph 覆盖更完整的 step，而不是只 replay 单个 op。
+4. 使用 Transformer Engine 的更底层 GEMM API 或官方 microbenchmark，而不是 `te.Linear` module forward。
+5. 对 `_scaled_mm` 做 Nsight Systems / Nsight Compute 抓取，确认实际 kernel、间隙和 cuBLASLt 算法选择。
+
+## 五、术语说明
+
+`eager` 指 PyTorch 立即执行模式。每次 Python 调用 `torch._scaled_mm`，PyTorch 都会经过 dispatcher、参数检查、Tensor 创建、准备 descriptor、调用 cuBLASLt heuristic，然后把 matmul enqueue 到 CUDA stream。
+
+`cuBLAS` 是 NVIDIA 的基础矩阵乘库。`cuBLASLt` 是更灵活的矩阵乘接口，支持更多 layout、FP8、算法 heuristic、workspace、epilogue 等能力。
+
+`direct cuBLASLt` 指我们自己写 C++/CUDA 直接调用 `cublasLtMatmul`，不经过 PyTorch eager，因此更接近裸 GEMM 峰值。
+
+`CUDA Graph` 指把一次 CUDA work 提前捕获成图，后续直接 replay，减少 CPU 侧反复 launch/调度带来的间隙。
+
+`Transformer Engine` 是 NVIDIA 面向 Transformer/FP8 训练优化的库。本次 D 路径使用的是 `te.Linear` module forward，不等同于裸 GEMM microbenchmark。
+
+## 六、文件清单
+
+本地脚本：
+
+| 文件 | 用途 |
+|---|---|
+| `/Users/d-robotics/lab/test_gpu_scripts/scripts/pytorch_fp8_path_bench.py` | A/B/C/D PyTorch 与 Transformer Engine 路径 |
+| `/Users/d-robotics/lab/test_gpu_scripts/scripts/cublaslt_fp8_gemm_bench.cu` | E direct cuBLASLt 路径 |
+| `/Users/d-robotics/lab/test_gpu_scripts/scripts/run_fp8_path_comparison.sh` | 统一运行并合并 A-E 结果 |
+
+本地结果：
+
+| 文件 | 用途 |
+|---|---|
+| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json` | aikubeworker0012 A-E 原始结果 |
+| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json` | aikubeworker0016 A-E 原始结果 |
+| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_path_comparison_20260525.md` | 本中文汇总报告 |
+
diff --git a/reports_fp8_paths_combined_aikubeworker0012_20260525_042347.json b/reports_fp8_paths_combined_aikubeworker0012_20260525_042347.json
new file mode 100644
index 0000000..51a1540
--- /dev/null
+++ b/reports_fp8_paths_combined_aikubeworker0012_20260525_042347.json
@@ -0,0 +1,142 @@
+{
+  "source": "fp8_path_comparison",
+  "host": null,
+  "matrix_size": 8192,
+  "gpu_index": 0,
+  "pytorch": {
+    "source": "pytorch_fp8_path_bench",
+    "torch": "2.6.0+cu124",
+    "cuda": "12.4",
+    "gpu_index": 0,
+    "gpu_name": "NVIDIA H100 80GB HBM3",
+    "matrix_size": 8192,
+    "warmup": 50,
+    "iterations": 500,
+    "results": [
+      {
+        "name": "A_eager_scaled_mm_default",
+        "status": "ok",
+        "matrix_size": 8192,
+        "iterations": 500,
+        "warmup": 50,
+        "event_ms_total": 465.145,
+        "event_us_per_iter": 930.29,
+        "wall_ms_total": 465.21,
+        "tflops": 1181.9
+      },
+      {
+        "name": "B_eager_scaled_mm_fast_accum",
+        "status": "ok",
+        "matrix_size": 8192,
+        "iterations": 500,
+        "warmup": 50,
+        "event_ms_total": 440.252,
+        "event_us_per_iter": 880.504,
+        "wall_ms_total": 440.289,
+        "tflops": 1248.7
+      },
+      {
+        "name": "C_cuda_graph_scaled_mm_fast_accum",
+        "status": "ok",
+        "matrix_size": 8192,
+        "iterations": 500,
+        "warmup": 3,
+        "event_ms_total": 415.631,
+        "event_us_per_iter": 831.262,
+        "wall_ms_total": 415.664,
+        "tflops": 1322.7
+      },
+      {
+        "name": "D_transformer_engine_fp8_linear",
+        "status": "unavailable",
+        "reason": "ModuleNotFoundError: No module named 'transformer_engine'"
+      }
+    ],
+    "summary": {
+      "max_tflops": 1322.7,
+      "min_tflops": 1181.9,
+      "mean_tflops": 1251.1
+    }
+  },
+  "cublaslt": {
+    "source": "cuBLASLt",
+    "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
+    "matrix_size": 8192,
+    "warmup": 50,
+    "iterations": 500,
+    "fast_accum": 1,
+    "per_gpu": [
+      {
+        "index": 0,
+        "fp8_tflops": 1615.4,
+        "algo_id": 52,
+        "tile_id": 23,
+        "splitk": 1,
+        "stages_id": 36,
+        "inner_shape_id": 0,
+        "cluster_shape_id": 3
+      }
+    ],
+    "mean_tflops": 1615.4,
+    "min_tflops": 1615.4,
+    "max_tflops": 1615.4,
+    "spread_pct": 0.0
+  },
+  "results": [
+    {
+      "name": "A_eager_scaled_mm_default",
+      "status": "ok",
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 50,
+      "event_ms_total": 465.145,
+      "event_us_per_iter": 930.29,
+      "wall_ms_total": 465.21,
+      "tflops": 1181.9
+    },
+    {
+      "name": "B_eager_scaled_mm_fast_accum",
+      "status": "ok",
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 50,
+      "event_ms_total": 440.252,
+      "event_us_per_iter": 880.504,
+      "wall_ms_total": 440.289,
+      "tflops": 1248.7
+    },
+    {
+      "name": "C_cuda_graph_scaled_mm_fast_accum",
+      "status": "ok",
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 3,
+      "event_ms_total": 415.631,
+      "event_us_per_iter": 831.262,
+      "wall_ms_total": 415.664,
+      "tflops": 1322.7
+    },
+    {
+      "name": "D_transformer_engine_fp8_linear",
+      "status": "unavailable",
+      "reason": "ModuleNotFoundError: No module named 'transformer_engine'"
+    },
+    {
+      "index": 0,
+      "algo_id": 52,
+      "tile_id": 23,
+      "splitk": 1,
+      "stages_id": 36,
+      "inner_shape_id": 0,
+      "cluster_shape_id": 3,
+      "name": "E_direct_cublaslt_fast_accum",
+      "status": "ok",
+      "tflops": 1615.4,
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 50,
+      "fast_accum": 1,
+      "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
+    }
+  ]
+}
\ No newline at end of file
diff --git a/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json b/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json
new file mode 100644
index 0000000..56cbce5
--- /dev/null
+++ b/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json
@@ -0,0 +1,156 @@
+{
+  "source": "fp8_path_comparison",
+  "host": null,
+  "matrix_size": 8192,
+  "gpu_index": 0,
+  "pytorch": {
+    "source": "pytorch_fp8_path_bench",
+    "torch": "2.6.0+cu124",
+    "cuda": "12.4",
+    "gpu_index": 0,
+    "gpu_name": "NVIDIA H100 80GB HBM3",
+    "matrix_size": 8192,
+    "warmup": 50,
+    "iterations": 500,
+    "results": [
+      {
+        "name": "A_eager_scaled_mm_default",
+        "status": "ok",
+        "matrix_size": 8192,
+        "iterations": 500,
+        "warmup": 50,
+        "event_ms_total": 463.507,
+        "event_us_per_iter": 927.014,
+        "wall_ms_total": 463.573,
+        "tflops": 1186.1
+      },
+      {
+        "name": "B_eager_scaled_mm_fast_accum",
+        "status": "ok",
+        "matrix_size": 8192,
+        "iterations": 500,
+        "warmup": 50,
+        "event_ms_total": 434.241,
+        "event_us_per_iter": 868.481,
+        "wall_ms_total": 434.492,
+        "tflops": 1266.0
+      },
+      {
+        "name": "C_cuda_graph_scaled_mm_fast_accum",
+        "status": "ok",
+        "matrix_size": 8192,
+        "iterations": 500,
+        "warmup": 3,
+        "event_ms_total": 415.786,
+        "event_us_per_iter": 831.573,
+        "wall_ms_total": 415.825,
+        "tflops": 1322.2
+      },
+      {
+        "name": "D_transformer_engine_fp8_linear",
+        "status": "ok",
+        "matrix_size": 8192,
+        "iterations": 500,
+        "warmup": 50,
+        "event_ms_total": 476.739,
+        "event_us_per_iter": 953.478,
+        "wall_ms_total": 476.8,
+        "tflops": 1153.2,
+        "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
+      }
+    ],
+    "summary": {
+      "max_tflops": 1322.2,
+      "min_tflops": 1153.2,
+      "mean_tflops": 1231.9
+    }
+  },
+  "cublaslt": {
+    "source": "cuBLASLt",
+    "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
+    "matrix_size": 8192,
+    "warmup": 50,
+    "iterations": 500,
+    "fast_accum": 1,
+    "per_gpu": [
+      {
+        "index": 0,
+        "fp8_tflops": 1626.6,
+        "algo_id": 52,
+        "tile_id": 23,
+        "splitk": 1,
+        "stages_id": 36,
+        "inner_shape_id": 0,
+        "cluster_shape_id": 3
+      }
+    ],
+    "mean_tflops": 1626.6,
+    "min_tflops": 1626.6,
+    "max_tflops": 1626.6,
+    "spread_pct": 0.0
+  },
+  "results": [
+    {
+      "name": "A_eager_scaled_mm_default",
+      "status": "ok",
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 50,
+      "event_ms_total": 463.507,
+      "event_us_per_iter": 927.014,
+      "wall_ms_total": 463.573,
+      "tflops": 1186.1
+    },
+    {
+      "name": "B_eager_scaled_mm_fast_accum",
+      "status": "ok",
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 50,
+      "event_ms_total": 434.241,
+      "event_us_per_iter": 868.481,
+      "wall_ms_total": 434.492,
+      "tflops": 1266.0
+    },
+    {
+      "name": "C_cuda_graph_scaled_mm_fast_accum",
+      "status": "ok",
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 3,
+      "event_ms_total": 415.786,
+      "event_us_per_iter": 831.573,
+      "wall_ms_total": 415.825,
+      "tflops": 1322.2
+    },
+    {
+      "name": "D_transformer_engine_fp8_linear",
+      "status": "ok",
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 50,
+      "event_ms_total": 476.739,
+      "event_us_per_iter": 953.478,
+      "wall_ms_total": 476.8,
+      "tflops": 1153.2,
+      "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
+    },
+    {
+      "index": 0,
+      "algo_id": 52,
+      "tile_id": 23,
+      "splitk": 1,
+      "stages_id": 36,
+      "inner_shape_id": 0,
+      "cluster_shape_id": 3,
+      "name": "E_direct_cublaslt_fast_accum",
+      "status": "ok",
+      "tflops": 1626.6,
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 50,
+      "fast_accum": 1,
+      "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
+    }
+  ]
+}
\ No newline at end of file
diff --git a/reports_fp8_paths_combined_aikubeworker0016_20260525_042402.json b/reports_fp8_paths_combined_aikubeworker0016_20260525_042402.json
new file mode 100644
index 0000000..6d6a3a2
--- /dev/null
+++ b/reports_fp8_paths_combined_aikubeworker0016_20260525_042402.json
@@ -0,0 +1,142 @@
+{
+  "source": "fp8_path_comparison",
+  "host": null,
+  "matrix_size": 8192,
+  "gpu_index": 0,
+  "pytorch": {
+    "source": "pytorch_fp8_path_bench",
+    "torch": "2.6.0+cu124",
+    "cuda": "12.4",
+    "gpu_index": 0,
+    "gpu_name": "NVIDIA H100 80GB HBM3",
+    "matrix_size": 8192,
+    "warmup": 50,
+    "iterations": 500,
+    "results": [
+      {
+        "name": "A_eager_scaled_mm_default",
+        "status": "ok",
+        "matrix_size": 8192,
+        "iterations": 500,
+        "warmup": 50,
+        "event_ms_total": 470.909,
+        "event_us_per_iter": 941.817,
+        "wall_ms_total": 470.974,
+        "tflops": 1167.4
+      },
+      {
+        "name": "B_eager_scaled_mm_fast_accum",
+        "status": "ok",
+        "matrix_size": 8192,
+        "iterations": 500,
+        "warmup": 50,
+        "event_ms_total": 452.608,
+        "event_us_per_iter": 905.215,
+        "wall_ms_total": 452.647,
+        "tflops": 1214.6
+      },
+      {
+        "name": "C_cuda_graph_scaled_mm_fast_accum",
+        "status": "ok",
+        "matrix_size": 8192,
+        "iterations": 500,
+        "warmup": 3,
+        "event_ms_total": 427.724,
+        "event_us_per_iter": 855.449,
+        "wall_ms_total": 427.768,
+        "tflops": 1285.3
+      },
+      {
+        "name": "D_transformer_engine_fp8_linear",
+        "status": "unavailable",
+        "reason": "ModuleNotFoundError: No module named 'transformer_engine'"
+      }
+    ],
+    "summary": {
+      "max_tflops": 1285.3,
+      "min_tflops": 1167.4,
+      "mean_tflops": 1222.4
+    }
+  },
+  "cublaslt": {
+    "source": "cuBLASLt",
+    "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
+    "matrix_size": 8192,
+    "warmup": 50,
+    "iterations": 500,
+    "fast_accum": 1,
+    "per_gpu": [
+      {
+        "index": 0,
+        "fp8_tflops": 1594.3,
+        "algo_id": 52,
+        "tile_id": 23,
+        "splitk": 1,
+        "stages_id": 36,
+        "inner_shape_id": 0,
+        "cluster_shape_id": 3
+      }
+    ],
+    "mean_tflops": 1594.3,
+    "min_tflops": 1594.3,
+    "max_tflops": 1594.3,
+    "spread_pct": 0.0
+  },
+  "results": [
+    {
+      "name": "A_eager_scaled_mm_default",
+      "status": "ok",
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 50,
+      "event_ms_total": 470.909,
+      "event_us_per_iter": 941.817,
+      "wall_ms_total": 470.974,
+      "tflops": 1167.4
+    },
+    {
+      "name": "B_eager_scaled_mm_fast_accum",
+      "status": "ok",
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 50,
+      "event_ms_total": 452.608,
+      "event_us_per_iter": 905.215,
+      "wall_ms_total": 452.647,
+      "tflops": 1214.6
+    },
+    {
+      "name": "C_cuda_graph_scaled_mm_fast_accum",
+      "status": "ok",
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 3,
+      "event_ms_total": 427.724,
+      "event_us_per_iter": 855.449,
+      "wall_ms_total": 427.768,
+      "tflops": 1285.3
+    },
+    {
+      "name": "D_transformer_engine_fp8_linear",
+      "status": "unavailable",
+      "reason": "ModuleNotFoundError: No module named 'transformer_engine'"
+    },
+    {
+      "index": 0,
+      "algo_id": 52,
+      "tile_id": 23,
+      "splitk": 1,
+      "stages_id": 36,
+      "inner_shape_id": 0,
+      "cluster_shape_id": 3,
+      "name": "E_direct_cublaslt_fast_accum",
+      "status": "ok",
+      "tflops": 1594.3,
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 50,
+      "fast_accum": 1,
+      "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
+    }
+  ]
+}
\ No newline at end of file
diff --git a/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json b/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json
new file mode 100644
index 0000000..7168c05
--- /dev/null
+++ b/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json
@@ -0,0 +1,156 @@
+{
+  "source": "fp8_path_comparison",
+  "host": null,
+  "matrix_size": 8192,
+  "gpu_index": 0,
+  "pytorch": {
+    "source": "pytorch_fp8_path_bench",
+    "torch": "2.6.0+cu124",
+    "cuda": "12.4",
+    "gpu_index": 0,
+    "gpu_name": "NVIDIA H100 80GB HBM3",
+    "matrix_size": 8192,
+    "warmup": 50,
+    "iterations": 500,
+    "results": [
+      {
+        "name": "A_eager_scaled_mm_default",
+        "status": "ok",
+        "matrix_size": 8192,
+        "iterations": 500,
+        "warmup": 50,
+        "event_ms_total": 473.156,
+        "event_us_per_iter": 946.313,
+        "wall_ms_total": 473.199,
+        "tflops": 1161.9
+      },
+      {
+        "name": "B_eager_scaled_mm_fast_accum",
+        "status": "ok",
+        "matrix_size": 8192,
+        "iterations": 500,
+        "warmup": 50,
+        "event_ms_total": 450.48,
+        "event_us_per_iter": 900.96,
+        "wall_ms_total": 450.505,
+        "tflops": 1220.4
+      },
+      {
+        "name": "C_cuda_graph_scaled_mm_fast_accum",
+        "status": "ok",
+        "matrix_size": 8192,
+        "iterations": 500,
+        "warmup": 3,
+        "event_ms_total": 430.272,
+        "event_us_per_iter": 860.543,
+        "wall_ms_total": 430.304,
+        "tflops": 1277.7
+      },
+      {
+        "name": "D_transformer_engine_fp8_linear",
+        "status": "ok",
+        "matrix_size": 8192,
+        "iterations": 500,
+        "warmup": 50,
+        "event_ms_total": 488.527,
+        "event_us_per_iter": 977.054,
+        "wall_ms_total": 488.576,
+        "tflops": 1125.3,
+        "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
+      }
+    ],
+    "summary": {
+      "max_tflops": 1277.7,
+      "min_tflops": 1125.3,
+      "mean_tflops": 1196.3
+    }
+  },
+  "cublaslt": {
+    "source": "cuBLASLt",
+    "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
+    "matrix_size": 8192,
+    "warmup": 50,
+    "iterations": 500,
+    "fast_accum": 1,
+    "per_gpu": [
+      {
+        "index": 0,
+        "fp8_tflops": 1598.1,
+        "algo_id": 52,
+        "tile_id": 23,
+        "splitk": 1,
+        "stages_id": 36,
+        "inner_shape_id": 0,
+        "cluster_shape_id": 3
+      }
+    ],
+    "mean_tflops": 1598.1,
+    "min_tflops": 1598.1,
+    "max_tflops": 1598.1,
+    "spread_pct": 0.0
+  },
+  "results": [
+    {
+      "name": "A_eager_scaled_mm_default",
+      "status": "ok",
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 50,
+      "event_ms_total": 473.156,
+      "event_us_per_iter": 946.313,
+      "wall_ms_total": 473.199,
+      "tflops": 1161.9
+    },
+    {
+      "name": "B_eager_scaled_mm_fast_accum",
+      "status": "ok",
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 50,
+      "event_ms_total": 450.48,
+      "event_us_per_iter": 900.96,
+      "wall_ms_total": 450.505,
+      "tflops": 1220.4
+    },
+    {
+      "name": "C_cuda_graph_scaled_mm_fast_accum",
+      "status": "ok",
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 3,
+      "event_ms_total": 430.272,
+      "event_us_per_iter": 860.543,
+      "wall_ms_total": 430.304,
+      "tflops": 1277.7
+    },
+    {
+      "name": "D_transformer_engine_fp8_linear",
+      "status": "ok",
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 50,
+      "event_ms_total": 488.527,
+      "event_us_per_iter": 977.054,
+      "wall_ms_total": 488.576,
+      "tflops": 1125.3,
+      "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
+    },
+    {
+      "index": 0,
+      "algo_id": 52,
+      "tile_id": 23,
+      "splitk": 1,
+      "stages_id": 36,
+      "inner_shape_id": 0,
+      "cluster_shape_id": 3,
+      "name": "E_direct_cublaslt_fast_accum",
+      "status": "ok",
+      "tflops": 1598.1,
+      "matrix_size": 8192,
+      "iterations": 500,
+      "warmup": 50,
+      "fast_accum": 1,
+      "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
+    }
+  ]
+}
\ No newline at end of file
diff --git a/reports_gpu_Test_combined_20260524.md b/reports_gpu_Test_combined_20260524.md
new file mode 100644
index 0000000..b4fff0a
--- /dev/null
+++ b/reports_gpu_Test_combined_20260524.md
@@ -0,0 +1,152 @@
+# GPU_Test 合并报告
+
+- **日期:** 2026-05-24
+- **节点:** `aikubeworker0012 / 172.72.8.12`，`aikubeworker0016 / 172.72.8.16`
+- **GPU:** NVIDIA H100 80GB HBM3 x8 / node
+- **范围:** 单机单卡算力与多机多卡 NCCL 通信
+- **说明:** 本报告汇总既有原始测试结果，不重新启动额外压力测试。
+
+## 总体结论
+
+| 测试项 | 结论 | 说明 |
+|---|---|---|
+| 单机 GPU 识别 | PASS | 两台机器均识别 8 张 H100 80GB HBM3 |
+| 单机单卡 FP8 硬件算力 | PASS | direct cuBLASLt FP8 GEMM 两台机器均超过 `>= 1400 TFLOPS` |
+| PyTorch `_scaled_mm` FP8 路径 | FAIL / 软件栈信号 | 约 `1170-1180 TFLOPS`，低于阈值；已定位为 PyTorch eager / `_scaled_mm` benchmark 路径偏低，不作为硬件失败依据 |
+| 多机多卡 NCCL 正确性 | PASS | return code `0`，`Wrong=0` / `Out of bounds values: 0 OK` |
+| 多机多卡 NCCL 性能 | 符合当前 4x400Gbps 网络形态 | 2x8 allreduce / alltoall 低于 PDF 8x400Gbps 阈值，但该阈值不应直接硬套到当前 4x400Gbps 环境 |
+
+## 单机单卡 / 算力测试
+
+### 机器信息
+
+| Host | GPU | Driver | CUDA | GPU 数量 |
+|---|---|---|---|---:|
+| `aikubeworker0012` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
+| `aikubeworker0016` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
+
+来源：
+
+- `reports_single_gpu_aikubeworker0012.md`
+- `reports_single_gpu_aikubeworker0016.md`
+
+### 原始 PyTorch 单机算力结果
+
+| Host | FP32 | TF32 | FP16 | BF16 | FP8 `_scaled_mm` | 原始 Verdict |
+|---|---:|---:|---:|---:|---:|---|
+| `aikubeworker0012` | 52.0 | 362.3 | 691.0 | 713.0 | 1148.8 | FAIL |
+| `aikubeworker0016` | 51.9 | 357.8 | 667.2 | 699.1 | 1146.2 | FAIL |
+
+原始 PyTorch 路径使用 `torch._scaled_mm` 做 FP8 GEMM。后续复查显示，该路径会受到 PyTorch eager dispatch、输出 Tensor 创建、cuBLASLt heuristic 路径、默认 `use_fast_accum=False` 等因素影响，不能直接代表 H100 FP8 Tensor Core 硬件上限。
+
+### direct cuBLASLt FP8 GEMM 交叉验证
+
+测试参数：
+
+| 参数 | 值 |
+|---|---|
+| Benchmark | direct cuBLASLt FP8 GEMM |
+| Source | `scripts/cublaslt_fp8_gemm_bench.cu` |
+| Matrix | `8192 x 8192 x 8192` |
+| A/B dtype | FP8 E4M3 |
+| Output dtype | BF16 |
+| Compute type | `CUBLAS_COMPUTE_32F` |
+| Scale type | `CUDA_R_32F` |
+| Scale A/B | `1.0` |
+| Layout | TN |
+| fast accumulation | enabled |
+| Threshold | `>= 1400 TFLOPS` |
+
+结果：
+
+| Host | Mean FP8 TFLOPS | Min | Max | Spread | Threshold | Verdict |
+|---|---:|---:|---:|---:|---:|---|
+| `aikubeworker0012` | 1608.6 | 1599.0 | 1615.6 | 1.03% | >= 1400 | PASS |
+| `aikubeworker0016` | 1613.7 | 1602.3 | 1630.3 | 1.74% | >= 1400 | PASS |
+
+单卡逐张结果：
+
+| Host | GPU0 | GPU1 | GPU2 | GPU3 | GPU4 | GPU5 | GPU6 | GPU7 |
+|---|---:|---:|---:|---:|---:|---:|---:|---:|
+| `aikubeworker0012` | 1615.6 | 1611.0 | 1599.0 | 1607.1 | 1614.0 | 1604.4 | 1608.4 | 1609.1 |
+| `aikubeworker0016` | 1602.3 | 1604.0 | 1616.9 | 1610.6 | 1620.5 | 1630.3 | 1605.1 | 1620.2 |
+
+结论：direct cuBLASLt FP8 GEMM 已通过 `>= 1400 TFLOPS` 阈值，说明两台机器的 FP8 硬件计算路径具备达标能力。PyTorch `_scaled_mm` 的 FAIL 更适合作为软件栈 benchmark 路径问题记录，而不是 GPU 硬件失败结论。
+
+来源：
+
+- `reports_cublaslt_fp8_crosscheck_20260524.md`
+- `reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json`
+- `reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json`
+
+## 多机多卡 NCCL 测试
+
+### 测试环境
+
+| 项目 | 结果 |
+|---|---|
+| Hosts | `nccl-gpu-1(172.72.8.12)`，`nccl-gpu-2(172.72.8.16)` |
+| Topology | 2 nodes x 8 GPUs，合计 16 GPUs |
+| NCCL source | `nccl-tests-mpirun` |
+| NCCL network | IB |
+| GPU Direct RDMA | ENABLED |
+| Active HCA rails | `mlx5_0, mlx5_1, mlx5_6, mlx5_7` |
+| HCA speed | 4 条 `400 Gb/sec (4X NDR)` ACTIVE |
+
+注意：NCCL 表里的 `GB/s` 是大 B，即 Bytes/s。IB 网卡口径 `400 Gb/s` 是小 b，即 bits/s。
+
+### 2x8 全集合通信结果
+
+| Operation | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Correctness | 当前 4x400Gbps 口径 |
+|---|---:|---:|---:|---|---|
+| allreduce | 354.27 GB/s | 354.45 GB/s | >= 491.84 GB/s | PASS | 符合当前硬件形态，低于 PDF 8 rail 阈值 |
+| alltoall | 37.00 GB/s | 37.14 GB/s | >= 76.54 GB/s | PASS | 符合当前硬件形态，低于 PDF 8 rail 阈值 |
+| broadcast | 191.65 GB/s | 190.25 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
+| reducescatter | 192.75 GB/s | 192.74 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
+| allgather | 192.14 GB/s | 192.47 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
+| sendrecv | 26.98 GB/s | 26.97 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
+
+结论：2x8 全集合通信测试中，NCCL 正确性通过。allreduce 和 alltoall 低于 PDF 8x400Gbps 参考阈值，但当前机器确认参与 NCCL 的是 4 条 400Gbps rail，因此该差距不应直接判定为当前 4x400Gbps 环境不合格。
+
+来源：
+
+- `reports_multinode_nccl_all_collectives_20260523_120144.md`
+- `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md`
+
+### PDF Matrix allreduce / alltoall 结果
+
+AllReduce（PDF 8x400Gbps 阈值对比，仅作参考）:
+
+| Topology | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Gap | 当前解释 |
+|---|---:|---:|---:|---:|---|
+| 2 nodes x 1 GPU | 47.29 GB/s | 47.26 GB/s | >= 48.90 GB/s | -1.61 GB/s | 接近 PDF 阈值 |
+| 2 nodes x 2 GPUs | 137.16 GB/s | 137.13 GB/s | >= 136.93 GB/s | +0.23 GB/s | 达到 PDF 阈值 |
+| 2 nodes x 4 GPUs | 335.07 GB/s | 335.02 GB/s | >= 335.48 GB/s | -0.41 GB/s | 接近 PDF 阈值 |
+| 2 nodes x 8 GPUs | 353.85 GB/s | 353.85 GB/s | >= 491.84 GB/s | -137.99 GB/s | 低于 PDF 8 rail 阈值；当前为 4 rail 环境，不直接判不合格 |
+
+AllToAll（PDF 8x400Gbps 阈值对比，仅作参考）:
+
+| Topology | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Gap | 当前解释 |
+|---|---:|---:|---:|---:|---|
+| 2 nodes x 1 GPU | 24.85 GB/s | 24.90 GB/s | >= 27.25 GB/s | -2.40 GB/s | 接近 PDF 阈值 |
+| 2 nodes x 2 GPUs | 47.76 GB/s | 47.98 GB/s | >= 54.41 GB/s | -6.65 GB/s | 低于 PDF 8 rail 阈值 |
+| 2 nodes x 4 GPUs | 72.74 GB/s | 72.80 GB/s | >= 73.73 GB/s | -0.99 GB/s | 接近 PDF 阈值 |
+| 2 nodes x 8 GPUs | 36.83 GB/s | 36.85 GB/s | >= 76.54 GB/s | -39.71 GB/s | 低于 PDF 8 rail 阈值；当前为 4 rail 环境，不直接判不合格 |
+
+来源：
+
+- `reports_multinode_nccl_pdf_matrix_run_20260523.md`
+- `reports_multinode_nccl_pdf_matrix_20260523_113803.md`
+
+## 风险与判断
+
+1. 单机 FP8 硬件能力通过 direct cuBLASLt 验证，当前不支持将 PyTorch `_scaled_mm` FAIL 直接判定为 GPU 硬件故障。
+2. 多机 NCCL 正确性通过，性能结果应按当前 4x400Gbps rail 环境解释。
+3. 当前多机环境确认参与 NCCL 的是 4 条 400G IB rail；PDF 参考环境为 8x400G 计算管理网络，因此 2x8 阈值与当前硬件形态不等价。
+4. 2x8 allreduce 和 alltoall 低于 PDF 8 rail 阈值，建议作为“与 PDF 参考环境差异”记录，而不是作为当前 4 rail 环境不合格结论。
+
+## 建议
+
+1. 单机 FP8 验收以 direct cuBLASLt 或 Transformer Engine GEMM benchmark 为主，PyTorch `_scaled_mm` 作为软件栈参考项保留。
+2. 多机 NCCL 后续若要按 PDF 阈值验收，需要先对齐 PDF 参考环境的 8x400Gbps rail 数量、NCCL net plugin / SHARP、跨 Leaf 交换策略、ECMP / 拥塞控制配置。
+3. 对外报告建议明确区分 `GB/s` 与 `Gb/s`：NCCL bus bandwidth 是大 B，IB 端口速率是小 b。
diff --git a/reports_gpu_Test_formal_20260524.md b/reports_gpu_Test_formal_20260524.md
new file mode 100644
index 0000000..65969b2
--- /dev/null
+++ b/reports_gpu_Test_formal_20260524.md
@@ -0,0 +1,123 @@
+# GPU_Test 双节点测试报告
+
+- **测试日期:** 2026-05-24
+- **测试节点:** `aikubeworker0012 / 172.72.8.12`，`aikubeworker0016 / 172.72.8.16`
+- **节点配置:** 每节点 8 张 NVIDIA H100 80GB HBM3 GPU
+- **测试范围:** 单机算力、单机 8 卡通信、多机 2x8 GPU 通信
+- **网络形态:** 当前参与 NCCL 的计算网络为 4 条 400Gbps IB rail
+
+## 结论摘要
+
+| 项目 | 结果摘要 |
+|---|---|
+| GPU 识别 | 两台节点均识别 8 张 H100 80GB HBM3 GPU |
+| 单机 FP8 GEMM | 两台节点 direct cuBLASLt FP8 GEMM 均超过 1600 TFLOPS |
+| 单机 8 卡 NCCL | 两台节点单机 8 卡 NCCL 集合通信均可正常完成，主要大包通信带宽稳定 |
+| 多机 2x8 NCCL | 两节点 16 GPU NCCL 正确性通过，所有测试 `Wrong=0` / return code `0` |
+| 多机网络口径 | 当前为 4x400Gbps IB rail 环境，结果按该硬件形态解释 |
+
+## 测试环境
+
+| Host | GPU | Driver | CUDA | GPU 数量 |
+|---|---|---|---|---:|
+| `aikubeworker0012` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
+| `aikubeworker0016` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
+
+## 单机算力测试
+
+### FP8 GEMM 硬件路径验证
+
+本项使用 direct cuBLASLt FP8 GEMM benchmark，绕过 PyTorch eager 调度路径，直接验证 GPU FP8 Tensor Core 与 cuBLASLt GEMM 能力。
+
+| 参数 | 配置 |
+|---|---|
+| GEMM shape | `8192 x 8192 x 8192` |
+| 输入类型 | FP8 E4M3 |
+| 输出类型 | BF16 |
+| 累加类型 | FP32 compute |
+| Layout | TN |
+| Scale | `scale_a = 1.0`，`scale_b = 1.0` |
+| fast accumulation | enabled |
+| 测试 GPU | 每节点 8 张 GPU 逐张测试 |
+
+| Host | Mean FP8 TFLOPS | Min | Max | Spread |
+|---|---:|---:|---:|---:|
+| `aikubeworker0012` | 1608.6 | 1599.0 | 1615.6 | 1.03% |
+| `aikubeworker0016` | 1613.7 | 1602.3 | 1630.3 | 1.74% |
+
+| Host | GPU0 | GPU1 | GPU2 | GPU3 | GPU4 | GPU5 | GPU6 | GPU7 |
+|---|---:|---:|---:|---:|---:|---:|---:|---:|
+| `aikubeworker0012` | 1615.6 | 1611.0 | 1599.0 | 1607.1 | 1614.0 | 1604.4 | 1608.4 | 1609.1 |
+| `aikubeworker0016` | 1602.3 | 1604.0 | 1616.9 | 1610.6 | 1620.5 | 1630.3 | 1605.1 | 1620.2 |
+
+**说明:** PyTorch `_scaled_mm` eager benchmark 结果约为 1170-1180 TFLOPS，该结果反映 PyTorch 软件路径与调度开销，不作为本报告的硬件算力结论。
+
+## 单机 8 卡 NCCL 通信测试
+
+本项在单个节点内使用 8 张 GPU 进行 NCCL 集合通信测试，结果单位为 `GB/s`，即 Bytes/s。
+
+| Operation | `aikubeworker0012` Bus BW | `aikubeworker0016` Bus BW |
+|---|---:|---:|
+| allreduce | 472.3 GB/s | 472.4 GB/s |
+| alltoall | 343.3 GB/s | 344.3 GB/s |
+| broadcast | 364.1 GB/s | 363.6 GB/s |
+| reducescatter | 352.8 GB/s | 353.1 GB/s |
+| allgather | 366.4 GB/s | 366.4 GB/s |
+| sendrecv | 369.0 GB/s | 368.9 GB/s |
+
+**说明:** 单机 8 卡通信主要依赖节点内 GPU 互联与 NCCL collective 实现。两台节点的同类 operation 结果接近，节点间差异较小。
+
+## 多机 2x8 NCCL 通信测试
+
+本项使用两台节点，每台 8 张 GPU，共 16 张 GPU 进行跨节点 NCCL 集合通信测试。
+
+### 网络环境
+
+| 项目 | 配置 |
+|---|---|
+| Host A | `aikubeworker0012 / 172.72.8.12` |
+| Host B | `aikubeworker0016 / 172.72.8.16` |
+| 拓扑 | 2 nodes x 8 GPUs |
+| NCCL network | IB |
+| GPU Direct RDMA | ENABLED |
+| Active rails | `mlx5_0, mlx5_1, mlx5_6, mlx5_7` |
+| Rail 速率 | 4 条 `400 Gb/sec (4X NDR)` ACTIVE |
+
+### 跨节点 NCCL 结果
+
+| Operation | Peak Bus BW | Avg Bus BW | Correctness |
+|---|---:|---:|---|
+| allreduce | 354.27 GB/s | 354.45 GB/s | PASS |
+| alltoall | 37.00 GB/s | 37.14 GB/s | PASS |
+| broadcast | 191.65 GB/s | 190.25 GB/s | PASS |
+| reducescatter | 192.75 GB/s | 192.74 GB/s | PASS |
+| allgather | 192.14 GB/s | 192.47 GB/s | PASS |
+| sendrecv | 26.98 GB/s | 26.97 GB/s | PASS |
+
+**正确性:** 本轮多机 NCCL 测试 return code 为 `0`，`Wrong=0`，未发现数据正确性错误。
+
+## 单位说明
+
+| 写法 | 含义 | 说明 |
+|---|---|---|
+| `GB/s` | Gigabytes per second | 大 B，字节每秒，NCCL bus bandwidth 使用此单位 |
+| `Gbps` / `Gb/s` | Gigabits per second | 小 b，比特每秒，IB 端口速率通常使用此单位 |
+
+换算关系：
+
+```text
+1 Byte = 8 bits
+400 Gb/s = 50 GB/s
+4 x 400 Gb/s = 1600 Gb/s = 200 GB/s 物理链路字节带宽
+```
+
+NCCL 的 `busbw` 是 collective 通信的逻辑折算带宽，不等同于单条物理链路的线速。
+
+## 结果说明
+
+1. 两台节点 GPU 识别正常，均为 8 张 H100 80GB HBM3。
+2. direct cuBLASLt FP8 GEMM 显示两台节点单卡 FP8 算力均超过 1600 TFLOPS，GPU FP8 硬件计算路径正常。
+3. 单机 8 卡 NCCL 通信在两台节点上结果接近，未观察到明显节点间异常差异。
+4. 多机 2x8 NCCL 正确性通过，跨节点通信功能正常。
+5. 当前多机通信结果应按 4x400Gbps IB rail 环境解释；若后续需要对齐 8x400Gbps 环境，应先确认 rail 数量、NCCL net plugin / SHARP、交换网络策略等配置一致。
+
diff --git a/reports_gpu_Test_pdf.css b/reports_gpu_Test_pdf.css
new file mode 100644
index 0000000..8ef6d39
--- /dev/null
+++ b/reports_gpu_Test_pdf.css
@@ -0,0 +1,102 @@
+@page {
+  size: A4 landscape;
+  margin: 13mm;
+}
+
+body {
+  color: #111827;
+  font-family: "PingFang SC", "Heiti SC", "Arial Unicode MS", sans-serif;
+  font-size: 11px;
+  line-height: 1.45;
+}
+
+h1 {
+  color: #0f172a;
+  font-size: 24px;
+  margin: 0 0 14px;
+}
+
+h2 {
+  border-bottom: 1px solid #cbd5e1;
+  color: #0f172a;
+  font-size: 17px;
+  margin: 24px 0 10px;
+  padding-bottom: 4px;
+}
+
+h3 {
+  color: #1f2937;
+  font-size: 13px;
+  margin: 16px 0 8px;
+}
+
+p {
+  margin: 7px 0;
+}
+
+code {
+  background: #f1f5f9;
+  border-radius: 3px;
+  color: #0f172a;
+  font-family: Menlo, Consolas, monospace;
+  font-size: 10px;
+  padding: 1px 3px;
+}
+
+pre {
+  background: #f8fafc;
+  border: 1px solid #e2e8f0;
+  border-radius: 4px;
+  padding: 8px;
+  white-space: pre-wrap;
+}
+
+table {
+  border-collapse: collapse;
+  margin: 8px 0 14px;
+  page-break-inside: auto;
+  width: 100%;
+}
+
+thead {
+  display: table-header-group;
+}
+
+tr {
+  page-break-inside: avoid;
+}
+
+th,
+td {
+  border: 1px solid #cbd5e1;
+  padding: 5px 6px;
+  text-align: left;
+  vertical-align: middle;
+  word-break: break-word;
+}
+
+th {
+  background: #e2e8f0;
+  color: #0f172a;
+  font-weight: 700;
+}
+
+tbody tr:nth-child(even) td {
+  background: #f8fafc;
+}
+
+a {
+  color: #2563eb;
+  text-decoration: none;
+}
+
+ul,
+ol {
+  margin: 6px 0 10px 20px;
+  padding: 0;
+}
+
+li {
+  margin: 3px 0;
+}
+
diff --git a/scripts/cublaslt_fp8_gemm_bench.cu b/scripts/cublaslt_fp8_gemm_bench.cu
new file mode 100644
index 0000000..a401f36
--- /dev/null
+++ b/scripts/cublaslt_fp8_gemm_bench.cu
@@ -0,0 +1,291 @@
+#include <cublasLt.h>
+#include <cuda_bf16.h>
+#include <cuda_fp8.h>
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#define CHECK_CUDA(call)                                                       \
+  do {                                                                         \
+    cudaError_t status = (call);                                               \
+    if (status != cudaSuccess) {                                               \
+      std::fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__,       \
+                   cudaGetErrorString(status));                                \
+      std::exit(1);                                                            \
+    }                                                                          \
+  } while (0)
+
+#define CHECK_CUBLAS(call)                                                     \
+  do {                                                                         \
+    cublasStatus_t status = (call);                                            \
+    if (status != CUBLAS_STATUS_SUCCESS) {                                     \
+      std::fprintf(stderr, "cuBLASLt error %s:%d: status=%d\n", __FILE__,      \
+                   __LINE__, static_cast<int>(status));                        \
+      std::exit(1);                                                            \
+    }                                                                          \
+  } while (0)
+
+__global__ void fill_fp8(__nv_fp8_e4m3 *ptr, size_t count, float value) {
+  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t stride = blockDim.x * gridDim.x;
+  for (size_t i = tid; i < count; i += stride) {
+    ptr[i] = __nv_fp8_e4m3(value);
+  }
+}
+
+struct Args {
+  int matrix_size = 8192;
+  int warmup = 20;
+  int iterations = 200;
+  int first_gpu = 0;
+  int gpu_count = -1;
+  size_t workspace_mb = 256;
+  int fast_accum = 1;
+};
+
+static Args parse_args(int argc, char **argv) {
+  Args args;
+  for (int i = 1; i < argc; ++i) {
+    auto need = [&](const char *name) {
+      if (i + 1 >= argc) {
+        std::fprintf(stderr, "Missing value for %s\n", name);
+        std::exit(2);
+      }
+      return argv[++i];
+    };
+    if (!std::strcmp(argv[i], "--matrix-size")) {
+      args.matrix_size = std::atoi(need(argv[i]));
+    } else if (!std::strcmp(argv[i], "--warmup")) {
+      args.warmup = std::atoi(need(argv[i]));
+    } else if (!std::strcmp(argv[i], "--iterations")) {
+      args.iterations = std::atoi(need(argv[i]));
+    } else if (!std::strcmp(argv[i], "--first-gpu")) {
+      args.first_gpu = std::atoi(need(argv[i]));
+    } else if (!std::strcmp(argv[i], "--gpu-count")) {
+      args.gpu_count = std::atoi(need(argv[i]));
+    } else if (!std::strcmp(argv[i], "--workspace-mb")) {
+      args.workspace_mb = static_cast<size_t>(std::atoll(need(argv[i])));
+    } else if (!std::strcmp(argv[i], "--fast-accum")) {
+      args.fast_accum = std::atoi(need(argv[i]));
+    } else if (!std::strcmp(argv[i], "--help") || !std::strcmp(argv[i], "-h")) {
+      std::puts("Usage: cublaslt_fp8_gemm_bench [--matrix-size N] [--warmup N] "
+                "[--iterations N] [--first-gpu N] [--gpu-count N] "
+                "[--workspace-mb N] [--fast-accum 0|1]");
+      std::exit(0);
+    } else {
+      std::fprintf(stderr, "Unknown argument: %s\n", argv[i]);
+      std::exit(2);
+    }
+  }
+  return args;
+}
+
+static double run_one_gpu(int gpu, const Args &args) {
+  CHECK_CUDA(cudaSetDevice(gpu));
+
+  const int64_t m = args.matrix_size;
+  const int64_t n = args.matrix_size;
+  const int64_t k = args.matrix_size;
+  const size_t a_elems = static_cast<size_t>(m) * k;
+  const size_t b_elems = static_cast<size_t>(k) * n;
+  const size_t d_elems = static_cast<size_t>(m) * n;
+
+  __nv_fp8_e4m3 *d_a = nullptr;
+  __nv_fp8_e4m3 *d_b = nullptr;
+  __nv_bfloat16 *d_d = nullptr;
+  void *workspace = nullptr;
+  float *d_scale_a = nullptr;
+  float *d_scale_b = nullptr;
+  const float scale = 1.0f;
+  const size_t workspace_bytes = args.workspace_mb * 1024ULL * 1024ULL;
+
+  CHECK_CUDA(cudaMalloc(&d_a, a_elems * sizeof(__nv_fp8_e4m3)));
+  CHECK_CUDA(cudaMalloc(&d_b, b_elems * sizeof(__nv_fp8_e4m3)));
+  CHECK_CUDA(cudaMalloc(&d_d, d_elems * sizeof(__nv_bfloat16)));
+  CHECK_CUDA(cudaMalloc(&workspace, workspace_bytes));
+  CHECK_CUDA(cudaMalloc(&d_scale_a, sizeof(float)));
+  CHECK_CUDA(cudaMalloc(&d_scale_b, sizeof(float)));
+  CHECK_CUDA(cudaMemcpy(d_scale_a, &scale, sizeof(scale), cudaMemcpyHostToDevice));
+  CHECK_CUDA(cudaMemcpy(d_scale_b, &scale, sizeof(scale), cudaMemcpyHostToDevice));
+
+  const int threads = 256;
+  const int blocks = 4096;
+  fill_fp8<<<blocks, threads>>>(d_a, a_elems, 0.01f);
+  fill_fp8<<<blocks, threads>>>(d_b, b_elems, 0.01f);
+  CHECK_CUDA(cudaMemset(d_d, 0, d_elems * sizeof(__nv_bfloat16)));
+  CHECK_CUDA(cudaGetLastError());
+  CHECK_CUDA(cudaDeviceSynchronize());
+
+  cublasLtHandle_t lt;
+  cublasLtMatmulDesc_t op_desc;
+  cublasLtMatrixLayout_t a_desc, b_desc, d_desc;
+  cublasLtMatmulPreference_t preference;
+  CHECK_CUBLAS(cublasLtCreate(&lt));
+  CHECK_CUBLAS(cublasLtMatmulDescCreate(&op_desc, CUBLAS_COMPUTE_32F, CUDA_R_32F));
+
+  // cuBLASLt FP8 kernels require TN format: A is transposed, B is non-transposed.
+  // With square GEMMs this keeps the benchmark FLOP count identical to the PDF
+  // acceptance shape while satisfying the library's FP8 kernel constraints.
+  cublasOperation_t transa = CUBLAS_OP_T;
+  cublasOperation_t transb = CUBLAS_OP_N;
+  CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
+      op_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)));
+  CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
+      op_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb)));
+  CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
+      op_desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &d_scale_a,
+      sizeof(d_scale_a)));
+  CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
+      op_desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &d_scale_b,
+      sizeof(d_scale_b)));
+  int8_t fast_accum = args.fast_accum ? 1 : 0;
+  CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
+      op_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fast_accum,
+      sizeof(fast_accum)));
+
+  CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&a_desc, CUDA_R_8F_E4M3, k, m, k));
+  CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&b_desc, CUDA_R_8F_E4M3, k, n, k));
+  CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&d_desc, CUDA_R_16BF, m, n, m));
+
+  CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference));
+  CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute(
+      preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_bytes,
+      sizeof(workspace_bytes)));
+
+  cublasLtMatmulHeuristicResult_t heuristic;
+  int returned = 0;
+  CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic(
+      lt, op_desc, a_desc, b_desc, d_desc, d_desc, preference, 1, &heuristic,
+      &returned));
+  if (returned == 0) {
+    std::fprintf(stderr, "No cuBLASLt heuristic returned for GPU %d\n", gpu);
+    std::exit(1);
+  }
+
+  auto get_algo_attr_i32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
+    int32_t value = -1;
+    size_t written = 0;
+    CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
+        &heuristic.algo, attr, &value, sizeof(value), &written));
+    return static_cast<int>(value);
+  };
+  auto get_algo_attr_u32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
+    uint32_t value = 0;
+    size_t written = 0;
+    CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
+        &heuristic.algo, attr, &value, sizeof(value), &written));
+    return static_cast<int>(value);
+  };
+  auto get_algo_attr_u16 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
+    uint16_t value = 0;
+    size_t written = 0;
+    CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
+        &heuristic.algo, attr, &value, sizeof(value), &written));
+    return static_cast<int>(value);
+  };
+  const int algo_id = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_ID);
+  const int tile_id = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_TILE_ID);
+  const int splitk = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_SPLITK_NUM);
+  const int stages = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_STAGES_ID);
+  const int inner_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID);
+  const int cluster_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID);
+
+  const float alpha = 1.0f;
+  const float beta = 0.0f;
+  auto matmul = [&]() {
+    CHECK_CUBLAS(cublasLtMatmul(lt, op_desc, &alpha, d_a, a_desc, d_b, b_desc,
+                                &beta, d_d, d_desc, d_d, d_desc,
+                                &heuristic.algo, workspace, workspace_bytes, 0));
+  };
+
+  for (int i = 0; i < args.warmup; ++i) {
+    matmul();
+  }
+  CHECK_CUDA(cudaDeviceSynchronize());
+
+  cudaEvent_t start, stop;
+  CHECK_CUDA(cudaEventCreate(&start));
+  CHECK_CUDA(cudaEventCreate(&stop));
+  CHECK_CUDA(cudaEventRecord(start));
+  for (int i = 0; i < args.iterations; ++i) {
+    matmul();
+  }
+  CHECK_CUDA(cudaEventRecord(stop));
+  CHECK_CUDA(cudaEventSynchronize(stop));
+  float elapsed_ms = 0.0f;
+  CHECK_CUDA(cudaEventElapsedTime(&elapsed_ms, start, stop));
+  const double flops =
+      2.0 * static_cast<double>(m) * static_cast<double>(n) *
+      static_cast<double>(k) * static_cast<double>(args.iterations);
+  const double tflops = flops / (static_cast<double>(elapsed_ms) / 1000.0) / 1e12;
+  std::printf(
+      "    {\"index\": %d, \"fp8_tflops\": %.1f, \"algo_id\": %d, "
+      "\"tile_id\": %d, \"splitk\": %d, \"stages_id\": %d, "
+      "\"inner_shape_id\": %d, \"cluster_shape_id\": %d}%s\n",
+      gpu, tflops, algo_id, tile_id, splitk, stages, inner_shape, cluster_shape,
+      (gpu + 1 == args.first_gpu + args.gpu_count) ? "" : ",");
+  std::fflush(stdout);
+
+  CHECK_CUDA(cudaEventDestroy(start));
+  CHECK_CUDA(cudaEventDestroy(stop));
+  CHECK_CUBLAS(cublasLtMatmulPreferenceDestroy(preference));
+  CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(a_desc));
+  CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(b_desc));
+  CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(d_desc));
+  CHECK_CUBLAS(cublasLtMatmulDescDestroy(op_desc));
+  CHECK_CUBLAS(cublasLtDestroy(lt));
+  CHECK_CUDA(cudaFree(d_a));
+  CHECK_CUDA(cudaFree(d_b));
+  CHECK_CUDA(cudaFree(d_d));
+  CHECK_CUDA(cudaFree(workspace));
+  CHECK_CUDA(cudaFree(d_scale_a));
+  CHECK_CUDA(cudaFree(d_scale_b));
+  CHECK_CUDA(cudaDeviceSynchronize());
+
+  return tflops;
+}
+
+int main(int argc, char **argv) {
+  Args args = parse_args(argc, argv);
+  int device_count = 0;
+  CHECK_CUDA(cudaGetDeviceCount(&device_count));
+  if (args.gpu_count < 0) {
+    args.gpu_count = device_count - args.first_gpu;
+  }
+  if (args.first_gpu < 0 || args.first_gpu + args.gpu_count > device_count) {
+    std::fprintf(stderr, "Invalid GPU range first=%d count=%d device_count=%d\n",
+                 args.first_gpu, args.gpu_count, device_count);
+    return 2;
+  }
+
+  std::vector<double> values;
+  std::printf("{\n");
+  std::printf("  \"source\": \"cuBLASLt\",\n");
+  std::printf("  \"dtype\": \"fp8_e4m3_inputs_bf16_output_fp32_accum\",\n");
+  std::printf("  \"matrix_size\": %d,\n", args.matrix_size);
+  std::printf("  \"warmup\": %d,\n", args.warmup);
+  std::printf("  \"iterations\": %d,\n", args.iterations);
+  std::printf("  \"fast_accum\": %d,\n", args.fast_accum ? 1 : 0);
+  std::printf("  \"per_gpu\": [\n");
+  for (int i = 0; i < args.gpu_count; ++i) {
+    int gpu = args.first_gpu + i;
+    double tflops = run_one_gpu(gpu, args);
+    values.push_back(tflops);
+  }
+  double mean = std::accumulate(values.begin(), values.end(), 0.0) / values.size();
+  auto minmax = std::minmax_element(values.begin(), values.end());
+  double spread = ((*minmax.second - *minmax.first) / mean) * 100.0;
+  std::printf("  ],\n");
+  std::printf("  \"mean_tflops\": %.1f,\n", mean);
+  std::printf("  \"min_tflops\": %.1f,\n", *minmax.first);
+  std::printf("  \"max_tflops\": %.1f,\n", *minmax.second);
+  std::printf("  \"spread_pct\": %.2f\n", spread);
+  std::printf("}\n");
+  return mean >= 1400.0 ? 0 : 1;
+}
diff --git a/scripts/pytorch_fp8_path_bench.py b/scripts/pytorch_fp8_path_bench.py
new file mode 100755
index 0000000..ab35af8
--- /dev/null
+++ b/scripts/pytorch_fp8_path_bench.py
@@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+"""Compare FP8 GEMM paths used for H100/H200 acceptance debugging.
+
+Paths:
+  A. torch._scaled_mm eager, default accumulation
+  B. torch._scaled_mm eager, use_fast_accum=True
+  C. CUDA Graph replay of torch._scaled_mm(out=..., use_fast_accum=True)
+  D. Transformer Engine Linear under fp8_autocast, when installed
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+import sys
+import time
+from typing import Any, Callable
+
+import torch
+
+
+def tflops_from_ms(matrix_size: int, iterations: int, elapsed_ms: float) -> float:
+    flops = 2.0 * matrix_size * matrix_size * matrix_size * iterations
+    return flops / (elapsed_ms / 1000.0) / 1e12
+
+
+def cuda_event_bench(
+    name: str,
+    matrix_size: int,
+    iterations: int,
+    warmup: int,
+    func: Callable[[int], Any],
+) -> dict[str, Any]:
+    for i in range(warmup):
+        func(i)
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    wall_start = time.perf_counter()
+    start.record()
+    for i in range(iterations):
+        func(i)
+    end.record()
+    torch.cuda.synchronize()
+    wall_elapsed = time.perf_counter() - wall_start
+    elapsed_ms = start.elapsed_time(end)
+    return {
+        "name": name,
+        "status": "ok",
+        "matrix_size": matrix_size,
+        "iterations": iterations,
+        "warmup": warmup,
+        "event_ms_total": round(elapsed_ms, 3),
+        "event_us_per_iter": round(elapsed_ms * 1000.0 / iterations, 3),
+        "wall_ms_total": round(wall_elapsed * 1000.0, 3),
+        "tflops": round(tflops_from_ms(matrix_size, iterations, elapsed_ms), 1),
+    }
+
+
+def make_fp8_inputs(matrix_size: int, pools: int, device: str) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+    a = [
+        torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
+        for _ in range(pools)
+    ]
+    b = [
+        torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
+        for _ in range(pools)
+    ]
+    torch.cuda.synchronize()
+    return a, b
+
+
+def bench_scaled_mm(args: argparse.Namespace) -> list[dict[str, Any]]:
+    device = f"cuda:{args.gpu_index}"
+    torch.cuda.set_device(args.gpu_index)
+    scale_a = torch.tensor(1.0, device=device)
+    scale_b = torch.tensor(1.0, device=device)
+    pools_a, pools_b = make_fp8_inputs(args.matrix_size, args.pools, device)
+    results: list[dict[str, Any]] = []
+
+    def eager_default(i: int) -> torch.Tensor:
+        idx = i % args.pools
+        return torch._scaled_mm(
+            pools_a[idx],
+            pools_b[idx].T,
+            scale_a=scale_a,
+            scale_b=scale_b,
+            out_dtype=torch.bfloat16,
+        )
+
+    def eager_fast(i: int) -> torch.Tensor:
+        idx = i % args.pools
+        return torch._scaled_mm(
+            pools_a[idx],
+            pools_b[idx].T,
+            scale_a=scale_a,
+            scale_b=scale_b,
+            out_dtype=torch.bfloat16,
+            use_fast_accum=True,
+        )
+
+    results.append(
+        cuda_event_bench(
+            "A_eager_scaled_mm_default",
+            args.matrix_size,
+            args.iterations,
+            args.warmup,
+            eager_default,
+        )
+    )
+    results.append(
+        cuda_event_bench(
+            "B_eager_scaled_mm_fast_accum",
+            args.matrix_size,
+            args.iterations,
+            args.warmup,
+            eager_fast,
+        )
+    )
+
+    graph_out = torch.empty(
+        (args.matrix_size, args.matrix_size),
+        device=device,
+        dtype=torch.bfloat16,
+    )
+    static_a = pools_a[0]
+    static_b_t = pools_b[0].T
+
+    try:
+        side_stream = torch.cuda.Stream()
+        side_stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(side_stream):
+            for _ in range(max(3, args.warmup // 2)):
+                torch._scaled_mm(
+                    static_a,
+                    static_b_t,
+                    scale_a=scale_a,
+                    scale_b=scale_b,
+                    out_dtype=torch.bfloat16,
+                    use_fast_accum=True,
+                    out=graph_out,
+                )
+        torch.cuda.current_stream().wait_stream(side_stream)
+        torch.cuda.synchronize()
+
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph):
+            torch._scaled_mm(
+                static_a,
+                static_b_t,
+                scale_a=scale_a,
+                scale_b=scale_b,
+                out_dtype=torch.bfloat16,
+                use_fast_accum=True,
+                out=graph_out,
+            )
+
+        def graph_replay(_: int) -> None:
+            graph.replay()
+
+        results.append(
+            cuda_event_bench(
+                "C_cuda_graph_scaled_mm_fast_accum",
+                args.matrix_size,
+                args.iterations,
+                3,
+                graph_replay,
+            )
+        )
+    except Exception as exc:  # noqa: BLE001
+        results.append(
+            {
+                "name": "C_cuda_graph_scaled_mm_fast_accum",
+                "status": "unavailable",
+                "reason": f"{type(exc).__name__}: {exc}",
+            }
+        )
+
+    return results
+
+
+def bench_transformer_engine(args: argparse.Namespace) -> dict[str, Any]:
+    try:
+        import transformer_engine.pytorch as te  # type: ignore[import-not-found]
+        from transformer_engine.common.recipe import DelayedScaling, Format  # type: ignore[import-not-found]
+    except Exception as exc:  # noqa: BLE001
+        return {
+            "name": "D_transformer_engine_fp8_linear",
+            "status": "unavailable",
+            "reason": f"{type(exc).__name__}: {exc}",
+        }
+
+    device = f"cuda:{args.gpu_index}"
+    x = torch.randn(args.matrix_size, args.matrix_size, device=device, dtype=torch.bfloat16)
+    layer = te.Linear(
+        args.matrix_size,
+        args.matrix_size,
+        bias=False,
+        params_dtype=torch.bfloat16,
+        device=device,
+    )
+    recipe = DelayedScaling(fp8_format=Format.HYBRID)
+
+    def run(_: int) -> torch.Tensor:
+        with te.fp8_autocast(enabled=True, fp8_recipe=recipe):
+            return layer(x)
+
+    try:
+        result = cuda_event_bench(
+            "D_transformer_engine_fp8_linear",
+            args.matrix_size,
+            args.iterations,
+            args.warmup,
+            run,
+        )
+    except Exception as exc:  # noqa: BLE001
+        return {
+            "name": "D_transformer_engine_fp8_linear",
+            "status": "error",
+            "reason": f"{type(exc).__name__}: {exc}",
+        }
+    result["note"] = "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
+    return result
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--matrix-size", type=int, default=8192)
+    parser.add_argument("--warmup", type=int, default=20)
+    parser.add_argument("--iterations", type=int, default=100)
+    parser.add_argument("--gpu-index", type=int, default=0)
+    parser.add_argument("--pools", type=int, default=4)
+    args = parser.parse_args()
+
+    if not torch.cuda.is_available():
+        print(json.dumps({"error": "cuda unavailable"}, indent=2))
+        return 1
+    if not hasattr(torch, "_scaled_mm") or not hasattr(torch, "float8_e4m3fn"):
+        print(json.dumps({"error": "torch FP8 _scaled_mm unavailable"}, indent=2))
+        return 1
+
+    torch.cuda.set_device(args.gpu_index)
+    props = torch.cuda.get_device_properties(args.gpu_index)
+    payload = {
+        "source": "pytorch_fp8_path_bench",
+        "torch": torch.__version__,
+        "cuda": torch.version.cuda,
+        "gpu_index": args.gpu_index,
+        "gpu_name": props.name,
+        "matrix_size": args.matrix_size,
+        "warmup": args.warmup,
+        "iterations": args.iterations,
+        "results": [],
+    }
+    try:
+        payload["results"].extend(bench_scaled_mm(args))
+        payload["results"].append(bench_transformer_engine(args))
+    except torch.cuda.OutOfMemoryError as exc:
+        payload["error"] = f"CUDA OOM: {exc}"
+        print(json.dumps(payload, indent=2))
+        return 1
+
+    ok_values = [r["tflops"] for r in payload["results"] if r.get("status") == "ok"]
+    if ok_values:
+        payload["summary"] = {
+            "max_tflops": round(max(ok_values), 1),
+            "min_tflops": round(min(ok_values), 1),
+            "mean_tflops": round(statistics.mean(ok_values), 1),
+        }
+    print(json.dumps(payload, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/run_cublaslt_fp8_gemm.sh b/scripts/run_cublaslt_fp8_gemm.sh
new file mode 100755
index 0000000..49f4787
--- /dev/null
+++ b/scripts/run_cublaslt_fp8_gemm.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+set -uo pipefail
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
+
+CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
+NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
+OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
+MATRIX_SIZE="${MATRIX_SIZE:-8192}"
+WARMUP="${WARMUP:-20}"
+ITERATIONS="${ITERATIONS:-200}"
+GPU_COUNT="${GPU_COUNT:-8}"
+FIRST_GPU="${FIRST_GPU:-0}"
+WORKSPACE_MB="${WORKSPACE_MB:-256}"
+
+if [[ ! -x "$NVCC" ]]; then
+  echo "nvcc not found: $NVCC" >&2
+  exit 1
+fi
+
+mkdir -p "$OUT_DIR" "$PROJECT_DIR/build"
+HOST="$(hostname 2>/dev/null || echo unknown)"
+TS="$(date +%Y%m%d_%H%M%S)"
+BIN="$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
+REPORT="$OUT_DIR/cublaslt_fp8_gemm_${HOST}_${TS}.json"
+
+"$NVCC" -O3 -std=c++17 -arch=sm_90 \
+  "$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
+  -lcublasLt -lcublas -o "$BIN"
+
+set +e
+"$BIN" \
+  --matrix-size "$MATRIX_SIZE" \
+  --warmup "$WARMUP" \
+  --iterations "$ITERATIONS" \
+  --first-gpu "$FIRST_GPU" \
+  --gpu-count "$GPU_COUNT" \
+  --workspace-mb "$WORKSPACE_MB" \
+  | tee "$REPORT"
+status=${PIPESTATUS[0]}
+set -e
+
+echo "Report written to: $REPORT"
+exit "$status"
diff --git a/scripts/run_fp8_path_comparison.sh b/scripts/run_fp8_path_comparison.sh
new file mode 100755
index 0000000..46fd0e2
--- /dev/null
+++ b/scripts/run_fp8_path_comparison.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
+
+PYTHON="${PYTHON:-/root/gpu-test-venv/bin/python}"
+CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.4}"
+NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
+OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
+MATRIX_SIZE="${MATRIX_SIZE:-8192}"
+WARMUP="${WARMUP:-20}"
+ITERATIONS="${ITERATIONS:-100}"
+GPU_INDEX="${GPU_INDEX:-0}"
+WORKSPACE_MB="${WORKSPACE_MB:-256}"
+VENV_SITE_PACKAGES="$("$PYTHON" - <<'PY'
+import site
+print(site.getsitepackages()[0])
+PY
+)"
+export LD_LIBRARY_PATH="$VENV_SITE_PACKAGES/nvidia/cudnn/lib:$VENV_SITE_PACKAGES/nvidia/nccl/lib:${LD_LIBRARY_PATH:-}"
+
+mkdir -p "$PROJECT_DIR/build" "$OUT_DIR"
+
+HOST="$(hostname 2>/dev/null || echo unknown)"
+TS="$(date +%Y%m%d_%H%M%S)"
+PY_REPORT="$OUT_DIR/fp8_paths_pytorch_${HOST}_${TS}.json"
+CUBLAS_REPORT="$OUT_DIR/fp8_paths_cublaslt_${HOST}_${TS}.json"
+COMBINED_REPORT="$OUT_DIR/fp8_paths_combined_${HOST}_${TS}.json"
+
+"$PYTHON" "$PROJECT_DIR/scripts/pytorch_fp8_path_bench.py" \
+  --matrix-size "$MATRIX_SIZE" \
+  --warmup "$WARMUP" \
+  --iterations "$ITERATIONS" \
+  --gpu-index "$GPU_INDEX" | tee "$PY_REPORT"
+
+"$NVCC" -O3 -std=c++17 -arch=sm_90 \
+  "$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
+  -lcublasLt -lcublas -o "$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
+
+"$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" \
+  --matrix-size "$MATRIX_SIZE" \
+  --warmup "$WARMUP" \
+  --iterations "$ITERATIONS" \
+  --first-gpu "$GPU_INDEX" \
+  --gpu-count 1 \
+  --workspace-mb "$WORKSPACE_MB" \
+  --fast-accum 1 | tee "$CUBLAS_REPORT"
+
+"$PYTHON" - "$PY_REPORT" "$CUBLAS_REPORT" "$COMBINED_REPORT" <<'PY'
+import json
+import pathlib
+import sys
+
+py_report = pathlib.Path(sys.argv[1])
+cublas_report = pathlib.Path(sys.argv[2])
+combined_report = pathlib.Path(sys.argv[3])
+
+with py_report.open() as f:
+    py_payload = json.load(f)
+with cublas_report.open() as f:
+    cublas_payload = json.load(f)
+
+combined = {
+    "source": "fp8_path_comparison",
+    "host": cublas_payload.get("host"),
+    "matrix_size": py_payload.get("matrix_size"),
+    "gpu_index": py_payload.get("gpu_index"),
+    "pytorch": py_payload,
+    "cublaslt": cublas_payload,
+    "results": [],
+}
+combined["results"].extend(py_payload.get("results", []))
+per_gpu = cublas_payload.get("per_gpu", [])
+if per_gpu:
+    row = dict(per_gpu[0])
+    row.update({
+        "name": "E_direct_cublaslt_fast_accum",
+        "status": "ok",
+        "tflops": row.pop("fp8_tflops"),
+        "matrix_size": cublas_payload.get("matrix_size"),
+        "iterations": cublas_payload.get("iterations"),
+        "warmup": cublas_payload.get("warmup"),
+        "fast_accum": cublas_payload.get("fast_accum"),
+        "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager.",
+    })
+    combined["results"].append(row)
+
+combined_report.write_text(json.dumps(combined, indent=2), encoding="utf-8")
+print(f"Combined report written to: {combined_report}")
+PY
+
+echo "$COMBINED_REPORT"
-- 
2.47.2


From 7ec2da18bc66f5199b97863ea8e4dea01dd801f0 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Tue, 26 May 2026 00:15:48 +0800
Subject: [PATCH 39/41] Clean report whitespace

---
 docs/multinode_nccl_concepts.md                | 1 -
 reports_fp8_path_comparison_20260525.md        | 7 +++----
 reports_gpu_Test_formal_20260524.md            | 1 -
 reports_gpu_Test_pdf.css                       | 1 -
 reports_test_all_latest_summary_cn_20260523.md | 2 +-
 5 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/docs/multinode_nccl_concepts.md b/docs/multinode_nccl_concepts.md
index 1c6039d..52d9b87 100644
--- a/docs/multinode_nccl_concepts.md
+++ b/docs/multinode_nccl_concepts.md
@@ -359,4 +359,3 @@ flowchart TD
 ```
 
 因此，多机多卡测试不是一个命令，而是一条验证链路。
-
diff --git a/reports_fp8_path_comparison_20260525.md b/reports_fp8_path_comparison_20260525.md
index c245b15..6c5d9cf 100644
--- a/reports_fp8_path_comparison_20260525.md
+++ b/reports_fp8_path_comparison_20260525.md
@@ -1,8 +1,8 @@
 # FP8 GEMM 路径对比测试报告
 
-测试日期：2026-05-25  
-测试节点：aikubeworker0012、aikubeworker0016  
-测试 GPU：NVIDIA H100 80GB HBM3  
+测试日期：2026-05-25
+测试节点：aikubeworker0012、aikubeworker0016
+测试 GPU：NVIDIA H100 80GB HBM3
 测试目标：对比同一 FP8 GEMM 规模下 PyTorch eager、CUDA Graph、Transformer Engine 和 direct cuBLASLt 的性能差异。
 
 ## 一、测试结论
@@ -166,4 +166,3 @@ E 路径 cuBLASLt 算法信息：
 | `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json` | aikubeworker0012 A-E 原始结果 |
 | `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json` | aikubeworker0016 A-E 原始结果 |
 | `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_path_comparison_20260525.md` | 本中文汇总报告 |
-
diff --git a/reports_gpu_Test_formal_20260524.md b/reports_gpu_Test_formal_20260524.md
index 65969b2..49e2695 100644
--- a/reports_gpu_Test_formal_20260524.md
+++ b/reports_gpu_Test_formal_20260524.md
@@ -120,4 +120,3 @@ NCCL 的 `busbw` 是 collective 通信的逻辑折算带宽，不等同于单条
 3. 单机 8 卡 NCCL 通信在两台节点上结果接近，未观察到明显节点间异常差异。
 4. 多机 2x8 NCCL 正确性通过，跨节点通信功能正常。
 5. 当前多机通信结果应按 4x400Gbps IB rail 环境解释；若后续需要对齐 8x400Gbps 环境，应先确认 rail 数量、NCCL net plugin / SHARP、交换网络策略等配置一致。
-
diff --git a/reports_gpu_Test_pdf.css b/reports_gpu_Test_pdf.css
index 8ef6d39..9a44015 100644
--- a/reports_gpu_Test_pdf.css
+++ b/reports_gpu_Test_pdf.css
@@ -99,4 +99,3 @@ ol {
 li {
   margin: 3px 0;
 }
-
diff --git a/reports_test_all_latest_summary_cn_20260523.md b/reports_test_all_latest_summary_cn_20260523.md
index 9ef9449..87f4eab 100644
--- a/reports_test_all_latest_summary_cn_20260523.md
+++ b/reports_test_all_latest_summary_cn_20260523.md
@@ -1,6 +1,6 @@
 # H100 单节点 test all 中文汇总
 
-生成时间：2026-05-23  
+生成时间：2026-05-23
 测试范围：`aikubeworker0012`、`aikubeworker0016` 单节点 `python gpu_tester.py --test all --report --format md`
 
 原始报告：
-- 
2.47.2


From 1c3c811254094720377473cace3edffcb61fd0bc Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Tue, 26 May 2026 00:44:39 +0800
Subject: [PATCH 40/41] Remove generated reports from PR

---
 .gitignore                                    |   7 +
 README.md                                     | 229 +----
 docs/h100_test_all_metrics_guide_cn.md        | 255 -----
 docs/multinode_nccl_concepts.md               | 361 -------
 docs/multinode_nccl_deep_diagnose_runbook.md  | 219 -----
 reports_all_aikubeworker0016.json             | 921 ------------------
 reports_all_aikubeworker0016.md               | 157 ---
 reports_cublaslt_fp8_crosscheck_20260524.md   |  87 --
 ...gemm_aikubeworker0012_20260524_071148.json |  21 -
 ...gemm_aikubeworker0016_20260524_071200.json |  21 -
 ...cgm_r3_aikubeworker0012_20260522_200338.md |  65 --
 ...cgm_r3_aikubeworker0016_20260522_200538.md |  65 --
 reports_fp8_path_comparison_20260525.md       | 168 ----
 ...ined_aikubeworker0012_20260525_042347.json | 142 ---
 ...ined_aikubeworker0012_20260525_045408.json | 156 ---
 ...ined_aikubeworker0016_20260525_042402.json | 142 ---
 ...ined_aikubeworker0016_20260525_050048.json | 156 ---
 reports_gpu_Test_combined_20260524.md         | 152 ---
 reports_gpu_Test_formal_20260524.md           | 122 ---
 reports_gpu_Test_pdf.css                      | 101 --
 ...0_acceptance_closure_checklist_20260523.md | 105 --
 ...h100_acceptance_current_status_20260523.md | 164 ----
 ...0_acceptance_delivery_manifest_20260523.md | 152 ---
 ...rts_h100_acceptance_pr_summary_20260523.md | 144 ---
 ...rk_hardware_escalation_request_20260523.md | 193 ----
 reports_multinode_nccl_16g_2x8_nccl227.md     |  66 --
 ...rts_multinode_nccl_16g_2x8_nccl227_auto.md |  66 --
 ...de_nccl_all_collectives_20260523_120144.md |  98 --
 ...llectives_20260523_120144_artifacts.sha256 |  24 -
 ..._collectives_20260523_120144_bundle.sha256 |   2 -
 ...ives_artifacts_manifest_20260523_120144.md |  46 -
 ...inode_nccl_all_collectives_run_20260523.md |  49 -
 ...multinode_nccl_alltoall_tuning_20260523.md | 160 ---
 ..._nccl_artifact_signal_analysis_20260523.md | 141 ---
 ...s_multinode_nccl_counter_probe_20260523.md | 209 ----
 ...ltinode_nccl_deep_diagnose_run_20260523.md | 125 ---
 reports_multinode_nccl_diagnosis_20260523.md  | 500 ----------
 ..._multinode_nccl_diagnostic_2x8_debug_v2.md |  66 --
 ...ultinode_nccl_diagnostic_2x8_nccl227_v2.md |  66 --
 ...ts_multinode_nccl_diagnostic_2x8_sshfix.md |  66 --
 ...multinode_nccl_environment_gap_20260523.md | 168 ----
 ...ts_multinode_nccl_handoff_plan_20260523.md | 213 ----
 ...ts_multinode_nccl_latest_index_20260523.md | 265 -----
 ...ltinode_nccl_pdf_matrix_20260523_112247.md |  75 --
 ...ltinode_nccl_pdf_matrix_20260523_113803.md |  75 --
 ...trix_artifacts_manifest_20260523_113803.md |  33 -
 reports_multinode_nccl_pdf_matrix_nccl227.md  |  84 --
 ..._multinode_nccl_pdf_matrix_run_20260523.md |  67 --
 ...node_nccl_smoke_256m_aikubeworker0012.json | 439 ---------
 ...tinode_nccl_smoke_256m_aikubeworker0012.md |  50 -
 reports_multinode_nccl_sweep_2x8_nccl227.md   |  66 --
 reports_nvbandwidth_aikubeworker0012.json     |  70 --
 reports_nvbandwidth_aikubeworker0012.md       |  38 -
 reports_nvbandwidth_aikubeworker0016.json     |  70 --
 reports_nvbandwidth_aikubeworker0016.md       |  38 -
 reports_rdma_aikubeworker0012.json            | 157 ---
 reports_rdma_aikubeworker0016.json            | 157 ---
 ...ounter_aikubeworker0012_20260522_194808.md |  62 --
 ...ounter_aikubeworker0016_20260522_194828.md |  62 --
 reports_rdma_cross_node_mlx5_0_20260523.md    |  50 -
 reports_rdma_single_node_summary.md           |  73 --
 reports_single_gpu_aikubeworker0012.json      | 292 ------
 reports_single_gpu_aikubeworker0012.md        |  54 -
 reports_single_gpu_aikubeworker0016.json      | 292 ------
 reports_single_gpu_aikubeworker0016.md        |  54 -
 ...stress_smoke_reasons_aikubeworker0012.json | 165 ----
 ...s_stress_smoke_reasons_aikubeworker0012.md |  29 -
 ...stress_smoke_reasons_aikubeworker0016.json | 165 ----
 ...s_stress_smoke_reasons_aikubeworker0016.md |  29 -
 ...latest_aikubeworker0012_20260522_203246.md | 322 ------
 ...latest_aikubeworker0016_20260522_203447.md | 322 ------
 ...rts_test_all_latest_summary_cn_20260523.md | 101 --
 ...ll_pdf_aikubeworker0012_20260522_182656.md | 259 -----
 ...ll_pdf_aikubeworker0016_20260522_182856.md | 259 -----
 ...warmup_aikubeworker0012_20260522_194528.md |  43 -
 ...warmup_aikubeworker0016_20260522_194609.md |  43 -
 76 files changed, 61 insertions(+), 10669 deletions(-)
 delete mode 100644 docs/h100_test_all_metrics_guide_cn.md
 delete mode 100644 docs/multinode_nccl_concepts.md
 delete mode 100644 docs/multinode_nccl_deep_diagnose_runbook.md
 delete mode 100644 reports_all_aikubeworker0016.json
 delete mode 100644 reports_all_aikubeworker0016.md
 delete mode 100644 reports_cublaslt_fp8_crosscheck_20260524.md
 delete mode 100644 reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json
 delete mode 100644 reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json
 delete mode 100644 reports_dcgm_r3_aikubeworker0012_20260522_200338.md
 delete mode 100644 reports_dcgm_r3_aikubeworker0016_20260522_200538.md
 delete mode 100644 reports_fp8_path_comparison_20260525.md
 delete mode 100644 reports_fp8_paths_combined_aikubeworker0012_20260525_042347.json
 delete mode 100644 reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json
 delete mode 100644 reports_fp8_paths_combined_aikubeworker0016_20260525_042402.json
 delete mode 100644 reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json
 delete mode 100644 reports_gpu_Test_combined_20260524.md
 delete mode 100644 reports_gpu_Test_formal_20260524.md
 delete mode 100644 reports_gpu_Test_pdf.css
 delete mode 100644 reports_h100_acceptance_closure_checklist_20260523.md
 delete mode 100644 reports_h100_acceptance_current_status_20260523.md
 delete mode 100644 reports_h100_acceptance_delivery_manifest_20260523.md
 delete mode 100644 reports_h100_acceptance_pr_summary_20260523.md
 delete mode 100644 reports_h100_network_hardware_escalation_request_20260523.md
 delete mode 100644 reports_multinode_nccl_16g_2x8_nccl227.md
 delete mode 100644 reports_multinode_nccl_16g_2x8_nccl227_auto.md
 delete mode 100644 reports_multinode_nccl_all_collectives_20260523_120144.md
 delete mode 100644 reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256
 delete mode 100644 reports_multinode_nccl_all_collectives_20260523_120144_bundle.sha256
 delete mode 100644 reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md
 delete mode 100644 reports_multinode_nccl_all_collectives_run_20260523.md
 delete mode 100644 reports_multinode_nccl_alltoall_tuning_20260523.md
 delete mode 100644 reports_multinode_nccl_artifact_signal_analysis_20260523.md
 delete mode 100644 reports_multinode_nccl_counter_probe_20260523.md
 delete mode 100644 reports_multinode_nccl_deep_diagnose_run_20260523.md
 delete mode 100644 reports_multinode_nccl_diagnosis_20260523.md
 delete mode 100644 reports_multinode_nccl_diagnostic_2x8_debug_v2.md
 delete mode 100644 reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md
 delete mode 100644 reports_multinode_nccl_diagnostic_2x8_sshfix.md
 delete mode 100644 reports_multinode_nccl_environment_gap_20260523.md
 delete mode 100644 reports_multinode_nccl_handoff_plan_20260523.md
 delete mode 100644 reports_multinode_nccl_latest_index_20260523.md
 delete mode 100644 reports_multinode_nccl_pdf_matrix_20260523_112247.md
 delete mode 100644 reports_multinode_nccl_pdf_matrix_20260523_113803.md
 delete mode 100644 reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md
 delete mode 100644 reports_multinode_nccl_pdf_matrix_nccl227.md
 delete mode 100644 reports_multinode_nccl_pdf_matrix_run_20260523.md
 delete mode 100644 reports_multinode_nccl_smoke_256m_aikubeworker0012.json
 delete mode 100644 reports_multinode_nccl_smoke_256m_aikubeworker0012.md
 delete mode 100644 reports_multinode_nccl_sweep_2x8_nccl227.md
 delete mode 100644 reports_nvbandwidth_aikubeworker0012.json
 delete mode 100644 reports_nvbandwidth_aikubeworker0012.md
 delete mode 100644 reports_nvbandwidth_aikubeworker0016.json
 delete mode 100644 reports_nvbandwidth_aikubeworker0016.md
 delete mode 100644 reports_rdma_aikubeworker0012.json
 delete mode 100644 reports_rdma_aikubeworker0016.json
 delete mode 100644 reports_rdma_counter_aikubeworker0012_20260522_194808.md
 delete mode 100644 reports_rdma_counter_aikubeworker0016_20260522_194828.md
 delete mode 100644 reports_rdma_cross_node_mlx5_0_20260523.md
 delete mode 100644 reports_rdma_single_node_summary.md
 delete mode 100644 reports_single_gpu_aikubeworker0012.json
 delete mode 100644 reports_single_gpu_aikubeworker0012.md
 delete mode 100644 reports_single_gpu_aikubeworker0016.json
 delete mode 100644 reports_single_gpu_aikubeworker0016.md
 delete mode 100644 reports_stress_smoke_reasons_aikubeworker0012.json
 delete mode 100644 reports_stress_smoke_reasons_aikubeworker0012.md
 delete mode 100644 reports_stress_smoke_reasons_aikubeworker0016.json
 delete mode 100644 reports_stress_smoke_reasons_aikubeworker0016.md
 delete mode 100644 reports_test_all_latest_aikubeworker0012_20260522_203246.md
 delete mode 100644 reports_test_all_latest_aikubeworker0016_20260522_203447.md
 delete mode 100644 reports_test_all_latest_summary_cn_20260523.md
 delete mode 100644 reports_test_all_pdf_aikubeworker0012_20260522_182656.md
 delete mode 100644 reports_test_all_pdf_aikubeworker0016_20260522_182856.md
 delete mode 100644 reports_training_warmup_aikubeworker0012_20260522_194528.md
 delete mode 100644 reports_training_warmup_aikubeworker0016_20260522_194609.md

diff --git a/.gitignore b/.gitignore
index 99f18a6..2347ffb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,12 @@ __pycache__/
 dist/
 build/
 reports/
+reports_*
+H100*.md
+test_all*.md
+docs/h100_test_all_metrics_guide_cn.md
+docs/multinode_nccl_concepts.md
+docs/multinode_nccl_deep_diagnose_runbook.md
 *.egg
 .eggs/
 *.log
@@ -14,5 +20,6 @@ reports/
 .venv/
 venv/
 .qoder/*
+.playwright-mcp/
 .claude/settings.local.json
 .omx/
diff --git a/README.md b/README.md
index 21aad0d..ebe1ae6 100644
--- a/README.md
+++ b/README.md
@@ -6,53 +6,10 @@
 > **支持 GPU 架构：** Ampere (A100/A800) · Hopper (H100/H200) · Blackwell (B200/B300)
 > 系统自动检测 GPU 型号并使用对应的规格参数进行基准对比。
 
-## H100 当前验收入口
-
-当前分支 `h100-acceptance-current` 已补齐 H100 单节点、多节点 NCCL、跨节点 RDMA 的主要证据链。按现有 PDF/配置口径，当前结论仍是 **FAIL**：脚本和证据基本可交付，但机器尚未达到生产验收阈值。
-
-| 优先级 | 文件 | 用途 |
-|---|---|---|
-| 1 | [reports_h100_acceptance_current_status_20260523.md](reports_h100_acceptance_current_status_20260523.md) | 当前总状态：已测项、失败项、阻塞项、下一步 |
-| 2 | [reports_h100_acceptance_closure_checklist_20260523.md](reports_h100_acceptance_closure_checklist_20260523.md) | 收尾检查清单：可交付项、未关闭门禁、最短收尾路径 |
-| 3 | [reports_h100_acceptance_delivery_manifest_20260523.md](reports_h100_acceptance_delivery_manifest_20260523.md) | 交付包 manifest：入口、脚本、远端 artifacts、checksum |
-| 4 | [reports_h100_acceptance_pr_summary_20260523.md](reports_h100_acceptance_pr_summary_20260523.md) | PR/审阅摘要：变更范围、验证、风险、合并说明 |
-| 5 | [reports_h100_network_hardware_escalation_request_20260523.md](reports_h100_network_hardware_escalation_request_20260523.md) | 给网络/硬件/环境侧的闭环请求和回填表 |
-| 6 | [reports_multinode_nccl_latest_index_20260523.md](reports_multinode_nccl_latest_index_20260523.md) | 多节点 NCCL 相关报告索引 |
-| 7 | [reports_multinode_nccl_handoff_plan_20260523.md](reports_multinode_nccl_handoff_plan_20260523.md) | 接手人复跑和继续定位计划 |
-| 8 | [reports_test_all_latest_summary_cn_20260523.md](reports_test_all_latest_summary_cn_20260523.md) | 单节点 `test all` 中文原始汇总 |
-| 9 | [reports_rdma_cross_node_mlx5_0_20260523.md](reports_rdma_cross_node_mlx5_0_20260523.md) | 跨节点 RDMA `mlx5_0` 双向结果 |
-
-当前主要阻塞：
-
-- 单节点 `test all`：两台节点均为 `6/10 PASS`，Compute、NCCL、Stress、RDMA 未过。
-- 跨节点 RDMA：`mlx5_0` 写带宽接近/达到阈值，但读带宽和读写延迟未过。
-- 多节点 NCCL：`2x8 allreduce`、`2x8 alltoall` 按 PDF 阈值未过；NCCL `wrong_count=0`，主要是性能不达标。
-- 环境差异：当前可用 400G IB rail 主要是 `mlx5_0,mlx5_1,mlx5_6,mlx5_7`，未发现外部 NCCL net plugin / SHARP / HCOLL。
-
-### H100 复跑入口
-
-远端默认路径为 `/root/test_gpu_scripts`，建议在 `nccl-gpu-1` 作为发起节点执行多节点测试。
-
-```bash
-# 单节点全量验收，分别在每台机器执行
-bash scripts/run_h100_single_node_all.sh
-
-# 多节点 NCCL PDF 矩阵：allreduce/alltoall x 2x1/2x2/2x4/2x8
-bash scripts/run_multinode_nccl_pdf_matrix.sh
-
-# 多节点 NCCL 六类 collective：2 节点 x 8 GPU
-bash scripts/run_multinode_nccl_all_collectives.sh
-
-# 多节点 NCCL 深度诊断和环境证据抓取
-bash scripts/multinode_nccl_deep_diagnose.sh preflight
-bash scripts/multinode_nccl_deep_diagnose.sh all
-```
-
 ---
 
 ## 目录
 
-- [H100 当前验收入口](#h100-当前验收入口)
 - [项目结构](#项目结构)
 - [环境要求](#环境要求)
 - [快速开始](#快速开始)
@@ -69,31 +26,23 @@ bash scripts/multinode_nccl_deep_diagnose.sh all
 ## 项目结构
 
 ```
-test_gpu_scripts/
-├── gpu_tester.py                               # 主入口：CLI + 交互式菜单
-├── install_deps.sh                             # 一键安装三方工具
+servertest/
+├── gpu_tester.py               # 主入口：CLI + 交互式菜单
+├── install_deps.sh             # 一键安装三方工具
 ├── configs/
-│   ├── default.yaml                            # 默认配置
-│   ├── multinode_nccl_nccl227_pdf_matrix.yaml  # H100 多节点 PDF 矩阵配置
-│   └── multinode_nccl_nccl227_all_collectives_2x8.yaml
+│   └── default.yaml            # 默认配置
 ├── modules/
-│   ├── gpu_specs.py                            # GPU 规格数据库
-│   ├── gpu_info.py                             # GPU 检测 & 信息
-│   ├── health_check.py                         # 健康诊断
-│   ├── benchmark.py                            # 内存带宽 + 计算吞吐
-│   ├── nccl_test.py                            # NCCL 多卡/多节点通信
-│   ├── stress_test.py                          # GPU 压力/稳定性
-│   ├── rdma_test.py                            # RDMA/InfiniBand
-│   ├── training_sim.py                         # 训练模拟
-│   └── report.py                               # 报告生成
-├── scripts/
-│   ├── run_h100_single_node_all.sh             # H100 单节点全量复跑
-│   ├── run_multinode_nccl_pdf_matrix.sh        # 多节点 NCCL PDF 矩阵复跑
-│   ├── run_multinode_nccl_all_collectives.sh   # 多节点 NCCL 六类 collective 复跑
-│   └── multinode_nccl_deep_diagnose.sh         # 多节点 NCCL 深度诊断
-├── docs/                                       # 指标说明和 runbook
-├── reports_*20260523*.md                       # 当前 H100 验收证据和汇总报告
-└── requirements.txt
+│   ├── gpu_specs.py            # GPU 规格数据库 (A100/A800/H100/H200/B200/B300)
+│   ├── gpu_info.py             # GPU 检测 & 信息
+│   ├── health_check.py         # 健康诊断
+│   ├── benchmark.py            # 内存带宽 + 计算吞吐
+│   ├── nccl_test.py            # NCCL 多卡通信
+│   ├── stress_test.py          # GPU 压力/稳定性
+│   ├── rdma_test.py            # RDMA/InfiniBand
+│   ├── training_sim.py         # 训练模拟
+│   └── report.py               # 报告生成
+├── requirements.txt
+└── 调研.md                     # 行业框架调研
 ```
 
 ---
@@ -210,7 +159,7 @@ python3 gpu_tester.py
  [3]  Memory Benchmark (nvbandwidth)
  [4]  Compute Benchmark
  [5]  NCCL Multi-GPU Test
- [6]  GPU Stress Test (PyTorch/gpu-burn)
+ [6]  GPU Stress Test (gpu-burn)
  [7]  RDMA/IB Test
  [8]  Training Simulation
  [9]  Full Test Suite (All Tests)
@@ -330,35 +279,33 @@ python3 gpu_tester.py --config /path/to/config.yaml --test all
 | FP16 | 312 TFLOPS | 990 TFLOPS | 2,250 TFLOPS | 3,500 TFLOPS |
 | BF16 | 312 TFLOPS | 990 TFLOPS | 2,250 TFLOPS | 3,500 TFLOPS |
 | FP8 | N/A | 1,979 TFLOPS | 4,500 TFLOPS | 7,000 TFLOPS |
-| FP64 | 9.7 TFLOPS | 67 TFLOPS | TBD | TBD |
-| INT8 | 624 TOPS | 1,979 TOPS | TBD | TBD |
 
-默认配置：8192×8192 矩阵，50 次 warmup，500 次迭代；逐 GPU 跑 FP32/TF32/FP16/BF16/FP8/FP64/INT8，并按同 dtype 的极差/均值判断一致性。
+默认配置：4096×4096 矩阵，10 次 warmup，100 次迭代。
 
 ### 5. NCCL Multi-GPU Test（多卡通信）
 
-优先使用官方 nccl-tests（通过 mpirun 调用）并解析真实 bus BW；如果只能走 torchrun fallback，验收结果会标记 FAIL。
+优先使用官方 nccl-tests（通过 mpirun 调用），不可用时 torchrun fallback。
 
 | 操作 | 说明 |
 |---|---|
 | AllReduce | 最常用的集合通信 |
 | AllToAll | 模型并行关键操作 |
 | Broadcast | 参数同步 |
-| ReduceScatter | 必测 |
-| AllGather | 必测 |
-| SendRecv | 必测 |
+| ReduceScatter | 可选 |
+| AllGather | 可选 |
+| SendRecv | 可选 |
 
-默认按 PDF 口径测试 1MB、256MB、2GB 三个 size，每个 op 重复 3 次，取 worst bus BW 和标准差；标准差超过 3% 判 FAIL。
+默认测试数据量范围 8B ~ 256MB，5 次 warmup，20 次迭代。
 
 **NVLink 参考带宽：** A100/A800 ≥ 240 GB/s | H100/H200 ≥ 360 GB/s | B200/B300 ≥ 720 GB/s（40% NVLink 峰值）
 
 ### 6. GPU Stress Test（压力测试）
 
-默认使用 PyTorch BF16/FP16 GEMM 进行长时高功耗满载测试；也可在配置中启用 gpu-burn。测试期间采集温度、功耗、throttle、XID，并计算稳态功耗、温差和 TFLOPS 抖动。
+使用 gpu-burn 进行长时满载测试，验证热稳定性和内存正确性。
 
 | 参数 | 默认值 | 说明 |
 |---|---|---|
-| duration_sec | 1800 | 测试时长（秒） |
+| duration_sec | 60 | 测试时长（秒） |
 | use_tensor_cores | true | 使用 Tensor Core |
 | memory_pct | 90 | 内存占用比例 |
 
@@ -373,18 +320,18 @@ python3 gpu_tester.py --config /path/to/config.yaml --test all
 | 写延迟 | ib_write_lat |
 | 读延迟 | ib_read_lat |
 
-**参考阈值：** 端口 ACTIVE 且 ≥400Gbps；4MB 写/读带宽 ≥47GB/s；8B 写延迟 ≤2μs、读延迟 ≤3.5μs；PFC/ECN/CNP/congestion 计数为 0。
+**参考阈值：** 带宽 ≥ 50 GB/s, 延迟 ≤ 10 μs
 
 ### 8. Training Simulation（训练模拟）
 
-默认跑 8 卡 DDP synthetic 1.5B Transformer 训练模拟。
+使用真实或合成模型模拟训练负载。
 
 | 模式 | 说明 |
 |---|---|
-| DDP 合成模型 | 约 1.5B 参数，8 卡 torchrun |
-| 单进程 fallback | 仅用于调试；生产验收按 FAIL |
+| 真实模型 | 加载 HuggingFace GPT-2（需安装 transformers） |
+| 合成模型 | 6 层 Transformer（无需额外依赖） |
 
-输出：tokens/sec、步时、warmup 后 step 抖动、峰值显存、最终 loss，并检查 loss 是否 NaN/Inf。
+输出：tokens/sec、步时、峰值显存、最终 loss。
 
 ---
 
@@ -404,14 +351,14 @@ benchmark:
     nvbandwidth_buffer_mb: 512          # nvbandwidth 缓冲区大小
     nvbandwidth_samples: 3              # nvbandwidth 采样次数
   compute:
-    dtypes: [fp32, tf32, fp16, bf16, fp8, fp64, int8]
-    matrix_size: 8192                   # GEMM 矩阵维度
-    warmup: 50
-    iterations: 500
+    dtypes: [fp32, tf32, fp16, bf16, fp8]
+    matrix_size: 4096                   # GEMM 矩阵维度
+    warmup: 10
+    iterations: 100
 
 health:
-  temp_warning: 75                      # 温度警告阈值 °C
-  temp_critical: 85                     # 温度严重阈值 °C
+  temp_warning: 80                      # 温度警告阈值 °C
+  temp_critical: 90                     # 温度严重阈值 °C
   power_limit: null                     # null = 自动匹配 GPU TDP
 
 nccl:
@@ -419,83 +366,26 @@ nccl:
   test_allreduce: true
   test_alltoall: true
   test_broadcast: true
-  test_reduce_scatter: true
-  test_allgather: true
-  test_sendrecv: true
-  message_sizes: [1M, 256M, 2G]
-  repeats: 3
-  max_stddev_pct: 3
-
-multinode_nccl:
-  enabled: false                        # true 时纳入 --test all
-  hosts:
-    - {name: nccl-gpu-1, addr: 172.72.8.12, slots: 8}
-    - {name: nccl-gpu-2, addr: 172.72.8.16, slots: 8}
-  tests: [all_reduce_perf, alltoall_perf]
-  topologies:
-    - {nodes: 2, gpus_per_node: 8}
-  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
-  extra_ld_library_path:                # 传给远端 rank 的 MPI/NCCL/CUDA 库路径
-    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
-    - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
-    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
-  begin_size: 1k
-  end_size: 16g
-  step_factor: 2
-  warmup_iters: 10
-  socket_ifname: bond0
-  ib_gid_index: 3
-  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
 
 stress:
-  duration_sec: 1800                   # 压力测试时长
-  use_gpu_burn: false                  # 默认走 PyTorch GEMM stress
-  dtype: bf16
-  matrix_size: 24576
-  telemetry_interval_sec: 1
-  min_power_watts: 630
-  max_tflops_jitter_pct: 5
-  require_tflops_jitter: true
+  duration_sec: 60                     # 压力测试时长
   use_tensor_cores: true
 
 rdma:
-  min_bandwidth_gbps: 47              # RDMA 最低可接受带宽
-  min_port_rate_gbps: 400             # IB 端口最低速率
-  max_write_latency_us: 2.0
-  max_read_latency_us: 3.5
-  msg_size: 4194304                   # 4MB 带宽测试消息
-  latency_msg_size: 8                 # 8B 延迟测试消息
-  server_addr: null                   # client 模式 perftest 对端 IP
-  ibping_target: null                 # ibping 对端 LID/GID，不是 IP
-  role: auto                          # auto / server / client
-  pfc_ecn_counters: true
-
-nvlink:
-  expected_links_per_gpu: 18
-  expected_link_speed_gbps: 25
-  require_zero_errors: true
-
-dcgm:
-  diag_level: 3
-  timeout_sec: 3600
-  expected_num_gpus: 8
-  json_output: true
-  require_subtests: true
+  min_bandwidth_gbps: 50              # RDMA 最低可接受带宽
+  max_latency_us: 10                  # RDMA 最大可接受延迟
+  msg_size: 65536                     # 测试消息大小
 
 training:
-  model: synthetic_1.5b                # 8 卡 synthetic Transformer
+  model: gpt2                          # HuggingFace 模型名
   batch_size: 8
   seq_length: 2048
   num_steps: 50
-  warmup_steps: 5
   dtype: bf16
-  mode: ddp
-  min_tokens_per_sec: 45000
-  max_step_jitter_pct: 3
 
 report:
   output_dir: ./reports
-  format: json                         # json / html / md
+  format: json                         # json 或 html
 ```
 
 ---
@@ -603,22 +493,22 @@ report:
 步骤 2: RDMA 网络测试
 ├── python3 gpu_tester.py --test rdma
 ├── 确认: IB 设备被识别
-├── 确认: 端口状态 ACTIVE 且 ≥400Gbps
-├── 确认: 4MB 写/读带宽 ≥47 GB/s
-├── 确认: 8B 写延迟 ≤2 μs、读延迟 ≤3.5 μs
-├── 确认: ibping 双向连通
-├── 确认: PFC/ECN/CNP/congestion 计数为 0
+├── 确认: 端口状态 Active
+├── 确认: 写带宽 ≥ 50 GB/s
+├── 确认: 延迟 ≤ 10 μs
 └── 异常: 检查 IB 线缆、交换机配置、子网管理器
 
 步骤 3: 多节点 NCCL 测试
-├── 在发起节点确认 mpirun、nccl-tests、跨节点 root SSH 可用
-├── 配置 configs/default.yaml 的 multinode_nccl.hosts / IB 参数
-├── 执行 PDF 风格 sweep:
-│   python3 gpu_tester.py --test multinode-nccl --report --format md
-├── 默认命令口径:
-│   mpirun -H <node1>:8,<node2>:8 --map-by ppr:8:node -np 16 \
-│     all_reduce_perf/alltoall_perf -b 1k -e 16g -f 2 -g 1 -w 10
-└── 确认: Peak Bus BW、Peak Size、wrong_count 正常
+├── 在每个节点上配置:
+│   export MASTER_ADDR=<主节点IP>
+│   export MASTER_PORT=29500
+│   export NCCL_SOCKET_IFNAME=ib0    # IB 网卡名
+│   export NCCL_DEBUG=INFO
+├── 运行 nccl-tests 手动测试:
+│   mpirun -np <总GPU数> -hostfile hosts \
+│     /opt/gpu-test-tools/nccl-tests/build/all_reduce_perf \
+│     -b 8 -e 256M -f 2 -g 1 -w 5 -n 20
+└── 确认: 多节点 AllReduce 带宽正常
 
 步骤 4: 训练验证
 ├── python3 gpu_tester.py --test training
@@ -626,17 +516,6 @@ report:
 └── 确认: 训练 loss 正常下降
 ```
 
-#### 多节点 NCCL 深度诊断
-
-当 SOP-3 的多节点 NCCL 结果与验收 PDF 不一致时，可以在发起节点运行深度诊断脚本，复现 counter 抓取、GRAPH/TUNING 日志和 PXN disabled sweep：
-
-```bash
-bash scripts/multinode_nccl_deep_diagnose.sh preflight
-bash scripts/multinode_nccl_deep_diagnose.sh all
-```
-
-详细参数、输出目录和解读方法见 [docs/multinode_nccl_deep_diagnose_runbook.md](/Users/d-robotics/lab/test_gpu_scripts/docs/multinode_nccl_deep_diagnose_runbook.md)。
-
 ---
 
 ### SOP-4: 故障诊断
diff --git a/docs/h100_test_all_metrics_guide_cn.md b/docs/h100_test_all_metrics_guide_cn.md
deleted file mode 100644
index 37abd28..0000000
--- a/docs/h100_test_all_metrics_guide_cn.md
+++ /dev/null
@@ -1,255 +0,0 @@
-# H100 `test all` 指标说明
-
-本文解释 `gpu_tester.py --test all` 报告里每一项指标的意义、它在验收中代表什么，以及异常时通常应该优先排查什么。
-
-适用报告：
-
-- `reports_test_all_latest_aikubeworker0012_20260522_203246.md`
-- `reports_test_all_latest_aikubeworker0016_20260522_203447.md`
-- `reports_test_all_latest_summary_cn_20260523.md`
-
-## 总体判定
-
-| 指标 | 意义 | 怎么看 |
-|---|---|---|
-| `Overall Acceptance Verdict` | 整机验收结论 | 按 PDF 生产验收规则，任一必测子项 FAIL，则整机 FAIL |
-| `Suite complete: x/10 tests passed` | 10 个测试模块里通过了几个 | 用来快速看整体健康度，但最终以 `Overall Acceptance Verdict` 为准 |
-| `PASS` | 达到当前配置阈值 | 表示该指标在当前测试口径下通过 |
-| `FAIL` | 未达到当前配置阈值，或证据不足 | 表示该项不能作为生产验收通过证据 |
-| `WARN` | 旧报告或非强制警告口径 | 当前 PDF 生产验收里，关键性能未达标应按 FAIL 处理 |
-
-## GPU Info
-
-GPU Info 是基础盘点项，用来确认机器硬件、驱动和 CUDA 环境是否符合预期。
-
-| 指标 | 意义 | 异常影响 |
-|---|---|---|
-| GPU count | 当前系统识别到的 GPU 数量 | H100 8 卡机器如果不是 8 张，后续所有多卡测试都不可信 |
-| GPU model | GPU 型号，例如 H100 | 型号不对会导致阈值、峰值、验收口径都不对 |
-| Driver version | NVIDIA 驱动版本 | 版本过旧可能影响 CUDA、NCCL、DCGM、NVLink 工具 |
-| CUDA version | CUDA 运行时或驱动支持版本 | CUDA 不匹配会导致 PyTorch、nccl-tests 或编译工具异常 |
-| GPU UUID / PCI bus id | GPU 唯一标识和 PCIe 拓扑位置 | 用于定位具体故障卡、对应槽位和链路 |
-
-这项通常不直接代表性能好坏，它是确认“测的是不是目标机器、目标 GPU、目标软件栈”。
-
-## Health Check
-
-Health Check 是空闲或轻负载状态下的基础健康检查。
-
-| 指标 | 意义 | 怎么看 |
-|---|---|---|
-| Temperature | 当前 GPU 温度 | 空闲温度过高可能说明散热、风道、环境温度异常 |
-| Power | 当前功耗 | 空闲功耗异常高可能说明有残留进程或功耗状态异常 |
-| ECC errors | 显存纠错错误 | 单比特错误过多或双比特错误通常需要重点关注硬件稳定性 |
-| PCIe | PCIe 代际和宽度，例如 Gen5 x16 | 降速或降宽会影响 CPU-GPU、RDMA、部分数据搬运性能 |
-| Throttle | 当前是否触发限速 | 空闲状态下非 idle throttle 不正常，可能影响后续性能 |
-| XID / NVRM events | 驱动或 GPU 错误事件 | 出现新 XID 通常说明硬件、驱动、供电或内核态异常 |
-
-Health PASS 只能说明基础状态正常，不代表满载性能一定达标。
-
-## Memory Bandwidth
-
-Memory Bandwidth 衡量数据搬运能力，包括 CPU 到 GPU、GPU 到 CPU、GPU 到 GPU。
-
-| 指标 | 意义 | 代表什么 |
-|---|---|---|
-| H2D | Host to Device，CPU 内存到 GPU 显存带宽 | 受 PCIe、NUMA、CPU 内存、驱动影响 |
-| D2H | Device to Host，GPU 显存到 CPU 内存带宽 | 受 PCIe、NUMA、CPU 内存、驱动影响 |
-| D2D | Device to Device，GPU 到 GPU 带宽 | 单节点多卡通常主要受 NVLink/NVSwitch 影响 |
-| Efficiency | 实测值相对理论或配置阈值的比例 | 用于快速判断是否达到预期带宽 |
-
-H2D/D2H 主要看 PCIe 和 CPU 侧链路是否正常。D2D 更接近多卡训练、NCCL 和 P2P 通信的基础能力。
-
-## Compute Throughput
-
-Compute Throughput 衡量 GPU 在不同数值格式下的矩阵计算吞吐，单位通常是 TFLOPS。
-
-| 指标 | 意义 | 常见用途 |
-|---|---|---|
-| FP32 | 32 位浮点性能 | 传统科学计算、部分模型训练和验证 |
-| TF32 | TensorFloat-32 Tensor Core 性能 | NVIDIA Ampere/Hopper 上常见的 FP32 加速路径 |
-| FP16 | 16 位浮点 Tensor Core 性能 | 深度学习训练和推理常用 |
-| BF16 | bfloat16 Tensor Core 性能 | 大模型训练常用，数值范围比 FP16 更稳 |
-| FP8 | 8 位浮点 Tensor Core 性能 | 新一代低精度训练/推理加速 |
-| FP64 | 64 位双精度性能 | HPC、科学计算、仿真 |
-| INT8 | 8 位整数性能 | 推理、量化模型 |
-| Achieved | 实测吞吐 | 越接近峰值越好 |
-| Peak | 理论峰值或规格峰值 | 用来计算效率 |
-| Threshold | 当前验收阈值 | 低于阈值则 FAIL |
-| Efficiency | `Achieved / Peak` | 衡量实测利用率 |
-
-### Compute Consistency
-
-Consistency 是看同一种 dtype 下，不同 GPU 之间性能是否均衡。
-
-| 指标 | 意义 | 异常含义 |
-|---|---|---|
-| Min | 8 张 GPU 里最慢卡的实测值 | 用于发现拖后腿的卡 |
-| Mean | 8 张 GPU 平均值 | 用于看整体水平 |
-| Max | 8 张 GPU 里最快卡的实测值 | 和 Min 一起计算离散度 |
-| Spread | `(Max - Min) / Mean` | 反映卡间性能差异 |
-
-Spread 超过阈值通常说明某些卡受温度、功耗、PCIe、后台负载、时钟策略或硬件状态影响。即使平均性能还可以，卡间差异过大也会拖慢分布式训练。
-
-## NVLink / NVSwitch
-
-NVLink/NVSwitch 测试确认 GPU 间高速互联是否完整、速率是否正确、错误计数是否干净。
-
-| 指标 | 意义 | 怎么看 |
-|---|---|---|
-| Active Links | 每张 GPU 当前活跃 NVLink 数 | H100 8 卡 SXM 常见期望是每卡 18 条 |
-| Expected Links | 配置期望链路数 | 少一条都可能影响拓扑和 NCCL 性能 |
-| Link speed | 单条链路速率 | 速率不对说明链路降级或识别异常 |
-| Error counters | NVLink 错误计数，例如 CRC/replay/recovery | 非零可能说明链路质量或硬件问题 |
-
-NVLink PASS 表示链路状态看起来正常，但 NCCL 仍可能因算法、拓扑、消息大小、NCCL 参数或系统噪声而不达标。
-
-## DCGM Diagnostic
-
-DCGM 是 NVIDIA 官方诊断工具。`dcgmi diag -r 3` 是比较完整的生产诊断级别。
-
-| 子项 | 意义 |
-|---|---|
-| Deployment/software | 驱动、库、系统软件依赖检查 |
-| Hardware/memory | GPU 显存健康检查 |
-| Hardware/diagnostic | GPU 硬件基础诊断 |
-| Hardware/nvbandwidth | GPU/NVLink/NVSwitch 带宽诊断 |
-| Integration/pcie | PCIe 集成和链路相关检查 |
-| Stress/targeted_stress | DCGM 自带目标压力测试 |
-| Stress/targeted_power | DCGM 自带目标功耗压力测试 |
-| summary | 该分类汇总结果 |
-
-DCGM PASS 是强证据，说明官方诊断没有发现明显硬件故障。但它不替代项目里的 NCCL、RDMA、长时间 telemetry 和训练模拟验收。
-
-## NCCL Multi-GPU
-
-NCCL 测试衡量单节点多 GPU 集合通信能力。它直接关系到多卡训练效率。
-
-| 指标 | 意义 | 为什么重要 |
-|---|---|---|
-| source | 测试来源 | 必须是 `nccl-tests` 才有真实 bus BW；`torchrun_fallback` 只能说明功能连通，不是性能验收 |
-| bus BW | NCCL 报告的总线等效带宽 | 用来衡量通信是否吃满 NVLink/NVSwitch |
-| message size | 消息大小，例如 1M、256M、2G | 小消息看延迟和调度，中大消息看带宽 |
-| repeats | 重复次数 | 减少偶然波动，当前按 3 次取样 |
-| worst bus BW | 多次结果里的最差值 | 生产验收更关注最差情况 |
-| mean bus BW | 多次平均值 | 反映稳定水平 |
-| stddev | 标准差或波动 | 波动大说明通信稳定性不足 |
-
-### NCCL op 含义
-
-| Op | 意义 | 常见场景 |
-|---|---|---|
-| allreduce | 每张卡都有一份数据，做规约后每张卡都拿到结果 | 数据并行梯度同步最常见 |
-| allgather | 每张卡收集所有卡的数据分片 | 模型并行、张量并行、参数/激活收集 |
-| reducescatter | 先规约再把结果切分给各卡 | ZeRO、优化器状态切分、分布式训练常用 |
-| broadcast | 一张卡把数据广播给其他卡 | 参数同步、初始化权重分发 |
-| sendrecv | 点对点发送和接收 | pipeline、定制通信、拓扑验证 |
-| alltoall | 每张卡向每张卡交换不同数据 | MoE、专家并行、shuffle 类通信 |
-
-NCCL 小消息失败常见于延迟、调度或阈值口径较严；大消息失败更偏向链路带宽、拓扑、NCCL 参数或 NVSwitch/PCIe/NUMA 配置问题。
-
-## Stress Test
-
-Stress Test 是长时间高负载稳定性测试。它不是只看“能不能跑完”，还要看满载期间的温度、功耗、限速和错误事件。
-
-| 指标 | 意义 | 怎么看 |
-|---|---|---|
-| duration | 实际压力测试时长 | 生产验收通常需要 30/60 分钟 |
-| source | 压力来源，例如 `pytorch` 或 `gpu-burn` | 说明用什么负载压 GPU |
-| dtype | 压力计算的数据类型，例如 BF16 | 影响 Tensor Core、功耗和温度 |
-| matrix_size | GEMM 矩阵边长 | 越大越容易形成持续高占用 |
-| memory_pct | 目标显存占用比例 | 避免只测很小负载 |
-| Avg steady power | 稳态平均功耗 | 判断是否真的把卡压起来 |
-| Max steady temp | 稳态最高温度 | 判断散热上限 |
-| Temp delta | 8 卡之间最高温和最低温的差 | 差异过大说明风道、散热或卡位不均衡 |
-| TFLOPS jitter | 稳态吞吐波动 | 波动大说明性能不稳定 |
-| Throttle events | 限速事件数量 | 非 idle throttle 会影响性能稳定性 |
-| XID events | 压测期间新增 XID 错误 | 出现 XID 通常是严重风险 |
-
-### Throttle 常见含义
-
-| 代码 | 常见含义 | 解释 |
-|---|---|---|
-| `0x1` | idle throttle | 空闲状态限速，通常不算真实问题 |
-| `0x4` | `sw_power_cap` | 达到软件功耗上限，性能可能被功耗墙限制 |
-| `0x8` | hardware slowdown | 硬件触发降速 |
-| `0x10` | thermal slowdown | 温度触发降速 |
-| `0x20` | power brake | 外部供电或硬件功率保护 |
-| `0x40` | software thermal slowdown | 软件温度策略触发降速 |
-
-当前报告里的 `sw_power_cap` 表示负载确实压到了功耗墙附近，但验收口径把非 idle throttle 作为失败原因之一，因为它会影响长时间稳定输出。
-
-## RDMA / InfiniBand
-
-RDMA 测试衡量 IB 网卡和网络链路性能。单节点 loopback 和跨节点 server/client 是两种不同证据，不能混用。
-
-| 指标 | 意义 | 怎么看 |
-|---|---|---|
-| Device | IB 设备名，例如 `mlx5_0` | 对应具体 HCA/端口 |
-| Port | 端口号 | 通常是 port 1 |
-| State | 端口状态，例如 ACTIVE/DOWN | ACTIVE 才能作为可用链路 |
-| Rate | 端口速率，例如 400 Gb/sec | 低于期望说明链路降级或接错网络 |
-| GID/LID | IB 寻址信息 | `ibping` 和跨节点定位会用到 |
-| ib_write_bw | RDMA write 带宽 | 客户端向远端写数据的吞吐 |
-| ib_read_bw | RDMA read 带宽 | 客户端从远端读数据的吞吐 |
-| ib_write_lat | RDMA write 延迟 | 小消息写延迟 |
-| ib_read_lat | RDMA read 延迟 | 小消息读延迟 |
-| ibping | IB 层连通性测试 | 看 LID/GID 层是否可达 |
-| PFC/ECN/CNP counters | 拥塞和流控相关计数 | 非零或增长可能说明网络拥塞/丢包/流控问题 |
-
-### 单节点与跨节点的区别
-
-| 口径 | 意义 | 能证明什么 | 不能证明什么 |
-|---|---|---|---|
-| `local_loopback` | 在同一台机器本地启动 perftest server/client | 工具、设备、单机端口基本可用 | 不能证明两台机器之间 RDMA 网络达标 |
-| server/client 跨节点 | 一台做 server，另一台做 client | 能证明实际跨节点 RDMA 带宽/延迟 | 需要明确 server_addr、ib_device、ib_port、ibping_target |
-
-RDMA read 带宽低于 write 带宽很常见，但生产验收会给 read/write 各自设置阈值。read 不过线时，需要排查 HCA 固件、BIOS、PCIe、NUMA、RoCE/IB 配置、交换机、PFC/ECN、线缆和端口速率。
-
-## Training Simulation
-
-Training Simulation 用一个合成 1.5B Transformer 训练负载验证 8 卡分布式训练是否能稳定运行。
-
-| 指标 | 意义 | 怎么看 |
-|---|---|---|
-| Model | 模型类型 | 当前是 synthetic 1.5B，不依赖真实数据集 |
-| Parameters | 参数量 | 用来确认负载规模是否达到预期 |
-| GPU Count | 参与训练的 GPU 数 | 生产口径要求 8 卡 DDP |
-| DType | 训练数值格式，例如 BF16 | 大模型训练常用 BF16 |
-| Batch Size | 每步 batch 大小 | 影响吞吐和显存 |
-| Seq Length | 序列长度 | 影响计算量和显存 |
-| Steps | 计入统计的训练步数 | 步数太少会导致统计不稳 |
-| Warmup Steps | 预热步数 | 避免把 CUDA 初始化、编译、缓存冷启动计入性能 |
-| Avg Step Time | 平均每步耗时 | 越低越好 |
-| Throughput | tokens/sec | 训练吞吐核心指标 |
-| Samples/sec | 每秒样本数 | 辅助衡量数据处理速度 |
-| Peak Memory | 峰值显存 | 看是否接近 OOM 或显存利用不足 |
-| Final Loss | 最后 loss | 用于确认数值是有限值，没有 NaN/Inf |
-| Step Jitter | step 时间抖动 | 抖动大说明训练不稳定 |
-| Distributed Mode | 分布式模式 | 必须是 `ddp` 才满足 8 卡分布式口径 |
-
-Training PASS 说明 8 卡 DDP 训练路径、NCCL 功能连通、PyTorch CUDA 和基本数值稳定性都没问题。但它不能替代 NCCL 性能测试，因为训练负载可能没有覆盖所有通信模式和消息大小。
-
-## 常见误读
-
-1. `DCGM PASS` 不等于整机验收 PASS。DCGM 是官方诊断的一部分，不覆盖全部业务性能门槛。
-2. `Training PASS` 不等于 NCCL 性能 PASS。训练能跑，只说明功能链路通；NCCL bus BW 仍可能不达标。
-3. `NVLink PASS` 不等于 NCCL PASS。链路数量和错误计数正常，不代表所有 NCCL op/size 都达到阈值。
-4. `ibping PASS` 不等于 RDMA 带宽 PASS。`ibping` 只证明连通性，不证明吞吐和延迟达标。
-5. `local_loopback` 不能当作跨节点 RDMA 证据。跨节点验收必须有 server/client 两端证据。
-6. Stress 跑满 30 分钟不等于 PASS。温差、功耗、throttle、XID、jitter 都要一起看。
-7. 小消息 NCCL 低不一定是链路断了，可能是延迟、算法、启动开销或阈值口径导致；但生产验收仍按阈值判定。
-
-## 排查优先级建议
-
-| 失败项 | 优先看什么 |
-|---|---|
-| Compute FAIL | GPU 时钟、功耗策略、MIG/MPS、后台进程、PyTorch/CUDA 版本、benchmark 算法是否用到目标 Tensor Core 路径 |
-| NCCL FAIL | `NCCL_DEBUG=INFO`、拓扑、NVSwitch/NVLink、NCCL 算法、消息大小、PCIe/NUMA、进程绑核 |
-| Stress FAIL | 机箱风道、风扇、环境温度、功耗上限、`nvidia-smi -q -d POWER,CLOCK,TEMPERATURE` |
-| RDMA FAIL | 端口速率、HCA 固件、线缆、交换机、PFC/ECN、NUMA、BIOS、跨节点 server/client 配置 |
-| Training FAIL | torchrun、NCCL 环境变量、CUDA OOM、loss NaN/Inf、DDP 初始化、网络/共享内存 |
-
-## 一句话版
-
-这套报告不是只看 GPU 能不能亮、训练能不能跑，而是同时验证：硬件识别、基础健康、显存和互联带宽、计算吞吐、多卡通信、长时间满载稳定性、IB/RDMA 网络、官方 DCGM 诊断和 8 卡训练业务路径。任何一个关键项 FAIL，按生产验收都应判整机不通过。
diff --git a/docs/multinode_nccl_concepts.md b/docs/multinode_nccl_concepts.md
deleted file mode 100644
index 52d9b87..0000000
--- a/docs/multinode_nccl_concepts.md
+++ /dev/null
@@ -1,361 +0,0 @@
-# 多机多卡 NCCL 测试概念说明
-
-本文先讲概念，不涉及脚本改造。目标是理解两台 8 卡 H100 服务器做多机多卡通信测试时，应该从哪些层次逐步验证，以及每一层到底在证明什么。
-
-当前示例机器：
-
-| 别名 | 主机名 | 内网 IP | GPU |
-|---|---|---|---|
-| nccl-gpu-1 | aikubeworker0012 | 172.72.8.12 | 8 x H100 |
-| nccl-gpu-2 | aikubeworker0016 | 172.72.8.16 | 8 x H100 |
-
-两台机器合起来就是 16 张 GPU。多机 NCCL 测试的核心问题是：这 16 张 GPU 是否能通过正确的 GPU、NVLink、PCIe、IB/RDMA 网络路径，高效且正确地完成集体通信。
-
-## 1. 总体思路
-
-多机多卡通信测试是一个自底向上的过程。越底层越接近硬件和链路，越上层越接近真实训练业务。
-
-```mermaid
-flowchart TD
-    L0["0. 物理与基础连通<br/>电源 / GPU / 网卡 / 线缆 / 交换机 / SSH"] --> L1["1. 系统识别层<br/>nvidia-smi / lspci / ibstat / ibdev2netdev"]
-    L1 --> L2["2. 单机 GPU 健康层<br/>温度 / 功耗 / ECC / PCIe / Throttling / NVLink Topo"]
-    L2 --> L3["3. 单机 GPU 性能层<br/>HBM 带宽 / H2D-D2H / FP32-TF32-FP16-BF16-FP8 算力"]
-    L3 --> L4["4. 单机多卡通信层<br/>单节点 8 卡 NCCL over NVLink/NVSwitch"]
-    L4 --> L5["5. 跨机网络与 RDMA 层<br/>IP 连通 / IB Active / RDMA 带宽 / RDMA 延迟"]
-    L5 --> L6["6. 跨机 NCCL 层<br/>两机 16 卡 AllReduce / AllGather / ReduceScatter / Broadcast / AllToAll"]
-    L6 --> L7["7. 训练负载层<br/>torchrun / Megatron / DeepSpeed / 业务训练压测"]
-```
-
-最重要的原则：
-
-**上层失败，不一定是上层问题。**
-
-比如两机 `all_reduce_perf` 失败，原因可能在 NCCL，也可能在 SSH、MPI、IB、GID、网卡选择、驱动版本、CUDA 版本、NCCL 版本或 GPU Direct RDMA。
-
-所以排查顺序应该是：
-
-```text
-基础连通 -> 单机健康 -> 单机性能 -> 单机 NCCL -> 跨机 RDMA -> 跨机 NCCL -> 训练业务
-```
-
-## 2. 两机 16 卡通信路径
-
-单机内部主要走 NVLink/NVSwitch；跨机器时，数据必须经过 GPU、PCIe/NVLink、网卡、交换机和对端网卡。
-
-```mermaid
-flowchart LR
-    subgraph A["aikubeworker0012 / 172.72.8.12"]
-        A0["GPU0"] --- ASW["NVSwitch / NVLink"]
-        A1["GPU1"] --- ASW
-        A2["..."] --- ASW
-        A7["GPU7"] --- ASW
-        ASW --> ANIC["IB/RDMA NIC(s)"]
-    end
-
-    subgraph NET["InfiniBand / RoCE Fabric"]
-        SW["IB Switch"]
-    end
-
-    subgraph B["aikubeworker0016 / 172.72.8.16"]
-        BNIC["IB/RDMA NIC(s)"] --> BSW["NVSwitch / NVLink"]
-        B0["GPU0"] --- BSW
-        B1["GPU1"] --- BSW
-        B2["..."] --- BSW
-        B7["GPU7"] --- BSW
-    end
-
-    ANIC <--> SW
-    SW <--> BNIC
-```
-
-这里有两个不同的通信域：
-
-| 通信域 | 典型路径 | 主要测试 |
-|---|---|---|
-| 单机内 8 卡 | GPU -> NVLink/NVSwitch -> GPU | 单机 NCCL、NVLink topo、D2D |
-| 跨机器 16 卡 | GPU -> NIC -> IB/RDMA 网络 -> NIC -> GPU | RDMA、跨机 NCCL |
-
-这两个域的性能阈值不能混用。单机 NVSwitch 很快，跨机 RDMA 一般慢一些，跨机 NCCL 的瓶颈通常在 IB/RDMA 网络。
-
-## 3. 每一层要测什么
-
-### 3.1 基础连通层
-
-这一层只证明机器能访问、身份和地址正确。
-
-要确认：
-
-| 检查项 | 目的 |
-|---|---|
-| SSH 互通 | MPI/NCCL 多机启动依赖远端拉起进程 |
-| hostname 正确 | 避免登录错机器 |
-| IP 正确 | 确认使用的是训练网络或 IB/RDMA 对应网络 |
-| 时间同步 | 长时间训练日志和超时排查更可靠 |
-
-这一层不证明 GPU 或 RDMA 性能，只证明“机器能互相找到”。
-
-### 3.2 系统识别层
-
-这一层证明系统能看见 GPU 和网卡。
-
-常见信息：
-
-| 工具 | 看什么 |
-|---|---|
-| `nvidia-smi` | GPU 数量、型号、驱动、CUDA、温度、功耗 |
-| `nvidia-smi topo -m` | GPU、NIC、CPU NUMA、NVLink/NVSwitch 拓扑 |
-| `ibstat` | IB 设备、端口状态、链路速率 |
-| `ibdev2netdev` | mlx5 设备和网络接口的映射 |
-| `/sys/class/infiniband` | 端口状态、link layer、rate、GID |
-
-这一层很关键，因为 NCCL 经常因为选错网卡而跑到 TCP 或错误的接口上。
-
-### 3.3 单机 GPU 健康层
-
-这一层证明每台机器自己是健康的。
-
-```mermaid
-flowchart LR
-    H["单机健康检查"] --> T["温度"]
-    H --> P["功耗"]
-    H --> E["ECC 错误"]
-    H --> PCIE["PCIe Gen/Width"]
-    H --> C["SM/Mem Clock"]
-    H --> TH["Throttling"]
-    H --> PM["Persistence Mode"]
-```
-
-如果某张卡温度过高、ECC double-bit、PCIe 降级或 throttling，后面的 NCCL 测试即使能跑，结果也不可信。
-
-### 3.4 单机 GPU 性能层
-
-这一层证明每台机器的 GPU 本身性能正常。
-
-| 测试 | 证明什么 |
-|---|---|
-| HBM/D2D 带宽 | GPU 显存和设备间拷贝能力 |
-| H2D/D2H 带宽 | CPU/Host 到 GPU 的 PCIe 路径 |
-| FP32/TF32 | 基础矩阵计算能力 |
-| FP16/BF16/FP8 | 训练常用 Tensor Core 能力 |
-
-这一步是单机验收。它不能证明两台机器之间通信正常，但可以排除“某台机器本身 GPU 算力或带宽异常”。
-
-### 3.5 单机多卡 NCCL 层
-
-这一层验证单台机器 8 卡之间的集体通信。
-
-```mermaid
-flowchart TD
-    S["单机 8 卡 NCCL"] --> AR["AllReduce"]
-    S --> AG["AllGather"]
-    S --> RS["ReduceScatter"]
-    S --> BC["Broadcast"]
-    S --> AT["AllToAll"]
-```
-
-单机 NCCL 主要看 NVLink/NVSwitch 通信路径是否正常。常见指标：
-
-| 指标 | 含义 |
-|---|---|
-| `algbw` | 算法视角的有效带宽 |
-| `busbw` | 总线视角的带宽，更适合比较通信链路利用率 |
-| `#wrong` | 结果错误数量，必须是 0 |
-
-单机测试通过后，只能说明单台服务器内部 8 卡通信正常。
-
-### 3.6 跨机 RDMA 层
-
-这一层验证两台机器之间的网络和 RDMA 能力，不涉及 NCCL。
-
-```mermaid
-sequenceDiagram
-    participant N1 as aikubeworker0012
-    participant FAB as IB/RDMA Fabric
-    participant N2 as aikubeworker0016
-
-    N1->>N2: ping / ssh
-    N1->>FAB: ib_write_bw client
-    FAB->>N2: ib_write_bw server
-    N1->>FAB: ib_read_bw client
-    FAB->>N2: ib_read_bw server
-    N1->>N2: ib_write_lat / ib_read_lat
-```
-
-这一层要回答：
-
-| 问题 | 说明 |
-|---|---|
-| IB 端口是否 Active | 没 Active 就不用跑 NCCL |
-| RDMA 带宽是否达标 | 证明网络数据面能跑起来 |
-| RDMA 延迟是否正常 | 高延迟会影响小消息和训练同步 |
-| 是否是 InfiniBand/RoCE | 两者环境变量和排障点不同 |
-
-如果 RDMA 层失败，跨机 NCCL 大概率也会失败或退化到 TCP。
-
-### 3.7 跨机 NCCL 层
-
-这一层才是真正的多机多卡 NCCL 测试。
-
-两台 8 卡机器通常是：
-
-```text
-2 nodes x 8 GPUs = 16 ranks
-每个 rank 绑定 1 张 GPU
-```
-
-概念上是：
-
-```mermaid
-flowchart LR
-    subgraph N1["Node 1: 172.72.8.12"]
-        R0["rank 0 / GPU0"]
-        R1["rank 1 / GPU1"]
-        R2["..."]
-        R7["rank 7 / GPU7"]
-    end
-
-    subgraph N2["Node 2: 172.72.8.16"]
-        R8["rank 8 / GPU0"]
-        R9["rank 9 / GPU1"]
-        R10["..."]
-        R15["rank 15 / GPU7"]
-    end
-
-    R0 <--> R8
-    R1 <--> R9
-    R7 <--> R15
-    N1 <--> N2
-```
-
-典型测试项：
-
-| NCCL 测试 | 训练里对应什么 |
-|---|---|
-| AllReduce | 数据并行梯度同步 |
-| ReduceScatter | ZeRO/FSDP 梯度切分 |
-| AllGather | ZeRO/FSDP 参数聚合 |
-| Broadcast | 参数广播、初始化 |
-| AllToAll | MoE、专家并行、部分并行策略 |
-| SendRecv | 点对点通信、pipeline parallel |
-
-跨机 NCCL 要看：
-
-| 指标 | 判定 |
-|---|---|
-| 是否成功启动 16 rank | MPI/SSH/路径/环境是否正常 |
-| `#wrong == 0` | 正确性必须过 |
-| `busbw` | 跨节点通信链路利用率 |
-| 是否走 IB/RDMA | 需要从 `NCCL_DEBUG=INFO` 确认 |
-| 是否退化 TCP | 如果退化，性能会明显偏低 |
-
-## 4. NCCL 为什么要分单机和跨机
-
-单机 8 卡通信和跨机 16 卡通信的瓶颈不同。
-
-```mermaid
-flowchart TD
-    A["NCCL 性能结果"] --> B{"测试范围"}
-    B --> C["单机 8 卡"]
-    B --> D["跨机 16 卡"]
-
-    C --> C1["主要瓶颈：NVLink / NVSwitch"]
-    C --> C2["阈值可参考 GPU NVLink 能力"]
-
-    D --> D1["主要瓶颈：IB/RDMA 网络"]
-    D --> D2["阈值应参考网卡数量、速率、拓扑和 rail 数"]
-```
-
-所以不能用单机 NVLink 的阈值直接判断跨机 NCCL。跨机要根据真实网络能力设阈值，例如：
-
-| 网络配置 | 理论上限理解 |
-|---|---|
-| 单张 400G 网卡 | 约 50 GB/s 单向原始带宽 |
-| 8 张 400G 网卡 | 约 400 GB/s 原始聚合带宽 |
-| 实测 NCCL busbw | 会受拓扑、GDR、rail、NUMA、交换机、NCCL 算法影响 |
-
-实际验收时，应该先知道每台机器有几张 IB/RDMA 网卡、每张速率多少、GPU 到 NIC 的拓扑关系，再定跨机 NCCL 阈值。
-
-## 5. 常见失败位置
-
-```mermaid
-flowchart TD
-    F["跨机 NCCL 失败"] --> A["启动失败"]
-    F --> B["能启动但很慢"]
-    F --> C["运行中 timeout"]
-    F --> D["结果 #wrong 非 0"]
-
-    A --> A1["SSH 不通"]
-    A --> A2["远端路径不存在"]
-    A --> A3["MPI 环境不一致"]
-    A --> A4["root 运行未允许"]
-
-    B --> B1["NCCL_SOCKET_IFNAME 选错"]
-    B --> B2["没走 IB/RDMA，退化 TCP"]
-    B --> B3["NCCL_IB_HCA 没选对"]
-    B --> B4["GPU Direct RDMA 没生效"]
-
-    C --> C1["IB 端口不稳定"]
-    C --> C2["交换机/PFC/ECN 问题"]
-    C --> C3["NCCL timeout 配置"]
-    C --> C4["驱动/CUDA/NCCL 版本不兼容"]
-
-    D --> D1["通信正确性失败"]
-    D --> D2["必须 FAIL，不能只看带宽"]
-```
-
-## 6. 推荐验收顺序
-
-下面是面向两台 8 卡机器的推荐顺序：
-
-```mermaid
-flowchart TD
-    A["Step 1: 两台机器基础信息"] --> B["Step 2: 两台机器单机 GPU 健康"]
-    B --> C["Step 3: 两台机器单机 benchmark"]
-    C --> D["Step 4: 两台机器分别跑单机 8 卡 NCCL"]
-    D --> E["Step 5: 两台机器互测 RDMA bandwidth/latency"]
-    E --> F["Step 6: 两机 16 卡 NCCL correctness"]
-    F --> G["Step 7: 两机 16 卡 NCCL performance"]
-    G --> H["Step 8: 两机训练 demo 或业务压测"]
-```
-
-每一步的意义：
-
-| 步骤 | 目的 |
-|---|---|
-| Step 1 | 确认没有登录错机器，基础网络和环境存在 |
-| Step 2 | 排除 GPU 健康问题 |
-| Step 3 | 排除 GPU 单卡/单机性能问题 |
-| Step 4 | 排除单机 NVLink/NVSwitch/NCCL 问题 |
-| Step 5 | 排除跨机 RDMA 问题 |
-| Step 6 | 先证明 NCCL 正确性 |
-| Step 7 | 再证明 NCCL 性能 |
-| Step 8 | 最后用真实训练形态验证稳定性 |
-
-## 7. 对当前脚本的映射
-
-当前脚本已有模块和上面层次的关系：
-
-| 当前模块 | 覆盖层次 | 备注 |
-|---|---|---|
-| `gpu_info` | 系统识别层 | 单机 |
-| `health` | 单机 GPU 健康层 | 单机 |
-| `benchmark` | 单机 GPU 性能层 | 单机 |
-| `nccl` | 单机多卡通信层 | 当前主要是单机 |
-| `rdma` | RDMA 检查 | 当前偏本机检查，不是两机互测 |
-| `stress` | 稳定性 | 单机 |
-| `training` | 训练负载层 | 当前偏单机 |
-| 建议新增 `multi_node_nccl` | 跨机 NCCL 层 | 专门处理 hostfile、mpirun、多节点环境、结果解析 |
-
-如果未来要扩展脚本，比较自然的方向是新增一个多机模块，而不是把所有逻辑塞进现有 `nccl` 模块。
-
-## 8. 最小概念模型
-
-记住这句话即可：
-
-```text
-单机 NCCL 验证 GPU 之间的 NVLink/NVSwitch。
-跨机 RDMA 验证机器之间的网络。
-跨机 NCCL 验证 NCCL 是否能把 GPU 和网络组合起来，为真实训练提供高效通信。
-```
-
-因此，多机多卡测试不是一个命令，而是一条验证链路。
diff --git a/docs/multinode_nccl_deep_diagnose_runbook.md b/docs/multinode_nccl_deep_diagnose_runbook.md
deleted file mode 100644
index 433d1ce..0000000
--- a/docs/multinode_nccl_deep_diagnose_runbook.md
+++ /dev/null
@@ -1,219 +0,0 @@
-# 多机 NCCL 深度诊断 runbook
-
-本文档用于复现 2026-05-23 这轮 2 机 8 卡 NCCL 排查里的关键动作：counter 抓取、GRAPH/TUNING 日志、以及 PXN disabled 基线上的二次参数 sweep。
-
-## 适用场景
-
-当前默认参数面向：
-
-- `aikubeworker0012` / `172.72.8.12`
-- `aikubeworker0016` / `172.72.8.16`
-- 每节点 8 GPU
-- 每节点 4 条 400G HCA：`mlx5_0,mlx5_1,mlx5_6,mlx5_7`
-- NCCL 临时运行库：`/tmp/nccl-2.27.7-cuda12.4`
-- nccl-tests：`/data/nccl-tests-latest/build`
-- OpenMPI：`/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun`
-
-脚本应在 coordinator 节点上执行，当前即 `aikubeworker0012`。
-
-## 快速运行
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/multinode_nccl_deep_diagnose.sh preflight
-bash scripts/multinode_nccl_deep_diagnose.sh all
-```
-
-如果要按 PDF 参考矩阵跑正式多机多卡报告，使用：
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/run_multinode_nccl_pdf_matrix.sh
-```
-
-它会跑 2 机 x 1/2/4/8 GPU per node 的 `all_reduce_perf` 和 `alltoall_perf`，输出到
-`reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS.md`。
-
-同时会生成：
-
-```text
-reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS_artifacts/
-```
-
-每个 case 保存完整 `*.cmd.txt`、`*.stdout.txt`、`*.stderr.txt` 和解析后的 `*.json`，用于复核原始 NCCL 输出。
-
-默认输出目录为：
-
-```text
-/tmp/nccl_deep_diagnose_YYYYMMDD_HHMMSS
-```
-
-只跑单项：
-
-```bash
-# 轻量检查 SSH、mpirun、nccl-tests 和 HCA 路径
-bash scripts/multinode_nccl_deep_diagnose.sh preflight
-
-# allreduce counter 对照
-bash scripts/multinode_nccl_deep_diagnose.sh allreduce-counter
-
-# PXN disabled alltoall counter
-bash scripts/multinode_nccl_deep_diagnose.sh alltoall-counter
-
-# NCCL GRAPH/TUNING/COLL 对照
-bash scripts/multinode_nccl_deep_diagnose.sh graph
-
-# PXN disabled 基线上的二次参数 sweep
-bash scripts/multinode_nccl_deep_diagnose.sh pxn-sweep
-```
-
-## 常用参数覆盖
-
-```bash
-OUT_DIR=/tmp/my_nccl_diag \
-HOSTS=172.72.8.12:8,172.72.8.16:8 \
-PEER_HOST=172.72.8.16 \
-HCAS="mlx5_0 mlx5_1 mlx5_6 mlx5_7" \
-HCA_CSV=mlx5_0,mlx5_1,mlx5_6,mlx5_7 \
-bash scripts/multinode_nccl_deep_diagnose.sh all
-```
-
-如果 nccl-tests 或 NCCL 运行库路径变化：
-
-```bash
-NCCL_TESTS_DIR=/data/nccl-tests-latest/build \
-NCCL_LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/path/to/nccl/lib:/usr/local/cuda/lib64 \
-bash scripts/multinode_nccl_deep_diagnose.sh graph
-```
-
-## 输出解读
-
-### preflight 模式
-
-典型输出文件：
-
-```text
-preflight.txt
-```
-
-该模式不跑 NCCL workload，只检查：
-
-- 本机和对端主机名。
-- OpenMPI `mpirun` 是否存在且可执行。
-- `all_reduce_perf` / `alltoall_perf` 是否存在且可执行。
-- 配置的 HCA 是否能在 `/sys/class/infiniband/<hca>/ports/1` 下读到 state/rate。
-- 发起节点到 `PEER_HOST` 的 root SSH 是否可用。
-
-如果这里出现 `MISSING`，先修环境；否则再跑 `all` 或单项诊断。
-
-### counter 模式
-
-典型输出文件：
-
-```text
-allreduce_counter/
-  allreduce.log
-  before.local
-  before.remote
-  after.local
-  after.remote
-  counter_delta.txt
-
-alltoall_pxn_counter/
-  alltoall_pxn.log
-  before.local
-  before.remote
-  after.local
-  after.remote
-  counter_delta.txt
-```
-
-重点看 `counter_delta.txt`：
-
-- `port_xmit_data` / `port_rcv_data`：端口流量，单位为 4-byte words，脚本同时换算 GiB。
-- `port_xmit_wait`：发送等待或 credit/拥塞等待信号。注意它不是 alltoall 独有根因，因为高吞吐 allreduce 也会出现。
-- `port_xmit_discards`、`port_rcv_errors`、`symbol_error`、`roce_adp_retrans`、`packet_seq_err` 等：错误、丢包、重传、链路异常类信号。
-
-当前已知基线：
-
-- allreduce 可到约 `354 GB/s busbw`，4 条 rail 均衡。
-- PXN disabled alltoall 通常在 `36-37 GB/s busbw` 附近，但有窗口波动。
-- alltoall PXN disabled 后 rail 均衡，且没有明显 error/retrans/slow restart。
-
-### graph 模式
-
-典型输出文件：
-
-```text
-graph/
-  allreduce.log
-  allreduce_summary.txt
-  alltoall_pxn.log
-  alltoall_pxn_summary.txt
-```
-
-重点看：
-
-- `nccl_version`
-- `plugin_missing`
-- `gdr_enabled_lines`
-- `pattern_counts`
-- `channel_summary`
-- `NET/IB/*/GDRDMA`
-- `P2P/CUMEM`
-- `channel_edge_lines`
-
-当前已知对照：
-
-| 观察项 | allreduce | alltoall + `NCCL_PXN_DISABLE=1` |
-|--------|-----------|----------------------------------|
-| HCA / GDR | 4 HCA, GDR enabled | 4 HCA, GDR enabled |
-| channels | `16 coll / 16 nvls / 16 p2p` | `16 coll / 16 nvls / 16 p2p` |
-| `NET/IB/*/GDRDMA` channel edge lines | `256` | `512` |
-| `P2P/CUMEM` channel edge lines | `0` | `224` |
-| total NET/P2P channel edge lines | `256` | `736` |
-
-判断边界：
-
-- 如果 HCA/GDR/channel 基础状态一致，但 alltoall graph 明显更复杂，问题更偏向 NCCL collective graph、P2P/NET 组合方式、internal IB plugin 或交换网络策略。
-- 如果 GDR disabled、HCA 不完整、plugin 路径变化，则不能直接与当前报告结论对比。
-
-### pxn-sweep 模式
-
-典型输出：
-
-```text
-pxn_sweep/
-  baseline.log
-  nvls_off.log
-  qps4_split1.log
-  qps8_split1.log
-  qps4_split0.log
-  channels16.log
-  buff8m.log
-  p2pchunk4m.log
-  netpeer8.log
-  ar0.log
-  summary.txt
-```
-
-当前结论：
-
-- `NCCL_PXN_DISABLE=1` 是已发现的唯一稳定正向项。
-- 在 PXN disabled 基线上继续叠加 NVLS、P2P chunk、buffer、channel、QP/split、AR，没有稳定收益。
-- QP/split 和 `NCCL_NCHANNELS_PER_NET_PEER=8` 在当前环境下明显变差。
-
-## 交接给网络/NCCL 环境侧的重点
-
-1. 当前不是旧 NCCL/GDR disabled 问题：NCCL `2.27.7` 下 4 条 HCA 都是 GDR enabled。
-2. 当前不是 rail 完全打偏问题：`NCCL_PXN_DISABLE=1` 后 alltoall 的 4 条 rail 已均衡。
-3. 当前不是明显坏链路/重传问题：未看到 discard、symbol error、RoCE retrans、slow restart、packet sequence error 等增长。
-4. allreduce 已接近当前 4 x 400G rail 的物理可用带宽；PDF 8 卡 allreduce 目标反推需要超过当前 4 rail 单向理论带宽。
-5. alltoall 剩余差距更像 NCCL internal alltoall graph、P2P/NET 组合方式、缺少 NCCL net plugin/SHARP，或交换网络策略/ECMP/拥塞控制问题。
-
-## 关联报告
-
-- `reports_multinode_nccl_diagnosis_20260523.md`
-- `reports_multinode_nccl_alltoall_tuning_20260523.md`
-- `reports_multinode_nccl_counter_probe_20260523.md`
-- `reports_multinode_nccl_pdf_matrix_nccl227.md`
diff --git a/reports_all_aikubeworker0016.json b/reports_all_aikubeworker0016.json
deleted file mode 100644
index d3db53f..0000000
--- a/reports_all_aikubeworker0016.json
+++ /dev/null
@@ -1,921 +0,0 @@
-{
-  "timestamp": "2026-05-22T15:49:02.368516",
-  "gpu_info": {
-    "driver_version": "580.159.03",
-    "cuda_version": "13.0",
-    "gpu_count": 8,
-    "gpus": [
-      {
-        "index": 0,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-dfbc9513-255d-4fe7-2b77-7b1ec3972e75",
-        "pci_bus_id": "00000000:18:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 4,
-        "vram_free_mb": 81076,
-        "power_draw": 69.98,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 21,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924016120",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 1,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-bb845ef7-d7b5-f011-9395-ea74274e2282",
-        "pci_bus_id": "00000000:2A:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 4,
-        "vram_free_mb": 81076,
-        "power_draw": 67.54,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 21,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924015483",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 2,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-3720cf13-2a34-be38-27be-0a7adc4addc4",
-        "pci_bus_id": "00000000:3A:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 4,
-        "vram_free_mb": 81076,
-        "power_draw": 66.82,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 22,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924025595",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 3,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-87080b2d-ac43-be0d-d574-c193078850ae",
-        "pci_bus_id": "00000000:5D:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 4,
-        "vram_free_mb": 81076,
-        "power_draw": 67.02,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 21,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924016862",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 4,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-599bd883-cc5c-a5dd-6c33-c15f7049da48",
-        "pci_bus_id": "00000000:9A:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 4,
-        "vram_free_mb": 81076,
-        "power_draw": 67.24,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 21,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924025670",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 5,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-a1c6bba4-61b0-e623-06c9-9c88635e26fe",
-        "pci_bus_id": "00000000:AB:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 4,
-        "vram_free_mb": 81076,
-        "power_draw": 69.31,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 23,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924027166",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 6,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-98745a0c-39bd-3e56-d6ca-54ba3647ab6d",
-        "pci_bus_id": "00000000:BA:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 4,
-        "vram_free_mb": 81076,
-        "power_draw": 67.84,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 21,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924026234",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 7,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-8c73bd8b-666b-357e-ac5d-c75ac7a759db",
-        "pci_bus_id": "00000000:DB:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 4,
-        "vram_free_mb": 81076,
-        "power_draw": 66.21,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 21,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924027255",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      }
-    ],
-    "topology": "\t\u001b[4mGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tNIC3\tNIC4\tNIC5\tNIC6\tNIC7\tNIC8\tNIC9\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\u001b[0m\nGPU0\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tPIX\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU1\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tPIX\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU2\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tPIX\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU3\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU4\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU5\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU6\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tPIX\t56-111,168-223\t1\t\tN/A\nGPU7\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nNIC0\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC3\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC4\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t X \tPIX\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC5\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\tPIX\t X \tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC6\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\t\t\t\t\nNIC7\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\t\t\t\t\nNIC8\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\t\t\t\t\nNIC9\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \t\t\t\t\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n  NIC0: mlx5_0\n  NIC1: mlx5_1\n  NIC2: mlx5_2\n  NIC3: mlx5_3\n  NIC4: mlx5_4\n  NIC5: mlx5_5\n  NIC6: mlx5_6\n  NIC7: mlx5_7\n  NIC8: mlx5_8\n  NIC9: mlx5_9\n\n",
-    "timestamp": "2026-05-22T15:49:09.197459",
-    "detected_gpu_type": "h100",
-    "gpu_label": "H100 SXM5"
-  },
-  "health": {
-    "passed": true,
-    "gpu_health": [
-      {
-        "index": 0,
-        "status": "WARN",
-        "checks": {
-          "temperature": {
-            "value": 21,
-            "status": "PASS",
-            "threshold": 75
-          },
-          "power": {
-            "value": 69.86,
-            "limit": 700.0,
-            "status": "PASS"
-          },
-          "ecc_errors": {
-            "single": 0,
-            "double": 0,
-            "status": "PASS"
-          },
-          "memory_errors": {
-            "status": "PASS"
-          },
-          "pcie_link": {
-            "gen": 5,
-            "width": 16,
-            "status": "PASS"
-          },
-          "clock_speed": {
-            "sm": 345,
-            "mem": 2619,
-            "status": "PASS"
-          },
-          "throttling": {
-            "status": "PASS",
-            "reasons": []
-          },
-          "persistence_mode": {
-            "enabled": false,
-            "status": "WARN"
-          }
-        }
-      },
-      {
-        "index": 1,
-        "status": "WARN",
-        "checks": {
-          "temperature": {
-            "value": 21,
-            "status": "PASS",
-            "threshold": 75
-          },
-          "power": {
-            "value": 67.48,
-            "limit": 700.0,
-            "status": "PASS"
-          },
-          "ecc_errors": {
-            "single": 0,
-            "double": 0,
-            "status": "PASS"
-          },
-          "memory_errors": {
-            "status": "PASS"
-          },
-          "pcie_link": {
-            "gen": 5,
-            "width": 16,
-            "status": "PASS"
-          },
-          "clock_speed": {
-            "sm": 345,
-            "mem": 2619,
-            "status": "PASS"
-          },
-          "throttling": {
-            "status": "PASS",
-            "reasons": []
-          },
-          "persistence_mode": {
-            "enabled": false,
-            "status": "WARN"
-          }
-        }
-      },
-      {
-        "index": 2,
-        "status": "WARN",
-        "checks": {
-          "temperature": {
-            "value": 22,
-            "status": "PASS",
-            "threshold": 75
-          },
-          "power": {
-            "value": 66.76,
-            "limit": 700.0,
-            "status": "PASS"
-          },
-          "ecc_errors": {
-            "single": 0,
-            "double": 0,
-            "status": "PASS"
-          },
-          "memory_errors": {
-            "status": "PASS"
-          },
-          "pcie_link": {
-            "gen": 5,
-            "width": 16,
-            "status": "PASS"
-          },
-          "clock_speed": {
-            "sm": 345,
-            "mem": 2619,
-            "status": "PASS"
-          },
-          "throttling": {
-            "status": "PASS",
-            "reasons": []
-          },
-          "persistence_mode": {
-            "enabled": false,
-            "status": "WARN"
-          }
-        }
-      },
-      {
-        "index": 3,
-        "status": "WARN",
-        "checks": {
-          "temperature": {
-            "value": 21,
-            "status": "PASS",
-            "threshold": 75
-          },
-          "power": {
-            "value": 67.06,
-            "limit": 700.0,
-            "status": "PASS"
-          },
-          "ecc_errors": {
-            "single": 0,
-            "double": 0,
-            "status": "PASS"
-          },
-          "memory_errors": {
-            "status": "PASS"
-          },
-          "pcie_link": {
-            "gen": 5,
-            "width": 16,
-            "status": "PASS"
-          },
-          "clock_speed": {
-            "sm": 345,
-            "mem": 2619,
-            "status": "PASS"
-          },
-          "throttling": {
-            "status": "PASS",
-            "reasons": []
-          },
-          "persistence_mode": {
-            "enabled": false,
-            "status": "WARN"
-          }
-        }
-      },
-      {
-        "index": 4,
-        "status": "WARN",
-        "checks": {
-          "temperature": {
-            "value": 21,
-            "status": "PASS",
-            "threshold": 75
-          },
-          "power": {
-            "value": 67.23,
-            "limit": 700.0,
-            "status": "PASS"
-          },
-          "ecc_errors": {
-            "single": 0,
-            "double": 0,
-            "status": "PASS"
-          },
-          "memory_errors": {
-            "status": "PASS"
-          },
-          "pcie_link": {
-            "gen": 5,
-            "width": 16,
-            "status": "PASS"
-          },
-          "clock_speed": {
-            "sm": 345,
-            "mem": 2619,
-            "status": "PASS"
-          },
-          "throttling": {
-            "status": "PASS",
-            "reasons": []
-          },
-          "persistence_mode": {
-            "enabled": false,
-            "status": "WARN"
-          }
-        }
-      },
-      {
-        "index": 5,
-        "status": "WARN",
-        "checks": {
-          "temperature": {
-            "value": 23,
-            "status": "PASS",
-            "threshold": 75
-          },
-          "power": {
-            "value": 69.27,
-            "limit": 700.0,
-            "status": "PASS"
-          },
-          "ecc_errors": {
-            "single": 0,
-            "double": 0,
-            "status": "PASS"
-          },
-          "memory_errors": {
-            "status": "PASS"
-          },
-          "pcie_link": {
-            "gen": 5,
-            "width": 16,
-            "status": "PASS"
-          },
-          "clock_speed": {
-            "sm": 345,
-            "mem": 2619,
-            "status": "PASS"
-          },
-          "throttling": {
-            "status": "PASS",
-            "reasons": []
-          },
-          "persistence_mode": {
-            "enabled": false,
-            "status": "WARN"
-          }
-        }
-      },
-      {
-        "index": 6,
-        "status": "WARN",
-        "checks": {
-          "temperature": {
-            "value": 21,
-            "status": "PASS",
-            "threshold": 75
-          },
-          "power": {
-            "value": 67.81,
-            "limit": 700.0,
-            "status": "PASS"
-          },
-          "ecc_errors": {
-            "single": 0,
-            "double": 0,
-            "status": "PASS"
-          },
-          "memory_errors": {
-            "status": "PASS"
-          },
-          "pcie_link": {
-            "gen": 5,
-            "width": 16,
-            "status": "PASS"
-          },
-          "clock_speed": {
-            "sm": 345,
-            "mem": 2619,
-            "status": "PASS"
-          },
-          "throttling": {
-            "status": "PASS",
-            "reasons": []
-          },
-          "persistence_mode": {
-            "enabled": false,
-            "status": "WARN"
-          }
-        }
-      },
-      {
-        "index": 7,
-        "status": "WARN",
-        "checks": {
-          "temperature": {
-            "value": 21,
-            "status": "PASS",
-            "threshold": 75
-          },
-          "power": {
-            "value": 66.3,
-            "limit": 700.0,
-            "status": "PASS"
-          },
-          "ecc_errors": {
-            "single": 0,
-            "double": 0,
-            "status": "PASS"
-          },
-          "memory_errors": {
-            "status": "PASS"
-          },
-          "pcie_link": {
-            "gen": 5,
-            "width": 16,
-            "status": "PASS"
-          },
-          "clock_speed": {
-            "sm": 345,
-            "mem": 2619,
-            "status": "PASS"
-          },
-          "throttling": {
-            "status": "PASS",
-            "reasons": []
-          },
-          "persistence_mode": {
-            "enabled": false,
-            "status": "WARN"
-          }
-        }
-      }
-    ],
-    "system_health": {
-      "nvidia_persistenced": {
-        "installed": true,
-        "running": false
-      },
-      "hugepages": {
-        "configured": false,
-        "count": 0
-      },
-      "swap": {
-        "enabled": true
-      },
-      "transparent_hugepage": "madvise",
-      "file_descriptors": {
-        "soft": 1024,
-        "max": 1048576
-      },
-      "infiniband_devices": [
-        "mlx5_4",
-        "mlx5_2",
-        "mlx5_0",
-        "mlx5_9",
-        "mlx5_7",
-        "mlx5_5",
-        "mlx5_3",
-        "mlx5_1",
-        "mlx5_8",
-        "mlx5_6"
-      ],
-      "rdma_devices": [
-        "abi_version",
-        "uverbs4",
-        "uverbs2",
-        "uverbs0",
-        "uverbs9",
-        "uverbs7",
-        "uverbs5",
-        "uverbs3",
-        "uverbs1",
-        "uverbs8",
-        "uverbs6"
-      ],
-      "nccl_env_vars": {}
-    },
-    "timestamp": "2026-05-22T15:49:11.294816",
-    "detected_gpu_type": "h100"
-  },
-  "memory_bench": {
-    "memory": {
-      "source": "nvbandwidth",
-      "h2d_bandwidth_gbps": 55.5,
-      "d2h_bandwidth_gbps": 55.3,
-      "d2d_bandwidth_gbps": 486.5,
-      "h2d_peak_gbps": 64,
-      "d2h_peak_gbps": 64,
-      "d2d_peak_gbps": 450.0,
-      "h2d_efficiency_pct": 86.7,
-      "d2h_efficiency_pct": 86.4,
-      "d2d_efficiency_pct": 108.1,
-      "peak_bandwidth_gbps": 3400,
-      "efficiency_pct": 108.1,
-      "results_by_test": {
-        "h2d": 55.5,
-        "d2h": 55.3,
-        "d2d_write": 397.4,
-        "d2d_read": 395.1,
-        "d2d_bidir": 486.5
-      },
-      "per_gpu": []
-    }
-  },
-  "compute_bench": {
-    "compute": {
-      "per_dtype_tflops": {
-        "fp32": 51.9,
-        "tf32": 357.0,
-        "fp16": 664.0,
-        "bf16": 700.1,
-        "fp8": 1116.2
-      },
-      "peak_tflops": {
-        "fp32": 67,
-        "tf32": 495,
-        "fp16": 990,
-        "bf16": 990,
-        "fp8": 1979
-      },
-      "efficiency_pct": {
-        "fp32": 77.5,
-        "tf32": 72.1,
-        "fp16": 67.1,
-        "bf16": 70.7,
-        "fp8": 56.4
-      },
-      "pass_thresholds_tflops": {
-        "fp32": 54,
-        "tf32": 444,
-        "fp16": 734,
-        "bf16": 745,
-        "fp8": 1400
-      },
-      "per_gpu": [
-        {
-          "index": 0,
-          "fp32": 51.9,
-          "tf32": 357.0,
-          "fp16": 664.0,
-          "bf16": 700.1,
-          "fp8": 1116.2
-        },
-        {
-          "index": 1,
-          "fp32": 51.9,
-          "tf32": 357.0,
-          "fp16": 664.0,
-          "bf16": 700.1,
-          "fp8": 1116.2
-        },
-        {
-          "index": 2,
-          "fp32": 51.9,
-          "tf32": 357.0,
-          "fp16": 664.0,
-          "bf16": 700.1,
-          "fp8": 1116.2
-        },
-        {
-          "index": 3,
-          "fp32": 51.9,
-          "tf32": 357.0,
-          "fp16": 664.0,
-          "bf16": 700.1,
-          "fp8": 1116.2
-        },
-        {
-          "index": 4,
-          "fp32": 51.9,
-          "tf32": 357.0,
-          "fp16": 664.0,
-          "bf16": 700.1,
-          "fp8": 1116.2
-        },
-        {
-          "index": 5,
-          "fp32": 51.9,
-          "tf32": 357.0,
-          "fp16": 664.0,
-          "bf16": 700.1,
-          "fp8": 1116.2
-        },
-        {
-          "index": 6,
-          "fp32": 51.9,
-          "tf32": 357.0,
-          "fp16": 664.0,
-          "bf16": 700.1,
-          "fp8": 1116.2
-        },
-        {
-          "index": 7,
-          "fp32": 51.9,
-          "tf32": 357.0,
-          "fp16": 664.0,
-          "bf16": 700.1,
-          "fp8": 1116.2
-        }
-      ],
-      "matrix_size": 8192,
-      "warmup": 50,
-      "iterations": 500
-    }
-  },
-  "nccl": {
-    "passed": false,
-    "source": "torchrun_fallback",
-    "tests": {
-      "NCCL version 2.21.5+cuda12.4": {
-        "status": "FAIL",
-        "error": null
-      },
-      "allreduce": {
-        "status": "PASS",
-        "error": null
-      },
-      "broadcast": {
-        "status": "PASS",
-        "error": null
-      },
-      "allgather": {
-        "status": "PASS",
-        "error": null
-      },
-      "reducescatter": {
-        "status": "PASS",
-        "error": null
-      },
-      "alltoall": {
-        "status": "PASS",
-        "error": null
-      }
-    },
-    "gpu_count": 8
-  },
-  "stress": {
-    "source": "pytorch",
-    "passed": true,
-    "duration_sec": 60,
-    "elapsed_sec": 60.0,
-    "gpu_status": {
-      "0": "PASS",
-      "1": "PASS",
-      "2": "PASS",
-      "3": "PASS",
-      "4": "PASS",
-      "5": "PASS",
-      "6": "PASS",
-      "7": "PASS"
-    },
-    "timestamp": "2026-05-22T15:51:56.803540"
-  },
-  "rdma": {
-    "passed": false,
-    "devices": [
-      {
-        "name": "mlx5_0",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "400 Gb/sec (4X NDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:58a2:e103:0088:81e0"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_1",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "400 Gb/sec (4X NDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:9c63:c003:0054:e00a"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_2",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "25 Gb/sec (1X EDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_3",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "25 Gb/sec (1X EDR)",
-            "state": "1: DOWN",
-            "phys_state": "3: Disabled",
-            "gid": "fe80:0000:0000:0000:c670:bdff:fefd:5bd9"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_4",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "100 Gb/sec (2X HDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:9c63:c003:005f:58ec"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_5",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "100 Gb/sec (2X HDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:9c63:c003:005f:58ed"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_6",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "400 Gb/sec (4X NDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:9c63:c003:0055:0e56"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_7",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "400 Gb/sec (4X NDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:a088:c203:00f0:286c"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_8",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "25 Gb/sec (1X EDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_9",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "25 Gb/sec (1X EDR)",
-            "state": "1: DOWN",
-            "phys_state": "3: Disabled",
-            "gid": "fe80:0000:0000:0000:c670:bdff:fefd:569d"
-          }
-        ]
-      }
-    ],
-    "bandwidth_tests": [
-      {
-        "test": "ib_write_bw",
-        "status": "WARN",
-        "bandwidth_gbps": 0.13,
-        "min_required_gbps": 50
-      },
-      {
-        "test": "ib_read_bw",
-        "status": "WARN",
-        "bandwidth_gbps": 0.13,
-        "min_required_gbps": 50
-      }
-    ],
-    "latency_tests": [
-      {
-        "test": "ib_write_lat",
-        "status": "PASS",
-        "latency_us": 4.1,
-        "max_allowed_us": 10
-      },
-      {
-        "test": "ib_read_lat",
-        "status": "WARN",
-        "latency_us": 16.0,
-        "max_allowed_us": 10
-      }
-    ],
-    "timestamp": "2026-05-22T15:52:03.507540"
-  },
-  "training": {
-    "model": "synthetic_transformer",
-    "total_params_m": 1470.5,
-    "num_layers": 6,
-    "hidden_size": 4096,
-    "gpu_count": 8,
-    "dtype": "bfloat16",
-    "batch_size": 8,
-    "seq_length": 2048,
-    "num_steps": 50,
-    "avg_step_time_ms": 312.3,
-    "throughput_tokens_per_sec": 52471.0,
-    "throughput_samples_per_sec": 25.62,
-    "peak_memory_gb": 27.31,
-    "final_loss": 0.0041,
-    "timestamp": "2026-05-22T15:52:32.650522"
-  }
-}
\ No newline at end of file
diff --git a/reports_all_aikubeworker0016.md b/reports_all_aikubeworker0016.md
deleted file mode 100644
index 80dda75..0000000
--- a/reports_all_aikubeworker0016.md
+++ /dev/null
@@ -1,157 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22T15:49:02.368516
-- **Host:** aikubeworker0016
-- **GPU:** NVIDIA H100 80GB HBM3 x8
-- **Driver:** 580.159.03 | **CUDA:** 13.0
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Failed or unverified items:
-- Compute Throughput: FAIL (worst FP32 52 vs >= 54)
-- NCCL: FAIL (no nccl-tests bus BW)
-- RDMA: FAIL
-- Training: UNVERIFIED (52471 tokens/sec; legacy result lacks explicit acceptance verdict)
-
-Missing required evidence:
-- NVLink/NVSwitch
-- DCGM
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| GPU Info | PASS (8 GPUs detected) |
-| Health Check | PASS |
-| Memory Bandwidth | PASS (108.1%) |
-| Compute Throughput | FAIL (worst FP32 52 vs >= 54) |
-| NCCL | FAIL (no nccl-tests bus BW) |
-| Stress Test | PASS |
-| RDMA | FAIL |
-| Training | UNVERIFIED (52471 tokens/sec; legacy result lacks explicit acceptance verdict) |
-
-## GPU Information
-
-| GPU | Model | VRAM | Temp | Power | SM Clock |
-|-----|-------|------|------|-------|----------|
-| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 70/700W | 345 MHz |
-| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 68/700W | 345 MHz |
-| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 22C | 67/700W | 345 MHz |
-| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 67/700W | 345 MHz |
-| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 67/700W | 345 MHz |
-| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 23C | 69/700W | 345 MHz |
-| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 68/700W | 345 MHz |
-| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 66/700W | 345 MHz |
-
-## Health Check
-
-**Overall: PASS**
-
-| GPU | Temp | Power | ECC | PCIe | Throttle | Status |
-|-----|------|-------|-----|------|----------|--------|
-| 0 | 21C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
-| 1 | 21C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
-| 2 | 22C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
-| 3 | 21C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
-| 4 | 21C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
-| 5 | 23C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
-| 6 | 21C PASS | 68W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
-| 7 | 21C PASS | 66W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** |
-
-## Memory Bandwidth
-
-Source: nvbandwidth
-
-| Metric | Value | Peak | Efficiency |
-|--------|-------|------|------------|
-| H2D (PCIe) | 55.5 GB/s | 64 GB/s | 86.7% |
-| D2H (PCIe) | 55.3 GB/s | 64 GB/s | 86.4% |
-| D2D (NVLink) | 486.5 GB/s | 450 GB/s | 108.1% |
-
-**Verdict: PASS** (D2D efficiency 108.1%)
-
-## Compute Throughput
-
-| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
-|-------|-------------------|------|------------|--------|
-| FP32 | 51.9 | 67 | >= 54 | FAIL |
-| TF32 | 357.0 | 495 | >= 444 | FAIL |
-| FP16 | 664.0 | 990 | >= 734 | FAIL |
-| BF16 | 700.1 | 990 | >= 745 | FAIL |
-| FP8 | 1116.2 | 1979 | >= 1400 | FAIL |
-
-**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 56.4%)
-
-### Compute Per-GPU TFLOPS
-
-| GPU | FP32 | TF32 | FP16 | BF16 | FP8 |
-|---|---|---|---|---|---|
-| 0 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
-| 1 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
-| 2 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
-| 3 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
-| 4 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
-| 5 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
-| 6 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
-| 7 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 |
-
-## NCCL Multi-GPU
-
-Source: torchrun_fallback | GPUs: 8
-
-> Functional NCCL smoke only: nccl-tests bus bandwidth was not measured, so this does not satisfy production acceptance.
-
-| Operation | Bus BW (GB/s) | Threshold | Status |
-|-----------|---------------|-----------|--------|
-| NCCL version 2.21.5+cuda12.4 | 0.0 | >= 0 | FAIL |
-| allreduce | 0.0 | >= 0 | PASS |
-| broadcast | 0.0 | >= 0 | PASS |
-| allgather | 0.0 | >= 0 | PASS |
-| reducescatter | 0.0 | >= 0 | PASS |
-| alltoall | 0.0 | >= 0 | PASS |
-
-**Overall: FAIL**
-
-## Stress Test
-
-- **Source:** pytorch
-- **Duration:** 60s (requested 60s)
-- **Result: PASS**
-
-## RDMA/InfiniBand
-
-> Legacy RDMA result re-evaluated with current PDF acceptance thresholds; old WARN statuses and old 50GB/s/10us limits are not used for verdict.
-
-| Test | Value | Threshold | Status |
-|------|-------|-----------|--------|
-| ib_write_bw | 0.1 GB/s | >= 47 GB/s | FAIL |
-| ib_read_bw | 0.1 GB/s | >= 47 GB/s | FAIL |
-| ib_write_lat | 4.10 us | <= 2 us | FAIL |
-| ib_read_lat | 16.00 us | <= 3.5 us | FAIL |
-
-- **Failure reasons:**
-  - ib_write_bw bandwidth 0.13GB/s < 47GB/s
-  - ib_read_bw bandwidth 0.13GB/s < 47GB/s
-  - ib_write_lat latency 4.1us > 2us
-  - ib_read_lat latency 16.0us > 3.5us
-**Overall: FAIL**
-
-## Training Simulation
-
-| Metric | Value |
-|--------|-------|
-| Model | synthetic_transformer |
-| Params | 1470.5M |
-| Throughput | 52471 tokens/sec |
-| Avg Step Time | 312.3 ms |
-| Peak Memory | 27.3 GB |
-| Final Loss | 0.0041 |
-| Step Jitter | N/A% |
-| Distributed Mode | N/A |
-| Acceptance Gaps | missing passed, step_jitter_pct, distributed_mode, loss_finite |
-| Verdict | UNVERIFIED (52471 tokens/sec; legacy result lacks explicit acceptance verdict) |
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_cublaslt_fp8_crosscheck_20260524.md b/reports_cublaslt_fp8_crosscheck_20260524.md
deleted file mode 100644
index 194a562..0000000
--- a/reports_cublaslt_fp8_crosscheck_20260524.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# cuBLASLt FP8 GEMM Cross-Check Report
-
-Date: 2026-05-24
-
-Scope: Validate whether the single-node FP8 compute FAIL is caused by hardware/platform limits or by the original PyTorch `_scaled_mm` benchmark path.
-
-## Method
-
-Added a direct cuBLASLt FP8 GEMM micro-benchmark:
-
-- Source: `scripts/cublaslt_fp8_gemm_bench.cu`
-- Wrapper: `scripts/run_cublaslt_fp8_gemm.sh`
-- Input dtype: `CUDA_R_8F_E4M3`
-- Output dtype: `CUDA_R_16BF`
-- Accumulate / compute type: `CUBLAS_COMPUTE_32F`
-- Layout: cuBLASLt FP8-required TN format
-- Matrix size: `8192`
-- Warmup: `50`
-- Iterations: `500`
-- GPUs: single-node 8 GPUs, measured one GPU at a time
-
-NVIDIA cuBLASLt documentation states FP8 kernels require TN format, `CUBLAS_COMPUTE_32F`, and `CUDA_R_32F` scale type. The implemented benchmark follows those constraints.
-
-## Results
-
-### aikubeworker0012 / nccl-gpu-1
-
-Raw report: `reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json`
-
-| GPU | FP8 TFLOPS |
-|---:|---:|
-| 0 | 1615.6 |
-| 1 | 1611.0 |
-| 2 | 1599.0 |
-| 3 | 1607.1 |
-| 4 | 1614.0 |
-| 5 | 1604.4 |
-| 6 | 1608.4 |
-| 7 | 1609.1 |
-
-Summary:
-
-- Mean: `1608.6 TFLOPS`
-- Min / Max: `1599.0 / 1615.6 TFLOPS`
-- Spread: `1.03%`
-- FP8 absolute threshold: `>= 1400 TFLOPS`
-- Verdict against FP8 absolute threshold: **PASS**
-- Verdict against 8-GPU consistency threshold `<= 3%`: **PASS**
-
-### aikubeworker0016 / nccl-gpu-2
-
-Raw report: `reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json`
-
-| GPU | FP8 TFLOPS |
-|---:|---:|
-| 0 | 1602.3 |
-| 1 | 1604.0 |
-| 2 | 1616.9 |
-| 3 | 1610.6 |
-| 4 | 1620.5 |
-| 5 | 1630.3 |
-| 6 | 1605.1 |
-| 7 | 1620.2 |
-
-Summary:
-
-- Mean: `1613.7 TFLOPS`
-- Min / Max: `1602.3 / 1630.3 TFLOPS`
-- Spread: `1.74%`
-- FP8 absolute threshold: `>= 1400 TFLOPS`
-- Verdict against FP8 absolute threshold: **PASS**
-- Verdict against 8-GPU consistency threshold `<= 3%`: **PASS**
-
-## Comparison With Existing PyTorch `_scaled_mm` Result
-
-| Host | PyTorch `_scaled_mm` FP8 | cuBLASLt FP8 | Delta |
-|---|---:|---:|---:|
-| aikubeworker0012 | 1170.4 | 1608.6 | +438.2 |
-| aikubeworker0016 | 1179.5 | 1613.7 | +434.2 |
-
-The cuBLASLt path passes the `>= 1400 TFLOPS` FP8 absolute threshold on both machines, while the original PyTorch `_scaled_mm` path remains around `1170-1180 TFLOPS`.
-
-## Conclusion
-
-The FP8 hardware path is capable of exceeding the configured H100 FP8 acceptance threshold on both machines. The earlier FP8 FAIL is therefore most likely a benchmark implementation issue in the current PyTorch `_scaled_mm` path, not a GPU hardware, power, clock, thermal, MIG, ECC, or Fabric Manager issue.
-
-Recommended next action: replace or augment the existing FP8 compute acceptance item with the cuBLASLt FP8 GEMM cross-check, while keeping the PyTorch `_scaled_mm` result as a secondary software-stack signal.
diff --git a/reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json b/reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json
deleted file mode 100644
index b61e641..0000000
--- a/reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
-  "source": "cuBLASLt",
-  "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
-  "matrix_size": 8192,
-  "warmup": 50,
-  "iterations": 500,
-  "per_gpu": [
-    {"index": 0, "fp8_tflops": 1615.6},
-    {"index": 1, "fp8_tflops": 1611.0},
-    {"index": 2, "fp8_tflops": 1599.0},
-    {"index": 3, "fp8_tflops": 1607.1},
-    {"index": 4, "fp8_tflops": 1614.0},
-    {"index": 5, "fp8_tflops": 1604.4},
-    {"index": 6, "fp8_tflops": 1608.4},
-    {"index": 7, "fp8_tflops": 1609.1}
-  ],
-  "mean_tflops": 1608.6,
-  "min_tflops": 1599.0,
-  "max_tflops": 1615.6,
-  "spread_pct": 1.03
-}
diff --git a/reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json b/reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json
deleted file mode 100644
index 6808990..0000000
--- a/reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
-  "source": "cuBLASLt",
-  "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
-  "matrix_size": 8192,
-  "warmup": 50,
-  "iterations": 500,
-  "per_gpu": [
-    {"index": 0, "fp8_tflops": 1602.3},
-    {"index": 1, "fp8_tflops": 1604.0},
-    {"index": 2, "fp8_tflops": 1616.9},
-    {"index": 3, "fp8_tflops": 1610.6},
-    {"index": 4, "fp8_tflops": 1620.5},
-    {"index": 5, "fp8_tflops": 1630.3},
-    {"index": 6, "fp8_tflops": 1605.1},
-    {"index": 7, "fp8_tflops": 1620.2}
-  ],
-  "mean_tflops": 1613.7,
-  "min_tflops": 1602.3,
-  "max_tflops": 1630.3,
-  "spread_pct": 1.74
-}
diff --git a/reports_dcgm_r3_aikubeworker0012_20260522_200338.md b/reports_dcgm_r3_aikubeworker0012_20260522_200338.md
deleted file mode 100644
index 1663b83..0000000
--- a/reports_dcgm_r3_aikubeworker0012_20260522_200338.md
+++ /dev/null
@@ -1,65 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22T20:26:56.947796
-- **Host:** aikubeworker0012
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Missing required evidence:
-- GPU Info
-- Health Check
-- Memory Bandwidth
-- Compute Throughput
-- NVLink/NVSwitch
-- NCCL
-- Stress Test
-- RDMA
-- Training
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| DCGM | PASS |
-
-## DCGM Diagnostic
-
-**Overall: PASS**
-
-| Subtest | Status |
-|---------|--------|
-| Hardware/nvbandwidth/GPU6 | PASS |
-| Hardware/nvbandwidth/GPU7 | PASS |
-| Hardware/nvbandwidth/summary | PASS |
-| Integration/pcie/GPU0 | PASS |
-| Integration/pcie/GPU1 | PASS |
-| Integration/pcie/GPU2 | PASS |
-| Integration/pcie/GPU3 | PASS |
-| Integration/pcie/GPU4 | PASS |
-| Integration/pcie/GPU5 | PASS |
-| Integration/pcie/GPU6 | PASS |
-| Integration/pcie/GPU7 | PASS |
-| Integration/pcie/summary | PASS |
-| Stress/targeted_stress/GPU0 | PASS |
-| Stress/targeted_stress/GPU1 | PASS |
-| Stress/targeted_stress/GPU2 | PASS |
-| Stress/targeted_stress/GPU3 | PASS |
-| Stress/targeted_stress/GPU4 | PASS |
-| Stress/targeted_stress/GPU5 | PASS |
-| Stress/targeted_stress/GPU6 | PASS |
-| Stress/targeted_stress/GPU7 | PASS |
-| Stress/targeted_stress/summary | PASS |
-| Stress/targeted_power/GPU0 | PASS |
-| Stress/targeted_power/GPU1 | PASS |
-| Stress/targeted_power/GPU2 | PASS |
-| Stress/targeted_power/GPU3 | PASS |
-| Stress/targeted_power/GPU4 | PASS |
-| Stress/targeted_power/GPU5 | PASS |
-| Stress/targeted_power/GPU6 | PASS |
-| Stress/targeted_power/GPU7 | PASS |
-| Stress/targeted_power/summary | PASS |
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_dcgm_r3_aikubeworker0016_20260522_200538.md b/reports_dcgm_r3_aikubeworker0016_20260522_200538.md
deleted file mode 100644
index f51b5bf..0000000
--- a/reports_dcgm_r3_aikubeworker0016_20260522_200538.md
+++ /dev/null
@@ -1,65 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22T20:28:58.716266
-- **Host:** aikubeworker0016
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Missing required evidence:
-- GPU Info
-- Health Check
-- Memory Bandwidth
-- Compute Throughput
-- NVLink/NVSwitch
-- NCCL
-- Stress Test
-- RDMA
-- Training
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| DCGM | PASS |
-
-## DCGM Diagnostic
-
-**Overall: PASS**
-
-| Subtest | Status |
-|---------|--------|
-| Hardware/nvbandwidth/GPU6 | PASS |
-| Hardware/nvbandwidth/GPU7 | PASS |
-| Hardware/nvbandwidth/summary | PASS |
-| Integration/pcie/GPU0 | PASS |
-| Integration/pcie/GPU1 | PASS |
-| Integration/pcie/GPU2 | PASS |
-| Integration/pcie/GPU3 | PASS |
-| Integration/pcie/GPU4 | PASS |
-| Integration/pcie/GPU5 | PASS |
-| Integration/pcie/GPU6 | PASS |
-| Integration/pcie/GPU7 | PASS |
-| Integration/pcie/summary | PASS |
-| Stress/targeted_stress/GPU0 | PASS |
-| Stress/targeted_stress/GPU1 | PASS |
-| Stress/targeted_stress/GPU2 | PASS |
-| Stress/targeted_stress/GPU3 | PASS |
-| Stress/targeted_stress/GPU4 | PASS |
-| Stress/targeted_stress/GPU5 | PASS |
-| Stress/targeted_stress/GPU6 | PASS |
-| Stress/targeted_stress/GPU7 | PASS |
-| Stress/targeted_stress/summary | PASS |
-| Stress/targeted_power/GPU0 | PASS |
-| Stress/targeted_power/GPU1 | PASS |
-| Stress/targeted_power/GPU2 | PASS |
-| Stress/targeted_power/GPU3 | PASS |
-| Stress/targeted_power/GPU4 | PASS |
-| Stress/targeted_power/GPU5 | PASS |
-| Stress/targeted_power/GPU6 | PASS |
-| Stress/targeted_power/GPU7 | PASS |
-| Stress/targeted_power/summary | PASS |
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_fp8_path_comparison_20260525.md b/reports_fp8_path_comparison_20260525.md
deleted file mode 100644
index 6c5d9cf..0000000
--- a/reports_fp8_path_comparison_20260525.md
+++ /dev/null
@@ -1,168 +0,0 @@
-# FP8 GEMM 路径对比测试报告
-
-测试日期：2026-05-25
-测试节点：aikubeworker0012、aikubeworker0016
-测试 GPU：NVIDIA H100 80GB HBM3
-测试目标：对比同一 FP8 GEMM 规模下 PyTorch eager、CUDA Graph、Transformer Engine 和 direct cuBLASLt 的性能差异。
-
-## 一、测试结论
-
-本次 A-E 五条路径均已完成实测。
-
-核心结论：
-
-1. direct cuBLASLt 是本组测试里最快路径，两台机器分别达到 1626.6 TFLOPS 和 1598.1 TFLOPS。
-2. PyTorch eager `_scaled_mm` 默认路径约为 1161.9-1186.1 TFLOPS。
-3. 打开 `use_fast_accum=True` 后，PyTorch eager 路径有稳定提升，约提升 5.0%-6.7%。
-4. CUDA Graph + `_scaled_mm(use_fast_accum=True)` 进一步提升到 1277.7-1322.2 TFLOPS，但仍低于 direct cuBLASLt。
-5. Transformer Engine 本次使用的是 `te.Linear` + `fp8_autocast` 路径，不是裸 GEMM，因此包含 TE module、cast、FP8 recipe 等额外开销，结果低于 direct cuBLASLt，也低于 CUDA Graph `_scaled_mm`。
-
-这说明：当前 GPU 硬件和 cuBLASLt 裸 GEMM 能力本身没有问题；之前 PyTorch `_scaled_mm` 1170-1180 TFLOPS 左右的结果，主要反映的是 PyTorch eager 路径和当前 benchmark 方式下的端到端路径性能，而不是 GPU 算力极限。
-
-## 二、测试方法
-
-统一参数：
-
-| 参数 | 值 |
-|---|---:|
-| matrix_size | 8192 |
-| M/N/K | 8192/8192/8192 |
-| warmup | 50 |
-| iterations | 500 |
-| GPU index | 0 |
-| PyTorch | 2.6.0+cu124 |
-| CUDA | 12.4 |
-| 输入 dtype | FP8 E4M3 |
-| 输出 dtype | BF16 |
-| accumulation | FP32 |
-| scale_a / scale_b | 1.0 / 1.0 |
-
-测试路径定义：
-
-| 路径 | 名称 | 含义 |
-|---|---|---|
-| A | 当前 eager `_scaled_mm` | PyTorch 立即执行模式调用 `torch._scaled_mm`，默认 accumulation 参数 |
-| B | `_scaled_mm(use_fast_accum=True)` | PyTorch eager 路径，但显式打开 fast accumulation |
-| C | CUDA Graph + `_scaled_mm(use_fast_accum=True)` | 捕获并 replay 同一个 `_scaled_mm` 调用，降低 Python/PyTorch launch 间隙 |
-| D | Transformer Engine FP8 GEMM | `te.Linear` 在 `fp8_autocast` 下执行，包含 TE 层封装和 FP8 recipe 开销 |
-| E | direct cuBLASLt | C++/CUDA 直接调用 `cublasLtMatmul`，绕过 PyTorch eager |
-
-复现脚本：
-
-```bash
-MATRIX_SIZE=8192 WARMUP=50 ITERATIONS=500 GPU_INDEX=0 WORKSPACE_MB=256 \
-  /root/test_gpu_scripts/scripts/run_fp8_path_comparison.sh
-```
-
-## 三、实测结果
-
-### aikubeworker0012
-
-原始 JSON：`/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json`
-
-| 路径 | 状态 | TFLOPS | 单轮 CUDA event 时间 |
-|---|---|---:|---:|
-| A eager `_scaled_mm` default | OK | 1186.1 | 927.014 us |
-| B eager `_scaled_mm` fast_accum | OK | 1266.0 | 868.481 us |
-| C CUDA Graph + fast_accum | OK | 1322.2 | 831.573 us |
-| D Transformer Engine FP8 Linear | OK | 1153.2 | 953.478 us |
-| E direct cuBLASLt fast_accum | OK | 1626.6 | 未在 combined JSON 中记录 |
-
-相对 A 的提升：
-
-| 路径 | 相对 A |
-|---|---:|
-| B | +6.7% |
-| C | +11.5% |
-| D | -2.8% |
-| E | +37.1% |
-
-E 路径 cuBLASLt 算法信息：
-
-| 字段 | 值 |
-|---|---:|
-| algo_id | 52 |
-| tile_id | 23 |
-| splitk | 1 |
-| stages_id | 36 |
-| inner_shape_id | 0 |
-| cluster_shape_id | 3 |
-
-### aikubeworker0016
-
-原始 JSON：`/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json`
-
-| 路径 | 状态 | TFLOPS | 单轮 CUDA event 时间 |
-|---|---|---:|---:|
-| A eager `_scaled_mm` default | OK | 1161.9 | 946.313 us |
-| B eager `_scaled_mm` fast_accum | OK | 1220.4 | 900.960 us |
-| C CUDA Graph + fast_accum | OK | 1277.7 | 860.543 us |
-| D Transformer Engine FP8 Linear | OK | 1125.3 | 977.054 us |
-| E direct cuBLASLt fast_accum | OK | 1598.1 | 未在 combined JSON 中记录 |
-
-相对 A 的提升：
-
-| 路径 | 相对 A |
-|---|---:|
-| B | +5.0% |
-| C | +10.0% |
-| D | -3.2% |
-| E | +37.5% |
-
-E 路径 cuBLASLt 算法信息：
-
-| 字段 | 值 |
-|---|---:|
-| algo_id | 52 |
-| tile_id | 23 |
-| splitk | 1 |
-| stages_id | 36 |
-| inner_shape_id | 0 |
-| cluster_shape_id | 3 |
-
-## 四、对 PyTorch FP8 能否“上去”的判断
-
-从本次结果看，PyTorch FP8 路径可以通过两类方式上去：
-
-1. 打开更快的 math/accumulation 参数，例如 `use_fast_accum=True`。
-2. 使用 CUDA Graph replay，减少 eager 模式下每轮调度、enqueue 之间的间隙。
-
-但在当前 `matrix_size=8192`、单个 `_scaled_mm`、PyTorch eager/Graph benchmark 的测试形态下，PyTorch 路径仍没有达到 direct cuBLASLt 的 1598-1626 TFLOPS。也就是说，direct cuBLASLt 证明硬件和底层库有能力跑得更高；PyTorch eager `_scaled_mm` 测到的是 PyTorch 当前封装路径在这个 shape 下的实际表现。
-
-如果把目标定义为“让 PyTorch 代码路径更接近裸 cuBLASLt”，后续可以继续验证：
-
-1. 更大的 GEMM size，例如 16384。
-2. 固定 shape 后用 `torch.compile` 或 Inductor。
-3. CUDA Graph 覆盖更完整的 step，而不是只 replay 单个 op。
-4. 使用 Transformer Engine 的更底层 GEMM API 或官方 microbenchmark，而不是 `te.Linear` module forward。
-5. 对 `_scaled_mm` 做 Nsight Systems / Nsight Compute 抓取，确认实际 kernel、间隙和 cuBLASLt 算法选择。
-
-## 五、术语说明
-
-`eager` 指 PyTorch 立即执行模式。每次 Python 调用 `torch._scaled_mm`，PyTorch 都会经过 dispatcher、参数检查、Tensor 创建、准备 descriptor、调用 cuBLASLt heuristic，然后把 matmul enqueue 到 CUDA stream。
-
-`cuBLAS` 是 NVIDIA 的基础矩阵乘库。`cuBLASLt` 是更灵活的矩阵乘接口，支持更多 layout、FP8、算法 heuristic、workspace、epilogue 等能力。
-
-`direct cuBLASLt` 指我们自己写 C++/CUDA 直接调用 `cublasLtMatmul`，不经过 PyTorch eager，因此更接近裸 GEMM 峰值。
-
-`CUDA Graph` 指把一次 CUDA work 提前捕获成图，后续直接 replay，减少 CPU 侧反复 launch/调度带来的间隙。
-
-`Transformer Engine` 是 NVIDIA 面向 Transformer/FP8 训练优化的库。本次 D 路径使用的是 `te.Linear` module forward，不等同于裸 GEMM microbenchmark。
-
-## 六、文件清单
-
-本地脚本：
-
-| 文件 | 用途 |
-|---|---|
-| `/Users/d-robotics/lab/test_gpu_scripts/scripts/pytorch_fp8_path_bench.py` | A/B/C/D PyTorch 与 Transformer Engine 路径 |
-| `/Users/d-robotics/lab/test_gpu_scripts/scripts/cublaslt_fp8_gemm_bench.cu` | E direct cuBLASLt 路径 |
-| `/Users/d-robotics/lab/test_gpu_scripts/scripts/run_fp8_path_comparison.sh` | 统一运行并合并 A-E 结果 |
-
-本地结果：
-
-| 文件 | 用途 |
-|---|---|
-| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json` | aikubeworker0012 A-E 原始结果 |
-| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json` | aikubeworker0016 A-E 原始结果 |
-| `/Users/d-robotics/lab/test_gpu_scripts/reports_fp8_path_comparison_20260525.md` | 本中文汇总报告 |
diff --git a/reports_fp8_paths_combined_aikubeworker0012_20260525_042347.json b/reports_fp8_paths_combined_aikubeworker0012_20260525_042347.json
deleted file mode 100644
index 51a1540..0000000
--- a/reports_fp8_paths_combined_aikubeworker0012_20260525_042347.json
+++ /dev/null
@@ -1,142 +0,0 @@
-{
-  "source": "fp8_path_comparison",
-  "host": null,
-  "matrix_size": 8192,
-  "gpu_index": 0,
-  "pytorch": {
-    "source": "pytorch_fp8_path_bench",
-    "torch": "2.6.0+cu124",
-    "cuda": "12.4",
-    "gpu_index": 0,
-    "gpu_name": "NVIDIA H100 80GB HBM3",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "results": [
-      {
-        "name": "A_eager_scaled_mm_default",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 465.145,
-        "event_us_per_iter": 930.29,
-        "wall_ms_total": 465.21,
-        "tflops": 1181.9
-      },
-      {
-        "name": "B_eager_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 440.252,
-        "event_us_per_iter": 880.504,
-        "wall_ms_total": 440.289,
-        "tflops": 1248.7
-      },
-      {
-        "name": "C_cuda_graph_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 3,
-        "event_ms_total": 415.631,
-        "event_us_per_iter": 831.262,
-        "wall_ms_total": 415.664,
-        "tflops": 1322.7
-      },
-      {
-        "name": "D_transformer_engine_fp8_linear",
-        "status": "unavailable",
-        "reason": "ModuleNotFoundError: No module named 'transformer_engine'"
-      }
-    ],
-    "summary": {
-      "max_tflops": 1322.7,
-      "min_tflops": 1181.9,
-      "mean_tflops": 1251.1
-    }
-  },
-  "cublaslt": {
-    "source": "cuBLASLt",
-    "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "fast_accum": 1,
-    "per_gpu": [
-      {
-        "index": 0,
-        "fp8_tflops": 1615.4,
-        "algo_id": 52,
-        "tile_id": 23,
-        "splitk": 1,
-        "stages_id": 36,
-        "inner_shape_id": 0,
-        "cluster_shape_id": 3
-      }
-    ],
-    "mean_tflops": 1615.4,
-    "min_tflops": 1615.4,
-    "max_tflops": 1615.4,
-    "spread_pct": 0.0
-  },
-  "results": [
-    {
-      "name": "A_eager_scaled_mm_default",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 465.145,
-      "event_us_per_iter": 930.29,
-      "wall_ms_total": 465.21,
-      "tflops": 1181.9
-    },
-    {
-      "name": "B_eager_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 440.252,
-      "event_us_per_iter": 880.504,
-      "wall_ms_total": 440.289,
-      "tflops": 1248.7
-    },
-    {
-      "name": "C_cuda_graph_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 3,
-      "event_ms_total": 415.631,
-      "event_us_per_iter": 831.262,
-      "wall_ms_total": 415.664,
-      "tflops": 1322.7
-    },
-    {
-      "name": "D_transformer_engine_fp8_linear",
-      "status": "unavailable",
-      "reason": "ModuleNotFoundError: No module named 'transformer_engine'"
-    },
-    {
-      "index": 0,
-      "algo_id": 52,
-      "tile_id": 23,
-      "splitk": 1,
-      "stages_id": 36,
-      "inner_shape_id": 0,
-      "cluster_shape_id": 3,
-      "name": "E_direct_cublaslt_fast_accum",
-      "status": "ok",
-      "tflops": 1615.4,
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "fast_accum": 1,
-      "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
-    }
-  ]
-}
\ No newline at end of file
diff --git a/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json b/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json
deleted file mode 100644
index 56cbce5..0000000
--- a/reports_fp8_paths_combined_aikubeworker0012_20260525_045408.json
+++ /dev/null
@@ -1,156 +0,0 @@
-{
-  "source": "fp8_path_comparison",
-  "host": null,
-  "matrix_size": 8192,
-  "gpu_index": 0,
-  "pytorch": {
-    "source": "pytorch_fp8_path_bench",
-    "torch": "2.6.0+cu124",
-    "cuda": "12.4",
-    "gpu_index": 0,
-    "gpu_name": "NVIDIA H100 80GB HBM3",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "results": [
-      {
-        "name": "A_eager_scaled_mm_default",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 463.507,
-        "event_us_per_iter": 927.014,
-        "wall_ms_total": 463.573,
-        "tflops": 1186.1
-      },
-      {
-        "name": "B_eager_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 434.241,
-        "event_us_per_iter": 868.481,
-        "wall_ms_total": 434.492,
-        "tflops": 1266.0
-      },
-      {
-        "name": "C_cuda_graph_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 3,
-        "event_ms_total": 415.786,
-        "event_us_per_iter": 831.573,
-        "wall_ms_total": 415.825,
-        "tflops": 1322.2
-      },
-      {
-        "name": "D_transformer_engine_fp8_linear",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 476.739,
-        "event_us_per_iter": 953.478,
-        "wall_ms_total": 476.8,
-        "tflops": 1153.2,
-        "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
-      }
-    ],
-    "summary": {
-      "max_tflops": 1322.2,
-      "min_tflops": 1153.2,
-      "mean_tflops": 1231.9
-    }
-  },
-  "cublaslt": {
-    "source": "cuBLASLt",
-    "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "fast_accum": 1,
-    "per_gpu": [
-      {
-        "index": 0,
-        "fp8_tflops": 1626.6,
-        "algo_id": 52,
-        "tile_id": 23,
-        "splitk": 1,
-        "stages_id": 36,
-        "inner_shape_id": 0,
-        "cluster_shape_id": 3
-      }
-    ],
-    "mean_tflops": 1626.6,
-    "min_tflops": 1626.6,
-    "max_tflops": 1626.6,
-    "spread_pct": 0.0
-  },
-  "results": [
-    {
-      "name": "A_eager_scaled_mm_default",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 463.507,
-      "event_us_per_iter": 927.014,
-      "wall_ms_total": 463.573,
-      "tflops": 1186.1
-    },
-    {
-      "name": "B_eager_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 434.241,
-      "event_us_per_iter": 868.481,
-      "wall_ms_total": 434.492,
-      "tflops": 1266.0
-    },
-    {
-      "name": "C_cuda_graph_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 3,
-      "event_ms_total": 415.786,
-      "event_us_per_iter": 831.573,
-      "wall_ms_total": 415.825,
-      "tflops": 1322.2
-    },
-    {
-      "name": "D_transformer_engine_fp8_linear",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 476.739,
-      "event_us_per_iter": 953.478,
-      "wall_ms_total": 476.8,
-      "tflops": 1153.2,
-      "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
-    },
-    {
-      "index": 0,
-      "algo_id": 52,
-      "tile_id": 23,
-      "splitk": 1,
-      "stages_id": 36,
-      "inner_shape_id": 0,
-      "cluster_shape_id": 3,
-      "name": "E_direct_cublaslt_fast_accum",
-      "status": "ok",
-      "tflops": 1626.6,
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "fast_accum": 1,
-      "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
-    }
-  ]
-}
\ No newline at end of file
diff --git a/reports_fp8_paths_combined_aikubeworker0016_20260525_042402.json b/reports_fp8_paths_combined_aikubeworker0016_20260525_042402.json
deleted file mode 100644
index 6d6a3a2..0000000
--- a/reports_fp8_paths_combined_aikubeworker0016_20260525_042402.json
+++ /dev/null
@@ -1,142 +0,0 @@
-{
-  "source": "fp8_path_comparison",
-  "host": null,
-  "matrix_size": 8192,
-  "gpu_index": 0,
-  "pytorch": {
-    "source": "pytorch_fp8_path_bench",
-    "torch": "2.6.0+cu124",
-    "cuda": "12.4",
-    "gpu_index": 0,
-    "gpu_name": "NVIDIA H100 80GB HBM3",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "results": [
-      {
-        "name": "A_eager_scaled_mm_default",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 470.909,
-        "event_us_per_iter": 941.817,
-        "wall_ms_total": 470.974,
-        "tflops": 1167.4
-      },
-      {
-        "name": "B_eager_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 452.608,
-        "event_us_per_iter": 905.215,
-        "wall_ms_total": 452.647,
-        "tflops": 1214.6
-      },
-      {
-        "name": "C_cuda_graph_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 3,
-        "event_ms_total": 427.724,
-        "event_us_per_iter": 855.449,
-        "wall_ms_total": 427.768,
-        "tflops": 1285.3
-      },
-      {
-        "name": "D_transformer_engine_fp8_linear",
-        "status": "unavailable",
-        "reason": "ModuleNotFoundError: No module named 'transformer_engine'"
-      }
-    ],
-    "summary": {
-      "max_tflops": 1285.3,
-      "min_tflops": 1167.4,
-      "mean_tflops": 1222.4
-    }
-  },
-  "cublaslt": {
-    "source": "cuBLASLt",
-    "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "fast_accum": 1,
-    "per_gpu": [
-      {
-        "index": 0,
-        "fp8_tflops": 1594.3,
-        "algo_id": 52,
-        "tile_id": 23,
-        "splitk": 1,
-        "stages_id": 36,
-        "inner_shape_id": 0,
-        "cluster_shape_id": 3
-      }
-    ],
-    "mean_tflops": 1594.3,
-    "min_tflops": 1594.3,
-    "max_tflops": 1594.3,
-    "spread_pct": 0.0
-  },
-  "results": [
-    {
-      "name": "A_eager_scaled_mm_default",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 470.909,
-      "event_us_per_iter": 941.817,
-      "wall_ms_total": 470.974,
-      "tflops": 1167.4
-    },
-    {
-      "name": "B_eager_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 452.608,
-      "event_us_per_iter": 905.215,
-      "wall_ms_total": 452.647,
-      "tflops": 1214.6
-    },
-    {
-      "name": "C_cuda_graph_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 3,
-      "event_ms_total": 427.724,
-      "event_us_per_iter": 855.449,
-      "wall_ms_total": 427.768,
-      "tflops": 1285.3
-    },
-    {
-      "name": "D_transformer_engine_fp8_linear",
-      "status": "unavailable",
-      "reason": "ModuleNotFoundError: No module named 'transformer_engine'"
-    },
-    {
-      "index": 0,
-      "algo_id": 52,
-      "tile_id": 23,
-      "splitk": 1,
-      "stages_id": 36,
-      "inner_shape_id": 0,
-      "cluster_shape_id": 3,
-      "name": "E_direct_cublaslt_fast_accum",
-      "status": "ok",
-      "tflops": 1594.3,
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "fast_accum": 1,
-      "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
-    }
-  ]
-}
\ No newline at end of file
diff --git a/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json b/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json
deleted file mode 100644
index 7168c05..0000000
--- a/reports_fp8_paths_combined_aikubeworker0016_20260525_050048.json
+++ /dev/null
@@ -1,156 +0,0 @@
-{
-  "source": "fp8_path_comparison",
-  "host": null,
-  "matrix_size": 8192,
-  "gpu_index": 0,
-  "pytorch": {
-    "source": "pytorch_fp8_path_bench",
-    "torch": "2.6.0+cu124",
-    "cuda": "12.4",
-    "gpu_index": 0,
-    "gpu_name": "NVIDIA H100 80GB HBM3",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "results": [
-      {
-        "name": "A_eager_scaled_mm_default",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 473.156,
-        "event_us_per_iter": 946.313,
-        "wall_ms_total": 473.199,
-        "tflops": 1161.9
-      },
-      {
-        "name": "B_eager_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 450.48,
-        "event_us_per_iter": 900.96,
-        "wall_ms_total": 450.505,
-        "tflops": 1220.4
-      },
-      {
-        "name": "C_cuda_graph_scaled_mm_fast_accum",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 3,
-        "event_ms_total": 430.272,
-        "event_us_per_iter": 860.543,
-        "wall_ms_total": 430.304,
-        "tflops": 1277.7
-      },
-      {
-        "name": "D_transformer_engine_fp8_linear",
-        "status": "ok",
-        "matrix_size": 8192,
-        "iterations": 500,
-        "warmup": 50,
-        "event_ms_total": 488.527,
-        "event_us_per_iter": 977.054,
-        "wall_ms_total": 488.576,
-        "tflops": 1125.3,
-        "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
-      }
-    ],
-    "summary": {
-      "max_tflops": 1277.7,
-      "min_tflops": 1125.3,
-      "mean_tflops": 1196.3
-    }
-  },
-  "cublaslt": {
-    "source": "cuBLASLt",
-    "dtype": "fp8_e4m3_inputs_bf16_output_fp32_accum",
-    "matrix_size": 8192,
-    "warmup": 50,
-    "iterations": 500,
-    "fast_accum": 1,
-    "per_gpu": [
-      {
-        "index": 0,
-        "fp8_tflops": 1598.1,
-        "algo_id": 52,
-        "tile_id": 23,
-        "splitk": 1,
-        "stages_id": 36,
-        "inner_shape_id": 0,
-        "cluster_shape_id": 3
-      }
-    ],
-    "mean_tflops": 1598.1,
-    "min_tflops": 1598.1,
-    "max_tflops": 1598.1,
-    "spread_pct": 0.0
-  },
-  "results": [
-    {
-      "name": "A_eager_scaled_mm_default",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 473.156,
-      "event_us_per_iter": 946.313,
-      "wall_ms_total": 473.199,
-      "tflops": 1161.9
-    },
-    {
-      "name": "B_eager_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 450.48,
-      "event_us_per_iter": 900.96,
-      "wall_ms_total": 450.505,
-      "tflops": 1220.4
-    },
-    {
-      "name": "C_cuda_graph_scaled_mm_fast_accum",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 3,
-      "event_ms_total": 430.272,
-      "event_us_per_iter": 860.543,
-      "wall_ms_total": 430.304,
-      "tflops": 1277.7
-    },
-    {
-      "name": "D_transformer_engine_fp8_linear",
-      "status": "ok",
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "event_ms_total": 488.527,
-      "event_us_per_iter": 977.054,
-      "wall_ms_total": 488.576,
-      "tflops": 1125.3,
-      "note": "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
-    },
-    {
-      "index": 0,
-      "algo_id": 52,
-      "tile_id": 23,
-      "splitk": 1,
-      "stages_id": 36,
-      "inner_shape_id": 0,
-      "cluster_shape_id": 3,
-      "name": "E_direct_cublaslt_fast_accum",
-      "status": "ok",
-      "tflops": 1598.1,
-      "matrix_size": 8192,
-      "iterations": 500,
-      "warmup": 50,
-      "fast_accum": 1,
-      "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager."
-    }
-  ]
-}
\ No newline at end of file
diff --git a/reports_gpu_Test_combined_20260524.md b/reports_gpu_Test_combined_20260524.md
deleted file mode 100644
index b4fff0a..0000000
--- a/reports_gpu_Test_combined_20260524.md
+++ /dev/null
@@ -1,152 +0,0 @@
-# GPU_Test 合并报告
-
-- **日期:** 2026-05-24
-- **节点:** `aikubeworker0012 / 172.72.8.12`，`aikubeworker0016 / 172.72.8.16`
-- **GPU:** NVIDIA H100 80GB HBM3 x8 / node
-- **范围:** 单机单卡算力与多机多卡 NCCL 通信
-- **说明:** 本报告汇总既有原始测试结果，不重新启动额外压力测试。
-
-## 总体结论
-
-| 测试项 | 结论 | 说明 |
-|---|---|---|
-| 单机 GPU 识别 | PASS | 两台机器均识别 8 张 H100 80GB HBM3 |
-| 单机单卡 FP8 硬件算力 | PASS | direct cuBLASLt FP8 GEMM 两台机器均超过 `>= 1400 TFLOPS` |
-| PyTorch `_scaled_mm` FP8 路径 | FAIL / 软件栈信号 | 约 `1170-1180 TFLOPS`，低于阈值；已定位为 PyTorch eager / `_scaled_mm` benchmark 路径偏低，不作为硬件失败依据 |
-| 多机多卡 NCCL 正确性 | PASS | return code `0`，`Wrong=0` / `Out of bounds values: 0 OK` |
-| 多机多卡 NCCL 性能 | 符合当前 4x400Gbps 网络形态 | 2x8 allreduce / alltoall 低于 PDF 8x400Gbps 阈值，但该阈值不应直接硬套到当前 4x400Gbps 环境 |
-
-## 单机单卡 / 算力测试
-
-### 机器信息
-
-| Host | GPU | Driver | CUDA | GPU 数量 |
-|---|---|---|---|---:|
-| `aikubeworker0012` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
-| `aikubeworker0016` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
-
-来源：
-
-- `reports_single_gpu_aikubeworker0012.md`
-- `reports_single_gpu_aikubeworker0016.md`
-
-### 原始 PyTorch 单机算力结果
-
-| Host | FP32 | TF32 | FP16 | BF16 | FP8 `_scaled_mm` | 原始 Verdict |
-|---|---:|---:|---:|---:|---:|---|
-| `aikubeworker0012` | 52.0 | 362.3 | 691.0 | 713.0 | 1148.8 | FAIL |
-| `aikubeworker0016` | 51.9 | 357.8 | 667.2 | 699.1 | 1146.2 | FAIL |
-
-原始 PyTorch 路径使用 `torch._scaled_mm` 做 FP8 GEMM。后续复查显示，该路径会受到 PyTorch eager dispatch、输出 Tensor 创建、cuBLASLt heuristic 路径、默认 `use_fast_accum=False` 等因素影响，不能直接代表 H100 FP8 Tensor Core 硬件上限。
-
-### direct cuBLASLt FP8 GEMM 交叉验证
-
-测试参数：
-
-| 参数 | 值 |
-|---|---|
-| Benchmark | direct cuBLASLt FP8 GEMM |
-| Source | `scripts/cublaslt_fp8_gemm_bench.cu` |
-| Matrix | `8192 x 8192 x 8192` |
-| A/B dtype | FP8 E4M3 |
-| Output dtype | BF16 |
-| Compute type | `CUBLAS_COMPUTE_32F` |
-| Scale type | `CUDA_R_32F` |
-| Scale A/B | `1.0` |
-| Layout | TN |
-| fast accumulation | enabled |
-| Threshold | `>= 1400 TFLOPS` |
-
-结果：
-
-| Host | Mean FP8 TFLOPS | Min | Max | Spread | Threshold | Verdict |
-|---|---:|---:|---:|---:|---:|---|
-| `aikubeworker0012` | 1608.6 | 1599.0 | 1615.6 | 1.03% | >= 1400 | PASS |
-| `aikubeworker0016` | 1613.7 | 1602.3 | 1630.3 | 1.74% | >= 1400 | PASS |
-
-单卡逐张结果：
-
-| Host | GPU0 | GPU1 | GPU2 | GPU3 | GPU4 | GPU5 | GPU6 | GPU7 |
-|---|---:|---:|---:|---:|---:|---:|---:|---:|
-| `aikubeworker0012` | 1615.6 | 1611.0 | 1599.0 | 1607.1 | 1614.0 | 1604.4 | 1608.4 | 1609.1 |
-| `aikubeworker0016` | 1602.3 | 1604.0 | 1616.9 | 1610.6 | 1620.5 | 1630.3 | 1605.1 | 1620.2 |
-
-结论：direct cuBLASLt FP8 GEMM 已通过 `>= 1400 TFLOPS` 阈值，说明两台机器的 FP8 硬件计算路径具备达标能力。PyTorch `_scaled_mm` 的 FAIL 更适合作为软件栈 benchmark 路径问题记录，而不是 GPU 硬件失败结论。
-
-来源：
-
-- `reports_cublaslt_fp8_crosscheck_20260524.md`
-- `reports_cublaslt_fp8_gemm_aikubeworker0012_20260524_071148.json`
-- `reports_cublaslt_fp8_gemm_aikubeworker0016_20260524_071200.json`
-
-## 多机多卡 NCCL 测试
-
-### 测试环境
-
-| 项目 | 结果 |
-|---|---|
-| Hosts | `nccl-gpu-1(172.72.8.12)`，`nccl-gpu-2(172.72.8.16)` |
-| Topology | 2 nodes x 8 GPUs，合计 16 GPUs |
-| NCCL source | `nccl-tests-mpirun` |
-| NCCL network | IB |
-| GPU Direct RDMA | ENABLED |
-| Active HCA rails | `mlx5_0, mlx5_1, mlx5_6, mlx5_7` |
-| HCA speed | 4 条 `400 Gb/sec (4X NDR)` ACTIVE |
-
-注意：NCCL 表里的 `GB/s` 是大 B，即 Bytes/s。IB 网卡口径 `400 Gb/s` 是小 b，即 bits/s。
-
-### 2x8 全集合通信结果
-
-| Operation | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Correctness | 当前 4x400Gbps 口径 |
-|---|---:|---:|---:|---|---|
-| allreduce | 354.27 GB/s | 354.45 GB/s | >= 491.84 GB/s | PASS | 符合当前硬件形态，低于 PDF 8 rail 阈值 |
-| alltoall | 37.00 GB/s | 37.14 GB/s | >= 76.54 GB/s | PASS | 符合当前硬件形态，低于 PDF 8 rail 阈值 |
-| broadcast | 191.65 GB/s | 190.25 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
-| reducescatter | 192.75 GB/s | 192.74 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
-| allgather | 192.14 GB/s | 192.47 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
-| sendrecv | 26.98 GB/s | 26.97 GB/s | 未配置 PDF 阈值 | PASS | PASS / 仅记录 |
-
-结论：2x8 全集合通信测试中，NCCL 正确性通过。allreduce 和 alltoall 低于 PDF 8x400Gbps 参考阈值，但当前机器确认参与 NCCL 的是 4 条 400Gbps rail，因此该差距不应直接判定为当前 4x400Gbps 环境不合格。
-
-来源：
-
-- `reports_multinode_nccl_all_collectives_20260523_120144.md`
-- `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md`
-
-### PDF Matrix allreduce / alltoall 结果
-
-AllReduce（PDF 8x400Gbps 阈值对比，仅作参考）:
-
-| Topology | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Gap | 当前解释 |
-|---|---:|---:|---:|---:|---|
-| 2 nodes x 1 GPU | 47.29 GB/s | 47.26 GB/s | >= 48.90 GB/s | -1.61 GB/s | 接近 PDF 阈值 |
-| 2 nodes x 2 GPUs | 137.16 GB/s | 137.13 GB/s | >= 136.93 GB/s | +0.23 GB/s | 达到 PDF 阈值 |
-| 2 nodes x 4 GPUs | 335.07 GB/s | 335.02 GB/s | >= 335.48 GB/s | -0.41 GB/s | 接近 PDF 阈值 |
-| 2 nodes x 8 GPUs | 353.85 GB/s | 353.85 GB/s | >= 491.84 GB/s | -137.99 GB/s | 低于 PDF 8 rail 阈值；当前为 4 rail 环境，不直接判不合格 |
-
-AllToAll（PDF 8x400Gbps 阈值对比，仅作参考）:
-
-| Topology | Peak Bus BW | Avg Bus BW | PDF 8x400Gbps Threshold | Gap | 当前解释 |
-|---|---:|---:|---:|---:|---|
-| 2 nodes x 1 GPU | 24.85 GB/s | 24.90 GB/s | >= 27.25 GB/s | -2.40 GB/s | 接近 PDF 阈值 |
-| 2 nodes x 2 GPUs | 47.76 GB/s | 47.98 GB/s | >= 54.41 GB/s | -6.65 GB/s | 低于 PDF 8 rail 阈值 |
-| 2 nodes x 4 GPUs | 72.74 GB/s | 72.80 GB/s | >= 73.73 GB/s | -0.99 GB/s | 接近 PDF 阈值 |
-| 2 nodes x 8 GPUs | 36.83 GB/s | 36.85 GB/s | >= 76.54 GB/s | -39.71 GB/s | 低于 PDF 8 rail 阈值；当前为 4 rail 环境，不直接判不合格 |
-
-来源：
-
-- `reports_multinode_nccl_pdf_matrix_run_20260523.md`
-- `reports_multinode_nccl_pdf_matrix_20260523_113803.md`
-
-## 风险与判断
-
-1. 单机 FP8 硬件能力通过 direct cuBLASLt 验证，当前不支持将 PyTorch `_scaled_mm` FAIL 直接判定为 GPU 硬件故障。
-2. 多机 NCCL 正确性通过，性能结果应按当前 4x400Gbps rail 环境解释。
-3. 当前多机环境确认参与 NCCL 的是 4 条 400G IB rail；PDF 参考环境为 8x400G 计算管理网络，因此 2x8 阈值与当前硬件形态不等价。
-4. 2x8 allreduce 和 alltoall 低于 PDF 8 rail 阈值，建议作为“与 PDF 参考环境差异”记录，而不是作为当前 4 rail 环境不合格结论。
-
-## 建议
-
-1. 单机 FP8 验收以 direct cuBLASLt 或 Transformer Engine GEMM benchmark 为主，PyTorch `_scaled_mm` 作为软件栈参考项保留。
-2. 多机 NCCL 后续若要按 PDF 阈值验收，需要先对齐 PDF 参考环境的 8x400Gbps rail 数量、NCCL net plugin / SHARP、跨 Leaf 交换策略、ECMP / 拥塞控制配置。
-3. 对外报告建议明确区分 `GB/s` 与 `Gb/s`：NCCL bus bandwidth 是大 B，IB 端口速率是小 b。
diff --git a/reports_gpu_Test_formal_20260524.md b/reports_gpu_Test_formal_20260524.md
deleted file mode 100644
index 49e2695..0000000
--- a/reports_gpu_Test_formal_20260524.md
+++ /dev/null
@@ -1,122 +0,0 @@
-# GPU_Test 双节点测试报告
-
-- **测试日期:** 2026-05-24
-- **测试节点:** `aikubeworker0012 / 172.72.8.12`，`aikubeworker0016 / 172.72.8.16`
-- **节点配置:** 每节点 8 张 NVIDIA H100 80GB HBM3 GPU
-- **测试范围:** 单机算力、单机 8 卡通信、多机 2x8 GPU 通信
-- **网络形态:** 当前参与 NCCL 的计算网络为 4 条 400Gbps IB rail
-
-## 结论摘要
-
-| 项目 | 结果摘要 |
-|---|---|
-| GPU 识别 | 两台节点均识别 8 张 H100 80GB HBM3 GPU |
-| 单机 FP8 GEMM | 两台节点 direct cuBLASLt FP8 GEMM 均超过 1600 TFLOPS |
-| 单机 8 卡 NCCL | 两台节点单机 8 卡 NCCL 集合通信均可正常完成，主要大包通信带宽稳定 |
-| 多机 2x8 NCCL | 两节点 16 GPU NCCL 正确性通过，所有测试 `Wrong=0` / return code `0` |
-| 多机网络口径 | 当前为 4x400Gbps IB rail 环境，结果按该硬件形态解释 |
-
-## 测试环境
-
-| Host | GPU | Driver | CUDA | GPU 数量 |
-|---|---|---|---|---:|
-| `aikubeworker0012` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
-| `aikubeworker0016` | NVIDIA H100 80GB HBM3 | 580.159.03 | 13.0 | 8 |
-
-## 单机算力测试
-
-### FP8 GEMM 硬件路径验证
-
-本项使用 direct cuBLASLt FP8 GEMM benchmark，绕过 PyTorch eager 调度路径，直接验证 GPU FP8 Tensor Core 与 cuBLASLt GEMM 能力。
-
-| 参数 | 配置 |
-|---|---|
-| GEMM shape | `8192 x 8192 x 8192` |
-| 输入类型 | FP8 E4M3 |
-| 输出类型 | BF16 |
-| 累加类型 | FP32 compute |
-| Layout | TN |
-| Scale | `scale_a = 1.0`，`scale_b = 1.0` |
-| fast accumulation | enabled |
-| 测试 GPU | 每节点 8 张 GPU 逐张测试 |
-
-| Host | Mean FP8 TFLOPS | Min | Max | Spread |
-|---|---:|---:|---:|---:|
-| `aikubeworker0012` | 1608.6 | 1599.0 | 1615.6 | 1.03% |
-| `aikubeworker0016` | 1613.7 | 1602.3 | 1630.3 | 1.74% |
-
-| Host | GPU0 | GPU1 | GPU2 | GPU3 | GPU4 | GPU5 | GPU6 | GPU7 |
-|---|---:|---:|---:|---:|---:|---:|---:|---:|
-| `aikubeworker0012` | 1615.6 | 1611.0 | 1599.0 | 1607.1 | 1614.0 | 1604.4 | 1608.4 | 1609.1 |
-| `aikubeworker0016` | 1602.3 | 1604.0 | 1616.9 | 1610.6 | 1620.5 | 1630.3 | 1605.1 | 1620.2 |
-
-**说明:** PyTorch `_scaled_mm` eager benchmark 结果约为 1170-1180 TFLOPS，该结果反映 PyTorch 软件路径与调度开销，不作为本报告的硬件算力结论。
-
-## 单机 8 卡 NCCL 通信测试
-
-本项在单个节点内使用 8 张 GPU 进行 NCCL 集合通信测试，结果单位为 `GB/s`，即 Bytes/s。
-
-| Operation | `aikubeworker0012` Bus BW | `aikubeworker0016` Bus BW |
-|---|---:|---:|
-| allreduce | 472.3 GB/s | 472.4 GB/s |
-| alltoall | 343.3 GB/s | 344.3 GB/s |
-| broadcast | 364.1 GB/s | 363.6 GB/s |
-| reducescatter | 352.8 GB/s | 353.1 GB/s |
-| allgather | 366.4 GB/s | 366.4 GB/s |
-| sendrecv | 369.0 GB/s | 368.9 GB/s |
-
-**说明:** 单机 8 卡通信主要依赖节点内 GPU 互联与 NCCL collective 实现。两台节点的同类 operation 结果接近，节点间差异较小。
-
-## 多机 2x8 NCCL 通信测试
-
-本项使用两台节点，每台 8 张 GPU，共 16 张 GPU 进行跨节点 NCCL 集合通信测试。
-
-### 网络环境
-
-| 项目 | 配置 |
-|---|---|
-| Host A | `aikubeworker0012 / 172.72.8.12` |
-| Host B | `aikubeworker0016 / 172.72.8.16` |
-| 拓扑 | 2 nodes x 8 GPUs |
-| NCCL network | IB |
-| GPU Direct RDMA | ENABLED |
-| Active rails | `mlx5_0, mlx5_1, mlx5_6, mlx5_7` |
-| Rail 速率 | 4 条 `400 Gb/sec (4X NDR)` ACTIVE |
-
-### 跨节点 NCCL 结果
-
-| Operation | Peak Bus BW | Avg Bus BW | Correctness |
-|---|---:|---:|---|
-| allreduce | 354.27 GB/s | 354.45 GB/s | PASS |
-| alltoall | 37.00 GB/s | 37.14 GB/s | PASS |
-| broadcast | 191.65 GB/s | 190.25 GB/s | PASS |
-| reducescatter | 192.75 GB/s | 192.74 GB/s | PASS |
-| allgather | 192.14 GB/s | 192.47 GB/s | PASS |
-| sendrecv | 26.98 GB/s | 26.97 GB/s | PASS |
-
-**正确性:** 本轮多机 NCCL 测试 return code 为 `0`，`Wrong=0`，未发现数据正确性错误。
-
-## 单位说明
-
-| 写法 | 含义 | 说明 |
-|---|---|---|
-| `GB/s` | Gigabytes per second | 大 B，字节每秒，NCCL bus bandwidth 使用此单位 |
-| `Gbps` / `Gb/s` | Gigabits per second | 小 b，比特每秒，IB 端口速率通常使用此单位 |
-
-换算关系：
-
-```text
-1 Byte = 8 bits
-400 Gb/s = 50 GB/s
-4 x 400 Gb/s = 1600 Gb/s = 200 GB/s 物理链路字节带宽
-```
-
-NCCL 的 `busbw` 是 collective 通信的逻辑折算带宽，不等同于单条物理链路的线速。
-
-## 结果说明
-
-1. 两台节点 GPU 识别正常，均为 8 张 H100 80GB HBM3。
-2. direct cuBLASLt FP8 GEMM 显示两台节点单卡 FP8 算力均超过 1600 TFLOPS，GPU FP8 硬件计算路径正常。
-3. 单机 8 卡 NCCL 通信在两台节点上结果接近，未观察到明显节点间异常差异。
-4. 多机 2x8 NCCL 正确性通过，跨节点通信功能正常。
-5. 当前多机通信结果应按 4x400Gbps IB rail 环境解释；若后续需要对齐 8x400Gbps 环境，应先确认 rail 数量、NCCL net plugin / SHARP、交换网络策略等配置一致。
diff --git a/reports_gpu_Test_pdf.css b/reports_gpu_Test_pdf.css
deleted file mode 100644
index 9a44015..0000000
--- a/reports_gpu_Test_pdf.css
+++ /dev/null
@@ -1,101 +0,0 @@
-@page {
-  size: A4 landscape;
-  margin: 13mm;
-}
-
-body {
-  color: #111827;
-  font-family: "PingFang SC", "Heiti SC", "Arial Unicode MS", sans-serif;
-  font-size: 11px;
-  line-height: 1.45;
-}
-
-h1 {
-  color: #0f172a;
-  font-size: 24px;
-  margin: 0 0 14px;
-}
-
-h2 {
-  border-bottom: 1px solid #cbd5e1;
-  color: #0f172a;
-  font-size: 17px;
-  margin: 24px 0 10px;
-  padding-bottom: 4px;
-}
-
-h3 {
-  color: #1f2937;
-  font-size: 13px;
-  margin: 16px 0 8px;
-}
-
-p {
-  margin: 7px 0;
-}
-
-code {
-  background: #f1f5f9;
-  border-radius: 3px;
-  color: #0f172a;
-  font-family: Menlo, Consolas, monospace;
-  font-size: 10px;
-  padding: 1px 3px;
-}
-
-pre {
-  background: #f8fafc;
-  border: 1px solid #e2e8f0;
-  border-radius: 4px;
-  padding: 8px;
-  white-space: pre-wrap;
-}
-
-table {
-  border-collapse: collapse;
-  margin: 8px 0 14px;
-  page-break-inside: auto;
-  width: 100%;
-}
-
-thead {
-  display: table-header-group;
-}
-
-tr {
-  page-break-inside: avoid;
-}
-
-th,
-td {
-  border: 1px solid #cbd5e1;
-  padding: 5px 6px;
-  text-align: left;
-  vertical-align: middle;
-  word-break: break-word;
-}
-
-th {
-  background: #e2e8f0;
-  color: #0f172a;
-  font-weight: 700;
-}
-
-tbody tr:nth-child(even) td {
-  background: #f8fafc;
-}
-
-a {
-  color: #2563eb;
-  text-decoration: none;
-}
-
-ul,
-ol {
-  margin: 6px 0 10px 20px;
-  padding: 0;
-}
-
-li {
-  margin: 3px 0;
-}
diff --git a/reports_h100_acceptance_closure_checklist_20260523.md b/reports_h100_acceptance_closure_checklist_20260523.md
deleted file mode 100644
index 6b0264f..0000000
--- a/reports_h100_acceptance_closure_checklist_20260523.md
+++ /dev/null
@@ -1,105 +0,0 @@
-# H100 验收收尾检查清单 2026-05-23
-
-## 结论
-
-当前项目已经可以进入“阶段性交付/问题转交”状态，但不能进入“生产验收通过”状态。
-
-原因不是测试没跑完，而是当前证据明确显示多个验收门禁仍为 `FAIL`。要真正收尾，必须满足下面两种路径之一：
-
-1. **通过路径：** 修复硬件/网络/软件环境后复跑，单节点、跨节点 RDMA、多节点 NCCL 均达到 PDF/配置阈值。
-2. **例外路径：** 硬件/网络/环境侧书面确认当前机器与 PDF 参考环境不等价，并给出新的验收阈值或豁免口径，再按新口径复核。
-
-在这两条路径完成前，本项目只能交付“已测证据 + 阻塞定位 + 复跑入口”，不能判定 H100 节点生产验收通过。
-
-## 当前可关闭的工作
-
-| 工作项 | 状态 | 证据 |
-|---|---|---|
-| 单节点 `test all` 入口 | 完成 | `scripts/run_h100_single_node_all.sh` |
-| 单节点中文原始汇总 | 完成 | `reports_test_all_latest_summary_cn_20260523.md` |
-| 跨节点 RDMA 单 rail 证据 | 完成 | `reports_rdma_cross_node_mlx5_0_20260523.md` |
-| 多节点 NCCL PDF matrix | 完成 | `scripts/run_multinode_nccl_pdf_matrix.sh`，`reports_multinode_nccl_pdf_matrix_run_20260523.md` |
-| 多节点 2x8 六项 collective | 完成 | `scripts/run_multinode_nccl_all_collectives.sh`，`reports_multinode_nccl_all_collectives_run_20260523.md` |
-| NCCL artifacts / checksum | 完成 | `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`，`reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` |
-| 环境等价性分析 | 完成 | `reports_multinode_nccl_environment_gap_20260523.md` |
-| 交付包 manifest | 完成 | `reports_h100_acceptance_delivery_manifest_20260523.md` |
-| 网络/硬件/环境闭环请求 | 完成 | `reports_h100_network_hardware_escalation_request_20260523.md` |
-| 接手 runbook / README 入口 | 完成 | `README.md`，`reports_multinode_nccl_handoff_plan_20260523.md` |
-
-这些工作可以作为当前阶段交付物归档。
-
-## 不能关闭的验收门禁
-
-| 门禁 | 当前结果 | 现有证据 | 关闭条件 |
-|---|---|---|---|
-| 单节点 Compute | FAIL | 两台机器多 dtype 绝对 TFLOPS 未达阈值，部分 GPU spread 超 3% | 复核阈值/测试实现后重跑通过，或更新阈值口径 |
-| 单节点 NCCL | FAIL | 多 op/size 未达阈值，尤其小包和部分 2G case | 按 PDF/config 逐 size 通过，或明确小包/阈值豁免 |
-| 单节点 Stress | FAIL | 30 分钟可跑满，但温差和 `sw_power_cap` throttle 触发 FAIL | 调整散热/功耗策略或阈值后重跑通过 |
-| 单节点 RDMA | FAIL | read BW 未达 47 GB/s，`mlx5_4/5` 只有 100G | perftest read/write/latency 和端口速率满足验收口径 |
-| 跨节点 RDMA | FAIL | `mlx5_0` 写带宽 PASS，但读带宽和读写 latency FAIL | 双向 write/read BW/latency 全部达标 |
-| 多节点 NCCL allreduce | FAIL | 2x8 `353.85 GB/s`，目标 `491.84 GB/s` | 环境等价后达到 PDF 阈值，或按 4 x 400G rail 重定标 |
-| 多节点 NCCL alltoall | FAIL | 2x8 `36.83 GB/s`，目标 `76.54 GB/s` | 网络/plugin/SHARP/路径修复后达到阈值，或明确新口径 |
-| PDF 环境等价性 | 未证明 | 当前每节点只有 4 条 400G rail，缺外部 NCCL net plugin / SHARP | 确认参考环境 rail/plugin/SHARP/交换策略，并补齐或重定标 |
-
-## 最短收尾路径
-
-### 路径 A：按原 PDF 阈值验收
-
-必须先完成环境补齐：
-
-1. 确认每节点是否应有 8 条 400G IB rail；如果是，修复 `mlx5_4/5`、`mlx5_2/8`、`mlx5_3/9` 的速率/模式/状态。
-2. 如 PDF 参考环境使用 SHARP、HCOLL、UCX plugin 或 NCCL net plugin，则在两台节点补齐同等组件。
-3. 让网络侧确认跨 Leaf ECMP / adaptive routing / congestion control / credit wait 配置。
-4. 复跑：
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/run_h100_single_node_all.sh
-bash scripts/run_multinode_nccl_pdf_matrix.sh
-bash scripts/run_multinode_nccl_all_collectives.sh
-```
-
-关闭标准：`reports_h100_acceptance_current_status_*.md` 中所有必测项不再有 `FAIL`。
-
-### 路径 B：承认当前环境与 PDF 不等价
-
-必须拿到新的验收口径：
-
-1. 硬件/网络侧确认当前机器实际有效 400G IB rail 数量。
-2. 明确是否允许按 4 x 400G rail 的物理上限重定 allreduce 阈值。
-3. 明确 2x8 alltoall 的合理目标，或要求安装 plugin/SHARP 后再判。
-4. 明确单节点 Compute、Stress、RDMA 的阈值是否沿用 PDF 原口径。
-5. 用新口径更新配置后复跑并生成新报告。
-
-关闭标准：新口径必须写进配置或报告，不能只口头说明。
-
-## 下一步优先级
-
-| 优先级 | 动作 | 负责人建议 | 为什么 |
-|---:|---|---|---|
-| P0 | 确认 PDF 参考环境 rail/plugin/SHARP 状态 | 硬件/网络/环境侧 | 不确认等价性，2x8 allreduce 阈值是否合理无法判断 |
-| P0 | 查跨 Leaf alltoall 网络路径 | 网络侧 | alltoall 低于目标过多，且参数 sweep 无稳定收益 |
-| P1 | 复核单节点 Compute 阈值和测试 dtype 路径 | 测试/平台侧 | 两台机器多 dtype 绝对阈值均失败，需要确认是不是口径问题 |
-| P1 | 处理 Stress `sw_power_cap` 和温差 | 机房/硬件侧 | 压测能跑满，但 telemetry 门禁未过 |
-| P1 | 处理 RDMA read BW/latency | 网络/OFED/固件侧 | 单节点和跨节点 RDMA 都有 read/latency 缺口 |
-| P2 | 启用 plugin/SHARP 后复跑 NCCL graph | 平台侧 | 用于验证 `plugin_missing` 是否消失、图策略是否变化 |
-
-## 当前交付物入口
-
-优先读：
-
-1. `reports_h100_acceptance_current_status_20260523.md`
-2. `reports_h100_acceptance_closure_checklist_20260523.md`
-3. `reports_h100_acceptance_delivery_manifest_20260523.md`
-4. `reports_h100_network_hardware_escalation_request_20260523.md`
-5. `reports_multinode_nccl_handoff_plan_20260523.md`
-6. `reports_multinode_nccl_environment_gap_20260523.md`
-7. `reports_multinode_nccl_latest_index_20260523.md`
-
-当前项目可以向外汇报为：
-
-```text
-测试脚本、复跑入口、原始 artifacts、checksum 和中文报告已经齐备；
-但当前 H100 生产验收未通过，剩余问题集中在单节点 Compute/NCCL/Stress/RDMA、
-跨节点 RDMA read/latency、多节点 NCCL 2x8 allreduce/alltoall 性能，以及 PDF 环境等价性。
-```
diff --git a/reports_h100_acceptance_current_status_20260523.md b/reports_h100_acceptance_current_status_20260523.md
deleted file mode 100644
index 0686918..0000000
--- a/reports_h100_acceptance_current_status_20260523.md
+++ /dev/null
@@ -1,164 +0,0 @@
-# H100 验收当前状态总览 2026-05-23
-
-## 一句话结论
-
-当前脚本能力和证据链已经基本补齐：单节点 `test all`、多机多卡 PDF matrix、2x8 六项 collective、跨节点 RDMA、NCCL artifacts、环境快照和 checksum 都已经有可复跑入口和原始证据。但按当前 PDF/配置口径，两台 H100 节点仍不能判定生产验收通过，主要阻塞不是脚本没跑，而是多项实测指标低于阈值，以及当前硬件/软件环境无法证明与 PDF 参考环境等价。
-
-## 当前总状态
-
-| 范围 | 当前证据 | 结论 | 主要阻塞 |
-|---|---|---|---|
-| 单节点 `test all` | `reports_test_all_latest_summary_cn_20260523.md` | 两台均 FAIL | Compute、NCCL、Stress、RDMA |
-| 跨节点 RDMA | `reports_rdma_cross_node_mlx5_0_20260523.md` | FAIL | read BW、write/read latency 未达阈值 |
-| 多机多卡 PDF matrix | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | FAIL | 2x8 allreduce/alltoall 差距大，1/4 GPU 档位部分小差距 |
-| 多机多卡 2x8 六项 collective | `reports_multinode_nccl_all_collectives_run_20260523.md` | FAIL / evidence complete | 6 项正确性通过；allreduce/alltoall 按 PDF 阈值 FAIL |
-| NCCL artifacts 信号 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 基础链路正常 | IB/GDRDMA/HCA 均正常；无 SHARP/CollNet/外部 net plugin |
-| 环境等价性 | `reports_multinode_nccl_environment_gap_20260523.md` | 未证明等价 | 每节点只有 4 条 400G rail，缺 NCCL net plugin / SHARP |
-| 收尾检查 | `reports_h100_acceptance_closure_checklist_20260523.md` | 可阶段性交付 | 生产验收门禁仍未关闭 |
-| 交付包 manifest | `reports_h100_acceptance_delivery_manifest_20260523.md` | 已形成 | 入口、脚本、远端 artifacts、checksum 已汇总 |
-| 网络/硬件/环境闭环 | `reports_h100_network_hardware_escalation_request_20260523.md` | 已形成请求 | 等待 rail/plugin/SHARP/交换策略/阈值口径回填 |
-
-## 已完成的能力
-
-| 能力 | 当前状态 |
-|---|---|
-| 单节点 H100 all 验收入口 | `scripts/run_h100_single_node_all.sh` 已可用，默认带环境快照 |
-| 多机 PDF matrix 入口 | `scripts/run_multinode_nccl_pdf_matrix.sh` 已可用，自动归档每个 case 的 `cmd/stdout/stderr/json` |
-| 多机 2x8 六项 collective 入口 | `scripts/run_multinode_nccl_all_collectives.sh` 已可用，覆盖 `allreduce/alltoall/broadcast/reducescatter/allgather/sendrecv` |
-| NCCL 深度诊断入口 | `scripts/multinode_nccl_deep_diagnose.sh` 已可用，覆盖 preflight、counter、graph、PXN sweep |
-| 环境等价性快照 | `scripts/nccl_environment_snapshot.sh` 已可用 |
-| 原始证据归档 | PDF matrix 和六项 collective artifacts 均已 tar + checksum |
-| 中文解释文档 | 指标说明、NCCL/RDMA 概念、handoff、environment gap、artifact signal analysis 均已生成 |
-
-## 单节点验收状态
-
-两台机器的单节点 `test all` 当前都是：
-
-```text
-Suite: 6/10 PASS
-PDF acceptance: FAIL
-```
-
-通过项：
-
-- GPU Info
-- Health
-- Memory Bandwidth
-- NVLink/NVSwitch
-- DCGM diag -r 3
-- Training Simulation
-
-失败项：
-
-| 项目 | 当前现象 | 备注 |
-|---|---|---|
-| Compute | 多 dtype 绝对 TFLOPS 阈值未达，部分 GPU 间 spread 超 3% | 需要复核 H100 阈值口径和具体 dtype 路径 |
-| NCCL 单机 | 真实 `nccl-tests` 已可测，但多 op/size 未达阈值 | 主要是 1M 小包，以及 reducescatter/allgather 的 2G |
-| Stress | 30 分钟可跑满，但温差和 `sw_power_cap` throttle 导致 FAIL | 更像散热/功耗策略或阈值口径问题 |
-| RDMA 单机 | read BW 未达标，部分端口速率低于 400G | 单机 local-loopback 不能替代跨节点 RDMA |
-
-## 跨节点 RDMA 状态
-
-跨节点 `mlx5_0` 单 rail perftest 结果：
-
-| Direction | Test | Value | Threshold | Status |
-|---|---|---:|---:|---|
-| 0016 -> 0012 | ib_write_bw | 49.35 GB/s | >= 47 GB/s | PASS |
-| 0016 -> 0012 | ib_read_bw | 44.36 GB/s | >= 47 GB/s | FAIL |
-| 0016 -> 0012 | ib_write_lat avg | 2.17 us | <= 2.0 us | FAIL |
-| 0016 -> 0012 | ib_read_lat avg | 4.05 us | <= 3.5 us | FAIL |
-| 0012 -> 0016 | ib_write_bw | 48.38 GB/s | >= 47 GB/s | PASS |
-| 0012 -> 0016 | ib_read_bw | 44.37 GB/s | >= 47 GB/s | FAIL |
-| 0012 -> 0016 | ib_write_lat avg | 2.13 us | <= 2.0 us | FAIL |
-| 0012 -> 0016 | ib_read_lat avg | 4.08 us | <= 3.5 us | FAIL |
-
-判断：链路连通、ibping 正常、PFC/ECN/CNP/congestion counter 干净；但 read bandwidth 和 latency 仍低于阈值，需要网络/OFED/BIOS/firmware 或 perftest 参数侧继续确认。
-
-## 多机多卡 NCCL 状态
-
-### PDF Matrix
-
-| Topology | AllReduce | Target | Status | AllToAll | Target | Status |
-|---|---:|---:|---|---:|---:|---|
-| 2 nodes x 1 GPU | 47.29 | 48.90 | FAIL | 24.85 | 27.25 | FAIL |
-| 2 nodes x 2 GPUs | 137.16 | 136.93 | PASS | 47.76 | 54.41 | FAIL |
-| 2 nodes x 4 GPUs | 335.07 | 335.48 | FAIL | 72.74 | 73.73 | FAIL |
-| 2 nodes x 8 GPUs | 353.85 | 491.84 | FAIL | 36.83 | 76.54 | FAIL |
-
-所有 case 均 `returncode=0`、`wrong=0`，所以 FAIL 来自性能阈值，不是功能错误。
-
-### 2x8 六项 Collective 补测
-
-| Operation | Peak Bus BW | Threshold | Correctness | Network | Status |
-|---|---:|---:|---|---|---|
-| allreduce | 354.27 | >= 491.84 | wrong=0 | IB/GDRDMA | FAIL |
-| alltoall | 37.00 | >= 76.54 | wrong=0 | IB/GDRDMA | FAIL |
-| broadcast | 191.65 | 未配置 | wrong=0 | IB/GDRDMA | PASS evidence |
-| reducescatter | 192.75 | 未配置 | wrong=0 | IB/GDRDMA | PASS evidence |
-| allgather | 192.14 | 未配置 | wrong=0 | IB/GDRDMA | PASS evidence |
-| sendrecv | 26.98 | 未配置 | wrong=0 | IB/GDRDMA | PASS evidence |
-
-这说明多机多卡 collective 覆盖面已经补齐，但生产性能是否达标仍取决于 PDF 是否有对应跨节点阈值，以及当前环境是否与 PDF 等价。
-
-## 当前最关键阻塞
-
-### 1. PDF 参考环境等价性未确认
-
-当前两台节点每节点只有 4 条可用于 NCCL 的 400G IB rail：
-
-```text
-mlx5_0, mlx5_1, mlx5_6, mlx5_7
-```
-
-其他 HCA：
-
-```text
-mlx5_4, mlx5_5: 100G InfiniBand
-mlx5_2, mlx5_8: 25G Ethernet
-mlx5_3, mlx5_9: DOWN
-```
-
-PDF 2x8 allreduce 目标 `491.84 GB/s busbw` 反推 algbw 为 `262.31 GB/s`，高于当前 4 x 400G rail 的理论单向原始带宽 `200 GB/s`。如果 PDF 参考环境有更多 400G rail 或 SHARP/plugin，当前硬件/软件栈不等价。
-
-### 2. 缺少 NCCL net plugin / SHARP
-
-当前没有发现：
-
-```text
-libnccl-net*.so*
-libsharp*.so*
-SHARP / HCOLL package
-```
-
-NCCL 日志中没有 SHARP/CollNet 迹象，当前走 internal IB plugin。
-
-### 3. alltoall 仍是独立问题
-
-`NCCL_PXN_DISABLE=1` 后 alltoall rail 更均衡，但 2x8 仍只有约 `36-37 GB/s`。已有 sweep 没找到稳定正收益，下一步应该交给网络路径、ECMP/adaptive routing、拥塞控制、plugin/SHARP 等方向，而不是继续盲调 NCCL 小参数。
-
-### 4. 单节点 Compute/Stress/RDMA 也未过
-
-即使多机 NCCL 后续解决，两台机器按当前 PDF `test all` 仍因 Compute、Stress、RDMA 项失败，不能直接判整机生产验收通过。
-
-## 建议下一步
-
-1. **硬件/网络侧先确认 PDF 等价性。** 确认参考环境每节点到底是 4 条还是 8 条 400G rail，是否启用 SHARP/NCCL net plugin，交换网络是否同一策略。
-2. **环境侧补齐或明确排除 SHARP/plugin。** 如果 PDF 环境有，当前必须补齐后重跑 `scripts/run_multinode_nccl_pdf_matrix.sh` 和 `scripts/run_multinode_nccl_all_collectives.sh`。
-3. **网络侧排查 alltoall。** 重点看跨 Leaf ECMP/adaptive routing/拥塞控制/credit wait，而不是只看链路是否 up。
-4. **单节点继续分项收敛。** Compute 阈值、Stress 温差/功耗 cap、RDMA read/latency 需要分别确认是机器问题、配置问题还是阈值口径问题。
-5. **如果硬件不等价，调整验收阈值或换等价节点复测。** 当前证据不支持把 4 rail 环境直接按疑似更高规格 PDF 阈值判定。
-
-## 当前最值得先读的文件
-
-| 顺序 | 文件 | 用途 |
-|---:|---|---|
-| 1 | `reports_h100_acceptance_current_status_20260523.md` | 当前总览和阻塞清单 |
-| 2 | `reports_h100_acceptance_closure_checklist_20260523.md` | 收尾检查清单和关闭条件 |
-| 3 | `reports_h100_acceptance_delivery_manifest_20260523.md` | 交付包 manifest 和 checksum |
-| 4 | `reports_h100_network_hardware_escalation_request_20260523.md` | 给网络/硬件/环境侧的闭环请求 |
-| 5 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划 |
-| 6 | `reports_multinode_nccl_environment_gap_20260523.md` | PDF 环境等价性缺口 |
-| 7 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | NCCL artifacts 信号分析 |
-| 8 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机 2x8 六项 collective 补测摘要 |
-| 9 | `reports_test_all_latest_summary_cn_20260523.md` | 单节点 test all 中文汇总 |
-| 10 | `reports_rdma_cross_node_mlx5_0_20260523.md` | 跨节点 RDMA 单 rail 证据 |
diff --git a/reports_h100_acceptance_delivery_manifest_20260523.md b/reports_h100_acceptance_delivery_manifest_20260523.md
deleted file mode 100644
index 735b5ea..0000000
--- a/reports_h100_acceptance_delivery_manifest_20260523.md
+++ /dev/null
@@ -1,152 +0,0 @@
-# H100 验收交付包 Manifest 2026-05-23
-
-## 交付结论
-
-当前分支：`h100-acceptance-current`
-
-最新 commit：以 `git log -1 --oneline` 为准。
-
-当前状态：**测试侧阶段性交付完成，生产验收未通过。**
-
-本交付包已经覆盖单节点 `test all`、跨节点 RDMA、多节点 NCCL PDF matrix、多节点 2x8 六项 collective、环境等价性分析、网络/硬件/环境闭环请求、复跑脚本和 artifacts checksum。剩余工作需要网络/硬件/环境侧确认后才能继续往最终验收推进。
-
-## 主入口
-
-按下面顺序阅读：
-
-| 顺序 | 文件 | 用途 |
-|---:|---|---|
-| 1 | `README.md` | 仓库入口和 H100 当前验收入口 |
-| 2 | `reports_h100_acceptance_current_status_20260523.md` | 当前总状态和阻塞项 |
-| 3 | `reports_h100_acceptance_closure_checklist_20260523.md` | 可交付项、未关闭门禁、收尾路径 |
-| 4 | `reports_h100_acceptance_pr_summary_20260523.md` | PR/审阅摘要 |
-| 5 | `reports_h100_network_hardware_escalation_request_20260523.md` | 给网络/硬件/环境侧的回填请求 |
-| 6 | `reports_multinode_nccl_latest_index_20260523.md` | 多节点 NCCL 报告索引 |
-
-## 核心报告
-
-| 分类 | 文件 | 当前结论 |
-|---|---|---|
-| 总览 | `reports_h100_acceptance_current_status_20260523.md` | FAIL，证据链完整但门禁未过 |
-| 收尾 | `reports_h100_acceptance_closure_checklist_20260523.md` | 可阶段性交付，不能判生产通过 |
-| PR 摘要 | `reports_h100_acceptance_pr_summary_20260523.md` | 给代码审阅和合并说明使用 |
-| 闭环请求 | `reports_h100_network_hardware_escalation_request_20260523.md` | 等待网络/硬件/环境侧回填 |
-| 单节点 | `reports_test_all_latest_summary_cn_20260523.md` | 两台均 `6/10 PASS`，整体 FAIL |
-| 跨节点 RDMA | `reports_rdma_cross_node_mlx5_0_20260523.md` | write BW PASS，read BW/latency FAIL |
-| 多节点 NCCL PDF matrix | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 8 个 case 仅 1 个性能 PASS；正确性均 OK |
-| 多节点 NCCL 六项 collective | `reports_multinode_nccl_all_collectives_run_20260523.md` | 6 项正确性 OK；allreduce/alltoall 按 PDF 阈值 FAIL |
-| 环境等价性 | `reports_multinode_nccl_environment_gap_20260523.md` | 当前不能证明与 PDF 等价 |
-| NCCL artifact 信号 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | IB/GDRDMA 正常；缺外部 plugin/SHARP |
-| 接手计划 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给继续定位和复跑的人使用 |
-
-## 可复跑入口
-
-| 脚本 | 用途 | 建议执行位置 |
-|---|---|---|
-| `scripts/run_h100_single_node_all.sh` | 单节点 H100 全量验收 | 两台节点分别执行 |
-| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多节点 NCCL PDF matrix | `nccl-gpu-1` |
-| `scripts/run_multinode_nccl_all_collectives.sh` | 多节点 2x8 六项 collective | `nccl-gpu-1` |
-| `scripts/multinode_nccl_deep_diagnose.sh` | 多节点 NCCL 深度诊断 | `nccl-gpu-1` |
-| `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照 | 两台节点分别执行 |
-
-推荐复跑顺序：
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/multinode_nccl_deep_diagnose.sh preflight
-bash scripts/run_multinode_nccl_pdf_matrix.sh
-bash scripts/run_multinode_nccl_all_collectives.sh
-```
-
-如果网络/硬件/环境侧调整了单节点条件，还需要分别在两台节点执行：
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/run_h100_single_node_all.sh
-```
-
-## 远端位置
-
-两台远端默认路径：
-
-```text
-nccl-gpu-1: /root/test_gpu_scripts
-nccl-gpu-2: /root/test_gpu_scripts
-```
-
-最新多节点 NCCL 原始 artifacts 位于 `nccl-gpu-1`：
-
-| 类型 | 路径 |
-|---|---|
-| PDF matrix raw report | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803.md` |
-| PDF matrix artifacts dir | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts` |
-| PDF matrix artifacts tar | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.tar.gz` |
-| 六项 collective raw report | `/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md` |
-| 六项 collective artifacts dir | `/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts` |
-| 六项 collective artifacts tar | `/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz` |
-
-## Artifact 校验
-
-PDF matrix bundle checksum：
-
-```text
-682ac637460472d464a0d56ccc0f3335ed7f79a270157a403ebec23b8d9feceb  reports/multinode_nccl_pdf_matrix_20260523_113803.md
-7371fcaf7269f92eb1544e5e63573ebf77f4ae38f668b5b22169ca86e6d603ee  reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.tar.gz
-```
-
-六项 collective bundle checksum：
-
-```text
-06c565281813c4260da9cfee8f0b0289b61b3be95c01dd670c71fa1a441133e3  reports/multinode_nccl_all_collectives_20260523_120144.md
-fa5961d47a5905da6ebc6c726421d73ddc2314a316a8f578683d31fe69c256e5  reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz
-```
-
-逐文件 checksum：
-
-| 文件 | 用途 |
-|---|---|
-| `reports_multinode_nccl_all_collectives_20260523_120144_bundle.sha256` | 六项 collective raw report + tar checksum |
-| `reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256` | 六项 collective artifacts 逐文件 checksum |
-| `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | PDF matrix case summary + bundle checksum |
-| `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 六项 collective case summary + bundle/per-file checksum |
-
-## 入口文件 SHA256
-
-以下 hash 用于确认本地与两台远端入口文件一致。本 manifest 本身不做自引用 hash。
-
-```text
-e2faf6cbd968924727c669827d7e838d5165ee961133c8e55e8993134b5e7b63  README.md
-846c3da4ac655a0b3ad072e4c4475d91b55e2bdc9d8aedb9c5f9d800608fb64c  reports_h100_acceptance_current_status_20260523.md
-4a0ee9f456acc1284bf3a42df5bd338affb831471c27ca4b6584201acd72fd52  reports_h100_acceptance_closure_checklist_20260523.md
-0c71f36b9b1a6c5a73bd32337a56a702d3faa37c02640b93cb5d00b9b80c362f  reports_h100_acceptance_pr_summary_20260523.md
-45438db9204ceef5f65019a6594c016f3183799ed3b89dcf40f383a34f9e3466  reports_h100_network_hardware_escalation_request_20260523.md
-d982d6f3698e8860b8505d65105f6056c11f1f72758401a4613ae8315b6f92d0  reports_multinode_nccl_latest_index_20260523.md
-8fca70e703961745d5bdacaa3fccb814709c426c0fa7713d0df2d1f2fb26a3f4  reports_multinode_nccl_handoff_plan_20260523.md
-b0d0d1fa9b1aa0d8cbdd2672508df5c7bafffc91b607b35b199e624352147e75  reports_multinode_nccl_environment_gap_20260523.md
-a7bc27c630fb97c0b83a3427ed4017a16a21e1285f4be5a2cc21f653921fab2b  reports_multinode_nccl_pdf_matrix_run_20260523.md
-60bdb85e087e796d59c6f0cb7e79c7e60b4147b5fff8c6b60606f6c1f53b1b58  reports_multinode_nccl_all_collectives_run_20260523.md
-6affec63694d31dc2d7f097210794e7821e931b8c8b9ac8f451c6f7948bf138a  reports_test_all_latest_summary_cn_20260523.md
-3895cdf040220aa13093c3377c301580120f04eb9958dbb7c3df3d7285c2d733  reports_rdma_cross_node_mlx5_0_20260523.md
-```
-
-## 还不能关闭的事项
-
-| 项目 | 当前阻塞 |
-|---|---|
-| 单节点 Compute | 多 dtype 绝对 TFLOPS 阈值未达，部分 GPU spread 超 3% |
-| 单节点 NCCL | 多 op/size 未达阈值，小包和部分 2G case 明显 |
-| 单节点 Stress | 30 分钟可跑满，但温差和 `sw_power_cap` throttle 触发 FAIL |
-| 单节点 RDMA | read BW 未达 47 GB/s，部分端口不是 400G |
-| 跨节点 RDMA | read BW 和 write/read latency 未达阈值 |
-| 多节点 NCCL allreduce | 2x8 `353.85 GB/s`，PDF 目标 `491.84 GB/s` |
-| 多节点 NCCL alltoall | 2x8 `36.83 GB/s`，PDF 目标 `76.54 GB/s` |
-| PDF 环境等价性 | 当前只有 4 条 400G rail，缺 NCCL net plugin / SHARP 证据 |
-
-## 下一步闭环条件
-
-网络/硬件/环境侧需要给出以下任一结论：
-
-1. 当前两台机器已修复到 PDF 参考环境等价状态，测试侧复跑。
-2. 当前机器与 PDF 参考环境不等价，但可以接受新的阈值或豁免口径。
-3. 当前硬件/网络不满足交付规格，需要先修复。
-4. PDF 阈值不适用于当前跨 Leaf/4 rail/plugin 缺失场景，需要更新验收标准。
diff --git a/reports_h100_acceptance_pr_summary_20260523.md b/reports_h100_acceptance_pr_summary_20260523.md
deleted file mode 100644
index 27b6436..0000000
--- a/reports_h100_acceptance_pr_summary_20260523.md
+++ /dev/null
@@ -1,144 +0,0 @@
-# H100 验收分支 PR 摘要 2026-05-23
-
-## 建议 PR 标题
-
-```text
-Add H100 acceptance evidence, multinode NCCL runs, and handoff reports
-```
-
-## PR 结论
-
-本 PR 完成 H100 验收测试侧的阶段性交付：脚本、单节点报告、多节点 NCCL 报告、RDMA 证据、artifacts、checksum、中文说明和交接文档已经齐备。
-
-但本 PR **不表示生产验收通过**。当前两台 H100 节点按现有 PDF/配置口径仍为 `FAIL`，需要网络/硬件/环境侧完成回填或修复后再复跑。
-
-## 变更范围
-
-### 测试入口
-
-- 新增/完善单节点 H100 `test all` 入口。
-- 新增多节点 NCCL PDF matrix 复跑入口。
-- 新增多节点 2x8 六项 collective 复跑入口。
-- 新增 NCCL 深度诊断和环境快照入口。
-
-### 配置
-
-- 固定 NCCL 2.27.7 / nccl-tests 路径的多节点 PDF matrix 配置。
-- 新增 2x8 六项 collective 配置。
-- `allreduce/alltoall` 保留已知 PDF 2x8 阈值；新增的 `broadcast/reducescatter/allgather/sendrecv` 暂按证据采集处理。
-
-### 报告和证据
-
-- 单节点 `test all` 中文汇总。
-- 跨节点 RDMA `mlx5_0` 双向证据。
-- 多节点 NCCL PDF matrix 中文摘要、原始报告、artifacts manifest。
-- 多节点 2x8 六项 collective 中文摘要、原始报告、artifacts manifest。
-- NCCL artifact 信号分析、环境等价性分析、handoff 计划、收尾清单。
-- 网络/硬件/环境侧闭环请求和交付包 manifest。
-
-## 当前验收状态
-
-| 范围 | 结论 | 说明 |
-|---|---|---|
-| 单节点 `test all` | FAIL | 两台均 `6/10 PASS`；Compute、NCCL、Stress、RDMA 未过 |
-| 跨节点 RDMA | FAIL | write BW PASS；read BW 和 latency 未达阈值 |
-| 多节点 NCCL PDF matrix | FAIL | 8 个 case 仅 2x2 allreduce 性能 PASS；所有 case 正确性 OK |
-| 多节点 2x8 六项 collective | FAIL / evidence complete | 6 项正确性 OK；allreduce/alltoall 按 PDF 阈值 FAIL |
-| 环境等价性 | 未证明 | 当前每节点只有 4 条 400G rail，缺外部 NCCL net plugin / SHARP 证据 |
-
-## 关键结果
-
-### 单节点
-
-```text
-aikubeworker0012: 6/10 PASS, PDF acceptance FAIL
-aikubeworker0016: 6/10 PASS, PDF acceptance FAIL
-```
-
-### 跨节点 RDMA
-
-```text
-ib_write_bw: 48.38-49.35 GB/s, PASS
-ib_read_bw: 44.36-44.37 GB/s, FAIL
-ib_write_lat avg: 2.13-2.17 us, FAIL
-ib_read_lat avg: 4.05-4.08 us, FAIL
-```
-
-### 多节点 NCCL PDF matrix
-
-| Topology | AllReduce | Target | Status | AllToAll | Target | Status |
-|---|---:|---:|---|---:|---:|---|
-| 2 nodes x 1 GPU | 47.29 | 48.90 | FAIL | 24.85 | 27.25 | FAIL |
-| 2 nodes x 2 GPUs | 137.16 | 136.93 | PASS | 47.76 | 54.41 | FAIL |
-| 2 nodes x 4 GPUs | 335.07 | 335.48 | FAIL | 72.74 | 73.73 | FAIL |
-| 2 nodes x 8 GPUs | 353.85 | 491.84 | FAIL | 36.83 | 76.54 | FAIL |
-
-所有 NCCL case 均 `returncode=0`、`wrong=0`，当前失败来自性能阈值，不是功能错误。
-
-## 主要风险
-
-1. **不能把本 PR 合并理解为验收通过。**
-   当前结果明确是 `FAIL`，本 PR 交付的是证据链和复跑能力。
-
-2. **PDF 2x8 allreduce 阈值可能要求比当前环境更强的 rail/plugin 能力。**
-   当前每节点仅 4 条 400G IB rail；PDF 2x8 allreduce 目标 `491.84 GB/s busbw` 反推 algbw `262.31 GB/s`，高于 4 x 400G rail 的理论单向原始带宽 `200 GB/s`。
-
-3. **alltoall 需要网络侧继续定位。**
-   `NCCL_PXN_DISABLE=1` 后 rail 更均衡，但 2x8 alltoall 仍只有 `36-37 GB/s`。
-
-4. **单节点门禁也仍未过。**
-   即使多节点 NCCL 后续解决，Compute、Stress、RDMA 单节点项仍需闭环。
-
-## 验证方式
-
-已完成：
-
-- `git diff --check`
-- 本地与两台远端入口文件 sha256 核对
-- 多节点 NCCL PDF matrix 复跑并归档 artifacts
-- 多节点 2x8 六项 collective 复跑并归档 artifacts
-- 跨节点 RDMA 单 rail 双向测试
-- 单节点 `test all` 汇总
-
-远端同步路径：
-
-```text
-nccl-gpu-1: /root/test_gpu_scripts
-nccl-gpu-2: /root/test_gpu_scripts
-```
-
-## 复跑命令
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/multinode_nccl_deep_diagnose.sh preflight
-bash scripts/run_multinode_nccl_pdf_matrix.sh
-bash scripts/run_multinode_nccl_all_collectives.sh
-```
-
-单节点复跑：
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/run_h100_single_node_all.sh
-```
-
-## Reviewer 重点看
-
-| 文件 | 为什么要看 |
-|---|---|
-| `reports_h100_acceptance_current_status_20260523.md` | 当前总览和失败项 |
-| `reports_h100_acceptance_delivery_manifest_20260523.md` | 交付包入口、远端 artifacts、checksum |
-| `reports_h100_network_hardware_escalation_request_20260523.md` | 需要网络/硬件/环境侧回填的问题 |
-| `reports_multinode_nccl_environment_gap_20260523.md` | 为什么当前环境不能证明与 PDF 等价 |
-| `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 多节点 PDF matrix 结果 |
-| `reports_multinode_nccl_all_collectives_run_20260523.md` | 六项 collective 补测结果 |
-
-## 合并建议
-
-可以合并为测试侧交付分支，但合并说明中必须保留：
-
-```text
-当前 H100 生产验收未通过；本分支交付测试证据、复跑脚本和闭环请求。
-最终验收需等待网络/硬件/环境侧确认或修复后复跑。
-```
diff --git a/reports_h100_network_hardware_escalation_request_20260523.md b/reports_h100_network_hardware_escalation_request_20260523.md
deleted file mode 100644
index f4a82d5..0000000
--- a/reports_h100_network_hardware_escalation_request_20260523.md
+++ /dev/null
@@ -1,193 +0,0 @@
-# H100 网络/硬件/环境侧闭环请求 2026-05-23
-
-## 用途
-
-这份文档用于转交给网络、硬件、机房、环境维护同事，目标是把当前 H100 验收剩余 `FAIL` 从“测试侧已复现”推进到“责任侧确认并闭环”。
-
-当前测试侧已经完成单节点 `test all`、跨节点 RDMA、多节点 NCCL PDF matrix、2x8 六项 collective、NCCL artifacts、checksum 和中文报告。当前不能判生产验收通过，剩余问题需要网络/硬件/环境侧确认。
-
-## 需要对方先读的结论
-
-当前两台机器：
-
-| 角色 | 主机名 | 地址 |
-|---|---|---|
-| nccl-gpu-1 | `aikubeworker0012` | `172.72.8.12` |
-| nccl-gpu-2 | `aikubeworker0016` | `172.72.8.16` |
-
-当前主要阻塞：
-
-| 阻塞 | 当前证据 | 需要确认 |
-|---|---|---|
-| 每节点有效 400G IB rail 只有 4 条 | `mlx5_0,mlx5_1,mlx5_6,mlx5_7` | 这是否符合采购/布线/验收预期 |
-| 其他 HCA 不等价 | `mlx5_4/5` 为 100G IB，`mlx5_2/8` 为 25G Ethernet，`mlx5_3/9` DOWN | 是配置问题、线缆/模块问题、交换端口问题，还是设计如此 |
-| 缺外部 NCCL 网络组件 | 未找到 `libnccl-net*.so*`、`libsharp*.so*`，未见 SHARP/HCOLL 包 | PDF 参考环境是否启用这些组件 |
-| 跨节点 RDMA read/latency 未过 | `ib_read_bw` 约 44.36 GB/s，目标 >= 47 GB/s；latency 也未达阈值 | OFED/固件/BIOS/交换网络/perftest 参数是否需要调整 |
-| 2x8 NCCL allreduce 未达 PDF | `353.85 GB/s` vs `491.84 GB/s` | PDF 目标是否要求更多 rail 或 plugin/SHARP |
-| 2x8 NCCL alltoall 未达 PDF | `36.83 GB/s` vs `76.54 GB/s` | 跨 Leaf ECMP/adaptive routing/congestion control 是否影响多点流量 |
-
-## 请对方必须回填的问题
-
-### 1. Rail / 端口 / HCA
-
-请逐项回答：
-
-| 问题 | 回答 |
-|---|---|
-| 这两台机器是否设计为每节点 8 条 400G InfiniBand rail？ |  |
-| 如果是，为什么当前只有 `mlx5_0,mlx5_1,mlx5_6,mlx5_7` 是 400G IB ACTIVE？ |  |
-| `mlx5_4`、`mlx5_5` 为什么只有 100G IB？ |  |
-| `mlx5_2`、`mlx5_8` 为什么是 25G Ethernet？ |  |
-| `mlx5_3`、`mlx5_9` 为什么 DOWN？ |  |
-| 当前 HCA 状态是否符合这批机器的采购/交付规格？ |  |
-| 如果不符合，修复动作和预计完成时间是什么？ |  |
-
-建议在两台节点分别执行并回填输出：
-
-```bash
-hostname
-for d in /sys/class/infiniband/mlx5_*; do
-  dev=$(basename "$d")
-  printf "%s state=%s rate=%s link_layer=%s\n" \
-    "$dev" \
-    "$(cat "$d/ports/1/state" 2>/dev/null)" \
-    "$(cat "$d/ports/1/rate" 2>/dev/null)" \
-    "$(cat "$d/ports/1/link_layer" 2>/dev/null)"
-done
-nvidia-smi topo -m
-```
-
-### 2. PDF 参考环境等价性
-
-请确认 PDF 参考环境到底是什么形态：
-
-| 问题 | 回答 |
-|---|---|
-| PDF 参考环境每节点实际参与 NCCL 的 400G rail 数量是多少？ |  |
-| PDF 参考环境的 HCA 列表是否全部为 400G IB ACTIVE？ |  |
-| PDF 是否是在同一 Leaf、跨 Leaf，还是不同交换路径下测得？ |  |
-| PDF 是否启用了 adaptive routing / ECMP / congestion control 特定策略？ |  |
-| PDF 是否使用了外部 NCCL net plugin / SHARP / HCOLL / UCX plugin？ |  |
-| 如果当前环境与 PDF 不等价，是否仍要求按 PDF 阈值验收？ |  |
-
-测试侧当前判断：如果 PDF 2x8 allreduce 目标 `491.84 GB/s busbw` 是硬阈值，则其反推 algbw 为：
-
-```text
-491.84 / 1.875 = 262.31 GB/s
-```
-
-当前每节点 4 条 400G rail 的理论单向原始带宽约：
-
-```text
-4 * 400Gb/s / 8 = 200 GB/s
-```
-
-因此请明确：当前 4 rail 形态是否允许按 PDF 2x8 allreduce 目标验收。
-
-### 3. NCCL net plugin / SHARP / HCOLL
-
-请逐项回答：
-
-| 问题 | 回答 |
-|---|---|
-| 当前生产验收标准是否要求安装 NCCL net plugin？ |  |
-| 当前生产验收标准是否要求启用 SHARP 或 HCOLL？ |  |
-| 如果要求，安装包来源、版本、安装路径是什么？ |  |
-| 安装后是否需要设置 `LD_LIBRARY_PATH`、`NCCL_NET_PLUGIN`、`NCCL_COLLNET_ENABLE` 等变量？ |  |
-| 如果不要求，是否确认 internal IB plugin 即为验收参考环境？ |  |
-
-建议在两台节点分别执行并回填输出：
-
-```bash
-hostname
-find /usr /opt /root /data -name 'libnccl-net*.so*' -o -name 'libsharp*.so*' 2>/dev/null
-dpkg -l | egrep -i 'sharp|hcoll|nccl|ucx|ofed|doca' || true
-ldconfig -p | egrep -i 'nccl-net|sharp|hcoll|ucx' || true
-```
-
-### 4. 跨节点 RDMA read/latency
-
-当前测试侧证据：
-
-| Direction | Test | Value | Threshold | Status |
-|---|---|---:|---:|---|
-| 0016 -> 0012 | `ib_write_bw` | 49.35 GB/s | >= 47 GB/s | PASS |
-| 0016 -> 0012 | `ib_read_bw` | 44.36 GB/s | >= 47 GB/s | FAIL |
-| 0016 -> 0012 | `ib_write_lat` avg | 2.17 us | <= 2.0 us | FAIL |
-| 0016 -> 0012 | `ib_read_lat` avg | 4.05 us | <= 3.5 us | FAIL |
-| 0012 -> 0016 | `ib_write_bw` | 48.38 GB/s | >= 47 GB/s | PASS |
-| 0012 -> 0016 | `ib_read_bw` | 44.37 GB/s | >= 47 GB/s | FAIL |
-| 0012 -> 0016 | `ib_write_lat` avg | 2.13 us | <= 2.0 us | FAIL |
-| 0012 -> 0016 | `ib_read_lat` avg | 4.08 us | <= 3.5 us | FAIL |
-
-请确认：
-
-| 问题 | 回答 |
-|---|---|
-| 当前 OFED / firmware / BIOS 设置是否符合 400G IB perftest 验收推荐？ |  |
-| read BW 明显低于 write BW 是否符合预期？ |  |
-| 当前 latency 阈值是否适用于跨 Leaf 场景？ |  |
-| 是否需要指定 GID index、MTU、SL、traffic class、PCI relaxed ordering 或其他参数？ |  |
-| 是否能提供网络侧 port counter / credit wait / congestion 证据？ |  |
-
-### 5. alltoall 跨 Leaf 路径
-
-当前测试侧已经做过 NCCL 参数 sweep，`NCCL_PXN_DISABLE=1` 后 rail 更均衡，但 2x8 alltoall 仍只有 `36-37 GB/s`。继续盲调 NCCL 小参数没有明显收益。
-
-请网络侧确认：
-
-| 问题 | 回答 |
-|---|---|
-| 两台机器是否跨 Leaf？ |  |
-| 当前跨 Leaf ECMP hash 是否适合 alltoall 多点到多点流量？ |  |
-| adaptive routing 是否开启？ |  |
-| 是否存在 credit wait、PFC pause、拥塞控制、buffer 或 QoS 策略限制？ |  |
-| 是否能提供 alltoall 运行窗口内的交换机端口 counter？ |  |
-
-## 测试侧可配合复跑的命令
-
-如果网络/硬件/环境侧完成调整，请在 `nccl-gpu-1` 上复跑：
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/multinode_nccl_deep_diagnose.sh preflight
-bash scripts/run_multinode_nccl_pdf_matrix.sh
-bash scripts/run_multinode_nccl_all_collectives.sh
-```
-
-如果调整了 SHARP/plugin，请额外跑：
-
-```bash
-cd /root/test_gpu_scripts
-OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%d_%H%M%S) \
-  bash scripts/multinode_nccl_deep_diagnose.sh graph
-```
-
-如果调整了单节点环境，请分别在两台节点跑：
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/run_h100_single_node_all.sh
-```
-
-## 测试侧当前交付物
-
-| 文件 | 用途 |
-|---|---|
-| `reports_h100_acceptance_current_status_20260523.md` | 当前总览 |
-| `reports_h100_acceptance_closure_checklist_20260523.md` | 收尾检查清单和关闭条件 |
-| `reports_h100_network_hardware_escalation_request_20260523.md` | 本闭环请求 |
-| `reports_multinode_nccl_environment_gap_20260523.md` | PDF 环境等价性缺口 |
-| `reports_multinode_nccl_handoff_plan_20260523.md` | 复跑和接手计划 |
-| `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 多节点 NCCL PDF matrix 摘要 |
-| `reports_multinode_nccl_all_collectives_run_20260523.md` | 多节点 2x8 六项 collective 摘要 |
-| `reports_rdma_cross_node_mlx5_0_20260523.md` | 跨节点 RDMA 单 rail 证据 |
-
-## 闭环判定
-
-网络/硬件/环境侧需要输出以下任一结论，测试侧才能继续往最终验收推进：
-
-1. **环境修复完成：** 当前两台机器已达到 PDF 参考环境等价状态，请测试侧复跑。
-2. **环境不等价但可接受：** 当前机器规格与 PDF 不同，请按新的阈值/豁免口径复跑；新口径需写入配置或报告。
-3. **硬件/网络异常：** 当前机器或网络不满足交付规格，需要先修复硬件/布线/交换配置。
-4. **参考标准有误：** PDF 阈值不适用于当前场景，需要更新验收标准。
diff --git a/reports_multinode_nccl_16g_2x8_nccl227.md b/reports_multinode_nccl_16g_2x8_nccl227.md
deleted file mode 100644
index 394f191..0000000
--- a/reports_multinode_nccl_16g_2x8_nccl227.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-23T07:56:26.791384
-- **Host:** aikubeworker0012
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Missing required evidence:
-- GPU Info
-- Health Check
-- Memory Bandwidth
-- Compute Throughput
-- NVLink/NVSwitch
-- NCCL
-- Stress Test
-- RDMA
-- DCGM
-- Training
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Multi-node NCCL | FAIL |
-
-## Multi-node NCCL / Cross Leaf
-
-Source: nccl-tests-mpirun | Mode: large-message-nccl-2.27.7
-
-- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
-- **Preflight:** PASS
-
-### Multi-node NCCL allreduce
-
-| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 16G | 237.86 GB/s | 16G | 238.56 GB/s | >= 480 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 16G | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 16G | 0 | aikubeworker0016:1019342:1020412 [4] NCCL INFO comm 0x559f14871c30 rank 12 nranks 16 cudaDev 4 busId 9a000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 238.555  # # Collective test concluded: all_reduce_perf #   |
-
-### Multi-node NCCL alltoall
-
-| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 16G | 28.62 GB/s | 16G | 28.62 GB/s | >= 75 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 16G | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 16G | 0 | E aikubeworker0016:1020609:1021756 [5] NCCL INFO comm 0x55f920e55d90 rank 13 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 28.6222  # # Collective test concluded: alltoall_perf #   |
-
-**Overall: FAIL**
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_multinode_nccl_16g_2x8_nccl227_auto.md b/reports_multinode_nccl_16g_2x8_nccl227_auto.md
deleted file mode 100644
index 0481813..0000000
--- a/reports_multinode_nccl_16g_2x8_nccl227_auto.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-23T08:09:56.340954
-- **Host:** aikubeworker0012
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Missing required evidence:
-- GPU Info
-- Health Check
-- Memory Bandwidth
-- Compute Throughput
-- NVLink/NVSwitch
-- NCCL
-- Stress Test
-- RDMA
-- DCGM
-- Training
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Multi-node NCCL | FAIL |
-
-## Multi-node NCCL / Cross Leaf
-
-Source: nccl-tests-mpirun | Mode: large-message-nccl-2.27.7-auto
-
-- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
-- **Preflight:** PASS
-
-### Multi-node NCCL allreduce
-
-| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | 354.60 GB/s | 16G | 354.57 GB/s | >= 480 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | 0 | 0012:2149404:2149572 [7] NCCL INFO comm 0x560bd3541a30 rank 7 nranks 16 cudaDev 7 busId db000 - Destroy COMPLETE aikubeworker0016:1066162:1066981 [5] NCCL INFO comm 0x55e73208e200 rank 13 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE   |
-
-### Multi-node NCCL alltoall
-
-| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | 30.01 GB/s | 16G | 30.02 GB/s | >= 75 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G | 0 | r0012:2149589:2149764 [7] NCCL INFO comm 0x55fef234b7c0 rank 7 nranks 16 cudaDev 7 busId db000 - Destroy COMPLETE aikubeworker0012:2149588:2149765 [6] NCCL INFO comm 0x5637718f1dd0 rank 6 nranks 16 cudaDev 6 busId ba000 - Destroy COMPLETE   |
-
-**Overall: FAIL**
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_multinode_nccl_all_collectives_20260523_120144.md b/reports_multinode_nccl_all_collectives_20260523_120144.md
deleted file mode 100644
index 2b1d604..0000000
--- a/reports_multinode_nccl_all_collectives_20260523_120144.md
+++ /dev/null
@@ -1,98 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-23T12:04:48.257734
-- **Host:** aikubeworker0012
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Failed or unverified items:
-- Multi-node NCCL: FAIL
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Multi-node NCCL | FAIL |
-
-## Multi-node NCCL / Cross Leaf
-
-Source: nccl-tests-mpirun | Mode: cross-leaf-all-collectives-nccl-2.27.7
-
-- **Artifacts:** `/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts`
-- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
-- **Preflight:** PASS
-
-### Multi-node NCCL allreduce
-
-| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs (all collectives evidence run) | - | 354.27 GB/s | 16G | 354.45 GB/s | >= 491.84 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 8 GPUs (all collectives evidence run) | 0 | nks 16 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2208791:2208941 [0] NCCL INFO comm 0x557970d9f5f0 rank 0 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 354.452  #   |
-
-### Multi-node NCCL alltoall
-
-| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs (all collectives evidence run) | - | 37.00 GB/s | 16G | 37.14 GB/s | >= 76.54 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 8 GPUs (all collectives evidence run) | 0 | r0012:2208962:2209141 [5] NCCL INFO comm 0x564c4f9c4a30 rank 5 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE aikubeworker0012:2208963:2209143 [6] NCCL INFO comm 0x56328e52f270 rank 6 nranks 16 cudaDev 6 busId ba000 - Destroy COMPLETE   |
-
-### Multi-node NCCL broadcast
-
-| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs (all collectives evidence run) | - | 191.65 GB/s | 16G | 190.25 GB/s | - | PASS |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-### Multi-node NCCL reducescatter
-
-| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs (all collectives evidence run) | - | 192.75 GB/s | 16G | 192.74 GB/s | - | PASS |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-### Multi-node NCCL allgather
-
-| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs (all collectives evidence run) | - | 192.14 GB/s | 16G | 192.47 GB/s | - | PASS |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-### Multi-node NCCL sendrecv
-
-| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs (all collectives evidence run) | - | 26.98 GB/s | 16G | 26.97 GB/s | - | PASS |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 8 GPUs (all collectives evidence run) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-**Overall: FAIL**
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256 b/reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256
deleted file mode 100644
index 0264ba3..0000000
--- a/reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256
+++ /dev/null
@@ -1,24 +0,0 @@
-efa4a915bdf4943aef5d88c402c24eb2c60848e5f440f58058a1e99217b07e0d  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt
-020eb35ddc5933da78b5c00c1b6fc25b11b23c4505300276d9736fbe8a35519b  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt
-903772b675d9a9f7b04e061a25a90f97bf7844dddb5f3809bc9c501f4d6c783d  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt
-b7ea7350b3703d4b31389d92b375562bd04a50b40fe16a6c8d037b134a51dbd5  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt
-47f68b7510df3b472e7ac0ec2fb53dcefbe687bb4de0c889f8947cc652d09e61  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt
-6889180431d639e414e188e1dbc586157565e8506255731b7b38d221d0f72919  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt
-6ecbd8473d987d2a7839135029902bd629403eb407a7873502a49be26fa1c947  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt
-fa2828cdfcb86e6715a17c8bf45de10ce421c12f0877efff9bafb218b2f00df3  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt
-2eae24183754f8d084945d9857b84033ebccf1a2e606931b4f4fc19c5e2e876f  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt
-277e900dc1efa8f036616226dbc30cb616ba97337e929ad8b1a14c12484867b3  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt
-077fec1bf498fd202e2866f1cf6fb4502ac8d1bafba156f213453b21f6a6df2b  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt
-727c69ad6111b891c25360bd9e97ce15f2e7a36d5ff61ae88a7577ecb61c895f  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt
-8bec99a952eeb26fa3c6d89cbf2331393923fd4f0fae153b8efe3da239c0a09f  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt
-be24943eb4b63e304cee41831adeb23ffbbc0e890ff19b067e06d6a4b48b2d90  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt
-a8220b6a4fe3ae037837919a181452e0fc735f58f27fafff07ea431b09b905de  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt
-ead794f19e1d2d780cf1840c124b6e0955c70c8b157feb47c4826599d5643b39  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt
-4560364922a85d21827357b906491aae8283c6148ff1c0e0f0dc379a68307fdd  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt
-ade548ee5fdbe2d1fce461237b5b713cc2af24e6c2857bbbd73837f28551af27  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt
diff --git a/reports_multinode_nccl_all_collectives_20260523_120144_bundle.sha256 b/reports_multinode_nccl_all_collectives_20260523_120144_bundle.sha256
deleted file mode 100644
index 3097f81..0000000
--- a/reports_multinode_nccl_all_collectives_20260523_120144_bundle.sha256
+++ /dev/null
@@ -1,2 +0,0 @@
-06c565281813c4260da9cfee8f0b0289b61b3be95c01dd670c71fa1a441133e3  reports/multinode_nccl_all_collectives_20260523_120144.md
-fa5961d47a5905da6ebc6c726421d73ddc2314a316a8f578683d31fe69c256e5  reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz
diff --git a/reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md b/reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md
deleted file mode 100644
index b1fc9b5..0000000
--- a/reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md
+++ /dev/null
@@ -1,46 +0,0 @@
-# 多机多卡 NCCL 六项 Collective Artifacts Manifest 2026-05-23
-
-- Remote report: `reports/multinode_nccl_all_collectives_20260523_120144.md`
-- Remote artifact dir: `reports/multinode_nccl_all_collectives_20260523_120144_artifacts`
-- Remote artifact tar: `reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz`
-- Remote bundle checksum: `reports/multinode_nccl_all_collectives_20260523_120144_bundle.sha256`
-- Remote per-file checksum: `reports/multinode_nccl_all_collectives_20260523_120144_artifacts.sha256`
-- Local report copy: `reports_multinode_nccl_all_collectives_20260523_120144.md`
-- Local artifact tar copy: `/private/tmp/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz`
-- Case count: `6`
-- Artifact files: `24`
-
-## Case Summary
-
-| Case | Peak Bus BW | Avg Bus BW | Threshold | Wrong | Return Code | Status |
-|---|---:|---:|---:|---:|---:|---|
-| `allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 354.27 | 354.45 | 491.84 | 0 | 0 | FAIL |
-| `alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 37.00 | 37.14 | 76.54 | 0 | 0 | FAIL |
-| `broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 191.65 | 190.25 | 0.00 | 0 | 0 | PASS |
-| `reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 192.75 | 192.74 | 0.00 | 0 | 0 | PASS |
-| `allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 192.14 | 192.47 | 0.00 | 0 | 0 | PASS |
-| `sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 26.98 | 26.97 | 0.00 | 0 | 0 | PASS |
-
-## Bundle Checksums
-
-```text
-06c565281813c4260da9cfee8f0b0289b61b3be95c01dd670c71fa1a441133e3  reports/multinode_nccl_all_collectives_20260523_120144.md
-fa5961d47a5905da6ebc6c726421d73ddc2314a316a8f578683d31fe69c256e5  reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz
-```
-
-## Per-file Checksums
-
-```text
-020eb35ddc5933da78b5c00c1b6fc25b11b23c4505300276d9736fbe8a35519b  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-47f68b7510df3b472e7ac0ec2fb53dcefbe687bb4de0c889f8947cc652d09e61  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-fa2828cdfcb86e6715a17c8bf45de10ce421c12f0877efff9bafb218b2f00df3  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-077fec1bf498fd202e2866f1cf6fb4502ac8d1bafba156f213453b21f6a6df2b  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-be24943eb4b63e304cee41831adeb23ffbbc0e890ff19b067e06d6a4b48b2d90  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-4560364922a85d21827357b906491aae8283c6148ff1c0e0f0dc379a68307fdd  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-```
-
-完整逐文件 checksum 已保存为：
-
-```text
-reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256
-```
diff --git a/reports_multinode_nccl_all_collectives_run_20260523.md b/reports_multinode_nccl_all_collectives_run_20260523.md
deleted file mode 100644
index 9468190..0000000
--- a/reports_multinode_nccl_all_collectives_run_20260523.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# 多机多卡 NCCL 六项 Collective 补测结果 2026-05-23
-
-## 测试对象
-
-- 节点：`nccl-gpu-1(172.72.8.12)` + `nccl-gpu-2(172.72.8.16)`
-- 拓扑：`2 nodes x 8 GPUs`
-- NCCL：`2.27.7`
-- nccl-tests：`/data/nccl-tests-latest/build`
-- 配置：`configs/multinode_nccl_nccl227_all_collectives_2x8.yaml`
-- 入口：`scripts/run_multinode_nccl_all_collectives.sh`
-- 远端报告：`/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md`
-- 远端 artifacts：`/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts`
-- 本地报告：`reports_multinode_nccl_all_collectives_20260523_120144.md`
-
-## 一句话结论
-
-这次补测已经把单机 `test all` 中的 6 个 NCCL collective 扩展到了多机 2x8 场景：`allreduce/alltoall/broadcast/reducescatter/allgather/sendrecv` 都能跑通，`returncode=0`、`wrong_count=0`，并且都走 `IB + GDRDMA`。按已知 PDF 2x8 阈值，`allreduce` 和 `alltoall` 仍 FAIL；新增的 4 项目前没有 PDF 跨节点阈值，因此只作为证据采集项，不判生产验收性能。
-
-## 结果表
-
-| Operation | Peak Bus BW | Threshold | Correctness | Network | Status |
-|---|---:|---:|---|---|---|
-| allreduce | `354.27 GB/s` | `>= 491.84 GB/s` | `wrong=0` | `IB/GDRDMA` | FAIL |
-| alltoall | `37.00 GB/s` | `>= 76.54 GB/s` | `wrong=0` | `IB/GDRDMA` | FAIL |
-| broadcast | `191.65 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
-| reducescatter | `192.75 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
-| allgather | `192.14 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
-| sendrecv | `26.98 GB/s` | 未配置 | `wrong=0` | `IB/GDRDMA` | PASS evidence |
-
-## 怎么解读
-
-1. 这次不是替代 PDF matrix，而是补齐多机多卡 collective 覆盖面。
-2. `allreduce/alltoall` 继续沿用已知 PDF 2x8 阈值，所以报告整体是 `FAIL`。
-3. `broadcast/reducescatter/allgather/sendrecv` 当前只能证明“多机 2x8 能跑、正确性为 0 wrong、走 IB/GDRDMA”，还不能证明生产性能达标，因为手头 PDF matrix 没给这 4 项跨节点阈值。
-4. 新增 4 项的带宽大致呈现两个层次：
-   - `broadcast/reducescatter/allgather` 在 `191-193 GB/s`，接近当前 4 x 400G rail 的单向原始上限。
-   - `sendrecv` 只有 `26.98 GB/s`，需要结合 sendrecv 的 traffic pattern 单独解读，不能直接和 allreduce busbw 混比。
-
-## 校验信息
-
-```text
-06c565281813c4260da9cfee8f0b0289b61b3be95c01dd670c71fa1a441133e3  reports/multinode_nccl_all_collectives_20260523_120144.md
-020eb35ddc5933da78b5c00c1b6fc25b11b23c4505300276d9736fbe8a35519b  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-47f68b7510df3b472e7ac0ec2fb53dcefbe687bb4de0c889f8947cc652d09e61  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-fa2828cdfcb86e6715a17c8bf45de10ce421c12f0877efff9bafb218b2f00df3  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-077fec1bf498fd202e2866f1cf6fb4502ac8d1bafba156f213453b21f6a6df2b  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-be24943eb4b63e304cee41831adeb23ffbbc0e890ff19b067e06d6a4b48b2d90  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-4560364922a85d21827357b906491aae8283c6148ff1c0e0f0dc379a68307fdd  reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json
-```
diff --git a/reports_multinode_nccl_alltoall_tuning_20260523.md b/reports_multinode_nccl_alltoall_tuning_20260523.md
deleted file mode 100644
index dcf75c4..0000000
--- a/reports_multinode_nccl_alltoall_tuning_20260523.md
+++ /dev/null
@@ -1,160 +0,0 @@
-# 多机 NCCL 8 卡 alltoall 网络参数 sweep
-
-- 日期：2026-05-23
-- 主机：`aikubeworker0012` / `172.72.8.12`，`aikubeworker0016` / `172.72.8.16`
-- NCCL：临时 `2.27.7+cuda12.4`
-- 测试：2 nodes x 8 GPUs，`alltoall_perf -b 16G -e 16G`
-- HCA：`mlx5_0,mlx5_1,mlx5_6,mlx5_7`
-
-## 结论
-
-`NCCL_PXN_DISABLE=1` 是本轮唯一有效正向参数，可以把 8 卡 alltoall 从约 `30.06 GB/s` 提升到约 `37.24 GB/s`。纳入正式 PDF 矩阵配置后，8 卡 alltoall 原始报告结果为 `36.70 GB/s peak` / `36.74 GB/s avg`。
-
-补充计数器探测显示，`NCCL_PXN_DISABLE=1` 的实际作用是把 alltoall 流量重新均匀分配到 4 条 400G rail 上。baseline 下 `mlx5_0/6` 与 `mlx5_1/7` 的流量约为 3:1；禁用 PXN 后四条 HCA 均衡。但每条 rail 的实际吞吐仍只有约 `19-20 GB/s`，没有打满 400G rail。
-
-复测错误/拥塞 counter 后，没有看到 discard、链路错误、RoCE 重传、slow restart 或 packet sequence error 增长；主要非零异常是部分端口 `port_xmit_wait`。不过 allreduce 对照在 `354 GB/s busbw` 时也会出现同类 `port_xmit_wait`，所以当前不支持“链路坏包/重传导致慢”的判断，也不能只用 `port_xmit_wait` 解释 alltoall 低吞吐。更可能的方向是 NCCL internal alltoall 通信模式效率、交换侧调度/拥塞控制，或缺少 NCCL net plugin/SHARP。
-
-这个提升有实际价值，但仍远低于 PDF 参考 `76.54 GB/s`。在 `NCCL_PXN_DISABLE=1` 之前做过一轮参数 sweep，其他参数没有改善，部分明显变差：
-
-| Case | Avg Bus BW | 结论 |
-|------|------------|------|
-| baseline | `30.0633 GB/s` | 基线 |
-| `NCCL_PXN_DISABLE=1` | `37.2421 GB/s` | 有效提升 |
-| `NCCL_P2P_PXN_LEVEL=0` | `20.1205 GB/s` | 明显变差 |
-| `NCCL_P2P_PXN_LEVEL=1` | `30.0588 GB/s` | 无改善 |
-| `NCCL_P2P_PXN_LEVEL=2` | `30.0437 GB/s` | 无改善 |
-| `NCCL_NET_SHARED_COMMS=0` | `27.3889 GB/s` | 变差 |
-| `NCCL_NET_SHARED_BUFFERS=0` | `28.2389 GB/s` | 变差 |
-| `NCCL_NET_SHARED_COMMS=0 NCCL_NET_SHARED_BUFFERS=0` | `28.2279 GB/s` | 变差 |
-| `NCCL_NCHANNELS_PER_NET_PEER=2` | `30.0281 GB/s` | 无改善 |
-| `NCCL_NCHANNELS_PER_NET_PEER=4` | `29.9802 GB/s` | 无改善 |
-| `NCCL_IB_ADAPTIVE_ROUTING=1 NCCL_IB_AR_THRESHOLD=0` | `30.0526 GB/s` | 无改善 |
-| `NCCL_IB_ADAPTIVE_ROUTING=0` | `30.0535 GB/s` | 无改善 |
-| `NCCL_IB_PCI_RELAXED_ORDERING=0` | 未完成 | 明显异常，不建议 |
-
-在 `NCCL_PXN_DISABLE=1` 作为基线后又补跑了一轮叠加参数 sweep。短测窗口里 `NVLS_ENABLE=0`、`P2P_NET_CHUNKSIZE=4M` 有小幅波动式提升，但更长 `-w 10 -n 10` 复测没有复现，不能作为稳定优化项。
-
-| Case | Avg Bus BW | 结论 |
-|------|------------|------|
-| `NCCL_PXN_DISABLE=1` | `37.0069 GB/s` | 短测基线 |
-| `+ NCCL_NVLS_ENABLE=0` | `37.2217 GB/s` | 小幅波动，不稳定 |
-| `+ NCCL_P2P_NET_CHUNKSIZE=4194304` | `37.2522 GB/s` | 小幅波动，不稳定 |
-| `+ NCCL_BUFFSIZE=8388608` | `37.0911 GB/s` | 无实质改善 |
-| `+ NCCL_MIN_NCHANNELS=16 NCCL_MAX_NCHANNELS=16` | `37.0189 GB/s` | 无实质改善 |
-| `+ NCCL_IB_AR_THRESHOLD=0` | `37.0843 GB/s` | 无实质改善 |
-| `+ NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=0` | `35.9847 GB/s` | 变差 |
-| `+ NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=1` | `29.8406 GB/s` | 明显变差 |
-| `+ NCCL_IB_QPS_PER_CONNECTION=8 NCCL_IB_SPLIT_DATA_ON_QPS=1` | `24.1183 GB/s` | 明显变差 |
-| `+ NCCL_NCHANNELS_PER_NET_PEER=8` | `29.8904 GB/s` | 明显变差 |
-
-长测复核：
-
-| Case | Avg Bus BW | 结论 |
-|------|------------|------|
-| `NCCL_PXN_DISABLE=1` | `32.7280 GB/s` | 当前窗口基线下滑 |
-| `+ NCCL_P2P_NET_CHUNKSIZE=4194304` | `31.9340 GB/s` | 未复现短测提升 |
-| `+ NCCL_NVLS_ENABLE=0 NCCL_P2P_NET_CHUNKSIZE=4194304` | `27.6585 GB/s` | 明显变差 |
-
-补充 ENV/INIT/NET 日志确认，性能波动时仍是 NCCL `2.27.7+cuda12.4`、4 条 400G HCA、GDR enabled、internal IB plugin；不是退回旧 NCCL、HCA 选择错误或 GDR 失效。
-
-## NCCL GRAPH/TUNING 对照
-
-为避免只看带宽结果，补抓了 allreduce 与 PXN disabled alltoall 的 `NCCL_DEBUG_SUBSYS=INIT,NET,GRAPH,TUNING,COLL` 日志。该日志采样使用短迭代，只用于看 NCCL 图和通道选择，不作为性能结论。
-
-共同点：
-
-| 观察项 | allreduce | alltoall + `NCCL_PXN_DISABLE=1` |
-|--------|-----------|----------------------------------|
-| NCCL version | `2.27.7+cuda12.4` | `2.27.7+cuda12.4` |
-| HCA | `mlx5_0,mlx5_1,mlx5_6,mlx5_7` | `mlx5_0,mlx5_1,mlx5_6,mlx5_7` |
-| GDR | enabled | enabled |
-| external net plugin | missing, internal IB | missing, internal IB |
-| channels | `16 coll / 16 nvls / 16 p2p` | `16 coll / 16 nvls / 16 p2p` |
-| p2p channels per peer | `2` | `2` |
-| P2P chunk | `131072` | `131072` |
-
-差异：
-
-| 观察项 | allreduce | alltoall + `NCCL_PXN_DISABLE=1` |
-|--------|-----------|----------------------------------|
-| Pattern 4 | `crossNic 0`, `type NVL/PXN`, `nChannels 8` | `crossNic 2`, `type NVL/PIX`, `nChannels 8` |
-| `NET/IB/*/GDRDMA` channel edge lines | `256` | `512` |
-| `P2P/CUMEM` channel edge lines | `0` | `224` |
-| total NET/P2P channel edge lines | `256` | `736` |
-
-判断：PXN disabled 后 4 条 IB/GDRDMA rail 都仍被使用，且通道数没有少；但 alltoall 的 NCCL graph 明显更复杂，并混入大量本机 `P2P/CUMEM` 路径。这个结果进一步支持：剩余差距不是 HCA/GDR 基础环境没有生效，而是 alltoall collective graph、P2P/NET 组合方式、internal IB plugin 能力或交换网络策略的问题。
-
-## PXN disabled 端口计数器
-
-`NCCL_PXN_DISABLE=1` 后，8 卡 alltoall 输出：
-
-| Metric | Value |
-|--------|-------|
-| `algbw` | `39.37 / 39.46 GB/s` |
-| `busbw` | `36.91 / 37.00 GB/s` |
-| `Avg bus bandwidth` | `36.9518 GB/s` |
-
-端口计数器：
-
-| Host | HCA | Xmit GB | Recv GB | Xmit GB/s | Recv GB/s |
-|------|-----|---------|---------|-----------|-----------|
-| 172.72.8.12 | `mlx5_0` | `590.98` | `590.91` | `19.82` | `19.82` |
-| 172.72.8.12 | `mlx5_1` | `590.98` | `590.98` | `19.82` | `19.82` |
-| 172.72.8.12 | `mlx5_6` | `590.98` | `590.90` | `19.82` | `19.82` |
-| 172.72.8.12 | `mlx5_7` | `590.98` | `590.98` | `19.82` | `19.82` |
-| 172.72.8.16 | `mlx5_0` | `590.94` | `590.98` | `19.82` | `19.82` |
-| 172.72.8.16 | `mlx5_1` | `590.94` | `590.98` | `19.82` | `19.82` |
-| 172.72.8.16 | `mlx5_6` | `590.94` | `590.98` | `19.82` | `19.82` |
-| 172.72.8.16 | `mlx5_7` | `590.94` | `590.98` | `19.82` | `19.82` |
-
-对比 baseline：
-
-| Case | Rail 分布 | Avg Bus BW |
-|------|-----------|------------|
-| baseline | `mlx5_0/6` 约 `885 GB`，`mlx5_1/7` 约 `295 GB` | `30.04 GB/s` |
-| `NCCL_PXN_DISABLE=1` | 四条 HCA 均约 `591 GB` | `36.95 GB/s` |
-
-### 错误/等待 counter 复测
-
-PXN disabled 复测结果：
-
-| 观察项 | 结果 |
-|--------|------|
-| `Avg bus bandwidth` | `36.4512 GB/s` |
-| 每条 HCA 流量 | 约 `712.18-712.28 GiB`，四条 rail 均衡 |
-| discard / rcv error / symbol error / link down / link recovery | `0` 增量 |
-| RoCE retrans / slow restart / packet sequence error / out of sequence | `0` 增量 |
-| `port_xmit_wait` | `mlx5_1`、`mlx5_7` 有增长，约 `15.65M-23.49M` |
-
-allreduce 对照：
-
-| 观察项 | 结果 |
-|--------|------|
-| `Avg bus bandwidth` | `354.366 GB/s` |
-| 每条 HCA 流量 | 约 `178.03-178.07 GiB`，四条 rail 均衡 |
-| 错误/重传类 counter | `0` 增量 |
-| `port_xmit_wait` | `mlx5_1`、`mlx5_7` 有增长，约 `6.11M-6.59M` |
-
-## 正式配置更新
-
-`configs/multinode_nccl_nccl227_pdf_matrix.yaml` 已对 2 nodes x 8 GPUs 的 alltoall 增加：
-
-```yaml
-op_env:
-  alltoall:
-    NCCL_PXN_DISABLE: 1
-```
-
-正式矩阵报告：`reports_multinode_nccl_pdf_matrix_nccl227.md`
-
-| Topology | alltoall Peak Bus BW | alltoall Avg Bus BW | PDF Reference | Status |
-|----------|----------------------|---------------------|---------------|--------|
-| 2 nodes x 8 GPUs | `36.70 GB/s` | `36.74 GB/s` | `76.54 GB/s` | FAIL |
-
-## 判断
-
-1. PXN 在当前拓扑下对 8 卡 alltoall 有负面影响，禁用后有约 `22-24%` 提升。
-2. 禁用 PXN 可以修复 rail 分布不均衡，但无法打满每条 400G rail。
-3. PXN disabled 基线上继续叠加 NVLS、P2P chunk、buffer、channel、QP/split、AR 等参数，没有稳定收益；QP/split 和 `NCCL_NCHANNELS_PER_NET_PEER=8` 反而明显变差。
-4. 禁用 PXN 后仍只有 PDF 目标的一半左右，剩余差距不是单一 NCCL 环境变量可以补齐。
-5. 后续重点仍应放在 NCCL net plugin/SHARP、交换网络策略和 NCCL internal alltoall 实现效率；`port_xmit_wait` 需要结合 allreduce 对照解读，不能单独作为 alltoall 根因。
diff --git a/reports_multinode_nccl_artifact_signal_analysis_20260523.md b/reports_multinode_nccl_artifact_signal_analysis_20260523.md
deleted file mode 100644
index 1d8bc64..0000000
--- a/reports_multinode_nccl_artifact_signal_analysis_20260523.md
+++ /dev/null
@@ -1,141 +0,0 @@
-# 多机多卡 NCCL Artifacts 信号分析 2026-05-23
-
-## 分析对象
-
-- 本地 artifacts 解包目录：`/private/tmp/nccl_artifacts_113803/multinode_nccl_pdf_matrix_20260523_113803_artifacts`
-- 远端原始报告：`/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803.md`
-- 远端 artifacts：`/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`
-- 远端 artifacts tar：`/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.tar.gz`
-- 本地 manifest：`reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`
-
-这份文档只看最新正式 PDF matrix 复跑产生的原始 `cmd/stdout/stderr/json`，目的是回答：当前多机多卡 NCCL 是否真的走了 IB/GDRDMA，是否用到了正确 HCA，是否有 SHARP/外部 NCCL net plugin 信号，以及 2x8 失败更像卡在哪一层。
-
-## 一句话结论
-
-最新 artifacts 证明本轮多机多卡测试不是 launch 失败、不是回退 TCP、不是 GDRDMA 没开，也不是 HCA 名字选错；所有 case 都走 `IB`，都识别并启用了 `mlx5_0,mlx5_1,mlx5_6,mlx5_7` 这 4 条 400G rail，NCCL 正确性 `wrong=0`。当前主要缺口仍然是：环境没有外部 NCCL net plugin / SHARP 证据，且 2x8 档位的 PDF 阈值明显高于当前 4 rail 环境可解释能力，alltoall 还存在独立的跨 Leaf 多点通信效率问题。
-
-## Artifacts 信号表
-
-| Case | Peak | Threshold | Status | Plugin missing | NET/IB using | Using network IB | HCA set | GDR HCA set | GDRDMA edges | P2P/CUMEM | SHARP/CollNet | stdout KB |
-|---|---:|---:|---|---:|---:|---:|---|---|---:|---:|---:|---:|
-| allreduce_2x1 1_GPU | 47.29 | 48.90 | FAIL | 2 | 2 | 2 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 16 | 0 | 0 | 24 |
-| allreduce_2x2 2_GPUs | 137.16 | 136.93 | PASS | 4 | 4 | 4 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 32 | 32 | 0 | 68 |
-| allreduce_2x4 4_GPUs | 335.07 | 335.48 | FAIL | 8 | 8 | 8 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 256 | 0 | 0 | 259 |
-| allreduce_2x8 8_GPUs | 353.85 | 491.84 | FAIL | 16 | 16 | 16 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 256 | 0 | 0 | 410 |
-| alltoall_2x1 1_GPU | 24.85 | 27.25 | FAIL | 2 | 2 | 2 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 8 | 0 | 0 | 19 |
-| alltoall_2x2 2_GPUs | 47.76 | 54.41 | FAIL | 4 | 4 | 4 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 24 | 8 | 0 | 52 |
-| alltoall_2x4 4_GPUs | 72.74 | 73.73 | FAIL | 8 | 8 | 8 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 80 | 48 | 0 | 200 |
-| alltoall_2x8 8_GPUs | 36.83 | 76.54 | FAIL | 16 | 16 | 16 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | mlx5_0,mlx5_1,mlx5_6,mlx5_7 | 512 | 224 | 0 | 603 |
-
-字段解释：
-
-- `Plugin missing`：日志里的 `NET/Plugin: Could not find: none libnccl-net-none.so.` 次数。当前命令显式设置了 `NCCL_NET_PLUGIN=none`，所以这个信号表示没有使用外部 NCCL net plugin，而不是 NCCL 没有网络。
-- `NET/IB using`：日志里的 `NET/IB : Using ...` 次数，说明每个 rank 初始化时看到的 IB HCA 列表。
-- `Using network IB`：NCCL 最终选择了 `IB` 网络。
-- `GDR HCA set`：出现 `GPU Direct RDMA Enabled for HCA ...` 的 HCA 集合。
-- `GDRDMA edges`：NCCL graph/connection 中经由 `NET/IB/*/GDRDMA` 的跨节点边数量。
-- `P2P/CUMEM`：节点内 GPU 间路径信号，不是跨节点 IB。
-- `SHARP/CollNet`：日志中 `SHARP`、`CollNet`、`HCOLL` 相关信号计数。当前为 0。
-
-## 已排除的问题
-
-### 1. 不是 TCP 回退
-
-所有 8 个 case 都有 `Using network IB`，且每个 rank 均有 `NET/IB : Using ...`。这说明 NCCL 通信路径不是 socket/TCP 回退。
-
-### 2. 不是 HCA 名字选错
-
-所有 case 的 HCA 集合都一致：
-
-```text
-mlx5_0, mlx5_1, mlx5_6, mlx5_7
-```
-
-这与当前配置里的 `NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_6,mlx5_7` 一致，也与前面环境快照中确认的 4 条 400G IB rail 一致。
-
-### 3. 不是 GDRDMA 没开
-
-所有 case 都出现 `GPU Direct RDMA Enabled for HCA ...`，并且跨节点连接里有 `NET/IB/*/GDRDMA` 边。2x8 alltoall 甚至有 512 条 `GDRDMA/Shared` 边，所以不能简单判断为 GDRDMA 被关掉。
-
-### 4. 不是 NCCL 正确性失败
-
-最新 manifest 中 8 个 case 全部：
-
-```text
-returncode = 0
-wrong_count = 0
-```
-
-因此当前 FAIL 是严格 PDF 性能阈值失败，不是结果错误。
-
-## 仍然成立的缺口
-
-### 1. 外部 NCCL net plugin / SHARP 仍缺证据
-
-当前命令中显式设置：
-
-```text
-NCCL_NET_PLUGIN=none
-```
-
-所有 case 均出现 `NET/Plugin: Could not find: none libnccl-net-none.so.`，同时 `SHARP/CollNet` 信号计数为 0。结合前面的环境检查没有找到 `libnccl-net*.so*` / `libsharp*.so*`，当前环境不能证明与 PDF 参考环境的软件栈等价。
-
-### 2. 2x8 allreduce 更像被 4 rail 物理能力卡住
-
-2x8 allreduce：
-
-```text
-当前 busbw = 353.85 GB/s
-PDF 阈值 = 491.84 GB/s
-```
-
-16 rank allreduce 的换算关系是：
-
-```text
-busbw = algbw * 1.875
-```
-
-当前实测反推：
-
-```text
-353.85 / 1.875 = 188.72 GB/s algbw
-```
-
-当前每节点 4 条 400G rail 的理论单向原始带宽约：
-
-```text
-4 * 400 Gb/s / 8 = 200 GB/s
-```
-
-所以 allreduce 已经接近 4 rail 的可解释上限；如果 PDF 阈值来自更多 400G rail 或带 SHARP/plugin 的环境，当前节点不应直接按该阈值判死。
-
-### 3. 2x8 alltoall 是独立重点问题
-
-2x8 alltoall：
-
-```text
-当前 busbw = 36.83 GB/s
-PDF 阈值 = 76.54 GB/s
-```
-
-alltoall 和 allreduce 使用同一组 HCA，同样走 IB/GDRDMA，但 2x8 alltoall 下降明显。这个现象更像多点到多点流量在当前跨 Leaf 网络、ECMP/adaptive routing、拥塞控制或 NCCL graph 策略下效率不够，而不是单纯 HCA 没起来。
-
-## 下一步建议
-
-1. 先不要继续盲扫 NCCL 小参数。已有 artifacts 说明基础链路已经起来，继续微调环境变量的收益大概率很低。
-2. 向硬件/网络侧确认 PDF 参考环境每节点是否有 8 条 400G rail，以及是否启用了 SHARP、HCOLL 或外部 NCCL net plugin。
-3. 如果验收坚持 PDF 原阈值，应先补齐 plugin/SHARP 或换等价 8 rail 节点复测。
-4. 如果当前硬件形态就是 4 条 400G rail，则 allreduce 阈值应重新定标；alltoall 单独作为跨 Leaf 多点通信效率问题继续排查。
-5. 补齐 plugin/SHARP 后，优先复跑：
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/run_multinode_nccl_pdf_matrix.sh
-```
-
-并对比新旧 artifacts 中：
-
-- `Plugin missing` 是否消失。
-- 是否出现外部 net plugin、SHARP 或 CollNet 信号。
-- 2x8 allreduce 是否突破当前 `353-354 GB/s` 平台。
-- 2x8 alltoall 是否突破当前 `36-37 GB/s` 平台。
diff --git a/reports_multinode_nccl_counter_probe_20260523.md b/reports_multinode_nccl_counter_probe_20260523.md
deleted file mode 100644
index 9e42251..0000000
--- a/reports_multinode_nccl_counter_probe_20260523.md
+++ /dev/null
@@ -1,209 +0,0 @@
-# 多机 NCCL 8 卡链路计数器探测
-
-- 日期：2026-05-23
-- 主机：`aikubeworker0012` / `172.72.8.12`，`aikubeworker0016` / `172.72.8.16`
-- NCCL：临时 `2.27.7+cuda12.4`
-- HCA：`mlx5_0,mlx5_1,mlx5_6,mlx5_7`
-- HCA 速率：每节点 4 x 400Gb/s NDR，理论单向合计约 `200 GB/s`
-
-## 结论
-
-8 卡 allreduce 的 NCCL `algbw` 已经到 `189 GB/s` 左右，接近当前每节点 4 条 400G rail 的理论单向合计 `200 GB/s`。因此 PDF 参考的 `491.84 GB/s busbw` 对应 `262 GB/s algbw`，在当前 4 x 400G rail 形态下不太可能达到，除非实际可用跨节点 rail 数量或网络能力高于当前节点暴露的 4 条 400G。
-
-裸 RDMA 并发 perftest 也验证了这 4 条 400G rail 本身可以同时工作：4 个 HCA 并发 `ib_write_bw` 合计 `1476.95 Gb/s`，即 `184.62 GB/s`。这与 NCCL 8 卡 allreduce 换算出的 `189 GB/s algbw` 一致，说明 allreduce 已经接近裸网络可用带宽。
-
-8 卡 alltoall 仍只有 `30 GB/s busbw`，不是 HCA 顺序导致。HCA 顺序 sweep 都稳定在 `30.02-30.07 GB/s`。计数器显示 alltoall 流量主要压在 `mlx5_0` 和 `mlx5_6` 上，`mlx5_1` 和 `mlx5_7` 只有约三分之一流量，说明剩余问题更像 NCCL alltoall rail 分布、路由、拥塞、NCCL net plugin/SHARP 或网络侧策略问题。
-
-补充测试显示，`NCCL_PXN_DISABLE=1` 可以把 alltoall 流量均匀分配到四条 HCA，并将 busbw 提升到约 `36.5-37.0 GB/s`。不过每条 400G rail 仍只有约 `19-20 GB/s`，没有达到裸 RDMA 单 rail 能力。
-
-进一步抓 `counters`/`hw_counters` 后，未看到 discard、CRC/符号错误、packet sequence error、RoCE retrans、slow restart 等错误类计数增长；只看到部分端口 `port_xmit_wait` 增长。对照 allreduce 后发现，allreduce 在 `354 GB/s busbw` 时也会出现同类 `port_xmit_wait`，因此 `port_xmit_wait` 不是 alltoall 低吞吐的充分解释，只能说明发送侧存在等待。剩余问题更像 NCCL internal alltoall 通信模式、交换网络调度/拥塞控制、或缺少 NCCL net plugin/SHARP 能力。
-
-## 裸 RDMA 4 rail 并发
-
-命令类型：
-
-```bash
-ib_write_bw -d <mlx5_X> -i 1 -p <port> -s 4194304 -n 5000 -F --report_gbits
-```
-
-结果：
-
-| HCA | BW average |
-|-----|------------|
-| `mlx5_0` | `387.16 Gb/s` |
-| `mlx5_1` | `387.07 Gb/s` |
-| `mlx5_6` | `355.02 Gb/s` |
-| `mlx5_7` | `347.70 Gb/s` |
-| Total | `1476.95 Gb/s` / `184.62 GB/s` |
-
-## 8 卡 allreduce
-
-NCCL 输出：
-
-| Metric | Value |
-|--------|-------|
-| `algbw` | `189.16 / 189.07 GB/s` |
-| `busbw` | `354.68 / 354.52 GB/s` |
-| `Avg bus bandwidth` | `354.597 GB/s` |
-
-allreduce busbw 换算关系约为：
-
-```text
-busbw = algbw * 2 * (nranks - 1) / nranks
-      = algbw * 1.875  # nranks=16
-```
-
-因此：
-
-| 项 | busbw | 换算 algbw |
-|----|-------|------------|
-| 当前测试 | `354.60 GB/s` | `189.12 GB/s` |
-| PDF 参考 | `491.84 GB/s` | `262.31 GB/s` |
-
-当前 `189.12 GB/s algbw` 已接近 `4 x 400Gb/s = 200 GB/s` 理论单向总带宽。
-
-### allreduce counter 对照
-
-对同样 2 nodes x 8 GPUs、同样 4 条 HCA 的 16G allreduce 复测 counter：
-
-| Metric | Value |
-|--------|-------|
-| `algbw` | `189.22 / 188.77 GB/s` |
-| `busbw` | `354.79 / 353.94 GB/s` |
-| `Avg bus bandwidth` | `354.366 GB/s` |
-
-流量分布：
-
-| Host | HCA | Xmit GiB | Recv GiB |
-|------|-----|----------|----------|
-| aikubeworker0012 | `mlx5_0` | `178.07` | `178.03` |
-| aikubeworker0012 | `mlx5_1` | `178.07` | `178.07` |
-| aikubeworker0012 | `mlx5_6` | `178.07` | `178.03` |
-| aikubeworker0012 | `mlx5_7` | `178.07` | `178.07` |
-| aikubeworker0016 | `mlx5_0` | `178.03` | `178.07` |
-| aikubeworker0016 | `mlx5_1` | `178.07` | `178.07` |
-| aikubeworker0016 | `mlx5_6` | `178.03` | `178.07` |
-| aikubeworker0016 | `mlx5_7` | `178.07` | `178.07` |
-
-错误类 counter 增量同样为 `0`，非零等待类 counter 为：
-
-| Host | HCA | `port_xmit_wait` delta |
-|------|-----|------------------------|
-| aikubeworker0012 | `mlx5_1` | `6,555,518` |
-| aikubeworker0012 | `mlx5_7` | `6,325,059` |
-| aikubeworker0016 | `mlx5_1` | `6,585,965` |
-| aikubeworker0016 | `mlx5_7` | `6,112,874` |
-
-判断：allreduce 在达到当前 4 x 400G rail 物理上限附近时也会出现 `port_xmit_wait`，所以这个 counter 不能单独解释 alltoall 只有 `36-37 GB/s`。alltoall 的问题更偏向通信模式效率或网络调度策略，而不是简单链路错误。
-
-## 8 卡 alltoall
-
-NCCL 输出：
-
-| Metric | Value |
-|--------|-------|
-| `algbw` | `32.04 / 32.05 GB/s` |
-| `busbw` | `30.03 / 30.04 GB/s` |
-| `Avg bus bandwidth` | `30.0389 GB/s` |
-
-同一测试窗口内，端口计数器增量显示流量不均衡：
-
-| Host | HCA | Xmit GB | Recv GB |
-|------|-----|---------|---------|
-| 172.72.8.12 | `mlx5_0` | `885.54` | `885.51` |
-| 172.72.8.12 | `mlx5_1` | `295.19` | `295.19` |
-| 172.72.8.12 | `mlx5_6` | `885.53` | `885.51` |
-| 172.72.8.12 | `mlx5_7` | `295.19` | `295.19` |
-| 172.72.8.16 | `mlx5_0` | `885.51` | `885.54` |
-| 172.72.8.16 | `mlx5_1` | `295.19` | `295.19` |
-| 172.72.8.16 | `mlx5_6` | `885.51` | `885.53` |
-| 172.72.8.16 | `mlx5_7` | `295.19` | `295.19` |
-
-## HCA 顺序 sweep
-
-8 卡 alltoall 对 HCA 顺序不敏感：
-
-| `NCCL_IB_HCA` | Avg Bus BW |
-|---------------|------------|
-| `mlx5_0,mlx5_1,mlx5_6,mlx5_7` | `30.0367 GB/s` |
-| `mlx5_0,mlx5_6,mlx5_1,mlx5_7` | `30.0696 GB/s` |
-| `mlx5_0,mlx5_7,mlx5_1,mlx5_6` | `30.0397 GB/s` |
-| `mlx5_1,mlx5_0,mlx5_7,mlx5_6` | `30.0413 GB/s` |
-| `mlx5_6,mlx5_7,mlx5_0,mlx5_1` | `30.0230 GB/s` |
-
-## PXN disabled alltoall 计数器
-
-`NCCL_PXN_DISABLE=1` 后：
-
-| Metric | Value |
-|--------|-------|
-| `Avg bus bandwidth` | `36.9518 GB/s` |
-| 每条 HCA 流量 | 约 `590.94-590.98 GB` |
-| 每条 HCA 吞吐 | 约 `19.82 GB/s` |
-| 每节点 4 HCA 合计吞吐 | 约 `79.29 GB/s` |
-
-判断：禁用 PXN 可以修复 rail 分布不均衡，但不能让 alltoall 打满当前 4 条 400G rail。
-
-### PXN disabled 错误/拥塞 counter 复测
-
-复测命令仍为 2 nodes x 8 GPUs，`alltoall_perf -b 16G -e 16G -w 10 -n 10`，并使用：
-
-```bash
-NCCL_PXN_DISABLE=1
-NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_6,mlx5_7
-NCCL_NET_PLUGIN=none
-NCCL_NET_GDR_LEVEL=5
-NCCL_NET_GDR_READ=1
-NCCL_DMABUF_ENABLE=0
-```
-
-NCCL 输出：
-
-| Metric | Value |
-|--------|-------|
-| `algbw` | `39.04 / 38.72 GB/s` |
-| `busbw` | `36.60 / 36.30 GB/s` |
-| `Avg bus bandwidth` | `36.4512 GB/s` |
-
-流量分布保持均衡：
-
-| Host | HCA | Xmit GiB | Recv GiB |
-|------|-----|----------|----------|
-| aikubeworker0012 | `mlx5_0` | `712.28` | `712.19` |
-| aikubeworker0012 | `mlx5_1` | `712.27` | `712.27` |
-| aikubeworker0012 | `mlx5_6` | `712.28` | `712.18` |
-| aikubeworker0012 | `mlx5_7` | `712.27` | `712.27` |
-| aikubeworker0016 | `mlx5_0` | `712.23` | `712.27` |
-| aikubeworker0016 | `mlx5_1` | `712.23` | `712.27` |
-| aikubeworker0016 | `mlx5_6` | `712.23` | `712.27` |
-| aikubeworker0016 | `mlx5_7` | `712.23` | `712.27` |
-
-错误类 counter 增量：
-
-| Counter group | Result |
-|---------------|--------|
-| `port_xmit_discards`, `port_rcv_errors`, `port_rcv_remote_physical_errors`, `port_rcv_switch_relay_errors` | `0` |
-| `symbol_error`, `link_error_recovery`, `link_downed`, `local_link_integrity_errors`, `excessive_buffer_overrun_errors` | `0` |
-| `roce_adp_retrans`, `roce_adp_retrans_to`, `roce_slow_restart*` | `0` |
-| `packet_seq_err`, `out_of_sequence`, `out_of_buffer`, `duplicate_request`, `implied_nak_seq_err` | `0` |
-| `local_ack_timeout_err`, `req_transport_retries_exceeded`, `rnr_nak_retry_err` | `0` |
-
-非零等待类 counter：
-
-| Host | HCA | `port_xmit_wait` delta |
-|------|-----|------------------------|
-| aikubeworker0012 | `mlx5_1` | `23,492,853` |
-| aikubeworker0012 | `mlx5_7` | `17,420,720` |
-| aikubeworker0016 | `mlx5_1` | `20,428,901` |
-| aikubeworker0016 | `mlx5_7` | `15,650,027` |
-
-判断：PXN disabled 后 alltoall 没有明显链路错误、重传或丢包证据。结合 allreduce 对照，`port_xmit_wait` 只能作为发送等待信号，不能单独解释 alltoall 低吞吐；剩余性能缺口更偏向 NCCL internal alltoall 在当前拓扑下的通信模式效率、交换网络调度/拥塞控制，或外部 NCCL net plugin/SHARP 缺失。
-
-## 判断
-
-1. 裸 RDMA 4 rail 可以并发跑到约 `184.62 GB/s`，网络基础带宽不是单 rail 瓶颈。
-2. 8 卡 allreduce 当前不是软件参数小调能解决的问题，性能已经贴近当前 4 条 400G rail 的物理带宽上限。
-3. 8 卡 alltoall 仍明显异常，且不是 HCA 顺序问题；PXN disabled 后 rail 已均衡，`port_xmit_wait` 不是 alltoall 独有，需要继续从 NCCL alltoall 模式、交换机侧策略、NCCL net plugin/SHARP 排查。
-4. `NCCL_PXN_DISABLE=1` 可改善 8 卡 alltoall 的 rail 均衡性和性能，但无法补齐到 PDF 目标。
-5. 如果验收必须达到 PDF 的 2 机 16 卡 `491.84/76.54 GB/s`，需要确认当前两台机器是否具备与 PDF 参考环境同等的有效跨节点 rail 数量和交换网络能力。
-6. 两台机器当前均未发现 `libnccl-net.so` 或 SHARP/HCOLL 包，NCCL 使用 internal IB plugin；如果目标值依赖 NCCL net plugin/SHARP，需要先补齐对应运行环境。
diff --git a/reports_multinode_nccl_deep_diagnose_run_20260523.md b/reports_multinode_nccl_deep_diagnose_run_20260523.md
deleted file mode 100644
index a96c20d..0000000
--- a/reports_multinode_nccl_deep_diagnose_run_20260523.md
+++ /dev/null
@@ -1,125 +0,0 @@
-# 多节点 NCCL 深度诊断复跑报告 2026-05-23
-
-## 执行信息
-
-- 发起节点：`aikubeworker0012`
-- 对端节点：`aikubeworker0016`
-- 测试规模：2 节点 x 8 GPU
-- NCCL：`2.27.7+cuda12.4`
-- nccl-tests：`/data/nccl-tests-latest/build`
-- OpenMPI：`/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun`
-- 远端产物目录：`/root/test_gpu_scripts/reports/nccl_deep_diag_20260523_103932`
-- 诊断脚本：`scripts/multinode_nccl_deep_diagnose.sh all`
-
-## Preflight
-
-两台机器均通过轻量环境检查：
-
-| 项目 | aikubeworker0012 | aikubeworker0016 |
-|---|---:|---:|
-| OpenMPI | `4.1.9a1` | `4.1.9a1` |
-| `all_reduce_perf` | OK | OK |
-| `alltoall_perf` | OK | OK |
-| `mlx5_0` | 400 Gb/sec ACTIVE | 400 Gb/sec ACTIVE |
-| `mlx5_1` | 400 Gb/sec ACTIVE | 400 Gb/sec ACTIVE |
-| `mlx5_6` | 400 Gb/sec ACTIVE | 400 Gb/sec ACTIVE |
-| `mlx5_7` | 400 Gb/sec ACTIVE | 400 Gb/sec ACTIVE |
-
-## 16G 核心结果
-
-| 测试 | 配置 | Avg Bus BW | 结论 |
-|---|---|---:|---|
-| allreduce | 自动参数 | `354.025 GB/s` | 稳定复现当前高位基线 |
-| alltoall | `NCCL_PXN_DISABLE=1` | `36.9377 GB/s` | 稳定复现当前瓶颈基线 |
-| graph allreduce | `NCCL_DEBUG=INFO` | `354.224 GB/s` | 与 counter run 一致 |
-| graph alltoall | `NCCL_PXN_DISABLE=1`, `NCCL_DEBUG=INFO` | `37.14 GB/s` | 与 counter run 一致 |
-
-对 PDF 目标的含义：
-
-- 2x8 allreduce 仍明显低于 PDF 2 机 16 GPU 目标 `491.84 GB/s`。
-- 2x8 alltoall 仍明显低于 PDF 2 机 16 GPU 目标 `76.54 GB/s`。
-- 本轮没有发现能把 8 卡 alltoall 推出 `36-37 GB/s` 平台的参数。
-
-## Counter 观察
-
-### Rail 流量
-
-allreduce 每条 rail 发送流量约 `178.03-178.07 GiB`，alltoall + PXN disabled 每条 rail 发送流量约 `712.23-712.28 GiB`。四条 400G rail 在两类测试中都均衡。
-
-### 错误/拥塞类计数
-
-本轮未看到 discard、symbol error、RoCE retrans、slow restart、packet sequence error 等硬错误增长。
-
-有增长的是 `port_xmit_wait`：
-
-| 测试 | 计数增长 |
-|---|---|
-| allreduce | `aikubeworker0016 mlx5_1 +6725565`, `mlx5_7 +6103180` |
-| alltoall + PXN disabled | `aikubeworker0016 mlx5_1 +20988680`, `mlx5_7 +16271960` |
-
-这说明 `port_xmit_wait` 不是 alltoall 独有现象；高吞吐 allreduce 也会出现。它可以作为交换网络/credit 等待的信号继续给网络侧看，但不能单独解释 alltoall 低带宽。
-
-## GRAPH/TUNING 对照
-
-| 观察项 | allreduce | alltoall + `NCCL_PXN_DISABLE=1` |
-|---|---:|---:|
-| `avg_busbw` | `354.224` | `37.14` |
-| `plugin_missing` | `16` | `16` |
-| GDR enabled lines | `1344` | `704` |
-| channel summary | `16 coll / 16 nvls / 16 p2p` | `16 coll / 16 nvls / 16 p2p` |
-| Pattern 4 | `crossNic 0`, `NVL/PXN` | `crossNic 2`, `NVL/PIX` |
-| `NET/IB/*/GDRDMA` lines | `256` | `512` |
-| `P2P/CUMEM` lines | `0` | `224` |
-| total NET/P2P edge lines | `256` | `736` |
-
-解释：
-
-- HCA、GDR、NCCL 版本和基础 channel 数量不是差异根因。
-- alltoall 的通信图明显更复杂，引入更多 NET/P2P 边，且 Pattern 4 从 allreduce 的 `NVL/PXN` 变成 `NVL/PIX`。
-- 这继续支持问题偏向 NCCL alltoall 图策略、internal IB plugin、缺少外部 `libnccl-net.so`/SHARP，或交换网络策略，而不是单纯链路坏、HCA 不通、GDR 没开。
-
-## PXN Disabled Sweep
-
-基线均为 `NCCL_PXN_DISABLE=1`，16G，2x8 GPU。
-
-| Case | 额外参数 | Avg Bus BW |
-|---|---|---:|
-| baseline | 无 | `36.8024` |
-| nvls_off | `NCCL_NVLS_ENABLE=0` | `36.8095` |
-| qps4_split1 | `NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=1` | `30.5464` |
-| qps8_split1 | `NCCL_IB_QPS_PER_CONNECTION=8 NCCL_IB_SPLIT_DATA_ON_QPS=1` | `23.9345` |
-| qps4_split0 | `NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=0` | `35.8679` |
-| channels16 | `NCCL_MIN_NCHANNELS=16 NCCL_MAX_NCHANNELS=16` | `37.1776` |
-| buff8m | `NCCL_BUFFSIZE=8388608` | `37.0265` |
-| p2pchunk4m | `NCCL_P2P_NET_CHUNKSIZE=4194304` | `37.0188` |
-| netpeer8 | `NCCL_NCHANNELS_PER_NET_PEER=8` | `31.103` |
-| ar0 | `NCCL_IB_AR_THRESHOLD=0` | `36.9965` |
-
-结论：
-
-- `channels16`、`buff8m`、`p2pchunk4m`、`ar0` 只有 0.2-1.0% 左右波动，不能视为有效优化。
-- `qps4_split1`、`qps8_split1`、`netpeer8` 明显负向。
-- 当前 8 卡 alltoall 不建议套用 PDF 固定 QP/split 参数。
-
-## 脚本修正验证
-
-复跑后发现脚本在 GRAPH 模式后会把 `NCCL_DEBUG=INFO` 继承到 sweep，导致 sweep 日志过大；同时 OpenMPI 会对未设置的 `-x` 变量打印 warning。
-
-已修正：
-
-- `set_common_env` 每个 case 重置到默认 `NCCL_DEBUG=WARN`。
-- `mpi_xargs` 只导出已经设置的环境变量。
-
-验证方式：
-
-- 本地 `bash -n scripts/multinode_nccl_deep_diagnose.sh` 通过。
-- 远端 1M tiny `all` 冒烟测试通过。
-- tiny 产物中 `could not find environment variable` 计数为 `0`。
-
-## 当前判断
-
-1. allreduce 的高位基线稳定，2x8 仍在 `354 GB/s` 左右。
-2. alltoall 即使 PXN disabled 并且 rail 均衡，也只能稳定在 `36-37 GB/s`。
-3. 未发现明显坏链路、重传、丢包、HCA 不通或 GDR disabled。
-4. 当前 4 条 400G rail 的硬件形态与 PDF 目标疑似不等价；PDF 2x8 allreduce 目标 `491.84 GB/s` 反推需要超过当前 4 rail 单向理论上限。
-5. alltoall 还需要从 NCCL net plugin/SHARP、交换机路径/ECMP/拥塞控制、以及 NCCL alltoall 图策略侧继续排。
diff --git a/reports_multinode_nccl_diagnosis_20260523.md b/reports_multinode_nccl_diagnosis_20260523.md
deleted file mode 100644
index 6e769b5..0000000
--- a/reports_multinode_nccl_diagnosis_20260523.md
+++ /dev/null
@@ -1,500 +0,0 @@
-# 多机多卡 NCCL 诊断报告
-
-- 日期：2026-05-23
-- 测试入口：`nccl-gpu-1` / `aikubeworker0012` / `172.72.8.12`
-- 对端节点：`nccl-gpu-2` / `aikubeworker0016` / `172.72.8.16`
-- 诊断配置：`configs/multinode_nccl_nccl227_auto_16g.yaml`
-- 当前最佳原始脚本报告：`reports_multinode_nccl_16g_2x8_nccl227_auto.md`
-
-## 当前结论
-
-这不是单纯 “IB 不通” 的问题。底层 CUDA RDMA perftest 可以跑到接近单端口 400Gb/s 的水平；最初使用 pip 包里的 NCCL 2.21.5 时，NCCL 在实际 2 节点通信中把 GPU Direct RDMA 禁用了，导致带宽显著偏低。
-
-后续临时切换到 apt 包解压出的 NCCL 2.27.7+cuda12.4 后，NCCL GDR 已经恢复启用，2 节点 x 8 GPU allreduce 从 `67.42 GB/s` 提升到 `237.86 GB/s`，alltoall 从 `9.56 GB/s` 提升到 `28.62 GB/s`。
-
-继续 tuning 后发现，配置里固定的 `NCCL_MIN_NCHANNELS=4`、`NCCL_IB_QPS_PER_CONNECTION=4`、`NCCL_IB_SPLIT_DATA_ON_QPS=1` 会明显压低 16G allreduce。去掉这些固定参数、让 NCCL 2.27.7 自动选择后，正式脚本报告中 2 节点 x 8 GPU allreduce 提升到 `354.60 GB/s`，alltoall 小幅提升到 `30.01 GB/s`。当前剩余问题不再是 GDR disabled，而是 GDR enabled 且 NCCL 自动调参后，仍低于当前配置里的验收阈值。
-
-按 `sx算力节点跨Leaf NCCL测试报告.pdf` 的矩阵继续对齐后，发现 2 机 4 卡档位的核心问题是默认 GPU 选择不符合 GPU-NIC 亲和性。显式选择 `CUDA_VISIBLE_DEVICES=0,1,4,5` 后，2 机 4 卡 allreduce 可以恢复到 `333-335 GB/s` 区间，接近 PDF 的 `335.48 GB/s`；alltoall 配合 PDF 固定 NCCL 参数可到 `72.93 GB/s`，接近 PDF 的 `73.73 GB/s`。但 2 机 8 卡档位仍只有 allreduce `354.02 GB/s`、alltoall `30.04 GB/s`，与 PDF 的 `491.84/76.54 GB/s` 差距明显。
-
-进一步 sweep 8 卡 alltoall 网络参数后，`NCCL_PXN_DISABLE=1` 是唯一有效正向项。正式矩阵配置已对 2 机 8 GPU 的 alltoall 单独加入该变量，8 卡 alltoall 从约 `30.04 GB/s` 提升到 `36.70 GB/s` peak / `36.74 GB/s` avg，但仍低于 PDF 参考 `76.54 GB/s`。复测端口 counter 后，PXN disabled 下 4 条 rail 的流量已均衡，且没有明显链路错误、丢包、RoCE 重传或 slow restart；同类 `port_xmit_wait` 在高吞吐 allreduce 中也会出现，因此它不是 alltoall 低吞吐的充分解释。继续在 PXN disabled 基线上叠加 NVLS、P2P chunk、buffer、channel、QP/split、AR 等参数，没有稳定收益。NCCL GRAPH/TUNING 日志显示 alltoall 的 channel graph 比 allreduce 复杂很多，且混入大量本机 `P2P/CUMEM` 路径，但 HCA/GDR/channel 基础状态一致。剩余差距更像 NCCL internal alltoall 通信模式效率、交换网络策略，或缺少 NCCL net plugin/SHARP 能力。
-
-同时，`nccl-gpu-2` 的 SSH 入口曾因未认证连接过多触发 `MaxStartups` 随机拒绝，导致 `mpirun` 拉起远端 rank 失败。已经做了临时 SSHD 缓解并拿到有效的 2 节点 x 8 GPU allreduce/alltoall 报告。
-
-## 已完成的修正
-
-1. 修正 `mpirun` 使用路径，避开系统 `/usr/bin/mpirun` 与 DOCA OpenMPI 动态库混用导致的崩溃。
-2. 补充 `LD_LIBRARY_PATH`，确保 `mpirun`、CUDA、pip 安装的 NCCL 动态库可同时解析。
-3. 将 NCCL HCA 限定到 400Gb/s 活跃端口：`mlx5_0,mlx5_1,mlx5_6,mlx5_7`。
-4. 在脚本中加入 multi-node NCCL 网络诊断解析，报告会展示 `NCCL Network`、`GPU Direct RDMA`、`GDR Disabled HCAs`。
-5. 增加 `multinode_nccl.extra_env`，可以在配置里快速试 NCCL 环境变量，不需要改代码。
-6. 增加诊断配置 `configs/multinode_nccl_diagnostic.yaml`，固定跑 2 节点 x 8 GPU、256M、`NCCL_DEBUG=INFO` 和 `NCCL_DEBUG_SUBSYS=INIT,NET`。
-7. 在 `nccl-gpu-2` 上临时提高 SSHD `MaxStartups` 并缩短 `LoginGraceTime`，缓解未认证连接过多导致的 SSH 随机拒绝。
-8. 将 OpenMPI OOB TCP 控制通道固定到 `bond0`，并加入 `plm_rsh_args`，减少 `mpirun` 远端启动受 SSH/host key/接口选择影响的概率。
-9. 从 NVIDIA apt 源下载但不安装 `libnccl2=2.27.7-1+cuda12.4`，解压到两台机器 `/tmp/nccl-2.27.7-cuda12.4`，用 `LD_LIBRARY_PATH` 临时覆盖 NCCL 运行库验证。
-10. 增强报告解析，能够区分 `GPU Direct RDMA ENABLED` 和 `DISABLED`，并列出 enabled/disabled HCA。
-11. 将 multi-node NCCL 配置中的 `qps_per_connection`、`min_nchannels`、`split_data_on_qps` 改为 `null`，避免默认导出会压低大包 allreduce 的固定 NCCL 参数。
-12. 增加 topology 级 `cuda_visible_devices`、`env`、`op_env` 配置能力，支持按 GPU/NIC 亲和性和不同 NCCL op 分别设置环境变量。
-13. 生成 PDF 矩阵式原始报告 `reports_multinode_nccl_pdf_matrix_nccl227.md`，覆盖 2 机 1/2/4/8 GPU per node。
-14. 对 8 卡 alltoall 做 NCCL 网络参数 sweep，并将有效项 `NCCL_PXN_DISABLE=1` 固化到 PDF 矩阵配置。
-15. 对 PXN disabled 后的 8 卡 alltoall 抓取 `counters`/`hw_counters` 增量，确认 rail 已均衡且无明显错误/重传。
-16. 对同样 2x8 allreduce 抓 counter 对照，确认高吞吐 allreduce 也会出现 `port_xmit_wait`，因此该 counter 不是 alltoall 低吞吐的唯一根因。
-17. 在 PXN disabled 基线上继续 sweep NVLS、P2P chunk、buffer、channel、QP/split、AR 等参数，确认没有稳定收益，部分参数明显变差。
-18. 抓取 allreduce 与 PXN disabled alltoall 的 `GRAPH/TUNING/COLL` 日志，确认两者 HCA/GDR/channel 基础状态一致，但 alltoall graph 明显更复杂。
-
-## 关键证据
-
-### 1. CUDA RDMA perftest 通过
-
-命令类型：
-
-```bash
-CUDA_VISIBLE_DEVICES=0 ib_write_bw -d mlx5_0 -i 1 --use_cuda=0 -s 4194304 -F --report_gbits 172.72.8.16
-```
-
-结果：
-
-| 测试 | 设备 | GPU | 平均带宽 | 结论 |
-|------|------|-----|----------|------|
-| `ib_write_bw --use_cuda` | `mlx5_0` | GPU0 | `387.16 Gb/s` | PASS |
-
-解释：GPU 内存参与 RDMA 写带宽测试可以接近 400Gb/s，说明 `nvidia_peermem`/经典 GPUDirect RDMA 路径并非完全不可用。
-
-### 2. CUDA DMA-BUF 路径不可用
-
-命令类型：
-
-```bash
-CUDA_VISIBLE_DEVICES=0 ib_write_bw -d mlx5_0 -i 1 --use_cuda=0 --use_cuda_dmabuf -s 4194304 -F --report_gbits 172.72.8.16
-```
-
-结果：
-
-| 测试 | 输出 | 结论 |
-|------|------|------|
-| `ib_write_bw --use_cuda_dmabuf` | `DMA-BUF is not supported on this GPU` | FAIL |
-
-解释：当前环境不能走 CUDA DMA-BUF RDMA。后续 NCCL 应优先确认是否能稳定走经典 `nvidia_peermem` 路径。
-
-### 3. NCCL 单卡跨节点仍禁用 GDR
-
-使用 pip NCCL 2.21.5 时，
-
-已经尝试：
-
-- `NCCL_NET_GDR_LEVEL=SYS`
-- `NCCL_NET_GDR_LEVEL=5`
-- `NCCL_NET_GDR_READ=1`
-- `NCCL_DMABUF_ENABLE=0`
-- `NCCL_IB_CUDA_SUPPORT=1`
-- `NCCL_IB_HCA=mlx5_0`
-
-结果仍显示：
-
-```text
-NCCL INFO Using network IB
-NCCL INFO NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
-```
-
-256M allreduce 约 `13.4 GB/s`，明显低于 400Gb/s IB 端口能力。
-
-### 3.1 NCCL 2.27.7 恢复 GDR
-
-临时使用：
-
-```bash
-LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu:/usr/local/cuda-12.4/targets/x86_64-linux/lib
-```
-
-2 节点 x 1 GPU 日志显示：
-
-```text
-NCCL version 2.27.7+cuda12.4
-NET/IB : GPU Direct RDMA Enabled for HCA 0 'mlx5_0'
-Channel ... via NET/IB/0/GDRDMA
-```
-
-256M allreduce 从 NCCL 2.21.5 的约 `13.4 GB/s` 提升到 `45.2 GB/s`。判断：NCCL 2.21.5 与当前 driver/OFED/H100 组合存在 GDR 判定或注册路径兼容问题；升级 NCCL 是有效修复方向。
-
-### 4. 脚本 2 节点 x 8 GPU 诊断结果
-
-原始报告：`reports_multinode_nccl_diagnostic_2x8_sshfix.md`，使用 pip NCCL 2.21.5。
-
-| Operation | Topology | Peak Bus BW | Threshold | Status | NCCL Network | GPU Direct RDMA |
-|-----------|----------|-------------|-----------|--------|--------------|-----------------|
-| allreduce | 2 nodes x 8 GPUs | `67.42 GB/s` | `>= 480 GB/s` | FAIL | IB | DISABLED |
-| alltoall | 2 nodes x 8 GPUs | `9.56 GB/s` | `>= 75 GB/s` | FAIL | IB | DISABLED |
-
-allreduce 失败原因是带宽不达标，且报告捕获到 GDR 被 NCCL 禁用：
-
-| GDR Disabled HCAs |
-|-------------------|
-| `mlx5_0, mlx5_1, mlx5_6, mlx5_7` |
-
-allreduce 和 alltoall 本轮均正常完成，`returncode=0`、`wrong=0`，失败原因是带宽低于阈值，不是正确性失败。
-
-### 4.1 NCCL 2.27.7 诊断结果
-
-256M 诊断报告：`reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md`
-
-| Operation | Topology | Peak Bus BW | Threshold | Status | NCCL Network | GPU Direct RDMA |
-|-----------|----------|-------------|-----------|--------|--------------|-----------------|
-| allreduce | 2 nodes x 8 GPUs | `212.19 GB/s` | `>= 480 GB/s` | FAIL | IB | ENABLED |
-| alltoall | 2 nodes x 8 GPUs | `28.37 GB/s` | `>= 75 GB/s` | FAIL | IB | ENABLED |
-
-1M 到 4G sweep 报告：`reports_multinode_nccl_sweep_2x8_nccl227.md`
-
-| Operation | Peak Bus BW | Peak Size | Threshold | Status | GPU Direct RDMA |
-|-----------|-------------|-----------|-----------|--------|-----------------|
-| allreduce | `237.26 GB/s` | `4G` | `>= 480 GB/s` | FAIL | ENABLED |
-| alltoall | `28.78 GB/s` | `1G` | `>= 75 GB/s` | FAIL | ENABLED |
-
-16G 大包报告：`reports_multinode_nccl_16g_2x8_nccl227.md`
-
-| Operation | Peak Bus BW | Peak Size | Threshold | Status | GPU Direct RDMA |
-|-----------|-------------|-----------|-----------|--------|-----------------|
-| allreduce | `237.86 GB/s` | `16G` | `>= 480 GB/s` | FAIL | ENABLED |
-| alltoall | `28.62 GB/s` | `16G` | `>= 75 GB/s` | FAIL | ENABLED |
-
-解释：NCCL 2.27.7 已经修复 GDR 禁用问题，且性能提升明显；但在固定 `min_nchannels=4/qps=4/split=1` 的配置下仍不达标。allreduce 约稳定在 `238 GB/s`，alltoall 约稳定在 `28-29 GB/s`。
-
-### 4.2 NCCL 2.27.7 自动通道/QP 参数结果
-
-进一步对 16G 大包做 tuning，发现默认配置里锁定的参数会压低 allreduce：
-
-| 配置 | allreduce Avg Bus BW | alltoall Avg Bus BW | 结论 |
-|------|----------------------|---------------------|------|
-| NCCL 2.27.7 + 固定 `min_nchannels=4/qps=4/split=1` | `238.56 GB/s` | `28.62 GB/s` | GDR 已启用，但 allreduce 被压低 |
-| NCCL 2.27.7 + NCCL 自动选择 channel/QP | `354.57 GB/s` | `30.02 GB/s` | 当前最佳脚本结果 |
-
-正式脚本报告：`reports_multinode_nccl_16g_2x8_nccl227_auto.md`
-
-| Operation | Peak Bus BW | Avg Bus BW | Peak Size | Threshold | Status | GPU Direct RDMA |
-|-----------|-------------|------------|-----------|-----------|--------|-----------------|
-| allreduce | `354.60 GB/s` | `354.57 GB/s` | `16G` | `>= 480 GB/s` | FAIL | ENABLED |
-| alltoall | `30.01 GB/s` | `30.02 GB/s` | `16G` | `>= 75 GB/s` | FAIL | ENABLED |
-
-对比临时 tuning 命令：
-
-| 变量组合 | allreduce Avg Bus BW | alltoall Avg Bus BW |
-|----------|----------------------|---------------------|
-| baseline auto | `353.63 GB/s` | `30.05 GB/s` |
-| `NCCL_IB_MERGE_NICS=1` | `352.73 GB/s` | `30.07 GB/s` |
-| `NCCL_CROSS_NIC=1` | `354.68 GB/s` | `30.05 GB/s` |
-| `NCCL_IB_QPS_PER_CONNECTION=8` + `NCCL_IB_SPLIT_DATA_ON_QPS=0` | `350.91 GB/s` | `29.41 GB/s` |
-| `NCCL_MIN_NCHANNELS=16` + `NCCL_MAX_NCHANNELS=16` | `354.32 GB/s` | `30.06 GB/s` |
-
-解释：allreduce 的主要提升来自取消不合适的固定参数，而不是 `MERGE_NICS` 或 `CROSS_NIC`。alltoall 对这些参数不敏感，当前基本稳定在 `30 GB/s` 左右。
-
-### 5. SSHD MaxStartups 阻塞已临时缓解
-
-`nccl-gpu-2` 曾显示：
-
-```text
-sshd: /usr/sbin/sshd -D [listener] 52 of 10-100 startups
-maxstartups 10:30:100
-```
-
-同时存在大量 `sshd: unknown [priv]` / `sshd: unknown [net]` 未认证连接，来源主要是 `172.239.10.85`。这会触发 OpenSSH `MaxStartups` 随机拒绝，直接表现为：
-
-```text
-kex_exchange_identification: Connection closed by remote host
-```
-
-先临时改为：
-
-```text
-MaxStartups 120:30:240
-LoginGraceTime 20
-```
-
-后续外部未认证连接继续上涨到 `110 of 120-240 startups`，测试窗口进一步临时改为：
-
-```text
-MaxStartups 500:30:1000
-LoginGraceTime 5
-```
-
-改完后从 0012 连续 SSH 0016 5 次成功，2 节点 `mpirun hostname` 成功，2 节点 x 8 GPU allreduce/alltoall 也都能跑出有效结果。
-
-### 6. `nvidia_peermem` legacy 模式实验无效
-
-两台机器默认参数一致：
-
-| 参数 | 值 |
-|------|----|
-| `nvidia_peermem` version | `580.159.03` |
-| `peerdirect_support` | `0` |
-| `persistent_api_support` | `1` |
-| OFED | `OFED-internal-26.01-1.0.0` |
-
-临时切换两台机器到 `peerdirect_support=1` 后，2 节点 x 1 GPU NCCL 仍显示：
-
-```text
-NET/IB : GPU Direct RDMA Disabled for HCA 0 'mlx5_0'
-```
-
-带宽仍约 `13.4 GB/s`。测试后已经恢复默认 `peerdirect_support=0,persistent_api_support=1`。
-
-### 7. PDF 矩阵对齐与 GPU-NIC 亲和性
-
-参考 PDF 的跨 Leaf 命令覆盖 2 机 2/4/8/16 卡矩阵，并使用：
-
-- `NCCL_IB_GID_INDEX=3`
-- `NCCL_IB_SL=5`
-- `NCCL_IB_TC=136`
-- `NCCL_SOCKET_IFNAME=bond0`
-- `NCCL_IB_TIMEOUT=22`
-- `NCCL_NET_PLUGIN=none`
-- `NCCL_NVLS_ENABLE=1`
-
-本环境与 PDF 参考机器有一个关键硬件差异：当前两台机器只有 `mlx5_0,mlx5_1,mlx5_6,mlx5_7` 是 400Gb/s NDR；`mlx5_4,mlx5_5` 是 100Gb/s HDR；`mlx5_2,mlx5_8` 是 25Gb/s；`mlx5_3,mlx5_9` 为 DOWN。参考 PDF 的命令列出了更多 HCA，但当前节点不能等价使用为 8 条 400G rail。
-
-`nvidia-smi topo -m` 显示：
-
-| GPU | 最近的 400G HCA |
-|-----|-----------------|
-| GPU0 | `mlx5_0` |
-| GPU1 | `mlx5_1` |
-| GPU4 | `mlx5_6` |
-| GPU5 | `mlx5_7` |
-
-默认 2 机 4 卡会选择 GPU0/1/2/3，其中 GPU2 最近的是 25G/down 端口，GPU3 没有直接对应 400G rail。因此 2 机 4 卡默认 allreduce 只有约 `168 GB/s`。显式设置 `CUDA_VISIBLE_DEVICES=0,1,4,5` 后：
-
-| 场景 | allreduce | alltoall | 说明 |
-|------|-----------|----------|------|
-| 默认 GPU0/1/2/3 | `167.89 GB/s` | `39.68 GB/s` | GPU/NIC 亲和性错误 |
-| `CUDA_VISIBLE_DEVICES=0,1,4,5` + auto NCCL | `335.34 GB/s` | `63.90 GB/s` | allreduce 接近 PDF |
-| `CUDA_VISIBLE_DEVICES=0,1,4,5` + PDF 固定参数 | `225.29 GB/s` | `73.10 GB/s` | alltoall 接近 PDF，但 allreduce 被压低 |
-
-因此当前脚本支持按 op 配环境变量：4 卡 allreduce 用 auto，4 卡 alltoall 用 PDF 固定参数。
-
-矩阵式正式报告：`reports_multinode_nccl_pdf_matrix_nccl227.md`
-
-| Topology | allreduce | PDF Reference | Status | alltoall | PDF Reference | Status |
-|----------|-----------|---------------|--------|----------|---------------|--------|
-| 2 nodes x 1 GPU | `47.26 GB/s` | `48.90 GB/s` | FAIL | `24.87 GB/s` | `27.25 GB/s` | FAIL |
-| 2 nodes x 2 GPUs | `136.36 GB/s` | `136.93 GB/s` | FAIL | `47.69 GB/s` | `54.41 GB/s` | FAIL |
-| 2 nodes x 4 GPUs | `333.23 GB/s` | `335.48 GB/s` | FAIL | `72.82 GB/s` | `73.73 GB/s` | FAIL |
-| 2 nodes x 8 GPUs | `353.47 GB/s` | `491.84 GB/s` | FAIL | `36.70 GB/s` | `76.54 GB/s` | FAIL |
-
-解释：2 机 4 卡档位已经基本定位并修复到接近 PDF；2 机 8 卡档位不是简单 GPU 顺序问题。尝试调整 8 卡 `CUDA_VISIBLE_DEVICES` 顺序、加入 100G/25G active HCA、以及套 PDF 固定参数都没有改善；固定参数反而会把 8 卡 allreduce 从约 `354 GB/s` 压到约 `239 GB/s`。
-
-8 卡 alltoall 目前的最佳软件侧改动是 `NCCL_PXN_DISABLE=1`：
-
-| Case | 8 卡 alltoall Avg Bus BW |
-|------|--------------------------|
-| baseline | `30.06 GB/s` |
-| `NCCL_PXN_DISABLE=1` | `37.24 GB/s` |
-| 正式矩阵报告 | `36.74 GB/s` |
-
-其他变量如 `NCCL_P2P_PXN_LEVEL`、`NCCL_NET_SHARED_COMMS`、`NCCL_NET_SHARED_BUFFERS`、`NCCL_NCHANNELS_PER_NET_PEER`、`NCCL_IB_ADAPTIVE_ROUTING` 均无改善或变差。
-
-PXN disabled 计数器显示该参数确实修复了 rail 分布：
-
-| Case | Rail 分布 | Avg Bus BW |
-|------|-----------|------------|
-| baseline | `mlx5_0/6` 约 `885 GB`，`mlx5_1/7` 约 `295 GB` | `30.04 GB/s` |
-| `NCCL_PXN_DISABLE=1` | 四条 HCA 均约 `591 GB` | `36.95 GB/s` |
-
-但禁用 PXN 后每条 400G rail 仍只有约 `19-20 GB/s`，没有接近裸 RDMA 单 rail 的 `347-387 Gb/s`。因此它解决的是 rail 分布不均衡的一部分，不是全部 alltoall 性能问题。
-
-复测 PXN disabled alltoall 时继续抓 `counters`/`hw_counters`：
-
-| 观察项 | 结果 |
-|--------|------|
-| alltoall `Avg bus bandwidth` | `36.4512 GB/s` |
-| 每条 HCA 流量 | 约 `712.18-712.28 GiB`，四条 rail 均衡 |
-| discard / rcv error / symbol error / link down / link recovery | `0` 增量 |
-| RoCE retrans / slow restart / packet sequence error / out of sequence | `0` 增量 |
-| `port_xmit_wait` | `mlx5_1`、`mlx5_7` 有增长，约 `15.65M-23.49M` |
-
-判断：当前没有明显坏链路、丢包或重传证据；`port_xmit_wait` 更像发送侧等待 credit/拥塞控制/交换侧调度，或者 NCCL internal alltoall 在当前拓扑下没有把 rail 吞吐打起来。
-
-同样 2 nodes x 8 GPUs、同样 4 条 HCA 的 16G allreduce 对照：
-
-| 观察项 | 结果 |
-|--------|------|
-| allreduce `Avg bus bandwidth` | `354.366 GB/s` |
-| 每条 HCA 流量 | 约 `178.03-178.07 GiB`，四条 rail 均衡 |
-| 错误/重传类 counter | `0` 增量 |
-| `port_xmit_wait` | `mlx5_1`、`mlx5_7` 有增长，约 `6.11M-6.59M` |
-
-判断：allreduce 在接近物理上限时也会出现 `port_xmit_wait`，所以 alltoall 的核心问题不能只归因于该 counter。现在更应关注 NCCL alltoall 通信模式、交换网络策略、以及 NCCL net plugin/SHARP 能力差异。
-
-PXN disabled 基线上的二次参数 sweep：
-
-| Case | Avg Bus BW | 结论 |
-|------|------------|------|
-| `NCCL_PXN_DISABLE=1` | `37.0069 GB/s` | 短测基线 |
-| `+ NCCL_NVLS_ENABLE=0` | `37.2217 GB/s` | 小幅波动，不稳定 |
-| `+ NCCL_P2P_NET_CHUNKSIZE=4194304` | `37.2522 GB/s` | 小幅波动，不稳定 |
-| `+ NCCL_BUFFSIZE=8388608` | `37.0911 GB/s` | 无实质改善 |
-| `+ NCCL_MIN_NCHANNELS=16 NCCL_MAX_NCHANNELS=16` | `37.0189 GB/s` | 无实质改善 |
-| `+ NCCL_IB_AR_THRESHOLD=0` | `37.0843 GB/s` | 无实质改善 |
-| `+ NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=0` | `35.9847 GB/s` | 变差 |
-| `+ NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=1` | `29.8406 GB/s` | 明显变差 |
-| `+ NCCL_IB_QPS_PER_CONNECTION=8 NCCL_IB_SPLIT_DATA_ON_QPS=1` | `24.1183 GB/s` | 明显变差 |
-| `+ NCCL_NCHANNELS_PER_NET_PEER=8` | `29.8904 GB/s` | 明显变差 |
-
-长测复核没有复现 `NVLS/P2P chunk` 的短测小涨：同一环境确认仍为 NCCL `2.27.7+cuda12.4`、4 条 400G HCA、GDR enabled、internal IB plugin，但 baseline 窗口下滑到 `32.7280 GB/s`，`P2P_NET_CHUNKSIZE=4M` 为 `31.9340 GB/s`，`NVLS_ENABLE=0 + P2P_NET_CHUNKSIZE=4M` 为 `27.6585 GB/s`。因此这些参数不应固化到正式配置。
-
-`GRAPH/TUNING/COLL` 日志对照：
-
-| 观察项 | allreduce | alltoall + `NCCL_PXN_DISABLE=1` |
-|--------|-----------|----------------------------------|
-| NCCL version | `2.27.7+cuda12.4` | `2.27.7+cuda12.4` |
-| HCA / GDR | 4 HCA, GDR enabled | 4 HCA, GDR enabled |
-| external net plugin | missing, internal IB | missing, internal IB |
-| channels | `16 coll / 16 nvls / 16 p2p` | `16 coll / 16 nvls / 16 p2p` |
-| Pattern 4 | `crossNic 0`, `type NVL/PXN`, `nChannels 8` | `crossNic 2`, `type NVL/PIX`, `nChannels 8` |
-| `NET/IB/*/GDRDMA` channel edge lines | `256` | `512` |
-| `P2P/CUMEM` channel edge lines | `0` | `224` |
-| total NET/P2P channel edge lines | `256` | `736` |
-
-判断：PXN disabled 后 4 条 IB/GDRDMA rail 和 16 个 p2p/coll/nvls channels 都仍在；但 alltoall graph 明显比 allreduce 复杂，并包含大量本机 P2P/CUMEM 边。这进一步说明问题不在 HCA/GDR 没生效，而在 alltoall collective graph、P2P/NET 组合方式、internal IB plugin 或交换网络策略。
-
-### 8. 8 卡链路计数器与物理上限判断
-
-计数器探测报告：`reports_multinode_nccl_counter_probe_20260523.md`
-
-当前 2 机 8 GPU allreduce 输出：
-
-| Metric | Value |
-|--------|-------|
-| `algbw` | `189.16 / 189.07 GB/s` |
-| `busbw` | `354.68 / 354.52 GB/s` |
-| `Avg bus bandwidth` | `354.597 GB/s` |
-
-allreduce 在 16 ranks 下的换算关系约为：
-
-```text
-busbw = algbw * 2 * (nranks - 1) / nranks = algbw * 1.875
-```
-
-因此 PDF 参考 `491.84 GB/s busbw` 对应约 `262.31 GB/s algbw`。但当前节点可用的 400G HCA 是 `mlx5_0,mlx5_1,mlx5_6,mlx5_7`，每节点 4 条 400Gb/s，理论单向合计约 `200 GB/s`。当前 allreduce `189 GB/s algbw` 已经接近这个物理上限，所以 8 卡 allreduce 剩余差距基本不能靠 NCCL 参数小调解决。
-
-裸 RDMA 4 rail 并发 `ib_write_bw` 也验证了底层 4 条 400G rail 可以同时工作：
-
-| HCA | BW average |
-|-----|------------|
-| `mlx5_0` | `387.16 Gb/s` |
-| `mlx5_1` | `387.07 Gb/s` |
-| `mlx5_6` | `355.02 Gb/s` |
-| `mlx5_7` | `347.70 Gb/s` |
-| Total | `1476.95 Gb/s` / `184.62 GB/s` |
-
-这个裸 RDMA 总带宽与 NCCL 8 卡 allreduce 的 `189 GB/s algbw` 接近，进一步说明 allreduce 已经贴近当前网络形态可提供的实际带宽。
-
-8 卡 alltoall 当前仍只有：
-
-| Metric | Value |
-|--------|-------|
-| `algbw` | `32.04 / 32.05 GB/s` |
-| `busbw` | `30.03 / 30.04 GB/s` |
-| `Avg bus bandwidth` | `30.0389 GB/s` |
-
-同一测试窗口内端口计数器显示 alltoall 流量分布不均衡：`mlx5_0` 和 `mlx5_6` 的流量约 `885 GB`，`mlx5_1` 和 `mlx5_7` 约 `295 GB`，约为三倍差距。继续调换 `NCCL_IB_HCA` 顺序后，8 卡 alltoall 仍稳定在 `30.02-30.07 GB/s`，说明不是简单 HCA 列表顺序问题。
-
-`NCCL_PXN_DISABLE=1` 后，端口流量变为四条 HCA 均约 `591 GB`，alltoall `Avg bus bandwidth` 提升到 `36.9518 GB/s`，但每条 rail 吞吐仍只有约 `19.82 GB/s`。
-
-### 9. NCCL net plugin / SHARP 状态
-
-两台机器上均未找到：
-
-- `libnccl-net.so`
-- `libsharp*`
-- SHARP/HCOLL 相关 deb 包
-
-当前仅看到 UCX 包：
-
-```text
-ucx 1.20.0-1.20260211.d9a4f352d.2601100
-```
-
-apt 源里与 NCCL 直接相关的包只有：
-
-```text
-libnccl2
-libnccl-dev
-```
-
-因此当前 NCCL 日志里的 `Could not find: libnccl-net.so` 是真实环境缺失，不是脚本漏配路径。当前运行走的是 NCCL internal IB plugin；如果要继续追 8 卡 alltoall 或 PDF 2 机 16 卡参考值，需要补齐匹配当前 OFED/driver/CUDA/NCCL 的 NCCL net plugin/SHARP 环境，或由网络侧确认该集群不依赖这些组件也能达到目标值。
-
-## 当前阻塞
-
-### 阻塞 1：当前生产 NCCL 版本过旧，GDR 被禁用
-
-现象：
-
-- pip NCCL 2.21.5：`GPU Direct RDMA Disabled`，2x8 allreduce `67.42 GB/s`
-- 临时 NCCL 2.27.7：`GPU Direct RDMA Enabled`，2x8 allreduce `237.86 GB/s`
-- 因此，生产测试环境应避免继续使用 pip NCCL 2.21.5 作为多机 NCCL 验收运行库
-
-判断：底层 RDMA 能力存在，GDR 禁用主要由旧 NCCL 版本触发。建议正式安装并固定 NCCL 2.27.7+cuda12.4 或更新的已验证版本。
-
-### 阻塞 2：2 机 8 GPU 档位仍低于 PDF 参考值
-
-现象：
-
-- 2x8 16G allreduce：`354.02 GB/s`，PDF 参考 `491.84 GB/s`
-- 2x8 16G alltoall：`30.04 GB/s`，PDF 参考 `76.54 GB/s`
-- 已使用 4 个 400Gb/s HCA：`mlx5_0, mlx5_1, mlx5_6, mlx5_7`
-- 加入 `mlx5_4,mlx5_5` 100G HCA 或 `mlx5_2,mlx5_8` 25G HCA 基本无收益
-- 调整 8 卡 `CUDA_VISIBLE_DEVICES` 顺序基本无收益
-- 套 PDF 固定参数会让 8 卡 allreduce 明显变差
-
-判断：2 机 8 GPU 档位的剩余差距更像硬件 rail 数量/交换网络/路由/拥塞/NCCL net plugin 能力问题，不再是旧 NCCL GDR disabled 或 4 卡 GPU 选择问题。
-
-补充证据：
-
-- 8 卡 allreduce `algbw ~= 189 GB/s`，接近当前 4 x 400G HCA 的理论单向合计 `200 GB/s`
-- 裸 RDMA 4 rail 并发 `ib_write_bw` 合计 `1476.95 Gb/s` / `184.62 GB/s`
-- PDF 8 卡 allreduce `491.84 GB/s busbw` 反推需要约 `262 GB/s algbw`，超过当前 4 x 400G 的物理单向总带宽
-- 8 卡 alltoall baseline 端口计数器显示 rail 分布不均，且 HCA 顺序 sweep 无改善
-- 当前环境缺失 NCCL net plugin/SHARP，NCCL 只能使用 internal IB plugin
-- `NCCL_PXN_DISABLE=1` 可将 8 卡 alltoall 提升到约 `36.7 GB/s`，并修复 rail 分布不均，但仍不到 PDF 参考值的一半
-- PXN disabled 复测没有看到 discard、链路错误、RoCE 重传、slow restart、packet sequence error 等错误类 counter 增长
-- allreduce 对照同样出现 `port_xmit_wait` 但能跑到 `354.366 GB/s`，说明 `port_xmit_wait` 不是 alltoall 低吞吐的唯一根因
-- PXN disabled 基线上继续叠加 NVLS、P2P chunk、buffer、channel、QP/split、AR 等参数没有稳定收益；QP/split 和 `NCCL_NCHANNELS_PER_NET_PEER=8` 明显变差
-- NCCL GRAPH/TUNING 对照显示 alltoall 与 allreduce 的 HCA/GDR/channel 基础状态一致，但 alltoall channel edge 更多，并混入大量 `P2P/CUMEM` 本地路径
-
-### 阻塞 3：`nccl-gpu-2` SSH 存在外部连接压力
-
-现象：
-
-- 多次出现过：`kex_exchange_identification: Connection closed by remote host`
-- 根因是未认证连接过多触发 `MaxStartups`
-- 当前已经通过临时 SSHD 配置缓解，并拿到了有效 2x8 报告
-- 但如果外部连接压力持续，仍建议从网络侧或安全策略侧处理来源连接
-
-判断：这不再阻塞当前报告产出，但属于环境稳定性风险。
-
-## 建议下一步
-
-1. 从网络/安全侧处理 `172.239.10.85` 等来源的 SSH 未认证连接压力，或者保留更高的 `MaxStartups` 配置作为测试窗口临时策略。
-2. 正式安装并固定已验证的 NCCL 2.27.7+cuda12.4 或更新版本，不要依赖 pip NCCL 2.21.5；当前 `/tmp/nccl-2.27.7-cuda12.4` 只是临时解压验证。
-3. 4 卡 per node 测试应显式使用 `CUDA_VISIBLE_DEVICES=0,1,4,5`，避免默认 GPU0/1/2/3 落到错误 GPU/NIC 亲和性。
-4. 4 卡 allreduce 建议继续让 NCCL 自动选择 channel/QP；4 卡 alltoall 如果要贴近 PDF，可单独套 `NCCL_IB_QPS_PER_CONNECTION=4`、`NCCL_MIN_NCHANNELS=4`、`NCCL_IB_SPLIT_DATA_ON_QPS=1`。
-5. 8 卡 per node 不建议套上述固定参数，会降低 allreduce；继续用 auto。
-6. 尝试安装或启用匹配当前 OFED/driver 的 NCCL net plugin/SHARP；当前日志显示 `Could not find: libnccl-net.so`，NCCL 使用的是 internal IB plugin。
-7. 核对跨 Leaf 链路的 rail mapping、交换机端口速率、路由、credit/拥塞等待与交换机侧队列计数；同时用 allreduce 对照避免把 `port_xmit_wait` 误判为 alltoall 独有根因。
-8. 确认当前 PDF 的 `491.84/76.54 GB/s` 是否要求当前这两台节点在只有 4 条 400G rail 的形态下也达到；如果要求一致，需要网络/硬件侧继续介入。
-9. 8 卡 alltoall 当前不建议继续盲调 NCCL 环境变量；重点查 SHARP/NCCL net plugin、NCCL internal alltoall 行为、交换机 ECMP/自适应路由和拥塞/credit 等待；`NCCL_IB_HCA` 顺序与 rail 分布本身已经不是当前主问题。
-
-## 当前可交付物
-
-- `configs/multinode_nccl_diagnostic.yaml`：多机多卡诊断配置
-- `configs/multinode_nccl_nccl227_diagnostic.yaml`：NCCL 2.27.7 256M 诊断配置
-- `configs/multinode_nccl_nccl227_sweep.yaml`：NCCL 2.27.7 1M 到 4G sweep 配置
-- `configs/multinode_nccl_nccl227_16g.yaml`：NCCL 2.27.7 16G 大包配置
-- `configs/multinode_nccl_nccl227_auto_16g.yaml`：NCCL 2.27.7 16G 自动 channel/QP 配置
-- `configs/multinode_nccl_nccl227_pdf_matrix.yaml`：按 PDF 矩阵和 GPU 亲和性优化后的跨 Leaf 配置
-- `reports_multinode_nccl_diagnostic_2x8_sshfix.md`：脚本生成的原始 2x8 诊断报告
-- `reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md`：NCCL 2.27.7 256M 诊断报告
-- `reports_multinode_nccl_sweep_2x8_nccl227.md`：NCCL 2.27.7 1M 到 4G sweep 报告
-- `reports_multinode_nccl_16g_2x8_nccl227.md`：NCCL 2.27.7 16G 大包报告
-- `reports_multinode_nccl_16g_2x8_nccl227_auto.md`：NCCL 2.27.7 16G 自动 channel/QP 原始报告
-- `reports_multinode_nccl_pdf_matrix_nccl227.md`：NCCL 2.27.7 PDF 矩阵式原始报告
-- `reports_multinode_nccl_counter_probe_20260523.md`：8 卡链路计数器与 HCA 顺序 sweep 报告
-- `reports_multinode_nccl_alltoall_tuning_20260523.md`：8 卡 alltoall NCCL 网络参数 sweep 报告
-- `reports_multinode_nccl_diagnosis_20260523.md`：本中文诊断总结
diff --git a/reports_multinode_nccl_diagnostic_2x8_debug_v2.md b/reports_multinode_nccl_diagnostic_2x8_debug_v2.md
deleted file mode 100644
index 2076245..0000000
--- a/reports_multinode_nccl_diagnostic_2x8_debug_v2.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-23T07:37:41.426792
-- **Host:** aikubeworker0012
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Missing required evidence:
-- GPU Info
-- Health Check
-- Memory Bandwidth
-- Compute Throughput
-- NVLink/NVSwitch
-- NCCL
-- Stress Test
-- RDMA
-- DCGM
-- Training
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Multi-node NCCL | FAIL |
-
-## Multi-node NCCL / Cross Leaf
-
-Source: nccl-tests-mpirun | Mode: diagnostic
-
-- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
-- **Preflight:** PASS (1 warnings)
-
-### Multi-node NCCL allreduce
-
-| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs diagnostic | 68.69 GB/s | 256M | 68.21 GB/s | >= 480 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Disabled HCAs |
-|----------|--------------|-----------------|-------------------|
-| 2 nodes x 8 GPUs diagnostic | IB | DISABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 8 GPUs diagnostic | 0 |  aikubeworker0012:2139504:2139504 [0] NCCL INFO comm 0x55646d15f590 rank 0 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 68.2135  # # Collective test concluded: all_reduce_perf #   |
-
-### Multi-node NCCL alltoall
-
-| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs diagnostic | 0.00 GB/s |  | 0.00 GB/s | >= 75 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Disabled HCAs |
-|----------|--------------|-----------------|-------------------|
-| 2 nodes x 8 GPUs diagnostic | unknown | UNKNOWN | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 8 GPUs diagnostic | 255 |  lack of common network interfaces and/or no route found between   them. Please check network connectivity (including firewalls   and network routing requirements). --------------------------------------------------------------------------  |
-
-**Overall: FAIL**
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md b/reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md
deleted file mode 100644
index 1b188d5..0000000
--- a/reports_multinode_nccl_diagnostic_2x8_nccl227_v2.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-23T07:53:24.460277
-- **Host:** aikubeworker0012
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Missing required evidence:
-- GPU Info
-- Health Check
-- Memory Bandwidth
-- Compute Throughput
-- NVLink/NVSwitch
-- NCCL
-- Stress Test
-- RDMA
-- DCGM
-- Training
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Multi-node NCCL | FAIL |
-
-## Multi-node NCCL / Cross Leaf
-
-Source: nccl-tests-mpirun | Mode: diagnostic-nccl-2.27.7
-
-- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
-- **Preflight:** PASS
-
-### Multi-node NCCL allreduce
-
-| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 | 212.19 GB/s | 256M | 211.75 GB/s | >= 480 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 | 0 | 0016:1009332:1009965 [2] NCCL INFO comm 0x56388eec2e40 rank 10 nranks 16 cudaDev 2 busId 3a000 - Destroy COMPLETE aikubeworker0012:2144366:2144531 [5] NCCL INFO comm 0x556e4fcf5280 rank 5 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE   |
-
-### Multi-node NCCL alltoall
-
-| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 | 28.37 GB/s | 256M | 28.32 GB/s | >= 75 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 | 0 | 0012:2144547:2144713 [4] NCCL INFO comm 0x55896a1dae20 rank 4 nranks 16 cudaDev 4 busId 9a000 - Destroy COMPLETE aikubeworker0016:1010164:1010881 [2] NCCL INFO comm 0x565344db7790 rank 10 nranks 16 cudaDev 2 busId 3a000 - Destroy COMPLETE   |
-
-**Overall: FAIL**
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_multinode_nccl_diagnostic_2x8_sshfix.md b/reports_multinode_nccl_diagnostic_2x8_sshfix.md
deleted file mode 100644
index 1872c50..0000000
--- a/reports_multinode_nccl_diagnostic_2x8_sshfix.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-23T07:46:11.464439
-- **Host:** aikubeworker0012
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Missing required evidence:
-- GPU Info
-- Health Check
-- Memory Bandwidth
-- Compute Throughput
-- NVLink/NVSwitch
-- NCCL
-- Stress Test
-- RDMA
-- DCGM
-- Training
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Multi-node NCCL | FAIL |
-
-## Multi-node NCCL / Cross Leaf
-
-Source: nccl-tests-mpirun | Mode: diagnostic
-
-- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
-- **Preflight:** PASS
-
-### Multi-node NCCL allreduce
-
-| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs diagnostic | 67.42 GB/s | 256M | 67.50 GB/s | >= 480 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Disabled HCAs |
-|----------|--------------|-----------------|-------------------|
-| 2 nodes x 8 GPUs diagnostic | IB | DISABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 8 GPUs diagnostic | 0 | orker0016:986293:986293 [1] NCCL INFO comm 0x563abe94c350 rank 9 nranks 16 cudaDev 1 busId 2a000 - Destroy COMPLETE aikubeworker0016:986292:986292 [0] NCCL INFO comm 0x560ffac51160 rank 8 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE   |
-
-### Multi-node NCCL alltoall
-
-| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs diagnostic | 9.56 GB/s | 256M | 9.55 GB/s | >= 75 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Disabled HCAs |
-|----------|--------------|-----------------|-------------------|
-| 2 nodes x 8 GPUs diagnostic | IB | DISABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 8 GPUs diagnostic | 0 | TE aikubeworker0012:2141982:2141982 [4] NCCL INFO comm 0x55d0bf9c6a00 rank 4 nranks 16 cudaDev 4 busId 9a000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 9.55234  # # Collective test concluded: alltoall_perf #   |
-
-**Overall: FAIL**
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_multinode_nccl_environment_gap_20260523.md b/reports_multinode_nccl_environment_gap_20260523.md
deleted file mode 100644
index c4a65a5..0000000
--- a/reports_multinode_nccl_environment_gap_20260523.md
+++ /dev/null
@@ -1,168 +0,0 @@
-# 多节点 NCCL 环境等价性缺口说明 2026-05-23
-
-## 目的
-
-这份文档用于回答一个核心问题：当前 `aikubeworker0012` / `aikubeworker0016` 是否具备与参考 PDF 的 2 机 16 GPU NCCL 目标相同的硬件和 NCCL 网络软件环境。
-
-结论先行：**当前环境不能证明与 PDF 参考环境等价**。主要差异有两类：
-
-1. 当前每节点只有 4 条可用于 NCCL 的 400G InfiniBand rail。
-2. 当前没有外部 NCCL net plugin / SHARP / HCOLL 组件，NCCL 使用 internal IB plugin。
-
-## 采集时间和节点
-
-采集时间：`2026-05-23T10:53:18+00:00` 至 `2026-05-23T10:53:21+00:00`
-
-| 节点 | SSH alias | 内网地址 | kernel |
-|---|---|---|---|
-| `aikubeworker0012` | `nccl-gpu-1` | `172.72.8.12` | `5.15.0-119-generic` |
-| `aikubeworker0016` | `nccl-gpu-2` | `172.72.8.16` | `5.15.0-119-generic` |
-
-## HCA / Rail 现状
-
-两台机器的 `/sys/class/infiniband/mlx5_*/ports/1` 结果一致：
-
-| HCA | State | Rate | Link layer | 对 NCCL 跨节点验收的含义 |
-|---|---|---:|---|---|
-| `mlx5_0` | ACTIVE | `400 Gb/sec (4X NDR)` | InfiniBand | 可作为 400G rail |
-| `mlx5_1` | ACTIVE | `400 Gb/sec (4X NDR)` | InfiniBand | 可作为 400G rail |
-| `mlx5_2` | ACTIVE | `25 Gb/sec (1X EDR)` | Ethernet | 不是 400G IB rail |
-| `mlx5_3` | DOWN | `25 Gb/sec (1X EDR)` | Ethernet | 不可用 |
-| `mlx5_4` | ACTIVE | `100 Gb/sec (2X HDR)` | InfiniBand | 不是 400G rail |
-| `mlx5_5` | ACTIVE | `100 Gb/sec (2X HDR)` | InfiniBand | 不是 400G rail |
-| `mlx5_6` | ACTIVE | `400 Gb/sec (4X NDR)` | InfiniBand | 可作为 400G rail |
-| `mlx5_7` | ACTIVE | `400 Gb/sec (4X NDR)` | InfiniBand | 可作为 400G rail |
-| `mlx5_8` | ACTIVE | `25 Gb/sec (1X EDR)` | Ethernet | 不是 400G IB rail |
-| `mlx5_9` | DOWN | `25 Gb/sec (1X EDR)` | Ethernet | 不可用 |
-
-因此当前推荐并实际使用的 HCA 列表是：
-
-```text
-NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_6,mlx5_7
-```
-
-这代表每节点 `4 x 400Gb/s`，理论单向原始带宽约：
-
-```text
-4 * 400Gb/s / 8 = 200 GB/s
-```
-
-## 与 PDF 目标的物理带宽关系
-
-参考 PDF 的 2 机 16 GPU 目标：
-
-| Operation | PDF Bus BW |
-|---|---:|
-| AllReduce | `491.84 GB/s` |
-| AllToAll | `76.54 GB/s` |
-
-NCCL allreduce 在 16 ranks 下，`busbw = algbw * 2 * (n - 1) / n = algbw * 1.875`。
-
-因此 PDF 的 allreduce `491.84 GB/s busbw` 反推：
-
-```text
-491.84 / 1.875 = 262.31 GB/s algbw
-```
-
-但当前 4 条 400G rail 的理论单向原始带宽约 `200 GB/s`。本项目实测 2x8 allreduce：
-
-| 测试 | Bus BW | 反推 Alg BW |
-|---|---:|---:|
-| 本轮深度诊断 allreduce | `354.025 GB/s` | `188.81 GB/s` |
-| 本轮 GRAPH allreduce | `354.224 GB/s` | `188.92 GB/s` |
-
-这已经接近当前 4 x 400G rail 的物理单向上限。除非 PDF 参考环境具备更多有效 400G rail、更高交换网络能力，或使用了当前缺失的网络加速组件，否则当前 2x8 allreduce 很难靠 NCCL 环境变量小调达到 `491.84 GB/s`。
-
-## GPU-NIC 亲和性影响
-
-`nvidia-smi topo -m` 显示的 NIC legend 两台一致：
-
-| NIC | HCA |
-|---|---|
-| NIC0 | `mlx5_0` |
-| NIC1 | `mlx5_1` |
-| NIC2 | `mlx5_2` |
-| NIC3 | `mlx5_3` |
-| NIC4 | `mlx5_4` |
-| NIC5 | `mlx5_5` |
-| NIC6 | `mlx5_6` |
-| NIC7 | `mlx5_7` |
-| NIC8 | `mlx5_8` |
-| NIC9 | `mlx5_9` |
-
-关键亲和关系：
-
-| GPU | 最近的有效 400G HCA |
-|---|---|
-| GPU0 | `mlx5_0` |
-| GPU1 | `mlx5_1` |
-| GPU4 | `mlx5_6` |
-| GPU5 | `mlx5_7` |
-
-这解释了为什么 2 机 4 GPU 档位需要使用：
-
-```text
-CUDA_VISIBLE_DEVICES=0,1,4,5
-```
-
-默认 GPU0/1/2/3 会把 GPU2/GPU3 放到非理想 NIC 亲和路径上，其中 GPU2 最近的 `mlx5_2/3` 不是可用 400G IB rail。
-
-## NCCL Net Plugin / SHARP 状态
-
-在两台节点上搜索：
-
-```text
-find /usr /opt /tmp /root -name 'libnccl-net*.so*' -o -name 'libsharp*.so*'
-```
-
-结果为空。
-
-两台节点包列表中能看到：
-
-| 包 | 版本/说明 |
-|---|---|
-| `doca-ofed` | `3.3.0-088000` |
-| `mlnx-ofed-kernel-dkms` | `26.01.OFED.26.01.1.0.0.1-1` |
-| `ucx` | `1.20.0-1.20260211...` |
-
-未看到：
-
-- `libnccl-net.so`
-- `libsharp*.so`
-- SHARP packages
-- HCOLL packages
-
-本轮 NCCL GRAPH 日志也显示 `plugin_missing=16`，说明 NCCL 只能走 internal IB plugin。
-
-## 当前 2x8 结果归因边界
-
-已经基本排除：
-
-- 不是 SSH / mpirun launch 问题：preflight 已通过。
-- 不是 HCA 完全不可用：4 条 400G rail 都 ACTIVE，allreduce 能跑到约 `354 GB/s busbw`。
-- 不是 GDR disabled：NCCL `2.27.7` 日志中 GDR enabled。
-- 不是 rail 完全打偏：`NCCL_PXN_DISABLE=1` 后 alltoall 四条 rail 流量均衡。
-- 不是明显坏链路/重传：counter 未见 discard、RoCE retrans、slow restart、packet sequence error 等增长。
-
-仍然成立的缺口：
-
-1. **2x8 allreduce 的 PDF 目标疑似超过当前 4 x 400G rail 物理能力。**
-2. **2x8 alltoall 即使 rail 均衡仍只有 `36-37 GB/s`，更像 NCCL alltoall 图策略、internal IB plugin 能力、缺少 SHARP/NCCL net plugin 或交换网络策略问题。**
-
-## 给网络/环境侧的确认清单
-
-请网络/环境侧确认以下问题：
-
-1. PDF 参考环境每节点实际参与 NCCL 的 400G rail 数量是多少？是否为 8 条 400G，而不是当前的 4 条 400G？
-2. PDF 命令中列出的 HCA 列表是否在参考环境中全部为 400G InfiniBand ACTIVE？
-3. PDF 参考环境是否启用了 NCCL net plugin、SHARP、HCOLL、UCX plugin 或交换机侧 SHARP aggregation？
-4. 当前交换网络是否开启 adaptive routing / ECMP / congestion control，是否存在跨 Leaf 场景下对 alltoall pattern 不友好的 hash 或路径限制？
-5. 当前 `mlx5_4/5` 为什么只有 100G，`mlx5_2/8` 为什么是 Ethernet 25G，`mlx5_3/9` 为什么 DOWN；这些是否符合机器采购和验收预期？
-6. 如果验收必须按 PDF 的 `491.84/76.54 GB/s`，是否需要更换到与 PDF 等价的 rail 数量/交换网络/软件栈再测。
-
-## 建议下一步
-
-1. 暂停继续盲调 NCCL 小参数；已有 sweep 显示收益不稳定或负向。
-2. 先让硬件/网络侧确认 rail 数量和速率是否与 PDF 等价。
-3. 如果确认硬件等价，再补齐 NCCL net plugin / SHARP 环境，并用 `scripts/multinode_nccl_deep_diagnose.sh graph` 复查 plugin 和 graph 变化。
-4. 如果硬件不等价，应调整验收阈值或改用与 PDF 等价的节点组合复测。
diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md
deleted file mode 100644
index d70ea8b..0000000
--- a/reports_multinode_nccl_handoff_plan_20260523.md
+++ /dev/null
@@ -1,213 +0,0 @@
-# 多节点 NCCL 交接计划 2026-05-23
-
-## 当前一句话结论
-
-当前 2 机 8 卡 NCCL 已经排除旧 NCCL、GDR disabled、HCA 选择错误、SSH/mpirun launch、明显链路错误等问题；剩余差距集中在 **硬件 rail 数量是否与 PDF 等价**、**NCCL net plugin / SHARP 是否缺失**、以及 **alltoall 在当前跨 Leaf 网络下的图策略/交换路径效率**。
-
-全局验收状态先看 `reports_h100_acceptance_current_status_20260523.md`；该文件把单节点 `test all`、跨节点 RDMA、多机 NCCL 和阻塞项汇总到一张总表。
-
-## 已经验证的事实
-
-| 事实 | 当前证据 |
-|---|---|
-| 两台机器可用于 NCCL 的 400G IB rail 是 4 条 | `mlx5_0,mlx5_1,mlx5_6,mlx5_7` 均为 `400 Gb/sec (4X NDR)` |
-| 其他 HCA 不等价 | `mlx5_4/5` 为 100G IB，`mlx5_2/8` 为 25G Ethernet，`mlx5_3/9` DOWN |
-| NCCL 2.27.7 GDR 可用 | GRAPH/NET 日志中 GDR enabled |
-| allreduce 已接近当前 4 rail 物理上限 | 最新 PDF matrix 2x8 为 `353.85 GB/s busbw`，反推 `188.72 GB/s algbw`，接近 4 x 400G 的 `200 GB/s` 单向原始带宽 |
-| alltoall PXN disabled 后 rail 均衡但仍低 | 最新 PDF matrix 2x8 为 `36.83 GB/s busbw`，每条 rail 约 `19-20 GB/s` |
-| 正式 PDF matrix 已复跑 | `reports_multinode_nccl_pdf_matrix_20260523_113803.md`，所有 case 正确性通过；除 2x2 allreduce 外，性能阈值仍 FAIL |
-| 原始 artifacts 已归档 | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`，每个 case 有完整 `cmd/stdout/stderr/json` |
-| artifacts 信号已分析 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md`，确认所有 case 都走 IB/GDRDMA 和 4 条 400G HCA，未见 SHARP/CollNet |
-| 多机六项 collective 已补测 | `reports_multinode_nccl_all_collectives_run_20260523.md`，2x8 下 6 项均正确性通过，allreduce/alltoall 按 PDF 阈值仍 FAIL |
-| 六项 collective artifacts 已归档 | `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md`，远端 tar 为 `reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz` |
-| 没看到硬错误 | 未见 discard、RoCE retrans、slow restart、packet sequence error 等增长 |
-| 当前缺外部 NCCL 网络组件 | 未找到 `libnccl-net*.so*` / `libsharp*.so*`，未见 SHARP/HCOLL 包 |
-
-## PDF 目标与当前物理能力的冲突
-
-PDF 2 机 16 GPU allreduce 目标是：
-
-```text
-491.84 GB/s busbw
-```
-
-16 ranks allreduce 换算关系：
-
-```text
-busbw = algbw * 1.875
-```
-
-因此 PDF 目标反推：
-
-```text
-491.84 / 1.875 = 262.31 GB/s algbw
-```
-
-当前每节点 4 条 400G rail 的理论单向原始带宽：
-
-```text
-4 * 400Gb/s / 8 = 200 GB/s
-```
-
-所以如果 PDF 环境有更多有效 400G rail，或启用了 SHARP/NCCL net plugin，而当前环境没有，则当前节点不应直接按 PDF 2x8 目标判定。
-
-## 决策树
-
-### A. 如果验收坚持 PDF 原始阈值
-
-必须先证明当前环境与 PDF 等价：
-
-1. 每节点是否有 8 条 400G IB rail 可用？
-2. PDF 命令中的 HCA 在参考环境里是否全部是 400G IB ACTIVE？
-3. PDF 环境是否启用了 SHARP / NCCL net plugin / HCOLL / UCX plugin？
-4. 当前跨 Leaf 交换网络策略是否与 PDF 环境一致？
-
-如果任一答案是否定或未知，应先补齐硬件/软件/网络环境再复测，不应继续靠 NCCL 小参数追 `491.84/76.54 GB/s`。
-
-### B. 如果验收按当前硬件形态重新定标
-
-建议把当前 2x8 allreduce 的可解释目标按 4 x 400G rail 物理能力重新评估：
-
-- allreduce 当前 `353.85 GB/s busbw`，反推 `188.72 GB/s algbw`，接近 `200 GB/s` 单向原始上限。
-- alltoall 当前 `36.83 GB/s` 仍偏低，需要作为独立问题继续排查。
-
-## 最新 PDF matrix 结果
-
-| Topology | AllReduce | AllReduce Target | AllToAll | AllToAll Target |
-|---|---:|---:|---:|---:|
-| 2 nodes x 1 GPU | `47.29` | `48.90` | `24.85` | `27.25` |
-| 2 nodes x 2 GPUs | `137.16` | `136.93` | `47.76` | `54.41` |
-| 2 nodes x 4 GPUs | `335.07` | `335.48` | `72.74` | `73.73` |
-| 2 nodes x 8 GPUs | `353.85` | `491.84` | `36.83` | `76.54` |
-
-所有 case 的 return code 为 `0`，NCCL `Out of bounds values` 为 `0 OK`。因此本轮 FAIL 是性能阈值失败，不是 NCCL 正确性或启动链路失败。
-
-### C. 如果要继续优化 alltoall
-
-不要继续盲扫以下参数：
-
-- `NCCL_IB_QPS_PER_CONNECTION`
-- `NCCL_IB_SPLIT_DATA_ON_QPS`
-- `NCCL_NCHANNELS_PER_NET_PEER`
-- `NCCL_BUFFSIZE`
-- `NCCL_P2P_NET_CHUNKSIZE`
-- `NCCL_IB_AR_THRESHOLD`
-
-已有 sweep 表明它们没有稳定正收益，部分明显负向。
-
-优先做：
-
-1. 补齐并验证 `libnccl-net.so` / SHARP 环境。
-2. 让网络侧查跨 Leaf ECMP / adaptive routing / congestion control / credit wait。
-3. 用 `scripts/multinode_nccl_deep_diagnose.sh graph` 对比启用 plugin 前后的 NCCL graph。
-4. 如有等价 8 rail 节点，迁移同一脚本复测，确认 allreduce 物理上限是否抬升。
-
-## 给网络/硬件/环境侧的问题
-
-请直接确认下面这些问题：
-
-1. 这两台机器是否本来应该有 8 条 400G IB rail？如果是，为什么当前只有 4 条？
-2. `mlx5_4/5` 当前只有 100G，是配置、线缆、模块、交换机端口还是硬件限制？
-3. `mlx5_2/8` 为什么是 Ethernet 25G？是否预期不参与 IB NCCL？
-4. `mlx5_3/9` DOWN 是否符合预期？
-5. PDF 参考环境是否安装了 SHARP、HCOLL 或 NCCL net plugin？
-6. 当前交换机是否开启 adaptive routing，并且对 alltoall 这种多点到多点流量友好？
-7. 当前跨 Leaf 路径是否存在 ECMP hash 不均、PFC/credit wait、拥塞控制参数差异？
-
-## 后续复跑命令
-
-### 轻量检查
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/multinode_nccl_deep_diagnose.sh preflight
-```
-
-### 单节点环境等价性快照
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/nccl_environment_snapshot.sh reports/nccl_environment_snapshot_$(hostname)_$(date +%Y%m%d_%H%M%S).md
-```
-
-### 单节点 H100 原始 all 报告
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/run_h100_single_node_all.sh
-```
-
-### 多机多卡 PDF 矩阵
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/run_multinode_nccl_pdf_matrix.sh
-```
-
-### 多机多卡 2x8 六项 collective 补测
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/run_multinode_nccl_all_collectives.sh
-```
-
-说明：这个入口用于补齐单机 `test all` 中已有、但多机 PDF matrix 还没覆盖的 NCCL collective。已知 PDF 2x8 阈值仍用于 `allreduce/alltoall`；新增的 `broadcast/reducescatter/allgather/sendrecv` 暂作为证据采集项，不强行套 PDF allreduce/alltoall 阈值。
-
-### 完整深度诊断
-
-```bash
-cd /root/test_gpu_scripts
-OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_$(date +%Y%m%d_%H%M%S) \
-  bash scripts/multinode_nccl_deep_diagnose.sh all
-```
-
-### 启用新 NCCL plugin / SHARP 后的最小复核
-
-```bash
-cd /root/test_gpu_scripts
-OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%d_%H%M%S) \
-  bash scripts/multinode_nccl_deep_diagnose.sh graph
-```
-
-复核重点：
-
-- `plugin_missing` 是否消失或明显减少。
-- NCCL 日志是否出现外部 net plugin。
-- alltoall graph 中 `P2P/CUMEM`、`NET/IB/*/GDRDMA`、`channel_edge_lines` 是否变化。
-- alltoall busbw 是否突破 `36-37 GB/s` 平台。
-
-## 关键文件
-
-| 文件 | 用途 |
-|---|---|
-| `reports_h100_acceptance_current_status_20260523.md` | 当前 H100 验收总览，汇总单节点、多机 NCCL、跨节点 RDMA 和阻塞项 |
-| `reports_multinode_nccl_diagnosis_20260523.md` | 总诊断报告 |
-| `reports_multinode_nccl_pdf_matrix_20260523_112247.md` | 上一次多机多卡 PDF matrix 原始报告 |
-| `reports_multinode_nccl_pdf_matrix_20260523_113803.md` | 最新带 artifacts 的多机多卡 PDF matrix 原始报告 |
-| `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新多机多卡 PDF matrix 中文摘要 |
-| `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum |
-| `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 |
-| `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 |
-| `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 |
-| `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 最新多机多卡 2x8 六项 collective artifacts manifest 和 checksum |
-| `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮深度复跑结果 |
-| `reports_multinode_nccl_environment_gap_20260523.md` | 硬件/软件环境等价性缺口 |
-| `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail/counter 证据 |
-| `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall 参数 sweep 和结论 |
-| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本 runbook |
-| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 |
-| `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
-| `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 |
-| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口；复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` |
-| `scripts/run_multinode_nccl_all_collectives.sh` | 多机多卡 2x8 六项 collective 补测入口；复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` |
-| `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置 |
-| `configs/multinode_nccl_nccl227_all_collectives_2x8.yaml` | 多机多卡 2x8 六项 collective 补测配置 |
-
-## 当前建议
-
-当前不建议继续把精力放在 NCCL 环境变量微调上。更高价值的动作是：
-
-1. 确认 PDF 参考环境的 rail 数量、速率和 SHARP/plugin 状态。
-2. 补齐或明确排除 NCCL net plugin / SHARP。
-3. 让网络侧针对 alltoall 多点通信模式查跨 Leaf 路径和拥塞策略。
-4. 如果硬件不等价，调整验收阈值或换等价节点重测。
diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md
deleted file mode 100644
index 129b50d..0000000
--- a/reports_multinode_nccl_latest_index_20260523.md
+++ /dev/null
@@ -1,265 +0,0 @@
-# 多节点 NCCL 最新索引 2026-05-23
-
-## 当前状态
-
-当前工作分支：`h100-acceptance-current`
-
-当前结论：
-
-- 2026-05-23 `11:38` 已完成带 artifacts 的正式多机多卡 PDF matrix 复跑，原始报告为 `reports_multinode_nccl_pdf_matrix_20260523_113803.md`，中文结论为 `reports_multinode_nccl_pdf_matrix_run_20260523.md`，artifact manifest 为 `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`。
-- 已补充 artifacts 信号分析：`reports_multinode_nccl_artifact_signal_analysis_20260523.md`。结论是所有 case 都走 `IB`，都使用 `mlx5_0,mlx5_1,mlx5_6,mlx5_7`，都有 GDRDMA 信号，但没有 SHARP/CollNet/外部 NCCL net plugin 证据。
-- 已补充并实跑多机多卡 2x8 六项 collective：`reports_multinode_nccl_all_collectives_run_20260523.md`。新增 `broadcast/reducescatter/allgather/sendrecv` 均 `returncode=0`、`wrong=0`、走 `IB/GDRDMA`；已知 PDF 阈值项 `allreduce/alltoall` 仍 FAIL。
-- 六项 collective 的完整 artifacts 已归档：`reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md`，远端 tar 为 `reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz`。
-- 已补充当前验收状态总览：`reports_h100_acceptance_current_status_20260523.md`，把单节点、多机 NCCL、跨节点 RDMA、环境等价性和阻塞项合并到一份中文总表。
-- 已补充收尾检查清单：`reports_h100_acceptance_closure_checklist_20260523.md`，明确哪些工作可以阶段性交付、哪些验收门禁仍不能关闭。
-- 已补充网络/硬件/环境侧闭环请求：`reports_h100_network_hardware_escalation_request_20260523.md`，用于让责任侧回填 rail、plugin/SHARP、跨 Leaf 和新阈值口径。
-- 已补充交付包 manifest：`reports_h100_acceptance_delivery_manifest_20260523.md`，汇总主入口、脚本、远端 artifacts 和 checksum。
-- 2 机 1/2/4 GPU per node 档位已接近 PDF 参考值，但严格按阈值仍 FAIL。
-- 2 机 8 GPU 档位仍未达到 PDF 参考值：
-  - allreduce 实测 `353.85 GB/s busbw`，PDF 目标 `491.84 GB/s`。
-  - alltoall 实测 `36.83 GB/s busbw`，PDF 目标 `76.54 GB/s`。
-- 当前 2 机 8 GPU 剩余差距不再像是旧 NCCL、GDR disabled、HCA 顺序、SSH/mpirun 或明显坏链路问题。
-- 当前更像是硬件 rail 数量与 PDF 不等价、NCCL net plugin / SHARP 缺失、或跨 Leaf alltoall 网络/图策略问题。
-
-## 先看这三份
-
-| 顺序 | 文件 | 用途 |
-|---:|---|---|
-| 1 | `reports_h100_acceptance_current_status_20260523.md` | 当前 H100 验收总览，汇总单节点、多机 NCCL、跨节点 RDMA 和阻塞项 |
-| 2 | `reports_h100_acceptance_closure_checklist_20260523.md` | 收尾检查清单：可交付项、未关闭门禁、最短收尾路径 |
-| 3 | `reports_h100_acceptance_delivery_manifest_20260523.md` | 交付包 manifest：入口、脚本、远端 artifacts、checksum |
-| 4 | `reports_h100_network_hardware_escalation_request_20260523.md` | 给网络/硬件/环境侧的闭环请求和回填表 |
-| 5 | `reports_multinode_nccl_handoff_plan_20260523.md` | 给网络/硬件/环境侧的交接计划，包含决策树、要问的问题和复跑命令 |
-| 6 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价，重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP |
-| 7 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析，确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 |
-| 8 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果，补齐单机 test all 的 NCCL 覆盖面 |
-| 9 | `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 多机多卡 2x8 六项 collective artifacts manifest 和 checksum |
-| 10 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 |
-| 11 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果，包含 counter、GRAPH、PXN sweep |
-
-## 关键脚本
-
-| 文件 | 用途 |
-|---|---|
-| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 |
-| `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本，不启动 NCCL workload |
-| `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口，默认同时采环境快照 |
-| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口，跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall，并归档每个 case 的 command/stdout/stderr/parsed JSON |
-| `scripts/run_multinode_nccl_all_collectives.sh` | 多机多卡 2x8 六项 collective 补测入口，跑 allreduce/alltoall/broadcast/reducescatter/allgather/sendrecv，并归档每个 case |
-| `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置，固定 NCCL 2.27.7 和 `/data/nccl-tests-latest/build` |
-| `configs/multinode_nccl_nccl227_all_collectives_2x8.yaml` | 多机多卡 2x8 六项 collective 补测配置，allreduce/alltoall 保留 PDF 阈值，新增 4 项暂按证据采集 |
-| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
-
-多机多卡 PDF 矩阵：
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/run_multinode_nccl_pdf_matrix.sh
-```
-
-多机多卡 2x8 六项 collective 补测：
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/run_multinode_nccl_all_collectives.sh
-```
-
-单节点 H100 原始 all 报告：
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/run_h100_single_node_all.sh
-```
-
-推荐先跑轻量检查：
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/multinode_nccl_deep_diagnose.sh preflight
-```
-
-采集单节点环境快照：
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/nccl_environment_snapshot.sh reports/nccl_environment_snapshot_$(hostname)_$(date +%Y%m%d_%H%M%S).md
-```
-
-完整复跑：
-
-```bash
-cd /root/test_gpu_scripts
-OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_$(date +%Y%m%d_%H%M%S) \
-  bash scripts/multinode_nccl_deep_diagnose.sh all
-```
-
-启用 NCCL plugin / SHARP 后的最小复核：
-
-```bash
-cd /root/test_gpu_scripts
-OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%d_%H%M%S) \
-  bash scripts/multinode_nccl_deep_diagnose.sh graph
-```
-
-## 远端机器上的最新同步文件
-
-三份关键报告已经同步到两台节点：
-
-```text
-/root/test_gpu_scripts/reports_multinode_nccl_handoff_plan_20260523.md
-/root/test_gpu_scripts/reports_h100_acceptance_current_status_20260523.md
-/root/test_gpu_scripts/reports_h100_acceptance_closure_checklist_20260523.md
-/root/test_gpu_scripts/reports_h100_acceptance_delivery_manifest_20260523.md
-/root/test_gpu_scripts/reports_h100_network_hardware_escalation_request_20260523.md
-/root/test_gpu_scripts/reports_multinode_nccl_environment_gap_20260523.md
-/root/test_gpu_scripts/reports_multinode_nccl_artifact_signal_analysis_20260523.md
-/root/test_gpu_scripts/reports_multinode_nccl_all_collectives_run_20260523.md
-/root/test_gpu_scripts/reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md
-/root/test_gpu_scripts/reports_multinode_nccl_deep_diagnose_run_20260523.md
-```
-
-最新完整诊断产物目录在 `aikubeworker0012`：
-
-```text
-/root/test_gpu_scripts/reports/nccl_deep_diag_20260523_103932
-```
-
-该目录包含：
-
-- `preflight.txt`
-- `allreduce_counter/`
-- `alltoall_pxn_counter/`
-- `graph/`
-- `pxn_sweep/`
-
-最新单节点环境快照：
-
-```text
-aikubeworker0012: /root/test_gpu_scripts/reports/nccl_environment_snapshot_aikubeworker0012_20260523_111142.md
-aikubeworker0016: /root/test_gpu_scripts/reports/nccl_environment_snapshot_aikubeworker0016_20260523_111143.md
-```
-
-最新多机多卡 PDF matrix：
-
-```text
-aikubeworker0012: /root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803.md
-artifacts: /root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts
-artifacts tar: /root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.tar.gz
-local copy: reports_multinode_nccl_pdf_matrix_20260523_113803.md
-summary: reports_multinode_nccl_pdf_matrix_run_20260523.md
-manifest: reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md
-```
-
-最新多机多卡 2x8 六项 collective 补测：
-
-```text
-aikubeworker0012: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md
-artifacts: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts
-artifacts tar: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz
-local copy: reports_multinode_nccl_all_collectives_20260523_120144.md
-summary: reports_multinode_nccl_all_collectives_run_20260523.md
-manifest: reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md
-```
-
-下一次用 `scripts/run_multinode_nccl_pdf_matrix.sh` 复跑时，还会生成：
-
-```text
-/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS_artifacts/
-```
-
-目录内按 case 保存完整 `cmd/stdout/stderr/json`，用于给网络/硬件侧复核原始 NCCL 输出。
-
-下一次用 `scripts/run_multinode_nccl_all_collectives.sh` 补测时，还会生成：
-
-```text
-/root/test_gpu_scripts/reports/multinode_nccl_all_collectives_YYYYMMDD_HHMMSS_artifacts/
-```
-
-目录内按 6 个 collective 保存完整 `cmd/stdout/stderr/json`。该入口用于补齐单节点 `test all` 中已有、但多机 PDF matrix 未覆盖的 `broadcast/reducescatter/allgather/sendrecv` 证据；已知 PDF 2x8 阈值仍用于 `allreduce/alltoall`。
-
-## 当前证据摘要
-
-### HCA / rail
-
-两台节点当前有效 400G IB rail 一致：
-
-```text
-mlx5_0, mlx5_1, mlx5_6, mlx5_7
-```
-
-非等价 HCA：
-
-```text
-mlx5_4, mlx5_5: 100G InfiniBand
-mlx5_2, mlx5_8: 25G Ethernet
-mlx5_3, mlx5_9: DOWN
-```
-
-因此当前每节点可用于 NCCL 的 400G rail 是 4 条，理论单向原始带宽约 `200 GB/s`。
-
-PDF allreduce 目标 `491.84 GB/s busbw` 反推 `262.31 GB/s algbw`，超过当前 4 x 400G rail 的理论单向带宽。
-
-### NCCL / plugin
-
-当前两台节点没有找到：
-
-```text
-libnccl-net*.so*
-libsharp*.so*
-```
-
-也没有看到 SHARP/HCOLL 包。NCCL GRAPH 日志显示 `plugin_missing=16`，当前走 internal IB plugin。
-
-### 深度诊断
-
-正式 PDF matrix 复跑：
-
-| Topology | AllReduce | AllReduce Target | AllToAll | AllToAll Target |
-|---|---:|---:|---:|---:|
-| 2 nodes x 1 GPU | `47.29` | `48.90` | `24.85` | `27.25` |
-| 2 nodes x 2 GPUs | `137.16` | `136.93` | `47.76` | `54.41` |
-| 2 nodes x 4 GPUs | `335.07` | `335.48` | `72.74` | `73.73` |
-| 2 nodes x 8 GPUs | `353.85` | `491.84` | `36.83` | `76.54` |
-
-本轮完整复跑：
-
-| 项目 | 结果 |
-|---|---:|
-| allreduce 16G | `354.025 GB/s` |
-| graph allreduce 16G | `354.224 GB/s` |
-| alltoall + PXN disabled 16G | `36.9377 GB/s` |
-| graph alltoall + PXN disabled 16G | `37.14 GB/s` |
-
-PXN disabled sweep 未发现有效参数：
-
-- `channels16`、`buff8m`、`p2pchunk4m`、`ar0` 只有小幅噪声级波动。
-- `qps4_split1`、`qps8_split1`、`netpeer8` 明显负向。
-
-## 历史/支撑报告
-
-| 文件 | 说明 |
-|---|---|
-| `reports_multinode_nccl_diagnosis_20260523.md` | 长版总诊断，包含从旧 NCCL/GDR disabled 到 PDF 矩阵对齐的全过程 |
-| `reports_h100_acceptance_current_status_20260523.md` | 当前 H100 验收总览，汇总单节点、多机 NCCL、跨节点 RDMA 和阻塞项 |
-| `reports_multinode_nccl_pdf_matrix_nccl227.md` | 按 PDF 矩阵跑出的正式 raw report |
-| `reports_multinode_nccl_pdf_matrix_20260523_112247.md` | 上一次正式 PDF matrix 原始报告 |
-| `reports_multinode_nccl_pdf_matrix_20260523_113803.md` | 最新带 artifacts 的正式 PDF matrix 原始报告 |
-| `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式 PDF matrix 中文摘要 |
-| `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md` | 最新 artifacts manifest 和 checksum |
-| `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 |
-| `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 |
-| `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 |
-| `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 最新多机多卡 2x8 六项 collective artifacts manifest 和 checksum |
-| `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail 和 counter 证据 |
-| `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall PXN 和参数 sweep 结论 |
-| `reports_rdma_single_node_summary.md` | 单节点 RDMA/HCA 速率摘要 |
-| `docs/multinode_nccl_concepts.md` | NCCL/RDMA 概念解释 |
-
-## 给下一位接手人的路线
-
-1. 先读 `reports_h100_acceptance_current_status_20260523.md`。
-2. 再读 `reports_multinode_nccl_handoff_plan_20260523.md`。
-3. 用 `reports_multinode_nccl_environment_gap_20260523.md` 和硬件/网络侧确认当前节点是否应具备 8 条 400G rail。
-4. 如果硬件不等价，调整验收口径或换等价节点复测。
-5. 如果硬件确认等价，先补齐 NCCL net plugin / SHARP，再跑 `scripts/multinode_nccl_deep_diagnose.sh graph` 对比 plugin 前后。
-6. alltoall 继续排查时优先找网络路径/ECMP/adaptive routing/拥塞策略，不建议继续盲扫 NCCL 小参数。
diff --git a/reports_multinode_nccl_pdf_matrix_20260523_112247.md b/reports_multinode_nccl_pdf_matrix_20260523_112247.md
deleted file mode 100644
index 8d07aef..0000000
--- a/reports_multinode_nccl_pdf_matrix_20260523_112247.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-23T11:26:21.306224
-- **Host:** aikubeworker0012
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Failed or unverified items:
-- Multi-node NCCL: FAIL
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Multi-node NCCL | FAIL |
-
-## Multi-node NCCL / Cross Leaf
-
-Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
-
-- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
-- **Preflight:** PASS
-
-### Multi-node NCCL allreduce
-
-| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.15 GB/s | 16G | 47.18 GB/s | >= 48.90 GB/s | FAIL |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 136.62 GB/s | 16G | 136.67 GB/s | >= 136.93 GB/s | FAIL |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 335.19 GB/s | 16G | 334.85 GB/s | >= 335.48 GB/s | FAIL |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 354.56 GB/s | 16G | 354.21 GB/s | >= 491.84 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | ranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0016:1321368:1321509 [0] NCCL INFO comm 0x56428b645570 rank 1 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 47.1841  #   |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | 0 | ranks 4 cudaDev 1 busId 2a000 - Destroy COMPLETE aikubeworker0012:2199872:2199936 [0] NCCL INFO comm 0x561da4512280 rank 0 nranks 4 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 136.668  #   |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0016:1321707:1321805 [0] NCCL INFO comm 0x562bad8777a0 rank 4 nranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 334.846  #   |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | nks 16 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0016:1321873:1322056 [0] NCCL INFO comm 0x55ba6708f500 rank 8 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 354.211  #   |
-
-### Multi-node NCCL alltoall
-
-| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.85 GB/s | 16G | 24.92 GB/s | >= 27.25 GB/s | FAIL |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.71 GB/s | 16G | 47.93 GB/s | >= 54.41 GB/s | FAIL |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.63 GB/s | 16G | 72.67 GB/s | >= 73.73 GB/s | FAIL |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 36.82 GB/s | 16G | 36.86 GB/s | >= 76.54 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0016:1322113:1322193 [0] NCCL INFO comm 0x55b760411150 rank 1 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 24.917  #   |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | 0 | ker0012:2200344:2200469 [1] NCCL INFO comm 0x55efef439da0 rank 1 nranks 4 cudaDev 1 busId 2a000 - Destroy COMPLETE aikubeworker0016:1322250:1322338 [1] NCCL INFO comm 0x558ecf546380 rank 3 nranks 4 cudaDev 1 busId 2a000 - Destroy COMPLETE   |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2200479:2200573 [0] NCCL INFO comm 0x55db60daef30 rank 0 nranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 72.6664  #   |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | r0012:2200587:2200767 [5] NCCL INFO comm 0x5556a6f71620 rank 5 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE aikubeworker0012:2200588:2200772 [6] NCCL INFO comm 0x5585a1623170 rank 6 nranks 16 cudaDev 6 busId ba000 - Destroy COMPLETE   |
-
-**Overall: FAIL**
-
----
-*Generated by GPU Test Suite v0.2.0*
diff --git a/reports_multinode_nccl_pdf_matrix_20260523_113803.md b/reports_multinode_nccl_pdf_matrix_20260523_113803.md
deleted file mode 100644
index 06b509e..0000000
--- a/reports_multinode_nccl_pdf_matrix_20260523_113803.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-23T11:41:35.567886
-- **Host:** aikubeworker0012
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Failed or unverified items:
-- Multi-node NCCL: FAIL
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Multi-node NCCL | FAIL |
-
-## Multi-node NCCL / Cross Leaf
-
-Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
-
-- **Artifacts:** `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`
-- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
-- **Preflight:** PASS
-
-### Multi-node NCCL allreduce
-
-| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.29 GB/s | 16G | 47.26 GB/s | >= 48.90 GB/s | FAIL |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 137.16 GB/s | 16G | 137.13 GB/s | >= 136.93 GB/s | PASS |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 335.07 GB/s | 16G | 335.02 GB/s | >= 335.48 GB/s | FAIL |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 353.85 GB/s | 16G | 353.85 GB/s | >= 491.84 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | ranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2203142:2203200 [0] NCCL INFO comm 0x55e463572510 rank 0 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 47.2628  #   |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2203280:2203363 [0] NCCL INFO comm 0x55e2f3808c60 rank 0 nranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 335.021  #   |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | nks 16 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2203376:2203528 [0] NCCL INFO comm 0x55a5166a30c0 rank 0 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 353.854  #   |
-
-### Multi-node NCCL alltoall
-
-| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.85 GB/s | 16G | 24.90 GB/s | >= 27.25 GB/s | FAIL |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.76 GB/s | 16G | 47.98 GB/s | >= 54.41 GB/s | FAIL |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.74 GB/s | 16G | 72.80 GB/s | >= 73.73 GB/s | FAIL |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 36.83 GB/s | 16G | 36.85 GB/s | >= 76.54 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | ranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0012:2203543:2203602 [0] NCCL INFO comm 0x55af2a804ba0 rank 0 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 24.9006  #   |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | 0 | ker0012:2203610:2203792 [1] NCCL INFO comm 0x55e99a564500 rank 1 nranks 4 cudaDev 1 busId 2a000 - Destroy COMPLETE aikubeworker0016:1325607:1325696 [0] NCCL INFO comm 0x55eaaa7389c0 rank 2 nranks 4 cudaDev 0 busId 18000 - Destroy COMPLETE   |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0016:1325765:1325869 [3] NCCL INFO comm 0x55cb0f1c9c10 rank 7 nranks 8 cudaDev 3 busId ab000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 72.7968  #   |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | 0016:1325927:1326140 [2] NCCL INFO comm 0x5627d2adee20 rank 10 nranks 16 cudaDev 2 busId 3a000 - Destroy COMPLETE aikubeworker0016:1325926:1326135 [1] NCCL INFO comm 0x55c00c344ea0 rank 9 nranks 16 cudaDev 1 busId 2a000 - Destroy COMPLETE   |
-
-**Overall: FAIL**
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md b/reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md
deleted file mode 100644
index a398123..0000000
--- a/reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# 多机多卡 NCCL PDF Matrix Artifacts Manifest 2026-05-23
-
-- Remote report: `reports/multinode_nccl_pdf_matrix_20260523_113803.md`
-- Remote artifact dir: `reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`
-- Remote artifact tar: `reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.tar.gz`
-- Case count: `8`
-- Artifact files: `32`
-
-## Case Summary
-
-| Case | Peak Bus BW | Avg Bus BW | Threshold | Wrong | Return Code | Status |
-|---|---:|---:|---:|---:|---:|---|
-| `allreduce_2x1_2_nodes_x_1_GPU_PDF_2_machines_2_GPUs` | 47.29 | 47.26 | 48.90 | 0 | 0 | FAIL |
-| `allreduce_2x2_2_nodes_x_2_GPUs_PDF_2_machines_4_GPUs` | 137.16 | 137.13 | 136.93 | 0 | 0 | PASS |
-| `allreduce_2x4_2_nodes_x_4_GPUs_PDF_2_machines_8_GPUs` | 335.07 | 335.02 | 335.48 | 0 | 0 | FAIL |
-| `allreduce_2x8_2_nodes_x_8_GPUs_PDF_2_machines_16_GPUs` | 353.85 | 353.85 | 491.84 | 0 | 0 | FAIL |
-| `alltoall_2x1_2_nodes_x_1_GPU_PDF_2_machines_2_GPUs` | 24.85 | 24.90 | 27.25 | 0 | 0 | FAIL |
-| `alltoall_2x2_2_nodes_x_2_GPUs_PDF_2_machines_4_GPUs` | 47.76 | 47.98 | 54.41 | 0 | 0 | FAIL |
-| `alltoall_2x4_2_nodes_x_4_GPUs_PDF_2_machines_8_GPUs` | 72.74 | 72.80 | 73.73 | 0 | 0 | FAIL |
-| `alltoall_2x8_2_nodes_x_8_GPUs_PDF_2_machines_16_GPUs` | 36.83 | 36.85 | 76.54 | 0 | 0 | FAIL |
-
-## Checksums
-
-```text
-682ac637460472d464a0d56ccc0f3335ed7f79a270157a403ebec23b8d9feceb  reports/multinode_nccl_pdf_matrix_20260523_113803.md
-7371fcaf7269f92eb1544e5e63573ebf77f4ae38f668b5b22169ca86e6d603ee  reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.tar.gz
-```
-
-Per-file artifact checksums are on the remote node at:
-
-```text
-reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.sha256
-```
diff --git a/reports_multinode_nccl_pdf_matrix_nccl227.md b/reports_multinode_nccl_pdf_matrix_nccl227.md
deleted file mode 100644
index c04d023..0000000
--- a/reports_multinode_nccl_pdf_matrix_nccl227.md
+++ /dev/null
@@ -1,84 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-23T08:58:19.911230
-- **Host:** aikubeworker0012
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Missing required evidence:
-- GPU Info
-- Health Check
-- Memory Bandwidth
-- Compute Throughput
-- NVLink/NVSwitch
-- NCCL
-- Stress Test
-- RDMA
-- DCGM
-- Training
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Multi-node NCCL | FAIL |
-
-## Multi-node NCCL / Cross Leaf
-
-Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
-
-- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
-- **Preflight:** PASS
-
-### Multi-node NCCL allreduce
-
-| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.26 GB/s | 16G | 47.19 GB/s | >= 49 GB/s | FAIL |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 136.36 GB/s | 16G | 136.69 GB/s | >= 137 GB/s | FAIL |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 333.23 GB/s | 16G | 333.45 GB/s | >= 335 GB/s | FAIL |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 353.47 GB/s | 16G | 353.86 GB/s | >= 492 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | TE aikubeworker0012:2165982:2166060 [0] NCCL INFO comm 0x55d452f2df80 rank 0 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 47.189  # # Collective test concluded: all_reduce_perf #   |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | 0 | ker0016:1221425:1222411 [0] NCCL INFO comm 0x56437384f040 rank 2 nranks 4 cudaDev 0 busId 18000 - Destroy COMPLETE aikubeworker0016:1221427:1222412 [1] NCCL INFO comm 0x55ab9313f950 rank 3 nranks 4 cudaDev 1 busId 2a000 - Destroy COMPLETE   |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | E aikubeworker0012:2166160:2166257 [0] NCCL INFO comm 0x557243829d50 rank 0 nranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 333.449  # # Collective test concluded: all_reduce_perf #   |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | r0012:2166272:2166442 [5] NCCL INFO comm 0x55721e270960 rank 5 nranks 16 cudaDev 5 busId ab000 - Destroy COMPLETE aikubeworker0012:2166268:2166447 [1] NCCL INFO comm 0x5644fafd24e0 rank 1 nranks 16 cudaDev 1 busId 2a000 - Destroy COMPLETE   |
-
-### Multi-node NCCL alltoall
-
-| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|----------------------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.87 GB/s | 16G | 24.93 GB/s | >= 27 GB/s | FAIL |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.69 GB/s | 16G | 47.93 GB/s | >= 54 GB/s | FAIL |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.82 GB/s | 16G | 72.87 GB/s | >= 74 GB/s | FAIL |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 36.70 GB/s | 16G | 36.74 GB/s | >= 77 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | 0 | ETE aikubeworker0012:2166458:2166534 [0] NCCL INFO comm 0x5603baefb150 rank 0 nranks 2 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 24.9304  # # Collective test concluded: alltoall_perf #   |
-| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | 0 | ETE aikubeworker0012:2166543:2166743 [0] NCCL INFO comm 0x5569d31d4f50 rank 0 nranks 4 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 47.9258  # # Collective test concluded: alltoall_perf #   |
-| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0 | ker0016:1227342:1228382 [1] NCCL INFO comm 0x55cdec231780 rank 5 nranks 8 cudaDev 1 busId 2a000 - Destroy COMPLETE aikubeworker0016:1227344:1228381 [3] NCCL INFO comm 0x563c7ed39680 rank 7 nranks 8 cudaDev 3 busId ab000 - Destroy COMPLETE   |
-| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | 0 | TE aikubeworker0012:2166925:2167127 [7] NCCL INFO comm 0x560553b91250 rank 7 nranks 16 cudaDev 7 busId db000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 36.7382  # # Collective test concluded: alltoall_perf #   |
-
-**Overall: FAIL**
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_multinode_nccl_pdf_matrix_run_20260523.md b/reports_multinode_nccl_pdf_matrix_run_20260523.md
deleted file mode 100644
index 0006ea7..0000000
--- a/reports_multinode_nccl_pdf_matrix_run_20260523.md
+++ /dev/null
@@ -1,67 +0,0 @@
-# 多机多卡 NCCL PDF 矩阵实测 2026-05-23
-
-执行节点：`aikubeworker0012`
-
-对端节点：`aikubeworker0016`
-
-原始报告：`reports_multinode_nccl_pdf_matrix_20260523_113803.md`
-
-远端报告：`/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803.md`
-
-远端 artifacts：`/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`
-
-远端 artifacts tar：`/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts.tar.gz`
-
-Artifacts manifest：`reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`
-
-执行命令：
-
-```bash
-cd /root/test_gpu_scripts
-bash scripts/run_multinode_nccl_pdf_matrix.sh
-```
-
-## 结论
-
-本轮正式矩阵已跑通，`mpirun`、SSH、`nccl-tests`、GDRDMA、4 条 400G HCA 都可用；失败不是启动失败或功能错误，而是 bus bandwidth 未达到 PDF 阈值。
-
-所有 case 的 return code 都是 `0`，`Out of bounds values` 为 `0 OK`，说明 NCCL 正确性没有报错。FAIL 来自性能阈值。
-
-## Preflight
-
-| 项目 | 结果 |
-|---|---|
-| OpenMPI | PASS，`/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun` |
-| all_reduce_perf | PASS，`/data/nccl-tests-latest/build/all_reduce_perf` |
-| alltoall_perf | PASS，`/data/nccl-tests-latest/build/alltoall_perf` |
-| SSH 172.72.8.12 | PASS |
-| SSH 172.72.8.16 | PASS |
-| HCA | 两端 `mlx5_0,mlx5_1,mlx5_6,mlx5_7` 均为 `400 Gb/sec (4X NDR)` ACTIVE |
-| NCCL network | IB |
-| GPU Direct RDMA | ENABLED |
-
-## AllReduce
-
-| Topology | Peak Bus BW | Avg Bus BW | PDF Threshold | Gap | Status |
-|---|---:|---:|---:|---:|---|
-| 2 nodes x 1 GPU | 47.29 GB/s | 47.26 GB/s | >= 48.90 GB/s | -1.61 GB/s | FAIL |
-| 2 nodes x 2 GPUs | 137.16 GB/s | 137.13 GB/s | >= 136.93 GB/s | +0.23 GB/s | PASS |
-| 2 nodes x 4 GPUs | 335.07 GB/s | 335.02 GB/s | >= 335.48 GB/s | -0.41 GB/s | FAIL |
-| 2 nodes x 8 GPUs | 353.85 GB/s | 353.85 GB/s | >= 491.84 GB/s | -137.99 GB/s | FAIL |
-
-## AllToAll
-
-| Topology | Peak Bus BW | Avg Bus BW | PDF Threshold | Gap | Status |
-|---|---:|---:|---:|---:|---|
-| 2 nodes x 1 GPU | 24.85 GB/s | 24.90 GB/s | >= 27.25 GB/s | -2.40 GB/s | FAIL |
-| 2 nodes x 2 GPUs | 47.76 GB/s | 47.98 GB/s | >= 54.41 GB/s | -6.65 GB/s | FAIL |
-| 2 nodes x 4 GPUs | 72.74 GB/s | 72.80 GB/s | >= 73.73 GB/s | -0.99 GB/s | FAIL |
-| 2 nodes x 8 GPUs | 36.83 GB/s | 36.85 GB/s | >= 76.54 GB/s | -39.71 GB/s | FAIL |
-
-## 判断
-
-1. 2x2 的 AllReduce 本次过线，2x4 的 AllReduce 非常接近 PDF 阈值，差 `0.41 GB/s`。
-2. 2x4 的 AllToAll 也接近阈值，差 `0.99 GB/s`。
-3. 2x8 是主要问题：AllReduce 只有 `353.85 / 491.84`，AllToAll 只有 `36.83 / 76.54`。
-4. 当前环境已经确认只有 4 条 400G IB rail 参与 NCCL，且没有发现外部 NCCL net plugin / SHARP；这仍是解释 2x8 目标不可达或严重掉速的最强证据。
-5. 本轮没有看到 GDR disabled 或 HCA 不可用，所以下一步不应继续纠结 SSH/mpirun/nccl-tests 启动链路，而应对齐 PDF 参考环境的 rail 数量、net plugin/SHARP、交换机跨 Leaf 策略。
diff --git a/reports_multinode_nccl_smoke_256m_aikubeworker0012.json b/reports_multinode_nccl_smoke_256m_aikubeworker0012.json
deleted file mode 100644
index 72c30ce..0000000
--- a/reports_multinode_nccl_smoke_256m_aikubeworker0012.json
+++ /dev/null
@@ -1,439 +0,0 @@
-{
-  "multinode_nccl": {
-    "passed": false,
-    "source": "nccl-tests-mpirun",
-    "mode": "sweep",
-    "hosts": [
-      {
-        "name": "nccl-gpu-1",
-        "addr": "172.72.8.12",
-        "slots": 8
-      },
-      {
-        "name": "nccl-gpu-2",
-        "addr": "172.72.8.16",
-        "slots": 8
-      }
-    ],
-    "preflight": {
-      "checks": [
-        {
-          "name": "mpirun",
-          "status": "PASS",
-          "detail": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun"
-        },
-        {
-          "name": "hosts",
-          "status": "PASS",
-          "detail": "2 configured"
-        },
-        {
-          "name": "all_reduce_perf",
-          "status": "PASS",
-          "detail": "/opt/gpu-test-tools/nccl-tests/build/all_reduce_perf"
-        },
-        {
-          "name": "alltoall_perf",
-          "status": "PASS",
-          "detail": "/opt/gpu-test-tools/nccl-tests/build/alltoall_perf"
-        },
-        {
-          "name": "ssh 172.72.8.12",
-          "status": "WARN",
-          "detail": "Host key verification failed."
-        },
-        {
-          "name": "ssh 172.72.8.16",
-          "status": "PASS",
-          "detail": "aikubeworker0016"
-        }
-      ],
-      "passed": true
-    },
-    "tests": {
-      "allreduce": {
-        "binary": "/opt/gpu-test-tools/nccl-tests/build/all_reduce_perf",
-        "topologies": [
-          {
-            "label": "2 nodes x 8 GPUs",
-            "nodes": 2,
-            "gpus_per_node": 8,
-            "ranks": 16,
-            "hosts": [
-              {
-                "name": "nccl-gpu-1",
-                "addr": "172.72.8.12",
-                "slots": 8
-              },
-              {
-                "name": "nccl-gpu-2",
-                "addr": "172.72.8.16",
-                "slots": 8
-              }
-            ],
-            "command": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun --allow-run-as-root --mca btl_openib_warn_no_device_params_found 0 --mca btl_tcp_if_include bond0 -H 172.72.8.12:8,172.72.8.16:8 --map-by ppr:8:node -np 16 -x NCCL_DEBUG=WARN -x NCCL_SOCKET_IFNAME=bond0 -x NCCL_IB_GID_INDEX=3 -x NCCL_IB_SL=5 -x NCCL_IB_TC=136 -x NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 -x NCCL_IB_TIMEOUT=22 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_MIN_NCHANNELS=4 -x NCCL_NET_PLUGIN=none -x NCCL_NVLS_ENABLE=1 -x NCCL_IB_SPLIT_DATA_ON_QPS=1 -x LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib:/usr/local/cuda-12.4/targets/x86_64-linux/lib /opt/gpu-test-tools/nccl-tests/build/all_reduce_perf -b 1k -e 256M -g 1 -f 2 -w 2",
-            "returncode": 0,
-            "status": "FAIL",
-            "peak_busbw_gbps": 39.32,
-            "peak_algbw_gbps": 20.97,
-            "peak_size": "4M",
-            "avg_busbw_gbps": 9.1,
-            "min_required_gbps": 100.0,
-            "wrong_count": 0,
-            "by_size": [
-              {
-                "size_bytes": 1024,
-                "size": "1K",
-                "time_us": 80.32,
-                "algbw_gbps": 0.01,
-                "busbw_gbps": 0.02,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 2048,
-                "size": "2K",
-                "time_us": 35.79,
-                "algbw_gbps": 0.06,
-                "busbw_gbps": 0.11,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 4096,
-                "size": "4K",
-                "time_us": 37.49,
-                "algbw_gbps": 0.11,
-                "busbw_gbps": 0.2,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 8192,
-                "size": "8K",
-                "time_us": 40.32,
-                "algbw_gbps": 0.2,
-                "busbw_gbps": 0.38,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 16384,
-                "size": "16K",
-                "time_us": 43.04,
-                "algbw_gbps": 0.38,
-                "busbw_gbps": 0.71,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 32768,
-                "size": "32K",
-                "time_us": 43.32,
-                "algbw_gbps": 0.76,
-                "busbw_gbps": 1.42,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 65536,
-                "size": "64K",
-                "time_us": 47.45,
-                "algbw_gbps": 1.38,
-                "busbw_gbps": 2.59,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 131072,
-                "size": "128K",
-                "time_us": 89.3,
-                "algbw_gbps": 1.47,
-                "busbw_gbps": 2.75,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 262144,
-                "size": "256K",
-                "time_us": 165.38,
-                "algbw_gbps": 1.59,
-                "busbw_gbps": 2.97,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 524288,
-                "size": "512K",
-                "time_us": 4292.69,
-                "algbw_gbps": 0.12,
-                "busbw_gbps": 0.23,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 1048576,
-                "size": "1M",
-                "time_us": 139.29,
-                "algbw_gbps": 7.53,
-                "busbw_gbps": 14.12,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 2097152,
-                "size": "2M",
-                "time_us": 4195.12,
-                "algbw_gbps": 0.5,
-                "busbw_gbps": 0.94,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 4194304,
-                "size": "4M",
-                "time_us": 199.99,
-                "algbw_gbps": 20.97,
-                "busbw_gbps": 39.32,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 8388608,
-                "size": "8M",
-                "time_us": 6159.0,
-                "algbw_gbps": 1.36,
-                "busbw_gbps": 2.55,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 16777216,
-                "size": "16M",
-                "time_us": 6336.73,
-                "algbw_gbps": 2.65,
-                "busbw_gbps": 4.96,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 33554432,
-                "size": "32M",
-                "time_us": 12623.3,
-                "algbw_gbps": 2.66,
-                "busbw_gbps": 4.98,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 67108864,
-                "size": "64M",
-                "time_us": 17005.6,
-                "algbw_gbps": 3.95,
-                "busbw_gbps": 7.4,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 134217728,
-                "size": "128M",
-                "time_us": 23826.7,
-                "algbw_gbps": 5.63,
-                "busbw_gbps": 10.56,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 268435456,
-                "size": "256M",
-                "time_us": 47356.5,
-                "algbw_gbps": 5.67,
-                "busbw_gbps": 10.63,
-                "wrong": 0
-              }
-            ],
-            "stderr_tail": "",
-            "stdout_tail": "   6.25       0\n     1048576        262144     float     sum      -1   139.29    7.53   14.12       0  3552.34    0.30    0.55       0\n     2097152        524288     float     sum      -1  4195.12    0.50    0.94       0   158.81   13.21   24.76       0\n     4194304       1048576     float     sum      -1   199.99   20.97   39.32       0  3623.39    1.16    2.17       0\n     8388608       2097152     float     sum      -1  6159.00    1.36    2.55       0   324.45   25.85   48.48       0\n    16777216       4194304     float     sum      -1  6336.73    2.65    4.96       0   600.96   27.92   52.35       0\n    33554432       8388608     float     sum      -1  12623.3    2.66    4.98       0   949.39   35.34   66.27       0\n    67108864      16777216     float     sum      -1  17005.6    3.95    7.40       0  17175.5    3.91    7.33       0\n   134217728      33554432     float     sum      -1  23826.7    5.63   10.56       0  25793.0    5.20    9.76       0\n   268435456      67108864     float     sum      -1  47356.5    5.67   10.63       0  43195.8    6.21   11.65       0\n# Out of bounds values : 0 OK\n# Avg bus bandwidth    : 9.0956 \n#\n# Collective test concluded: all_reduce_perf\n#\n\n",
-            "started_at": "2026-05-23T04:59:28.584786",
-            "finished_at": "2026-05-23T04:59:54.886123"
-          }
-        ]
-      },
-      "alltoall": {
-        "binary": "/opt/gpu-test-tools/nccl-tests/build/alltoall_perf",
-        "topologies": [
-          {
-            "label": "2 nodes x 8 GPUs",
-            "nodes": 2,
-            "gpus_per_node": 8,
-            "ranks": 16,
-            "hosts": [
-              {
-                "name": "nccl-gpu-1",
-                "addr": "172.72.8.12",
-                "slots": 8
-              },
-              {
-                "name": "nccl-gpu-2",
-                "addr": "172.72.8.16",
-                "slots": 8
-              }
-            ],
-            "command": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun --allow-run-as-root --mca btl_openib_warn_no_device_params_found 0 --mca btl_tcp_if_include bond0 -H 172.72.8.12:8,172.72.8.16:8 --map-by ppr:8:node -np 16 -x NCCL_DEBUG=WARN -x NCCL_SOCKET_IFNAME=bond0 -x NCCL_IB_GID_INDEX=3 -x NCCL_IB_SL=5 -x NCCL_IB_TC=136 -x NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 -x NCCL_IB_TIMEOUT=22 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_MIN_NCHANNELS=4 -x NCCL_NET_PLUGIN=none -x NCCL_NVLS_ENABLE=1 -x NCCL_IB_SPLIT_DATA_ON_QPS=1 -x LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib:/usr/local/cuda-12.4/targets/x86_64-linux/lib /opt/gpu-test-tools/nccl-tests/build/alltoall_perf -b 1k -e 256M -g 1 -f 2 -w 2",
-            "returncode": 0,
-            "status": "FAIL",
-            "peak_busbw_gbps": 8.64,
-            "peak_algbw_gbps": 9.21,
-            "peak_size": "2M",
-            "avg_busbw_gbps": 2.19,
-            "min_required_gbps": 20.0,
-            "wrong_count": 0,
-            "by_size": [
-              {
-                "size_bytes": 1024,
-                "size": "1K",
-                "time_us": 58.44,
-                "algbw_gbps": 0.02,
-                "busbw_gbps": 0.02,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 2048,
-                "size": "2K",
-                "time_us": 47.2,
-                "algbw_gbps": 0.04,
-                "busbw_gbps": 0.04,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 4096,
-                "size": "4K",
-                "time_us": 47.68,
-                "algbw_gbps": 0.09,
-                "busbw_gbps": 0.08,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 8192,
-                "size": "8K",
-                "time_us": 48.78,
-                "algbw_gbps": 0.17,
-                "busbw_gbps": 0.16,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 16384,
-                "size": "16K",
-                "time_us": 79.34,
-                "algbw_gbps": 0.21,
-                "busbw_gbps": 0.19,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 32768,
-                "size": "32K",
-                "time_us": 68.8,
-                "algbw_gbps": 0.48,
-                "busbw_gbps": 0.45,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 65536,
-                "size": "64K",
-                "time_us": 49.86,
-                "algbw_gbps": 1.31,
-                "busbw_gbps": 1.23,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 131072,
-                "size": "128K",
-                "time_us": 52.89,
-                "algbw_gbps": 2.48,
-                "busbw_gbps": 2.32,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 262144,
-                "size": "256K",
-                "time_us": 3861.98,
-                "algbw_gbps": 0.07,
-                "busbw_gbps": 0.06,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 524288,
-                "size": "512K",
-                "time_us": 83.38,
-                "algbw_gbps": 6.29,
-                "busbw_gbps": 5.89,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 1048576,
-                "size": "1M",
-                "time_us": 182.32,
-                "algbw_gbps": 5.75,
-                "busbw_gbps": 5.39,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 2097152,
-                "size": "2M",
-                "time_us": 227.67,
-                "algbw_gbps": 9.21,
-                "busbw_gbps": 8.64,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 4194304,
-                "size": "4M",
-                "time_us": 6482.39,
-                "algbw_gbps": 0.65,
-                "busbw_gbps": 0.61,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 8388608,
-                "size": "8M",
-                "time_us": 10348.9,
-                "algbw_gbps": 0.81,
-                "busbw_gbps": 0.76,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 16777216,
-                "size": "16M",
-                "time_us": 18616.5,
-                "algbw_gbps": 0.9,
-                "busbw_gbps": 0.84,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 33554432,
-                "size": "32M",
-                "time_us": 17170.7,
-                "algbw_gbps": 1.95,
-                "busbw_gbps": 1.83,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 67108864,
-                "size": "64M",
-                "time_us": 35735.6,
-                "algbw_gbps": 1.88,
-                "busbw_gbps": 1.76,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 134217728,
-                "size": "128M",
-                "time_us": 69388.5,
-                "algbw_gbps": 1.93,
-                "busbw_gbps": 1.81,
-                "wrong": 0
-              },
-              {
-                "size_bytes": 268435456,
-                "size": "256M",
-                "time_us": 96873.9,
-                "algbw_gbps": 2.77,
-                "busbw_gbps": 2.6,
-                "wrong": 0
-              }
-            ],
-            "stderr_tail": "",
-            "stdout_tail": "56    6.85    6.42    N/A\n     1048576         16384     float    none      -1   182.32    5.75    5.39       0   169.19    6.20    5.81    N/A\n     2097152         32768     float    none      -1   227.67    9.21    8.64       0  3664.15    0.57    0.54    N/A\n     4194304         65536     float    none      -1  6482.39    0.65    0.61       0   553.24    7.58    7.11    N/A\n     8388608        131072     float    none      -1  10348.9    0.81    0.76       0   803.01   10.45    9.79    N/A\n    16777216        262144     float    none      -1  18616.5    0.90    0.84       0  4237.22    3.96    3.71    N/A\n    33554432        524288     float    none      -1  17170.7    1.95    1.83       0  20849.4    1.61    1.51    N/A\n    67108864       1048576     float    none      -1  35735.6    1.88    1.76       0  34524.7    1.94    1.82    N/A\n   134217728       2097152     float    none      -1  69388.5    1.93    1.81       0  63535.3    2.11    1.98    N/A\n   268435456       4194304     float    none      -1  96873.9    2.77    2.60       0   100742    2.66    2.50    N/A\n# Out of bounds values : 0 OK\n# Avg bus bandwidth    : 2.19061 \n#\n# Collective test concluded: alltoall_perf\n#\n\n",
-            "started_at": "2026-05-23T04:59:54.886310",
-            "finished_at": "2026-05-23T05:00:28.796555"
-          }
-        ]
-      }
-    },
-    "timestamp": "2026-05-23T05:00:28.796580"
-  },
-  "timestamp": "2026-05-23T05:00:28.807561",
-  "hostname": "aikubeworker0012"
-}
\ No newline at end of file
diff --git a/reports_multinode_nccl_smoke_256m_aikubeworker0012.md b/reports_multinode_nccl_smoke_256m_aikubeworker0012.md
deleted file mode 100644
index 57fea2a..0000000
--- a/reports_multinode_nccl_smoke_256m_aikubeworker0012.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-23T05:00:28.807561
-- **Host:** aikubeworker0012
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Missing required evidence:
-- GPU Info
-- Health Check
-- Memory Bandwidth
-- Compute Throughput
-- NVLink/NVSwitch
-- NCCL
-- Stress Test
-- RDMA
-- DCGM
-- Training
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Multi-node NCCL | FAIL |
-
-## Multi-node NCCL / Cross Leaf
-
-Source: nccl-tests-mpirun | Mode: sweep
-
-- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
-- **Preflight:** PASS (1 warnings)
-
-### Multi-node NCCL allreduce
-
-| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs | 39.32 GB/s | 4M | 9.10 GB/s | >= 100 GB/s | FAIL |
-
-### Multi-node NCCL alltoall
-
-| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs | 8.64 GB/s | 2M | 2.19 GB/s | >= 20 GB/s | FAIL |
-
-**Overall: FAIL**
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_multinode_nccl_sweep_2x8_nccl227.md b/reports_multinode_nccl_sweep_2x8_nccl227.md
deleted file mode 100644
index 701492b..0000000
--- a/reports_multinode_nccl_sweep_2x8_nccl227.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-23T07:54:48.990378
-- **Host:** aikubeworker0012
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Missing required evidence:
-- GPU Info
-- Health Check
-- Memory Bandwidth
-- Compute Throughput
-- NVLink/NVSwitch
-- NCCL
-- Stress Test
-- RDMA
-- DCGM
-- Training
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Multi-node NCCL | FAIL |
-
-## Multi-node NCCL / Cross Leaf
-
-Source: nccl-tests-mpirun | Mode: sweep-nccl-2.27.7
-
-- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
-- **Preflight:** PASS
-
-### Multi-node NCCL allreduce
-
-| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 sweep | 237.26 GB/s | 4G | 150.62 GB/s | >= 480 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 sweep | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 sweep | 0 |  aikubeworker0012:2145024:2145189 [0] NCCL INFO comm 0x561f7dc1f780 rank 0 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE # Out of bounds values : 0 OK # Avg bus bandwidth    : 150.624  # # Collective test concluded: all_reduce_perf #   |
-
-### Multi-node NCCL alltoall
-
-| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
-|----------|-------------|-----------|------------|-----------|--------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 sweep | 28.78 GB/s | 1G | 23.57 GB/s | >= 75 GB/s | FAIL |
-
-| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
-|----------|--------------|-----------------|------------------|-------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 sweep | IB | ENABLED | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | - |
-
-| Topology | Return Code | Error / Output Tail |
-|----------|-------------|---------------------|
-| 2 nodes x 8 GPUs NCCL 2.27.7 sweep | 0 | r0012:2145213:2145384 [7] NCCL INFO comm 0x558d54228110 rank 7 nranks 16 cudaDev 7 busId db000 - Destroy COMPLETE aikubeworker0016:1014703:1015544 [0] NCCL INFO comm 0x55ed6d99d8e0 rank 8 nranks 16 cudaDev 0 busId 18000 - Destroy COMPLETE   |
-
-**Overall: FAIL**
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_nvbandwidth_aikubeworker0012.json b/reports_nvbandwidth_aikubeworker0012.json
deleted file mode 100644
index 05a0587..0000000
--- a/reports_nvbandwidth_aikubeworker0012.json
+++ /dev/null
@@ -1,70 +0,0 @@
-{
-  "benchmark": {
-    "memory": {
-      "source": "nvbandwidth",
-      "h2d_bandwidth_gbps": 55.5,
-      "d2h_bandwidth_gbps": 54.8,
-      "d2d_bandwidth_gbps": 0.0,
-      "h2d_peak_gbps": 64,
-      "d2h_peak_gbps": 64,
-      "d2d_peak_gbps": 450.0,
-      "h2d_efficiency_pct": 86.7,
-      "d2h_efficiency_pct": 85.6,
-      "d2d_efficiency_pct": null,
-      "peak_bandwidth_gbps": 3400,
-      "efficiency_pct": null,
-      "results_by_test": {
-        "h2d": 55.5,
-        "d2h": 54.8,
-        "d2d_write": 0.0,
-        "d2d_read": 0.0,
-        "d2d_bidir": 0.0
-      },
-      "per_gpu": []
-    },
-    "compute": {
-      "per_dtype_tflops": {
-        "fp32": 52.2,
-        "tf32": 360.7,
-        "fp16": 680.0,
-        "bf16": 707.6,
-        "fp8": 1142.4
-      },
-      "peak_tflops": {
-        "fp32": 67,
-        "tf32": 495,
-        "fp16": 990,
-        "bf16": 990,
-        "fp8": 1979
-      },
-      "efficiency_pct": {
-        "fp32": 77.9,
-        "tf32": 72.9,
-        "fp16": 68.7,
-        "bf16": 71.5,
-        "fp8": 57.7
-      },
-      "pass_thresholds_tflops": {
-        "fp32": 54,
-        "tf32": 444,
-        "fp16": 734,
-        "bf16": 745,
-        "fp8": 1400
-      },
-      "per_gpu": [
-        {
-          "index": 0,
-          "fp32": 52.2,
-          "tf32": 360.7,
-          "fp16": 680.0,
-          "bf16": 707.6,
-          "fp8": 1142.4
-        }
-      ],
-      "matrix_size": 8192,
-      "warmup": 50,
-      "iterations": 500
-    }
-  },
-  "timestamp": "2026-05-22T15:35:16.675924"
-}
\ No newline at end of file
diff --git a/reports_nvbandwidth_aikubeworker0012.md b/reports_nvbandwidth_aikubeworker0012.md
deleted file mode 100644
index bf571ab..0000000
--- a/reports_nvbandwidth_aikubeworker0012.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22 15:37:12
-- **Host:** aikubeworker0012
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Memory Bandwidth | FAIL (0.0%) |
-| Compute Throughput | FAIL (worst TF32 361 vs >= 444) |
-
-## Memory Bandwidth
-
-Source: nvbandwidth
-
-| Metric | Value | Peak | Efficiency |
-|--------|-------|------|------------|
-| H2D (PCIe) | 55.5 GB/s | 64 GB/s | 86.7% |
-| D2H (PCIe) | 54.8 GB/s | 64 GB/s | 85.6% |
-| D2D (NVLink) | 0.0 GB/s | 450 GB/s | 0.0% |
-
-**Verdict: FAIL** (D2D efficiency 0.0%)
-
-## Compute Throughput
-
-| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
-|-------|-------------------|------|------------|--------|
-| FP32 | 52.2 | 67 | >= 54 | WARN |
-| TF32 | 360.7 | 495 | >= 444 | FAIL |
-| FP16 | 680.0 | 990 | >= 734 | WARN |
-| BF16 | 707.6 | 990 | >= 745 | WARN |
-| FP8 | 1142.4 | 1979 | >= 1400 | FAIL |
-
-**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 57.7%)
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_nvbandwidth_aikubeworker0016.json b/reports_nvbandwidth_aikubeworker0016.json
deleted file mode 100644
index 34ac61c..0000000
--- a/reports_nvbandwidth_aikubeworker0016.json
+++ /dev/null
@@ -1,70 +0,0 @@
-{
-  "benchmark": {
-    "memory": {
-      "source": "nvbandwidth",
-      "h2d_bandwidth_gbps": 55.5,
-      "d2h_bandwidth_gbps": 55.0,
-      "d2d_bandwidth_gbps": 0.0,
-      "h2d_peak_gbps": 64,
-      "d2h_peak_gbps": 64,
-      "d2d_peak_gbps": 450.0,
-      "h2d_efficiency_pct": 86.7,
-      "d2h_efficiency_pct": 85.9,
-      "d2d_efficiency_pct": null,
-      "peak_bandwidth_gbps": 3400,
-      "efficiency_pct": null,
-      "results_by_test": {
-        "h2d": 55.5,
-        "d2h": 55.0,
-        "d2d_write": 0.0,
-        "d2d_read": 0.0,
-        "d2d_bidir": 0.0
-      },
-      "per_gpu": []
-    },
-    "compute": {
-      "per_dtype_tflops": {
-        "fp32": 52.2,
-        "tf32": 357.5,
-        "fp16": 665.3,
-        "bf16": 697.1,
-        "fp8": 1138.8
-      },
-      "peak_tflops": {
-        "fp32": 67,
-        "tf32": 495,
-        "fp16": 990,
-        "bf16": 990,
-        "fp8": 1979
-      },
-      "efficiency_pct": {
-        "fp32": 77.9,
-        "tf32": 72.2,
-        "fp16": 67.2,
-        "bf16": 70.4,
-        "fp8": 57.5
-      },
-      "pass_thresholds_tflops": {
-        "fp32": 54,
-        "tf32": 444,
-        "fp16": 734,
-        "bf16": 745,
-        "fp8": 1400
-      },
-      "per_gpu": [
-        {
-          "index": 0,
-          "fp32": 52.2,
-          "tf32": 357.5,
-          "fp16": 665.3,
-          "bf16": 697.1,
-          "fp8": 1138.8
-        }
-      ],
-      "matrix_size": 8192,
-      "warmup": 50,
-      "iterations": 500
-    }
-  },
-  "timestamp": "2026-05-22T15:35:19.219299"
-}
\ No newline at end of file
diff --git a/reports_nvbandwidth_aikubeworker0016.md b/reports_nvbandwidth_aikubeworker0016.md
deleted file mode 100644
index 01320cf..0000000
--- a/reports_nvbandwidth_aikubeworker0016.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22 15:37:18
-- **Host:** aikubeworker0016
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Memory Bandwidth | FAIL (0.0%) |
-| Compute Throughput | FAIL (worst TF32 358 vs >= 444) |
-
-## Memory Bandwidth
-
-Source: nvbandwidth
-
-| Metric | Value | Peak | Efficiency |
-|--------|-------|------|------------|
-| H2D (PCIe) | 55.5 GB/s | 64 GB/s | 86.7% |
-| D2H (PCIe) | 55.0 GB/s | 64 GB/s | 85.9% |
-| D2D (NVLink) | 0.0 GB/s | 450 GB/s | 0.0% |
-
-**Verdict: FAIL** (D2D efficiency 0.0%)
-
-## Compute Throughput
-
-| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
-|-------|-------------------|------|------------|--------|
-| FP32 | 52.2 | 67 | >= 54 | WARN |
-| TF32 | 357.5 | 495 | >= 444 | FAIL |
-| FP16 | 665.3 | 990 | >= 734 | WARN |
-| BF16 | 697.1 | 990 | >= 745 | WARN |
-| FP8 | 1138.8 | 1979 | >= 1400 | FAIL |
-
-**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 57.5%)
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_rdma_aikubeworker0012.json b/reports_rdma_aikubeworker0012.json
deleted file mode 100644
index 93d7644..0000000
--- a/reports_rdma_aikubeworker0012.json
+++ /dev/null
@@ -1,157 +0,0 @@
-{
-  "rdma": {
-    "passed": false,
-    "devices": [
-      {
-        "name": "mlx5_0",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "400 Gb/sec (4X NDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:58a2:e103:0093:3898"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_1",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "400 Gb/sec (4X NDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:58a2:e103:0093:3db0"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_2",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "25 Gb/sec (1X EDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:5c3f:b8ff:fe5e:7832"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_3",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "25 Gb/sec (1X EDR)",
-            "state": "1: DOWN",
-            "phys_state": "3: Disabled",
-            "gid": "fe80:0000:0000:0000:5e25:73ff:fe4e:eac1"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_4",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "100 Gb/sec (2X HDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:9c63:c003:005f:63cc"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_5",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "100 Gb/sec (2X HDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:9c63:c003:005f:63cd"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_6",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "400 Gb/sec (4X NDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:58a2:e103:0093:3bf4"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_7",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "400 Gb/sec (4X NDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:58a2:e103:0093:3e28"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_8",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "25 Gb/sec (1X EDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:5c3f:b8ff:fe5e:7832"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_9",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "25 Gb/sec (1X EDR)",
-            "state": "1: DOWN",
-            "phys_state": "3: Disabled",
-            "gid": "fe80:0000:0000:0000:5e25:73ff:fe63:1717"
-          }
-        ]
-      }
-    ],
-    "bandwidth_tests": [
-      {
-        "test": "ib_write_bw",
-        "status": "WARN",
-        "bandwidth_gbps": 0.13,
-        "min_required_gbps": 50
-      },
-      {
-        "test": "ib_read_bw",
-        "status": "WARN",
-        "bandwidth_gbps": 0.13,
-        "min_required_gbps": 50
-      }
-    ],
-    "latency_tests": [
-      {
-        "test": "ib_write_lat",
-        "status": "PASS",
-        "latency_us": 4.53,
-        "max_allowed_us": 10
-      },
-      {
-        "test": "ib_read_lat",
-        "status": "WARN",
-        "latency_us": 16.0,
-        "max_allowed_us": 10
-      }
-    ],
-    "timestamp": "2026-05-22T15:41:20.534115"
-  },
-  "timestamp": "2026-05-22T15:41:20.544589"
-}
\ No newline at end of file
diff --git a/reports_rdma_aikubeworker0016.json b/reports_rdma_aikubeworker0016.json
deleted file mode 100644
index 5e98f8a..0000000
--- a/reports_rdma_aikubeworker0016.json
+++ /dev/null
@@ -1,157 +0,0 @@
-{
-  "rdma": {
-    "passed": false,
-    "devices": [
-      {
-        "name": "mlx5_0",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "400 Gb/sec (4X NDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:58a2:e103:0088:81e0"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_1",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "400 Gb/sec (4X NDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:9c63:c003:0054:e00a"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_2",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "25 Gb/sec (1X EDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_3",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "25 Gb/sec (1X EDR)",
-            "state": "1: DOWN",
-            "phys_state": "3: Disabled",
-            "gid": "fe80:0000:0000:0000:c670:bdff:fefd:5bd9"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_4",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "100 Gb/sec (2X HDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:9c63:c003:005f:58ec"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_5",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "100 Gb/sec (2X HDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:9c63:c003:005f:58ed"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_6",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "400 Gb/sec (4X NDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:9c63:c003:0055:0e56"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_7",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "400 Gb/sec (4X NDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:a088:c203:00f0:286c"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_8",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "25 Gb/sec (1X EDR)",
-            "state": "4: ACTIVE",
-            "phys_state": "5: LinkUp",
-            "gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf"
-          }
-        ]
-      },
-      {
-        "name": "mlx5_9",
-        "ports": [
-          {
-            "port": "1",
-            "rate": "25 Gb/sec (1X EDR)",
-            "state": "1: DOWN",
-            "phys_state": "3: Disabled",
-            "gid": "fe80:0000:0000:0000:c670:bdff:fefd:569d"
-          }
-        ]
-      }
-    ],
-    "bandwidth_tests": [
-      {
-        "test": "ib_write_bw",
-        "status": "WARN",
-        "bandwidth_gbps": 0.13,
-        "min_required_gbps": 50
-      },
-      {
-        "test": "ib_read_bw",
-        "status": "WARN",
-        "bandwidth_gbps": 0.13,
-        "min_required_gbps": 50
-      }
-    ],
-    "latency_tests": [
-      {
-        "test": "ib_write_lat",
-        "status": "PASS",
-        "latency_us": 4.22,
-        "max_allowed_us": 10
-      },
-      {
-        "test": "ib_read_lat",
-        "status": "WARN",
-        "latency_us": 16.0,
-        "max_allowed_us": 10
-      }
-    ],
-    "timestamp": "2026-05-22T15:41:07.851101"
-  },
-  "timestamp": "2026-05-22T15:41:07.861558"
-}
\ No newline at end of file
diff --git a/reports_rdma_counter_aikubeworker0012_20260522_194808.md b/reports_rdma_counter_aikubeworker0012_20260522_194808.md
deleted file mode 100644
index f254bef..0000000
--- a/reports_rdma_counter_aikubeworker0012_20260522_194808.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22T19:48:26.622179
-- **Host:** aikubeworker0012
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Failed or unverified items:
-- RDMA: FAIL
-
-Missing required evidence:
-- GPU Info
-- Health Check
-- Memory Bandwidth
-- Compute Throughput
-- NVLink/NVSwitch
-- NCCL
-- Stress Test
-- DCGM
-- Training
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| RDMA | FAIL |
-
-## RDMA/InfiniBand
-
-### RDMA Port Checks
-
-| Device | Port | State | Rate | Required | Status |
-|--------|------|-------|------|----------|--------|
-| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
-| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
-| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-
-| Test | Value | Threshold | Status |
-|------|-------|-----------|--------|
-| ib_write_bw | 49.3 GB/s | >= 47 GB/s | PASS |
-| ib_read_bw | 39.2 GB/s | >= 47 GB/s | FAIL |
-| ib_write_lat | 4.49 us | <= 2 us | FAIL |
-| ib_read_lat | 16.00 us | <= 3.5 us | FAIL |
-| ibping | target=0x58 count=5 | 0% packet loss | PASS |
-
-- **PFC/ECN/CNP/congestion counters checked:** 146
-- **PFC/ECN/CNP/congestion non-zero:** no
-- **Failure reasons:**
-  - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
-  - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
-  - ib_read_bw bandwidth 39.21GB/s < 47GB/s
-  - ib_write_lat latency 4.49us > 2.0us
-  - ib_read_lat latency 16.0us > 3.5us
-**Overall: FAIL**
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_rdma_counter_aikubeworker0016_20260522_194828.md b/reports_rdma_counter_aikubeworker0016_20260522_194828.md
deleted file mode 100644
index a72f917..0000000
--- a/reports_rdma_counter_aikubeworker0016_20260522_194828.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22T19:48:45.899570
-- **Host:** aikubeworker0016
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Failed or unverified items:
-- RDMA: FAIL
-
-Missing required evidence:
-- GPU Info
-- Health Check
-- Memory Bandwidth
-- Compute Throughput
-- NVLink/NVSwitch
-- NCCL
-- Stress Test
-- DCGM
-- Training
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| RDMA | FAIL |
-
-## RDMA/InfiniBand
-
-### RDMA Port Checks
-
-| Device | Port | State | Rate | Required | Status |
-|--------|------|-------|------|----------|--------|
-| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
-| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
-| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-
-| Test | Value | Threshold | Status |
-|------|-------|-----------|--------|
-| ib_write_bw | 48.1 GB/s | >= 47 GB/s | PASS |
-| ib_read_bw | 40.3 GB/s | >= 47 GB/s | FAIL |
-| ib_write_lat | 4.28 us | <= 2 us | FAIL |
-| ib_read_lat | 16.00 us | <= 3.5 us | FAIL |
-| ibping | target=0x4b count=5 | 0% packet loss | PASS |
-
-- **PFC/ECN/CNP/congestion counters checked:** 146
-- **PFC/ECN/CNP/congestion non-zero:** no
-- **Failure reasons:**
-  - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
-  - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
-  - ib_read_bw bandwidth 40.3GB/s < 47GB/s
-  - ib_write_lat latency 4.28us > 2.0us
-  - ib_read_lat latency 16.0us > 3.5us
-**Overall: FAIL**
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_rdma_cross_node_mlx5_0_20260523.md b/reports_rdma_cross_node_mlx5_0_20260523.md
deleted file mode 100644
index dfdfb8a..0000000
--- a/reports_rdma_cross_node_mlx5_0_20260523.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# RDMA Cross-node Evidence Report
-
-- **Date:** 2026-05-23 Asia/Shanghai
-- **Scope:** `aikubeworker0012` <-> `aikubeworker0016`, single rail `mlx5_0`, port 1
-- **Client/server bootstrap IPs:** `172.72.8.12` and `172.72.8.16`
-- **Bandwidth message size:** 4MB
-- **Latency message size:** 8B
-- **Iterations:** 1000
-
-## Port Evidence
-
-| Host | Device | State | Rate | Link | LID |
-|---|---|---|---|---|---|
-| aikubeworker0012 | mlx5_0/1 | ACTIVE | 400 Gb/sec (4X NDR) | InfiniBand | 0x58 |
-| aikubeworker0016 | mlx5_0/1 | ACTIVE | 400 Gb/sec (4X NDR) | InfiniBand | 0x4b |
-
-## Cross-node Perftest Results
-
-| Direction | Test | Value | PDF Threshold | Status |
-|---|---|---:|---:|---|
-| 0016 -> 0012 | ib_write_bw | 49.35 GB/s | >= 47 GB/s | PASS |
-| 0016 -> 0012 | ib_read_bw | 44.36 GB/s | >= 47 GB/s | FAIL |
-| 0016 -> 0012 | ib_write_lat avg | 2.17 us | <= 2.0 us | FAIL |
-| 0016 -> 0012 | ib_read_lat avg | 4.05 us | <= 3.5 us | FAIL |
-| 0012 -> 0016 | ib_write_bw | 48.38 GB/s | >= 47 GB/s | PASS |
-| 0012 -> 0016 | ib_read_bw | 44.37 GB/s | >= 47 GB/s | FAIL |
-| 0012 -> 0016 | ib_write_lat avg | 2.13 us | <= 2.0 us | FAIL |
-| 0012 -> 0016 | ib_read_lat avg | 4.08 us | <= 3.5 us | FAIL |
-
-## Bidirectional ibping
-
-| Direction | Target LID | Result |
-|---|---|---|
-| 0016 -> 0012 | 0x58 | 5 transmitted, 5 received, 0% packet loss; avg 0.005 ms |
-| 0012 -> 0016 | 0x4b | 5 transmitted, 5 received, 0% packet loss; avg 0.005 ms |
-
-## Fabric Counters
-
-| Host | PFC/ECN/CNP/congestion Counters Checked | Non-zero Counters | Status |
-|---|---:|---:|---|
-| aikubeworker0012 | 146 | 0 | PASS |
-| aikubeworker0016 | 146 | 0 | PASS |
-
-## Verdict
-
-**RDMA cross-node verdict: FAIL**
-
-Reason: bidirectional connectivity is good, PFC/ECN/CNP/congestion counters are clean, and write bandwidth passes. However read bandwidth is below 47 GB/s in both directions, write latency is slightly above 2.0 us in both directions, and read latency is above 3.5 us in both directions.
-
-Note: `modules/rdma_test.py` was corrected on 2026-05-23 to parse `ib_write_lat` / `ib_read_lat` `t_avg[usec]` rather than the 99.9 percentile column. Older reports that show `read_lat` around 16 us are therefore not the current parser output.
diff --git a/reports_rdma_single_node_summary.md b/reports_rdma_single_node_summary.md
deleted file mode 100644
index c1c95de..0000000
--- a/reports_rdma_single_node_summary.md
+++ /dev/null
@@ -1,73 +0,0 @@
-# Single-node RDMA/IB Report
-
-Generated: 2026-05-22 23:41 Asia/Shanghai
-
-Scope: project CLI `gpu_tester.py --test rdma --report --format json`, run separately on each host.
-
-Important note: the current repository RDMA test is single-node only. In `modules/rdma_test.py`, the perftest client connects to `localhost`, so this report validates local IB device discovery and local perftest behavior. It does not validate cross-node RDMA bandwidth between `aikubeworker0012` and `aikubeworker0016`.
-
-## Summary
-
-| Host | Devices Found | Active 400G Ports | Active 100G Ports | Down Ports | Overall |
-| --- | ---: | --- | --- | --- | --- |
-| aikubeworker0012 / 172.72.8.12 | 10 | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | mlx5_4, mlx5_5 | mlx5_3, mlx5_9 | WARN |
-| aikubeworker0016 / 172.72.8.16 | 10 | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | mlx5_4, mlx5_5 | mlx5_3, mlx5_9 | WARN |
-
-## Bandwidth
-
-The bandwidth numbers below are from the repo's local `localhost` RDMA perftest path.
-
-| Host | ib_write_bw | Threshold | Status | ib_read_bw | Threshold | Status |
-| --- | ---: | ---: | --- | ---: | ---: | --- |
-| aikubeworker0012 | 0.13 GB/s | 50 GB/s | WARN | 0.13 GB/s | 50 GB/s | WARN |
-| aikubeworker0016 | 0.13 GB/s | 50 GB/s | WARN | 0.13 GB/s | 50 GB/s | WARN |
-
-## Latency
-
-| Host | ib_write_lat | Limit | Status | ib_read_lat | Limit | Status |
-| --- | ---: | ---: | --- | ---: | ---: | --- |
-| aikubeworker0012 | 4.53 us | 10 us | PASS | 16.00 us | 10 us | WARN |
-| aikubeworker0016 | 4.22 us | 10 us | PASS | 16.00 us | 10 us | WARN |
-
-## Device Inventory
-
-### aikubeworker0012
-
-| Device | Port | State | Physical State | Rate |
-| --- | --- | --- | --- | --- |
-| mlx5_0 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
-| mlx5_1 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
-| mlx5_2 | 1 | ACTIVE | LinkUp | 25 Gb/sec (1X EDR) |
-| mlx5_3 | 1 | DOWN | Disabled | 25 Gb/sec (1X EDR) |
-| mlx5_4 | 1 | ACTIVE | LinkUp | 100 Gb/sec (2X HDR) |
-| mlx5_5 | 1 | ACTIVE | LinkUp | 100 Gb/sec (2X HDR) |
-| mlx5_6 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
-| mlx5_7 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
-| mlx5_8 | 1 | ACTIVE | LinkUp | 25 Gb/sec (1X EDR) |
-| mlx5_9 | 1 | DOWN | Disabled | 25 Gb/sec (1X EDR) |
-
-### aikubeworker0016
-
-| Device | Port | State | Physical State | Rate |
-| --- | --- | --- | --- | --- |
-| mlx5_0 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
-| mlx5_1 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
-| mlx5_2 | 1 | ACTIVE | LinkUp | 25 Gb/sec (1X EDR) |
-| mlx5_3 | 1 | DOWN | Disabled | 25 Gb/sec (1X EDR) |
-| mlx5_4 | 1 | ACTIVE | LinkUp | 100 Gb/sec (2X HDR) |
-| mlx5_5 | 1 | ACTIVE | LinkUp | 100 Gb/sec (2X HDR) |
-| mlx5_6 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
-| mlx5_7 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) |
-| mlx5_8 | 1 | ACTIVE | LinkUp | 25 Gb/sec (1X EDR) |
-| mlx5_9 | 1 | DOWN | Disabled | 25 Gb/sec (1X EDR) |
-
-## Files
-
-Raw JSON:
-
-- `reports_rdma_aikubeworker0012.json`
-- `reports_rdma_aikubeworker0016.json`
-
-Markdown summary:
-
-- `reports_rdma_single_node_summary.md`
diff --git a/reports_single_gpu_aikubeworker0012.json b/reports_single_gpu_aikubeworker0012.json
deleted file mode 100644
index 6cc5a37..0000000
--- a/reports_single_gpu_aikubeworker0012.json
+++ /dev/null
@@ -1,292 +0,0 @@
-{
-  "timestamp": "2026-05-22T15:26:26.973586",
-  "gpu_info": {
-    "driver_version": "580.159.03",
-    "cuda_version": "13.0",
-    "gpu_count": 8,
-    "gpus": [
-      {
-        "index": 0,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-7658c03c-7659-9886-041e-545c21d53e12",
-        "pci_bus_id": "00000000:18:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 4,
-        "vram_free_mb": 81076,
-        "power_draw": 69.72,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 25,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1654923030411",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 1,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-6392d40b-893b-9fc2-4284-a3f1d8c4d7f1",
-        "pci_bus_id": "00000000:2A:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 0,
-        "vram_free_mb": 81079,
-        "power_draw": 73.17,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 25,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1654724063165",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 2,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-2ae38735-10de-fb0b-fb20-9d1b5b434558",
-        "pci_bus_id": "00000000:3A:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 0,
-        "vram_free_mb": 81079,
-        "power_draw": 68.71,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 26,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1654823036530",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 3,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-ec62123f-0c48-6dbd-49e4-8b231b3fed0e",
-        "pci_bus_id": "00000000:5D:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 0,
-        "vram_free_mb": 81079,
-        "power_draw": 69.73,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 25,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1654923021638",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 4,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-b64fc270-109e-1543-fb0c-be7feecf14f1",
-        "pci_bus_id": "00000000:9A:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 0,
-        "vram_free_mb": 81079,
-        "power_draw": 68.84,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 24,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1655023033179",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 5,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-15ab7baf-9010-7cf3-5462-eeb09f8dbe65",
-        "pci_bus_id": "00000000:AB:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 0,
-        "vram_free_mb": 81079,
-        "power_draw": 69.94,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 27,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1655023034225",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 6,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-225f6f3c-6fef-d1e2-5428-d90f665fb3d3",
-        "pci_bus_id": "00000000:BA:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 0,
-        "vram_free_mb": 81079,
-        "power_draw": 70.46,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 25,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1654923078278",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 7,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-79aeb6a8-c00c-6edb-956f-779ef56950a3",
-        "pci_bus_id": "00000000:DB:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 0,
-        "vram_free_mb": 81079,
-        "power_draw": 71.76,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 24,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1654024031464",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      }
-    ],
-    "topology": "\t\u001b[4mGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tNIC3\tNIC4\tNIC5\tNIC6\tNIC7\tNIC8\tNIC9\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\u001b[0m\nGPU0\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tPIX\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU1\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tPIX\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU2\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tPIX\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU3\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU4\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU5\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU6\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tPIX\t56-111,168-223\t1\t\tN/A\nGPU7\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nNIC0\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC3\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC4\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t X \tPIX\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC5\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\tPIX\t X \tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC6\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\t\t\t\t\nNIC7\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\t\t\t\t\nNIC8\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\t\t\t\t\nNIC9\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \t\t\t\t\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n  NIC0: mlx5_0\n  NIC1: mlx5_1\n  NIC2: mlx5_2\n  NIC3: mlx5_3\n  NIC4: mlx5_4\n  NIC5: mlx5_5\n  NIC6: mlx5_6\n  NIC7: mlx5_7\n  NIC8: mlx5_8\n  NIC9: mlx5_9\n\n",
-    "timestamp": "2026-05-22T15:26:34.187409",
-    "detected_gpu_type": "h100",
-    "gpu_label": "H100 SXM5"
-  },
-  "memory_bench": {
-    "memory": {
-      "source": "pytorch",
-      "h2d_bandwidth_gbps": 11.8,
-      "d2h_bandwidth_gbps": 9.9,
-      "d2d_bandwidth_gbps": 829.1,
-      "peak_bandwidth_gbps": 3400,
-      "efficiency_pct": 24.4,
-      "test_sizes_mb": [
-        1,
-        4,
-        16,
-        64,
-        256,
-        1024,
-        4096
-      ],
-      "bandwidth_by_size": {
-        "1": {
-          "h2d_gbps": 3.8,
-          "d2h_gbps": 1.4,
-          "d2d_gbps": 40.6
-        },
-        "4": {
-          "h2d_gbps": 7.6,
-          "d2h_gbps": 9.9,
-          "d2d_gbps": 141.5
-        },
-        "16": {
-          "h2d_gbps": 11.0,
-          "d2h_gbps": 1.9,
-          "d2d_gbps": 450.3
-        },
-        "64": {
-          "h2d_gbps": 11.8,
-          "d2h_gbps": 1.4,
-          "d2d_gbps": 726.5
-        },
-        "256": {
-          "h2d_gbps": 9.0,
-          "d2h_gbps": 1.4,
-          "d2d_gbps": 793.8
-        },
-        "1024": {
-          "h2d_gbps": 5.5,
-          "d2h_gbps": 1.4,
-          "d2d_gbps": 821.2
-        },
-        "4096": {
-          "h2d_gbps": 5.9,
-          "d2h_gbps": 1.4,
-          "d2d_gbps": 829.1
-        }
-      },
-      "per_gpu": []
-    }
-  },
-  "compute_bench": {
-    "compute": {
-      "per_dtype_tflops": {
-        "fp32": 52.0,
-        "tf32": 362.3,
-        "fp16": 691.0,
-        "bf16": 713.0,
-        "fp8": 1148.8
-      },
-      "peak_tflops": {
-        "fp32": 67,
-        "tf32": 495,
-        "fp16": 990,
-        "bf16": 990,
-        "fp8": 1979
-      },
-      "efficiency_pct": {
-        "fp32": 77.6,
-        "tf32": 73.2,
-        "fp16": 69.8,
-        "bf16": 72.0,
-        "fp8": 58.0
-      },
-      "pass_thresholds_tflops": {
-        "fp32": 54,
-        "tf32": 444,
-        "fp16": 734,
-        "bf16": 745,
-        "fp8": 1400
-      },
-      "per_gpu": [
-        {
-          "index": 0,
-          "fp32": 52.0,
-          "tf32": 362.3,
-          "fp16": 691.0,
-          "bf16": 713.0,
-          "fp8": 1148.8
-        }
-      ],
-      "matrix_size": 8192,
-      "warmup": 50,
-      "iterations": 500
-    }
-  }
-}
\ No newline at end of file
diff --git a/reports_single_gpu_aikubeworker0012.md b/reports_single_gpu_aikubeworker0012.md
deleted file mode 100644
index 3a6c3c9..0000000
--- a/reports_single_gpu_aikubeworker0012.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22 15:27:51
-- **Host:** aikubeworker0012
-- **GPU:** NVIDIA H100 80GB HBM3 x8
-- **Driver:** 580.159.03 | **CUDA:** 13.0
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| GPU Info | PASS (8 GPUs detected) |
-| Memory Bandwidth | WARN (829 GB/s via PyTorch fallback) |
-| Compute Throughput | FAIL (worst TF32 362 vs >= 444) |
-
-## GPU Information
-
-| GPU | Model | VRAM | Temp | Power | SM Clock |
-|-----|-------|------|------|-------|----------|
-| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 70/700W | 345 MHz |
-| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 73/700W | 345 MHz |
-| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 26C | 69/700W | 345 MHz |
-| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 70/700W | 345 MHz |
-| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 69/700W | 345 MHz |
-| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 27C | 70/700W | 345 MHz |
-| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 70/700W | 345 MHz |
-| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 72/700W | 345 MHz |
-
-## Memory Bandwidth
-
-Source: pytorch
-
-| Metric | Value | Peak | Efficiency |
-|--------|-------|------|------------|
-| H2D (PCIe) | 11.8 GB/s | 0 GB/s | 0.0% |
-| D2H (PCIe) | 9.9 GB/s | 0 GB/s | 0.0% |
-| D2D (NVLink) | 829.1 GB/s | 3400 GB/s | 24.4% |
-
-**Verdict: WARN** (D2D 829.1 GB/s via PyTorch fallback; nvbandwidth unavailable — figure is indicative only, not a true HBM peak)
-
-## Compute Throughput
-
-| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
-|-------|-------------------|------|------------|--------|
-| FP32 | 52.0 | 67 | >= 54 | WARN |
-| TF32 | 362.3 | 495 | >= 444 | FAIL |
-| FP16 | 691.0 | 990 | >= 734 | WARN |
-| BF16 | 713.0 | 990 | >= 745 | WARN |
-| FP8 | 1148.8 | 1979 | >= 1400 | FAIL |
-
-**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 58.0%)
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_single_gpu_aikubeworker0016.json b/reports_single_gpu_aikubeworker0016.json
deleted file mode 100644
index 4b3c442..0000000
--- a/reports_single_gpu_aikubeworker0016.json
+++ /dev/null
@@ -1,292 +0,0 @@
-{
-  "timestamp": "2026-05-22T15:26:29.511252",
-  "gpu_info": {
-    "driver_version": "580.159.03",
-    "cuda_version": "13.0",
-    "gpu_count": 8,
-    "gpus": [
-      {
-        "index": 0,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-dfbc9513-255d-4fe7-2b77-7b1ec3972e75",
-        "pci_bus_id": "00000000:18:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 4,
-        "vram_free_mb": 81076,
-        "power_draw": 69.81,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 20,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924016120",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 1,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-bb845ef7-d7b5-f011-9395-ea74274e2282",
-        "pci_bus_id": "00000000:2A:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 0,
-        "vram_free_mb": 81079,
-        "power_draw": 67.45,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 20,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924015483",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 2,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-3720cf13-2a34-be38-27be-0a7adc4addc4",
-        "pci_bus_id": "00000000:3A:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 0,
-        "vram_free_mb": 81079,
-        "power_draw": 66.69,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 21,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924025595",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 3,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-87080b2d-ac43-be0d-d574-c193078850ae",
-        "pci_bus_id": "00000000:5D:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 0,
-        "vram_free_mb": 81079,
-        "power_draw": 66.86,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 20,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924016862",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 4,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-599bd883-cc5c-a5dd-6c33-c15f7049da48",
-        "pci_bus_id": "00000000:9A:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 0,
-        "vram_free_mb": 81079,
-        "power_draw": 67.07,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 20,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924025670",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 5,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-a1c6bba4-61b0-e623-06c9-9c88635e26fe",
-        "pci_bus_id": "00000000:AB:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 0,
-        "vram_free_mb": 81079,
-        "power_draw": 69.12,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 22,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924027166",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 6,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-98745a0c-39bd-3e56-d6ca-54ba3647ab6d",
-        "pci_bus_id": "00000000:BA:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 0,
-        "vram_free_mb": 81079,
-        "power_draw": 67.61,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 20,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924026234",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      },
-      {
-        "index": 7,
-        "name": "NVIDIA H100 80GB HBM3",
-        "uuid": "GPU-8c73bd8b-666b-357e-ac5d-c75ac7a759db",
-        "pci_bus_id": "00000000:DB:00.0",
-        "pcie_link_gen": 5,
-        "pcie_link_width": 16,
-        "vram_total_mb": 81559,
-        "vram_used_mb": 0,
-        "vram_free_mb": 81079,
-        "power_draw": 66.19,
-        "power_limit": 700.0,
-        "clock_sm": 345,
-        "clock_mem": 2619,
-        "temperature": 20,
-        "fan_speed": 0,
-        "persistence_mode": false,
-        "compute_mode": "Default",
-        "serial_number": "1651924027255",
-        "ecc_errors_single": 0,
-        "ecc_errors_double": 0
-      }
-    ],
-    "topology": "\t\u001b[4mGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tNIC3\tNIC4\tNIC5\tNIC6\tNIC7\tNIC8\tNIC9\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\u001b[0m\nGPU0\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tPIX\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU1\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tPIX\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU2\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tPIX\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU3\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU4\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU5\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU6\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tPIX\t56-111,168-223\t1\t\tN/A\nGPU7\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nNIC0\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC3\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC4\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t X \tPIX\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC5\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\tPIX\t X \tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC6\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\t\t\t\t\nNIC7\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\t\t\t\t\nNIC8\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\t\t\t\t\nNIC9\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \t\t\t\t\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n  NIC0: mlx5_0\n  NIC1: mlx5_1\n  NIC2: mlx5_2\n  NIC3: mlx5_3\n  NIC4: mlx5_4\n  NIC5: mlx5_5\n  NIC6: mlx5_6\n  NIC7: mlx5_7\n  NIC8: mlx5_8\n  NIC9: mlx5_9\n\n",
-    "timestamp": "2026-05-22T15:26:36.627805",
-    "detected_gpu_type": "h100",
-    "gpu_label": "H100 SXM5"
-  },
-  "memory_bench": {
-    "memory": {
-      "source": "pytorch",
-      "h2d_bandwidth_gbps": 11.8,
-      "d2h_bandwidth_gbps": 10.1,
-      "d2d_bandwidth_gbps": 829.0,
-      "peak_bandwidth_gbps": 3400,
-      "efficiency_pct": 24.4,
-      "test_sizes_mb": [
-        1,
-        4,
-        16,
-        64,
-        256,
-        1024,
-        4096
-      ],
-      "bandwidth_by_size": {
-        "1": {
-          "h2d_gbps": 3.6,
-          "d2h_gbps": 1.4,
-          "d2d_gbps": 40.3
-        },
-        "4": {
-          "h2d_gbps": 7.7,
-          "d2h_gbps": 10.1,
-          "d2d_gbps": 159.5
-        },
-        "16": {
-          "h2d_gbps": 10.9,
-          "d2h_gbps": 1.9,
-          "d2d_gbps": 439.5
-        },
-        "64": {
-          "h2d_gbps": 11.8,
-          "d2h_gbps": 1.4,
-          "d2d_gbps": 740.5
-        },
-        "256": {
-          "h2d_gbps": 9.0,
-          "d2h_gbps": 1.4,
-          "d2d_gbps": 792.1
-        },
-        "1024": {
-          "h2d_gbps": 8.4,
-          "d2h_gbps": 1.4,
-          "d2d_gbps": 818.9
-        },
-        "4096": {
-          "h2d_gbps": 6.1,
-          "d2h_gbps": 1.4,
-          "d2d_gbps": 829.0
-        }
-      },
-      "per_gpu": []
-    }
-  },
-  "compute_bench": {
-    "compute": {
-      "per_dtype_tflops": {
-        "fp32": 51.9,
-        "tf32": 357.8,
-        "fp16": 667.2,
-        "bf16": 699.1,
-        "fp8": 1146.2
-      },
-      "peak_tflops": {
-        "fp32": 67,
-        "tf32": 495,
-        "fp16": 990,
-        "bf16": 990,
-        "fp8": 1979
-      },
-      "efficiency_pct": {
-        "fp32": 77.5,
-        "tf32": 72.3,
-        "fp16": 67.4,
-        "bf16": 70.6,
-        "fp8": 57.9
-      },
-      "pass_thresholds_tflops": {
-        "fp32": 54,
-        "tf32": 444,
-        "fp16": 734,
-        "bf16": 745,
-        "fp8": 1400
-      },
-      "per_gpu": [
-        {
-          "index": 0,
-          "fp32": 51.9,
-          "tf32": 357.8,
-          "fp16": 667.2,
-          "bf16": 699.1,
-          "fp8": 1146.2
-        }
-      ],
-      "matrix_size": 8192,
-      "warmup": 50,
-      "iterations": 500
-    }
-  }
-}
\ No newline at end of file
diff --git a/reports_single_gpu_aikubeworker0016.md b/reports_single_gpu_aikubeworker0016.md
deleted file mode 100644
index 49f9f45..0000000
--- a/reports_single_gpu_aikubeworker0016.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22 15:27:53
-- **Host:** aikubeworker0016
-- **GPU:** NVIDIA H100 80GB HBM3 x8
-- **Driver:** 580.159.03 | **CUDA:** 13.0
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| GPU Info | PASS (8 GPUs detected) |
-| Memory Bandwidth | WARN (829 GB/s via PyTorch fallback) |
-| Compute Throughput | FAIL (worst TF32 358 vs >= 444) |
-
-## GPU Information
-
-| GPU | Model | VRAM | Temp | Power | SM Clock |
-|-----|-------|------|------|-------|----------|
-| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 70/700W | 345 MHz |
-| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz |
-| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 67/700W | 345 MHz |
-| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz |
-| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz |
-| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 22C | 69/700W | 345 MHz |
-| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 68/700W | 345 MHz |
-| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 66/700W | 345 MHz |
-
-## Memory Bandwidth
-
-Source: pytorch
-
-| Metric | Value | Peak | Efficiency |
-|--------|-------|------|------------|
-| H2D (PCIe) | 11.8 GB/s | 0 GB/s | 0.0% |
-| D2H (PCIe) | 10.1 GB/s | 0 GB/s | 0.0% |
-| D2D (NVLink) | 829.0 GB/s | 3400 GB/s | 24.4% |
-
-**Verdict: WARN** (D2D 829.0 GB/s via PyTorch fallback; nvbandwidth unavailable — figure is indicative only, not a true HBM peak)
-
-## Compute Throughput
-
-| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
-|-------|-------------------|------|------------|--------|
-| FP32 | 51.9 | 67 | >= 54 | WARN |
-| TF32 | 357.8 | 495 | >= 444 | FAIL |
-| FP16 | 667.2 | 990 | >= 734 | WARN |
-| BF16 | 699.1 | 990 | >= 745 | WARN |
-| FP8 | 1146.2 | 1979 | >= 1400 | FAIL |
-
-**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 57.9%)
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_stress_smoke_reasons_aikubeworker0012.json b/reports_stress_smoke_reasons_aikubeworker0012.json
deleted file mode 100644
index 2722c96..0000000
--- a/reports_stress_smoke_reasons_aikubeworker0012.json
+++ /dev/null
@@ -1,165 +0,0 @@
-{
-  "stress": {
-    "source": "pytorch",
-    "passed": false,
-    "duration_sec": 45,
-    "elapsed_sec": 45.4,
-    "gpu_status": {
-      "0": "PASS",
-      "1": "PASS",
-      "2": "PASS",
-      "3": "PASS",
-      "4": "PASS",
-      "5": "PASS",
-      "6": "PASS",
-      "7": "PASS"
-    },
-    "telemetry": {
-      "passed": false,
-      "samples": 39,
-      "steady_samples": 31,
-      "warmup_sec": 9.0,
-      "max_temp_c": {
-        "0": 59.0,
-        "1": 58.0,
-        "2": 65.0,
-        "3": 54.0,
-        "4": 59.0,
-        "5": 66.0,
-        "6": 62.0,
-        "7": 55.0
-      },
-      "avg_power_w": {
-        "0": 697.0,
-        "1": 697.4,
-        "2": 697.9,
-        "3": 698.0,
-        "4": 697.8,
-        "5": 697.6,
-        "6": 697.9,
-        "7": 698.2
-      },
-      "temp_delta_c": 12.0,
-      "throttle_events": [
-        {
-          "gpu": 0,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 1,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 2,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 3,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 4,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 5,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 6,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 7,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 0,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 1,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 2,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 3,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 4,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 5,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 6,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 7,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 0,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 1,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 2,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 3,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        }
-      ],
-      "throttle_event_count": 248,
-      "xid_events": [],
-      "tflops_jitter_pct": 4.07,
-      "steady_tflops_samples": 781,
-      "failures": [
-        "GPU temperature delta 12.0C exceeds 5.0C",
-        "non-idle throttle reasons observed in 248 samples (first: GPU 0 0x4)"
-      ],
-      "thresholds": {
-        "max_temp_c": 80.0,
-        "max_temp_delta_c": 5.0,
-        "min_power_w": 630.0,
-        "max_tflops_jitter_pct": 5.0,
-        "warmup_sec": 10.0,
-        "min_steady_samples": 10
-      }
-    },
-    "timestamp": "2026-05-22T17:52:09.074859"
-  },
-  "timestamp": "2026-05-22T17:52:09.082873"
-}
\ No newline at end of file
diff --git a/reports_stress_smoke_reasons_aikubeworker0012.md b/reports_stress_smoke_reasons_aikubeworker0012.md
deleted file mode 100644
index cea30e2..0000000
--- a/reports_stress_smoke_reasons_aikubeworker0012.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22T17:52:09.082873
-- **Host:** aikubeworker0012
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Stress Test | FAIL |
-
-## Stress Test
-
-- **Source:** pytorch
-- **Duration:** 45s (requested 45s)
-- **Telemetry samples:** 39
-- **Max temp:** {'0': 59.0, '1': 58.0, '2': 65.0, '3': 54.0, '4': 59.0, '5': 66.0, '6': 62.0, '7': 55.0}
-- **Avg power:** {'0': 697.0, '1': 697.4, '2': 697.9, '3': 698.0, '4': 697.8, '5': 697.6, '6': 697.9, '7': 698.2}
-- **Temp delta:** 12.0 C
-- **TFLOPS jitter:** 4.07%
-- **Throttle events:** 248
-- **XID events:** 0
-- **Failure reasons:**
-  - GPU temperature delta 12.0C exceeds 5.0C
-  - non-idle throttle reasons observed in 248 samples (first: GPU 0 0x4)
-- **Result: FAIL**
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_stress_smoke_reasons_aikubeworker0016.json b/reports_stress_smoke_reasons_aikubeworker0016.json
deleted file mode 100644
index 8d39f58..0000000
--- a/reports_stress_smoke_reasons_aikubeworker0016.json
+++ /dev/null
@@ -1,165 +0,0 @@
-{
-  "stress": {
-    "source": "pytorch",
-    "passed": false,
-    "duration_sec": 45,
-    "elapsed_sec": 45.4,
-    "gpu_status": {
-      "0": "PASS",
-      "1": "PASS",
-      "2": "PASS",
-      "3": "PASS",
-      "4": "PASS",
-      "5": "PASS",
-      "6": "PASS",
-      "7": "PASS"
-    },
-    "telemetry": {
-      "passed": false,
-      "samples": 39,
-      "steady_samples": 31,
-      "warmup_sec": 9.0,
-      "max_temp_c": {
-        "0": 50.0,
-        "1": 56.0,
-        "2": 57.0,
-        "3": 52.0,
-        "4": 51.0,
-        "5": 58.0,
-        "6": 53.0,
-        "7": 51.0
-      },
-      "avg_power_w": {
-        "0": 698.3,
-        "1": 698.5,
-        "2": 697.6,
-        "3": 697.9,
-        "4": 697.8,
-        "5": 698.0,
-        "6": 697.5,
-        "7": 698.0
-      },
-      "temp_delta_c": 8.0,
-      "throttle_events": [
-        {
-          "gpu": 0,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 1,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 2,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 3,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 4,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 5,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 6,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 7,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 0,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 1,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 2,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 3,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 4,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 5,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 6,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 7,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 0,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 1,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 2,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        },
-        {
-          "gpu": 3,
-          "throttle": "0x0000000000000004",
-          "real_throttle": "0x4"
-        }
-      ],
-      "throttle_event_count": 248,
-      "xid_events": [],
-      "tflops_jitter_pct": 3.77,
-      "steady_tflops_samples": 787,
-      "failures": [
-        "GPU temperature delta 8.0C exceeds 5.0C",
-        "non-idle throttle reasons observed in 248 samples (first: GPU 0 0x4)"
-      ],
-      "thresholds": {
-        "max_temp_c": 80.0,
-        "max_temp_delta_c": 5.0,
-        "min_power_w": 630.0,
-        "max_tflops_jitter_pct": 5.0,
-        "warmup_sec": 10.0,
-        "min_steady_samples": 10
-      }
-    },
-    "timestamp": "2026-05-22T17:53:02.058687"
-  },
-  "timestamp": "2026-05-22T17:53:02.066792"
-}
\ No newline at end of file
diff --git a/reports_stress_smoke_reasons_aikubeworker0016.md b/reports_stress_smoke_reasons_aikubeworker0016.md
deleted file mode 100644
index 9f9c3ab..0000000
--- a/reports_stress_smoke_reasons_aikubeworker0016.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22T17:53:02.066792
-- **Host:** aikubeworker0016
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Stress Test | FAIL |
-
-## Stress Test
-
-- **Source:** pytorch
-- **Duration:** 45s (requested 45s)
-- **Telemetry samples:** 39
-- **Max temp:** {'0': 50.0, '1': 56.0, '2': 57.0, '3': 52.0, '4': 51.0, '5': 58.0, '6': 53.0, '7': 51.0}
-- **Avg power:** {'0': 698.3, '1': 698.5, '2': 697.6, '3': 697.9, '4': 697.8, '5': 698.0, '6': 697.5, '7': 698.0}
-- **Temp delta:** 8.0 C
-- **TFLOPS jitter:** 3.77%
-- **Throttle events:** 248
-- **XID events:** 0
-- **Failure reasons:**
-  - GPU temperature delta 8.0C exceeds 5.0C
-  - non-idle throttle reasons observed in 248 samples (first: GPU 0 0x4)
-- **Result: FAIL**
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_test_all_latest_aikubeworker0012_20260522_203246.md b/reports_test_all_latest_aikubeworker0012_20260522_203246.md
deleted file mode 100644
index 8853d18..0000000
--- a/reports_test_all_latest_aikubeworker0012_20260522_203246.md
+++ /dev/null
@@ -1,322 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22T20:32:51.687830
-- **Host:** aikubeworker0012
-- **GPU:** NVIDIA H100 80GB HBM3 x8
-- **Driver:** 580.159.03 | **CUDA:** 13.0
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Failed or unverified items:
-- Compute Throughput: FAIL (FP16 spread 3.04% > 3%)
-- NCCL: FAIL
-- Stress Test: FAIL
-- RDMA: FAIL
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| GPU Info | PASS (8 GPUs detected) |
-| Health Check | PASS |
-| Memory Bandwidth | PASS (108.1%) |
-| Compute Throughput | FAIL (FP16 spread 3.04% > 3%) |
-| NVLink/NVSwitch | PASS |
-| DCGM | PASS |
-| NCCL | FAIL |
-| Stress Test | FAIL |
-| RDMA | FAIL |
-| Training | PASS (216498 tokens/sec) |
-
-## GPU Information
-
-| GPU | Model | VRAM | Temp | Power | SM Clock |
-|-----|-------|------|------|-------|----------|
-| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 69/700W | 345 MHz |
-| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 73/700W | 345 MHz |
-| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 26C | 69/700W | 345 MHz |
-| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 69/700W | 345 MHz |
-| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 69/700W | 345 MHz |
-| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 27C | 70/700W | 345 MHz |
-| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 70/700W | 345 MHz |
-| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 71/700W | 345 MHz |
-
-## Health Check
-
-**Overall: PASS**
-
-| GPU | Temp | Power | ECC | PCIe | Throttle | Status |
-|-----|------|-------|-----|------|----------|--------|
-| 0 | 25C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 1 | 25C PASS | 73W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 2 | 26C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 3 | 24C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 4 | 24C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 5 | 27C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 6 | 25C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 7 | 24C PASS | 71W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-
-## Memory Bandwidth
-
-Source: nvbandwidth
-
-| Metric | Value | Peak | Efficiency |
-|--------|-------|------|------------|
-| H2D (PCIe) | 55.4 GB/s | 64 GB/s | 86.6% |
-| D2H (PCIe) | 54.0 GB/s | 64 GB/s | 84.4% |
-| D2D (NVLink) | 486.5 GB/s | 450 GB/s | 108.1% |
-
-**Verdict: PASS** (D2D efficiency 108.1%)
-
-## Compute Throughput
-
-| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
-|-------|-------------------|------|------------|--------|
-| FP32 | 51.9 | 67 | >= 54 | FAIL |
-| TF32 | 364.9 | 495 | >= 444 | FAIL |
-| FP16 | 680.0 | 990 | >= 734 | FAIL |
-| BF16 | 713.2 | 990 | >= 745 | FAIL |
-| FP8 | 1170.4 | 1979 | >= 1400 | FAIL |
-| FP64 | 46.9 | 67 | >= 63 | FAIL |
-| INT8 | 100.4 | 1979 | >= 1536 | FAIL |
-
-**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 5.1%)
-
-### Compute Consistency
-
-| DType | Min | Mean | Max | Spread | Limit | Status |
-|-------|-----|------|-----|--------|-------|--------|
-| FP32 | 51.9 | 52.0 | 52.1 | 0.38% | <= 3% | PASS |
-| TF32 | 361.0 | 364.9 | 369.0 | 2.19% | <= 3% | PASS |
-| FP16 | 667.3 | 680.0 | 688.0 | 3.04% | <= 3% | FAIL |
-| BF16 | 703.0 | 713.3 | 735.7 | 4.58% | <= 3% | FAIL |
-| FP8 | 1156.9 | 1170.5 | 1186.1 | 2.49% | <= 3% | PASS |
-| FP64 | 45.9 | 46.9 | 47.5 | 3.41% | <= 3% | FAIL |
-| INT8 | 100.4 | 100.4 | 100.4 | 0.00% | <= 3% | PASS |
-
-### Compute Per-GPU TFLOPS
-
-| GPU | FP32 | TF32 | FP16 | BF16 | FP8 | FP64 | INT8 |
-|---|---|---|---|---|---|---|---|
-| 0 | 52.0 | 369.0 | 688.0 | 735.7 | 1186.1 | 47.5 | 100.4 |
-| 1 | 51.9 | 365.6 | 675.3 | 711.6 | 1171.0 | 47.0 | 100.4 |
-| 2 | 51.9 | 364.9 | 685.7 | 715.3 | 1175.3 | 47.1 | 100.4 |
-| 3 | 51.9 | 364.0 | 679.9 | 704.0 | 1167.6 | 47.4 | 100.4 |
-| 4 | 51.9 | 367.7 | 681.2 | 719.0 | 1178.0 | 46.6 | 100.4 |
-| 5 | 52.0 | 364.3 | 680.8 | 712.3 | 1165.5 | 46.8 | 100.4 |
-| 6 | 52.1 | 362.9 | 681.8 | 703.0 | 1156.9 | 46.9 | 100.4 |
-| 7 | 51.9 | 361.0 | 667.3 | 705.3 | 1163.2 | 45.9 | 100.4 |
-
-## NVLink/NVSwitch
-
-**Overall: PASS**
-
-| GPU | Active Links | Issues |
-|-----|--------------|--------|
-| 0 | 18/18 | OK |
-| 1 | 18/18 | OK |
-| 2 | 18/18 | OK |
-| 3 | 18/18 | OK |
-| 4 | 18/18 | OK |
-| 5 | 18/18 | OK |
-| 6 | 18/18 | OK |
-| 7 | 18/18 | OK |
-
-## DCGM Diagnostic
-
-**Overall: PASS**
-
-| Subtest | Status |
-|---------|--------|
-| Deployment/software/GPU0 | PASS |
-| Deployment/software/GPU1 | PASS |
-| Deployment/software/GPU2 | PASS |
-| Deployment/software/GPU3 | PASS |
-| Deployment/software/GPU4 | PASS |
-| Deployment/software/GPU5 | PASS |
-| Deployment/software/GPU6 | PASS |
-| Deployment/software/GPU7 | PASS |
-| Deployment/software/summary | PASS |
-| Hardware/memory/GPU0 | PASS |
-| Hardware/memory/GPU1 | PASS |
-| Hardware/memory/GPU2 | PASS |
-| Hardware/memory/GPU3 | PASS |
-| Hardware/memory/GPU4 | PASS |
-| Hardware/memory/GPU5 | PASS |
-| Hardware/memory/GPU6 | PASS |
-| Hardware/memory/GPU7 | PASS |
-| Hardware/memory/summary | PASS |
-| Hardware/diagnostic/GPU0 | PASS |
-| Hardware/diagnostic/GPU1 | PASS |
-| Hardware/diagnostic/GPU2 | PASS |
-| Hardware/diagnostic/GPU3 | PASS |
-| Hardware/diagnostic/GPU4 | PASS |
-| Hardware/diagnostic/GPU5 | PASS |
-| Hardware/diagnostic/GPU6 | PASS |
-| Hardware/diagnostic/GPU7 | PASS |
-| Hardware/diagnostic/summary | PASS |
-| Hardware/nvbandwidth/GPU0 | PASS |
-| Hardware/nvbandwidth/GPU1 | PASS |
-| Hardware/nvbandwidth/GPU2 | PASS |
-| Hardware/nvbandwidth/GPU3 | PASS |
-| Hardware/nvbandwidth/GPU4 | PASS |
-| Hardware/nvbandwidth/GPU5 | PASS |
-| Hardware/nvbandwidth/GPU6 | PASS |
-| Hardware/nvbandwidth/GPU7 | PASS |
-| Hardware/nvbandwidth/summary | PASS |
-| Integration/pcie/GPU0 | PASS |
-| Integration/pcie/GPU1 | PASS |
-| Integration/pcie/GPU2 | PASS |
-| Integration/pcie/GPU3 | PASS |
-| Integration/pcie/GPU4 | PASS |
-| Integration/pcie/GPU5 | PASS |
-| Integration/pcie/GPU6 | PASS |
-| Integration/pcie/GPU7 | PASS |
-| Integration/pcie/summary | PASS |
-| Stress/targeted_stress/GPU0 | PASS |
-| Stress/targeted_stress/GPU1 | PASS |
-| Stress/targeted_stress/GPU2 | PASS |
-| Stress/targeted_stress/GPU3 | PASS |
-| Stress/targeted_stress/GPU4 | PASS |
-| Stress/targeted_stress/GPU5 | PASS |
-| Stress/targeted_stress/GPU6 | PASS |
-| Stress/targeted_stress/GPU7 | PASS |
-| Stress/targeted_stress/summary | PASS |
-| Stress/targeted_power/GPU0 | PASS |
-| Stress/targeted_power/GPU1 | PASS |
-| Stress/targeted_power/GPU2 | PASS |
-| Stress/targeted_power/GPU3 | PASS |
-| Stress/targeted_power/GPU4 | PASS |
-| Stress/targeted_power/GPU5 | PASS |
-| Stress/targeted_power/GPU6 | PASS |
-| Stress/targeted_power/GPU7 | PASS |
-| Stress/targeted_power/summary | PASS |
-
-## NCCL Multi-GPU
-
-Source: nccl-tests | GPUs: 8
-
-| Operation | Bus BW (GB/s) | Threshold | Status |
-|-----------|---------------|-----------|--------|
-| allreduce | 472.3 | >= 405 | FAIL |
-| alltoall | 343.3 | >= 315 | FAIL |
-| broadcast | 364.1 | >= 360 | FAIL |
-| reducescatter | 352.8 | >= 405 | FAIL |
-| allgather | 366.4 | >= 405 | FAIL |
-| sendrecv | 369.0 | >= 360 | FAIL |
-
-### NCCL allreduce by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 24.9, 25.0, 24.7 | 24.7 | 24.9 | 0.50% | >= 405 | FAIL |
-| 256M | 421.6, 421.8, 421.6 | 421.6 | 421.7 | 0.02% | >= 405 | PASS |
-| 2G | 472.8, 472.7, 471.5 | 471.5 | 472.3 | 0.13% | >= 405 | PASS |
-
-### NCCL alltoall by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 8.1, 8.0, 8.0 | 8.0 | 8.0 | 0.59% | >= 315 | FAIL |
-| 256M | 305.3, 314.9, 313.1 | 305.3 | 311.1 | 1.34% | >= 315 | FAIL |
-| 2G | 342.1, 342.5, 345.4 | 342.1 | 343.3 | 0.43% | >= 315 | PASS |
-
-### NCCL broadcast by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 14.5, 14.6, 14.2 | 14.2 | 14.4 | 1.18% | >= 360 | FAIL |
-| 256M | 344.2, 345.9, 344.6 | 344.2 | 344.9 | 0.21% | >= 360 | FAIL |
-| 2G | 364.2, 364.0, 364.1 | 364.0 | 364.1 | 0.02% | >= 360 | PASS |
-
-### NCCL reducescatter by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 14.1, 13.8, 14.2 | 13.8 | 14.0 | 1.21% | >= 405 | FAIL |
-| 256M | 328.6, 328.3, 328.2 | 328.2 | 328.4 | 0.05% | >= 405 | FAIL |
-| 2G | 352.6, 352.4, 353.3 | 352.4 | 352.8 | 0.11% | >= 405 | FAIL |
-
-### NCCL allgather by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 14.6, 14.3, 14.4 | 14.3 | 14.4 | 0.86% | >= 405 | FAIL |
-| 256M | 350.5, 350.4, 349.9 | 349.9 | 350.3 | 0.07% | >= 405 | FAIL |
-| 2G | 366.3, 366.6, 366.2 | 366.2 | 366.4 | 0.05% | >= 405 | FAIL |
-
-### NCCL sendrecv by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 18.4, 18.4, 18.4 | 18.4 | 18.4 | 0.00% | >= 360 | FAIL |
-| 256M | 350.9, 351.6, 351.4 | 350.9 | 351.3 | 0.08% | >= 360 | FAIL |
-| 2G | 368.9, 369.1, 368.9 | 368.9 | 369.0 | 0.03% | >= 360 | PASS |
-
-**Overall: FAIL**
-
-## Stress Test
-
-- **Source:** pytorch
-- **Duration:** 1800s (requested 1800s)
-- **Telemetry samples:** 1266
-- **Max temp:** {0: 60.0, 1: 60.0, 2: 68.0, 3: 56.0, 4: 60.0, 5: 68.0, 6: 64.0, 7: 56.0}
-- **Avg power:** {0: 697.7, 1: 697.5, 2: 697.1, 3: 697.8, 4: 697.8, 5: 697.9, 6: 697.7, 7: 698.3}
-- **Temp delta:** 12.0 C
-- **TFLOPS jitter:** 4.37%
-- **Steady TFLOPS samples:** 37672
-- **Throttle events:** 9712
-- **XID events:** 0
-- **Failure reasons:**
-  - GPU temperature delta 12.0C exceeds 5.0C
-  - non-idle throttle reasons observed in 9712 samples (first: GPU 0 0x4)
-- **Result: FAIL**
-
-## RDMA/InfiniBand
-
-### RDMA Port Checks
-
-| Device | Port | State | Rate | Required | Status |
-|--------|------|-------|------|----------|--------|
-| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
-| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
-| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-
-| Test | Value | Threshold | Status |
-|------|-------|-----------|--------|
-| ib_write_bw | 49.5 GB/s | >= 47 GB/s | PASS |
-| ib_read_bw | 39.1 GB/s | >= 47 GB/s | FAIL |
-| ib_write_lat | 1.25 us | <= 2 us | PASS |
-| ib_read_lat | 2.60 us | <= 3.5 us | PASS |
-| ibping | local_loopback target=0x58 count=5 | 0% packet loss | PASS |
-
-- **PFC/ECN/CNP/congestion counters checked:** 146
-- **PFC/ECN/CNP/congestion non-zero:** no
-- **Failure reasons:**
-  - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
-  - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
-  - ib_read_bw bandwidth 39.12GB/s < 47GB/s
-**Overall: FAIL**
-
-## Training Simulation
-
-| Metric | Value |
-|--------|-------|
-| Model | synthetic_transformer_1.5b |
-| Params | 1470.5M |
-| Throughput | 216498 tokens/sec |
-| Avg Step Time | 75.7 ms |
-| Warmup Steps | 5 |
-| Peak Memory | 18.1 GB |
-| Final Loss | 0.0039 |
-| Step Jitter | 1.89% |
-| Distributed Mode | ddp |
-| Verdict | PASS (216498 tokens/sec) |
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_test_all_latest_aikubeworker0016_20260522_203447.md b/reports_test_all_latest_aikubeworker0016_20260522_203447.md
deleted file mode 100644
index 3a4077f..0000000
--- a/reports_test_all_latest_aikubeworker0016_20260522_203447.md
+++ /dev/null
@@ -1,322 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22T20:34:52.129246
-- **Host:** aikubeworker0016
-- **GPU:** NVIDIA H100 80GB HBM3 x8
-- **Driver:** 580.159.03 | **CUDA:** 13.0
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Failed or unverified items:
-- Compute Throughput: FAIL (BF16 spread 3.44% > 3%)
-- NCCL: FAIL
-- Stress Test: FAIL
-- RDMA: FAIL
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| GPU Info | PASS (8 GPUs detected) |
-| Health Check | PASS |
-| Memory Bandwidth | PASS (108.1%) |
-| Compute Throughput | FAIL (BF16 spread 3.44% > 3%) |
-| NVLink/NVSwitch | PASS |
-| DCGM | PASS |
-| NCCL | FAIL |
-| Stress Test | FAIL |
-| RDMA | FAIL |
-| Training | PASS (216683 tokens/sec) |
-
-## GPU Information
-
-| GPU | Model | VRAM | Temp | Power | SM Clock |
-|-----|-------|------|------|-------|----------|
-| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 70/700W | 345 MHz |
-| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 68/700W | 345 MHz |
-| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 67/700W | 345 MHz |
-| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz |
-| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 68/700W | 345 MHz |
-| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 22C | 69/700W | 345 MHz |
-| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 68/700W | 345 MHz |
-| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 66/700W | 345 MHz |
-
-## Health Check
-
-**Overall: PASS**
-
-| GPU | Temp | Power | ECC | PCIe | Throttle | Status |
-|-----|------|-------|-----|------|----------|--------|
-| 0 | 20C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 1 | 21C PASS | 68W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 2 | 21C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 3 | 20C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 4 | 20C PASS | 68W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 5 | 22C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 6 | 20C PASS | 68W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 7 | 20C PASS | 66W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-
-## Memory Bandwidth
-
-Source: nvbandwidth
-
-| Metric | Value | Peak | Efficiency |
-|--------|-------|------|------------|
-| H2D (PCIe) | 55.4 GB/s | 64 GB/s | 86.6% |
-| D2H (PCIe) | 54.4 GB/s | 64 GB/s | 85.0% |
-| D2D (NVLink) | 486.6 GB/s | 450 GB/s | 108.1% |
-
-**Verdict: PASS** (D2D efficiency 108.1%)
-
-## Compute Throughput
-
-| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
-|-------|-------------------|------|------------|--------|
-| FP32 | 52.1 | 67 | >= 54 | FAIL |
-| TF32 | 366.7 | 495 | >= 444 | FAIL |
-| FP16 | 682.7 | 990 | >= 734 | FAIL |
-| BF16 | 717.3 | 990 | >= 745 | FAIL |
-| FP8 | 1173.5 | 1979 | >= 1400 | FAIL |
-| FP64 | 47.4 | 67 | >= 63 | FAIL |
-| INT8 | 100.4 | 1979 | >= 1536 | FAIL |
-
-**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 5.1%)
-
-### Compute Consistency
-
-| DType | Min | Mean | Max | Spread | Limit | Status |
-|-------|-----|------|-----|--------|-------|--------|
-| FP32 | 51.9 | 52.1 | 52.2 | 0.58% | <= 3% | PASS |
-| TF32 | 362.3 | 366.7 | 369.2 | 1.88% | <= 3% | PASS |
-| FP16 | 674.4 | 682.7 | 693.1 | 2.74% | <= 3% | PASS |
-| BF16 | 705.3 | 717.2 | 730.0 | 3.44% | <= 3% | FAIL |
-| FP8 | 1155.2 | 1173.5 | 1186.2 | 2.64% | <= 3% | PASS |
-| FP64 | 46.3 | 47.4 | 48.5 | 4.64% | <= 3% | FAIL |
-| INT8 | 100.4 | 100.4 | 100.4 | 0.00% | <= 3% | PASS |
-
-### Compute Per-GPU TFLOPS
-
-| GPU | FP32 | TF32 | FP16 | BF16 | FP8 | FP64 | INT8 |
-|---|---|---|---|---|---|---|---|
-| 0 | 52.2 | 362.3 | 674.4 | 714.3 | 1159.0 | 46.3 | 100.4 |
-| 1 | 51.9 | 366.5 | 674.7 | 721.4 | 1185.4 | 47.7 | 100.4 |
-| 2 | 52.2 | 367.4 | 693.1 | 730.0 | 1185.7 | 48.5 | 100.4 |
-| 3 | 52.2 | 367.8 | 682.2 | 708.2 | 1163.4 | 47.4 | 100.4 |
-| 4 | 52.0 | 366.4 | 686.9 | 714.1 | 1186.2 | 47.3 | 100.4 |
-| 5 | 52.0 | 369.2 | 679.9 | 721.1 | 1155.2 | 47.3 | 100.4 |
-| 6 | 51.9 | 365.1 | 677.7 | 705.3 | 1169.0 | 47.0 | 100.4 |
-| 7 | 52.2 | 369.0 | 692.8 | 723.5 | 1184.3 | 47.6 | 100.4 |
-
-## NVLink/NVSwitch
-
-**Overall: PASS**
-
-| GPU | Active Links | Issues |
-|-----|--------------|--------|
-| 0 | 18/18 | OK |
-| 1 | 18/18 | OK |
-| 2 | 18/18 | OK |
-| 3 | 18/18 | OK |
-| 4 | 18/18 | OK |
-| 5 | 18/18 | OK |
-| 6 | 18/18 | OK |
-| 7 | 18/18 | OK |
-
-## DCGM Diagnostic
-
-**Overall: PASS**
-
-| Subtest | Status |
-|---------|--------|
-| Deployment/software/GPU0 | PASS |
-| Deployment/software/GPU1 | PASS |
-| Deployment/software/GPU2 | PASS |
-| Deployment/software/GPU3 | PASS |
-| Deployment/software/GPU4 | PASS |
-| Deployment/software/GPU5 | PASS |
-| Deployment/software/GPU6 | PASS |
-| Deployment/software/GPU7 | PASS |
-| Deployment/software/summary | PASS |
-| Hardware/memory/GPU0 | PASS |
-| Hardware/memory/GPU1 | PASS |
-| Hardware/memory/GPU2 | PASS |
-| Hardware/memory/GPU3 | PASS |
-| Hardware/memory/GPU4 | PASS |
-| Hardware/memory/GPU5 | PASS |
-| Hardware/memory/GPU6 | PASS |
-| Hardware/memory/GPU7 | PASS |
-| Hardware/memory/summary | PASS |
-| Hardware/diagnostic/GPU0 | PASS |
-| Hardware/diagnostic/GPU1 | PASS |
-| Hardware/diagnostic/GPU2 | PASS |
-| Hardware/diagnostic/GPU3 | PASS |
-| Hardware/diagnostic/GPU4 | PASS |
-| Hardware/diagnostic/GPU5 | PASS |
-| Hardware/diagnostic/GPU6 | PASS |
-| Hardware/diagnostic/GPU7 | PASS |
-| Hardware/diagnostic/summary | PASS |
-| Hardware/nvbandwidth/GPU0 | PASS |
-| Hardware/nvbandwidth/GPU1 | PASS |
-| Hardware/nvbandwidth/GPU2 | PASS |
-| Hardware/nvbandwidth/GPU3 | PASS |
-| Hardware/nvbandwidth/GPU4 | PASS |
-| Hardware/nvbandwidth/GPU5 | PASS |
-| Hardware/nvbandwidth/GPU6 | PASS |
-| Hardware/nvbandwidth/GPU7 | PASS |
-| Hardware/nvbandwidth/summary | PASS |
-| Integration/pcie/GPU0 | PASS |
-| Integration/pcie/GPU1 | PASS |
-| Integration/pcie/GPU2 | PASS |
-| Integration/pcie/GPU3 | PASS |
-| Integration/pcie/GPU4 | PASS |
-| Integration/pcie/GPU5 | PASS |
-| Integration/pcie/GPU6 | PASS |
-| Integration/pcie/GPU7 | PASS |
-| Integration/pcie/summary | PASS |
-| Stress/targeted_stress/GPU0 | PASS |
-| Stress/targeted_stress/GPU1 | PASS |
-| Stress/targeted_stress/GPU2 | PASS |
-| Stress/targeted_stress/GPU3 | PASS |
-| Stress/targeted_stress/GPU4 | PASS |
-| Stress/targeted_stress/GPU5 | PASS |
-| Stress/targeted_stress/GPU6 | PASS |
-| Stress/targeted_stress/GPU7 | PASS |
-| Stress/targeted_stress/summary | PASS |
-| Stress/targeted_power/GPU0 | PASS |
-| Stress/targeted_power/GPU1 | PASS |
-| Stress/targeted_power/GPU2 | PASS |
-| Stress/targeted_power/GPU3 | PASS |
-| Stress/targeted_power/GPU4 | PASS |
-| Stress/targeted_power/GPU5 | PASS |
-| Stress/targeted_power/GPU6 | PASS |
-| Stress/targeted_power/GPU7 | PASS |
-| Stress/targeted_power/summary | PASS |
-
-## NCCL Multi-GPU
-
-Source: nccl-tests | GPUs: 8
-
-| Operation | Bus BW (GB/s) | Threshold | Status |
-|-----------|---------------|-----------|--------|
-| allreduce | 472.4 | >= 405 | FAIL |
-| alltoall | 344.3 | >= 315 | FAIL |
-| broadcast | 363.6 | >= 360 | FAIL |
-| reducescatter | 353.1 | >= 405 | FAIL |
-| allgather | 366.4 | >= 405 | FAIL |
-| sendrecv | 368.9 | >= 360 | FAIL |
-
-### NCCL allreduce by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 24.9, 24.4, 24.9 | 24.4 | 24.7 | 0.95% | >= 405 | FAIL |
-| 256M | 421.9, 421.1, 421.9 | 421.1 | 421.6 | 0.09% | >= 405 | PASS |
-| 2G | 472.6, 472.0, 472.5 | 472.0 | 472.4 | 0.06% | >= 405 | PASS |
-
-### NCCL alltoall by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 7.9, 7.8, 8.1 | 7.8 | 7.9 | 1.57% | >= 315 | FAIL |
-| 256M | 298.7, 312.7, 303.2 | 298.7 | 304.9 | 1.91% | >= 315 | FAIL |
-| 2G | 342.2, 345.4, 345.2 | 342.2 | 344.3 | 0.43% | >= 315 | PASS |
-
-### NCCL broadcast by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 14.5, 14.3, 14.4 | 14.3 | 14.4 | 0.57% | >= 360 | FAIL |
-| 256M | 344.1, 344.3, 344.8 | 344.1 | 344.4 | 0.09% | >= 360 | FAIL |
-| 2G | 364.0, 363.6, 363.3 | 363.3 | 363.6 | 0.08% | >= 360 | PASS |
-
-### NCCL reducescatter by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 14.0, 14.2, 14.3 | 14.0 | 14.2 | 0.88% | >= 405 | FAIL |
-| 256M | 328.8, 328.7, 328.4 | 328.4 | 328.6 | 0.05% | >= 405 | FAIL |
-| 2G | 351.9, 353.8, 353.6 | 351.9 | 353.1 | 0.24% | >= 405 | FAIL |
-
-### NCCL allgather by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 14.4, 13.9, 14.0 | 13.9 | 14.1 | 1.53% | >= 405 | FAIL |
-| 256M | 350.2, 350.4, 350.7 | 350.2 | 350.4 | 0.06% | >= 405 | FAIL |
-| 2G | 366.9, 366.4, 366.0 | 366.0 | 366.4 | 0.10% | >= 405 | FAIL |
-
-### NCCL sendrecv by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 18.4, 18.3, 18.5 | 18.3 | 18.4 | 0.44% | >= 360 | FAIL |
-| 256M | 351.1, 351.4, 351.3 | 351.1 | 351.3 | 0.04% | >= 360 | FAIL |
-| 2G | 368.9, 368.8, 368.9 | 368.8 | 368.9 | 0.01% | >= 360 | PASS |
-
-**Overall: FAIL**
-
-## Stress Test
-
-- **Source:** pytorch
-- **Duration:** 1800s (requested 1800s)
-- **Telemetry samples:** 1295
-- **Max temp:** {0: 51.0, 1: 59.0, 2: 61.0, 3: 53.0, 4: 53.0, 5: 62.0, 6: 56.0, 7: 52.0}
-- **Avg power:** {0: 698.8, 1: 697.8, 2: 698.1, 3: 697.9, 4: 697.9, 5: 698.2, 6: 698.0, 7: 697.8}
-- **Temp delta:** 11.0 C
-- **TFLOPS jitter:** 3.4%
-- **Steady TFLOPS samples:** 37874
-- **Throttle events:** 9944
-- **XID events:** 0
-- **Failure reasons:**
-  - GPU temperature delta 11.0C exceeds 5.0C
-  - non-idle throttle reasons observed in 9944 samples (first: GPU 0 0x4)
-- **Result: FAIL**
-
-## RDMA/InfiniBand
-
-### RDMA Port Checks
-
-| Device | Port | State | Rate | Required | Status |
-|--------|------|-------|------|----------|--------|
-| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
-| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
-| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-
-| Test | Value | Threshold | Status |
-|------|-------|-----------|--------|
-| ib_write_bw | 48.6 GB/s | >= 47 GB/s | PASS |
-| ib_read_bw | 40.3 GB/s | >= 47 GB/s | FAIL |
-| ib_write_lat | 1.29 us | <= 2 us | PASS |
-| ib_read_lat | 2.59 us | <= 3.5 us | PASS |
-| ibping | local_loopback target=0x4b count=5 | 0% packet loss | PASS |
-
-- **PFC/ECN/CNP/congestion counters checked:** 146
-- **PFC/ECN/CNP/congestion non-zero:** no
-- **Failure reasons:**
-  - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
-  - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
-  - ib_read_bw bandwidth 40.29GB/s < 47GB/s
-**Overall: FAIL**
-
-## Training Simulation
-
-| Metric | Value |
-|--------|-------|
-| Model | synthetic_transformer_1.5b |
-| Params | 1470.5M |
-| Throughput | 216683 tokens/sec |
-| Avg Step Time | 75.6 ms |
-| Warmup Steps | 5 |
-| Peak Memory | 18.1 GB |
-| Final Loss | 0.0039 |
-| Step Jitter | 1.2% |
-| Distributed Mode | ddp |
-| Verdict | PASS (216683 tokens/sec) |
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_test_all_latest_summary_cn_20260523.md b/reports_test_all_latest_summary_cn_20260523.md
deleted file mode 100644
index 87f4eab..0000000
--- a/reports_test_all_latest_summary_cn_20260523.md
+++ /dev/null
@@ -1,101 +0,0 @@
-# H100 单节点 test all 中文汇总
-
-生成时间：2026-05-23
-测试范围：`aikubeworker0012`、`aikubeworker0016` 单节点 `python gpu_tester.py --test all --report --format md`
-
-原始报告：
-
-- `reports_test_all_latest_aikubeworker0012_20260522_203246.md`
-- `reports_test_all_latest_aikubeworker0016_20260522_203447.md`
-
-## 总结论
-
-| 机器 | Suite | PDF 验收结论 | 主要失败项 |
-|---|---:|---|---|
-| aikubeworker0012 | 6/10 PASS | FAIL | Compute、NCCL、Stress、RDMA |
-| aikubeworker0016 | 6/10 PASS | FAIL | Compute、NCCL、Stress、RDMA |
-
-按 PDF 口径，任一必测子项 FAIL，则整机 FAIL。因此两台机器当前都不通过生产验收。
-
-## 通过项
-
-| 项目 | aikubeworker0012 | aikubeworker0016 | 说明 |
-|---|---|---|---|
-| GPU Info | PASS | PASS | 8 张 H100 |
-| Health | PASS | PASS | 温度、空闲功耗、ECC、PCIe、空闲 throttle 正常 |
-| Memory Bandwidth | PASS | PASS | D2D 效率均约 108.1% |
-| NVLink/NVSwitch | PASS | PASS | 8 卡均 18/18 links |
-| DCGM diag -r 3 | PASS | PASS | software、memory、diagnostic、nvbandwidth、pcie、targeted stress/power 全 PASS |
-| Training Simulation | PASS | PASS | 8 卡 DDP synthetic 1.5B，loss finite |
-
-Training 结果：
-
-| 机器 | Throughput | Step jitter | Peak memory | Verdict |
-|---|---:|---:|---:|---|
-| aikubeworker0012 | 216498 tokens/s | 1.89% | 18.08 GB | PASS |
-| aikubeworker0016 | 216683 tokens/s | 1.20% | 18.08 GB | PASS |
-
-## 失败项
-
-### Compute
-
-两台机器都未达到当前 H100 绝对 TFLOPS 阈值，且部分 dtype 的跨 GPU spread 超过 3%。
-
-| 机器 | 代表性失败 |
-|---|---|
-| aikubeworker0012 | FP16 spread 3.04%，BF16 spread 4.58%，FP64 spread 3.41%；FP32/TF32/FP16/BF16/FP8/FP64/INT8 绝对阈值均 FAIL |
-| aikubeworker0016 | BF16 spread 3.44%，FP64 spread 4.64%；FP32/TF32/FP16/BF16/FP8/FP64/INT8 绝对阈值均 FAIL |
-
-### NCCL
-
-NCCL 已经使用真实 `nccl-tests` bus BW，不是 torchrun fallback。失败主要来自小 size 以及部分 256M/2G op 未达阈值。
-
-| 机器 | allreduce best | alltoall best | broadcast best | reducescatter best | allgather best | sendrecv best | Verdict |
-|---|---:|---:|---:|---:|---:|---:|---|
-| aikubeworker0012 | 472.3 | 343.3 | 364.1 | 352.8 | 366.4 | 369.0 | FAIL |
-| aikubeworker0016 | 472.4 | 344.3 | 363.6 | 353.1 | 366.4 | 368.9 | FAIL |
-
-关键原因：
-
-- `1M` size 在所有 op 上都明显低于阈值。
-- `reducescatter`、`allgather` 的 2G 也低于 405 GB/s 阈值。
-- `broadcast/sendrecv` 的 256M 低于 360 GB/s 阈值。
-
-### Stress
-
-两台机器的 1800 秒 PyTorch BF16 GEMM 压力测试均跑满，但 telemetry 判定 FAIL。
-
-| 机器 | 平均稳态功耗 | 最高温度范围 | 温差 | TFLOPS jitter | throttle events | XID | Verdict |
-|---|---|---|---:|---:|---:|---:|---|
-| aikubeworker0012 | 约 697-698W/GPU | 56-68C | 12C | 4.37% | 9712 | 0 | FAIL |
-| aikubeworker0016 | 约 698W/GPU | 51-62C | 11C | 3.40% | 9944 | 0 | FAIL |
-
-失败原因：
-
-- GPU 间温差超过 5C 阈值。
-- 观测到大量非 idle throttle，首个原因是 `0x4`，即 `sw_power_cap`。
-
-### RDMA/InfiniBand
-
-本轮 `test all` 是单节点 RDMA 路径，`ibping` 显示为 `local_loopback`。这份结果不能替代跨节点 RDMA 验收，但仍反映单节点 perftest read bandwidth 未达标。
-
-| 机器 | ib_write_bw | ib_read_bw | ib_write_lat | ib_read_lat | Verdict |
-|---|---:|---:|---:|---:|---|
-| aikubeworker0012 | 49.5 GB/s PASS | 39.1 GB/s FAIL | 1.25 us PASS | 2.60 us PASS | FAIL |
-| aikubeworker0016 | 48.6 GB/s PASS | 40.3 GB/s FAIL | 1.29 us PASS | 2.59 us PASS | FAIL |
-
-另外，两台机器都有 `mlx5_4`、`mlx5_5` 处于 ACTIVE 但速率为 100 Gb/sec，低于当前 400G 端口阈值，因此 RDMA port check 也有 FAIL。
-
-## 当前阻塞
-
-1. Compute 阈值口径较严，当前实测绝对 TFLOPS 全 dtype 未达配置阈值，尤其 INT8 路径仅约 100 TFLOPS。
-2. NCCL 真实 bus BW 已可测，但多 op/size 未达 PDF 阈值。
-3. Stress 负载可跑满 30 分钟，但温差和 `sw_power_cap` throttle 导致 FAIL。
-4. 单节点 RDMA read bandwidth 未达 47 GB/s，且部分 IB 端口速率低于 400G。
-5. 跨节点 RDMA 需要继续使用单独 server/client 报告；不能把本轮 `local_loopback` 当作跨节点验收。
-
-## 状态判断
-
-脚本能力已经基本补齐到 PDF 验收口径：真实 nccl-tests、30 分钟 stress telemetry、NVLink、DCGM r3、RDMA perftest/ibping/counter、逐 GPU compute、8 卡 DDP training、最终任一 FAIL 即整机 FAIL 都已经跑通。
-
-当前剩余问题主要不是脚本缺项，而是两台机器的实际验收数据有多项未达标。
diff --git a/reports_test_all_pdf_aikubeworker0012_20260522_182656.md b/reports_test_all_pdf_aikubeworker0012_20260522_182656.md
deleted file mode 100644
index 283d875..0000000
--- a/reports_test_all_pdf_aikubeworker0012_20260522_182656.md
+++ /dev/null
@@ -1,259 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22T18:27:01.103760
-- **Host:** aikubeworker0012
-- **GPU:** NVIDIA H100 80GB HBM3 x8
-- **Driver:** 580.159.03 | **CUDA:** 13.0
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Failed or unverified items:
-- Compute Throughput: FAIL (worst FP32 52 vs >= 54)
-- DCGM: ERROR: dcgmi diag -r 3 timeout after 1200s
-- NCCL: FAIL
-- Stress Test: FAIL
-- RDMA: FAIL
-- Training: FAIL (188741 tokens/sec)
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| GPU Info | PASS (8 GPUs detected) |
-| Health Check | PASS |
-| Memory Bandwidth | PASS (108.1%) |
-| Compute Throughput | FAIL (worst FP32 52 vs >= 54) |
-| NVLink/NVSwitch | PASS |
-| DCGM | ERROR: dcgmi diag -r 3 timeout after 1200s |
-| NCCL | FAIL |
-| Stress Test | FAIL |
-| RDMA | FAIL |
-| Training | FAIL (188741 tokens/sec) |
-
-## GPU Information
-
-| GPU | Model | VRAM | Temp | Power | SM Clock |
-|-----|-------|------|------|-------|----------|
-| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 70/700W | 345 MHz |
-| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 73/700W | 345 MHz |
-| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 26C | 69/700W | 345 MHz |
-| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 70/700W | 345 MHz |
-| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 69/700W | 345 MHz |
-| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 27C | 70/700W | 345 MHz |
-| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 71/700W | 345 MHz |
-| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 72/700W | 345 MHz |
-
-## Health Check
-
-**Overall: PASS**
-
-| GPU | Temp | Power | ECC | PCIe | Throttle | Status |
-|-----|------|-------|-----|------|----------|--------|
-| 0 | 25C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 1 | 25C PASS | 73W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 2 | 26C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 3 | 24C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 4 | 24C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 5 | 27C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 6 | 25C PASS | 71W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 7 | 24C PASS | 72W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-
-## Memory Bandwidth
-
-Source: nvbandwidth
-
-| Metric | Value | Peak | Efficiency |
-|--------|-------|------|------------|
-| H2D (PCIe) | 55.5 GB/s | 64 GB/s | 86.7% |
-| D2H (PCIe) | 54.3 GB/s | 64 GB/s | 84.8% |
-| D2D (NVLink) | 486.6 GB/s | 450 GB/s | 108.1% |
-
-**Verdict: PASS** (D2D efficiency 108.1%)
-
-## Compute Throughput
-
-| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
-|-------|-------------------|------|------------|--------|
-| FP32 | 52.0 | 67 | >= 54 | FAIL |
-| TF32 | 364.8 | 495 | >= 444 | FAIL |
-| FP16 | 685.0 | 990 | >= 734 | FAIL |
-| BF16 | 715.9 | 990 | >= 745 | FAIL |
-| FP8 | 1166.6 | 1979 | >= 1400 | FAIL |
-| FP64 | 46.9 | 0 | >= 63 | FAIL |
-| INT8 | 100.4 | 0 | >= 1536 | FAIL |
-
-**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 58.9%)
-
-### Compute Consistency
-
-| DType | Min | Mean | Max | Spread | Limit | Status |
-|-------|-----|------|-----|--------|-------|--------|
-| FP32 | 51.9 | 52.0 | 52.2 | 0.58% | <= 3% | PASS |
-| TF32 | 360.9 | 364.9 | 368.2 | 2.00% | <= 3% | PASS |
-| FP16 | 676.0 | 685.0 | 689.9 | 2.03% | <= 3% | PASS |
-| BF16 | 697.3 | 715.9 | 730.2 | 4.60% | <= 3% | FAIL |
-| FP8 | 1141.8 | 1166.6 | 1180.3 | 3.30% | <= 3% | FAIL |
-| FP64 | 45.8 | 46.9 | 47.7 | 4.05% | <= 3% | FAIL |
-| INT8 | 100.4 | 100.4 | 100.4 | 0.00% | <= 3% | PASS |
-
-### Compute Per-GPU TFLOPS
-
-| GPU | FP32 | TF32 | FP16 | BF16 | FP8 | FP64 | INT8 |
-|---|---|---|---|---|---|---|---|
-| 0 | 51.9 | 368.2 | 689.5 | 730.2 | 1180.3 | 47.1 | 100.4 |
-| 1 | 51.9 | 366.8 | 688.7 | 721.6 | 1170.1 | 47.7 | 100.4 |
-| 2 | 51.9 | 366.3 | 689.9 | 711.3 | 1167.8 | 47.2 | 100.4 |
-| 3 | 51.9 | 363.0 | 677.6 | 699.2 | 1176.3 | 46.6 | 100.4 |
-| 4 | 52.2 | 365.3 | 685.0 | 725.4 | 1163.0 | 46.8 | 100.4 |
-| 5 | 52.1 | 363.9 | 684.2 | 725.0 | 1172.1 | 46.9 | 100.4 |
-| 6 | 51.9 | 364.4 | 688.8 | 717.3 | 1161.2 | 46.9 | 100.4 |
-| 7 | 51.9 | 360.9 | 676.0 | 697.3 | 1141.8 | 45.8 | 100.4 |
-
-## NVLink/NVSwitch
-
-**Overall: PASS**
-
-| GPU | Active Links | Issues |
-|-----|--------------|--------|
-| 0 | 18/18 | OK |
-| 1 | 18/18 | OK |
-| 2 | 18/18 | OK |
-| 3 | 18/18 | OK |
-| 4 | 18/18 | OK |
-| 5 | 18/18 | OK |
-| 6 | 18/18 | OK |
-| 7 | 18/18 | OK |
-
-## DCGM Diagnostic
-
-**Overall: FAIL** (dcgmi diag -r 3 timeout after 1200s)
-
-## NCCL Multi-GPU
-
-Source: nccl-tests | GPUs: 8
-
-| Operation | Bus BW (GB/s) | Threshold | Status |
-|-----------|---------------|-----------|--------|
-| allreduce | 472.4 | >= 405 | FAIL |
-| alltoall | 344.4 | >= 315 | FAIL |
-| broadcast | 363.8 | >= 360 | FAIL |
-| reducescatter | 353.0 | >= 405 | FAIL |
-| allgather | 366.4 | >= 405 | FAIL |
-| sendrecv | 368.9 | >= 360 | FAIL |
-
-### NCCL allreduce by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 24.0, 24.9, 24.7 | 24.0 | 24.5 | 1.57% | >= 405 | FAIL |
-| 256M | 421.4, 421.7, 421.4 | 421.4 | 421.5 | 0.03% | >= 405 | PASS |
-| 2G | 471.8, 473.0, 472.3 | 471.8 | 472.4 | 0.10% | >= 405 | PASS |
-
-### NCCL alltoall by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 8.1, 8.0, 8.0 | 8.0 | 8.0 | 0.59% | >= 315 | FAIL |
-| 256M | 312.3, 310.9, 319.2 | 310.9 | 314.1 | 1.15% | >= 315 | FAIL |
-| 2G | 343.1, 346.2, 344.0 | 343.1 | 344.4 | 0.38% | >= 315 | PASS |
-
-### NCCL broadcast by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 14.6, 13.6, 14.5 | 13.6 | 14.2 | 3.16% | >= 360 | FAIL |
-| 256M | 343.8, 344.2, 344.5 | 343.8 | 344.2 | 0.08% | >= 360 | FAIL |
-| 2G | 363.5, 363.3, 364.7 | 363.3 | 363.8 | 0.17% | >= 360 | PASS |
-
-### NCCL reducescatter by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 14.1, 14.3, 14.3 | 14.1 | 14.2 | 0.66% | >= 405 | FAIL |
-| 256M | 328.1, 328.3, 328.3 | 328.1 | 328.2 | 0.03% | >= 405 | FAIL |
-| 2G | 354.0, 352.6, 352.3 | 352.3 | 353.0 | 0.21% | >= 405 | FAIL |
-
-### NCCL allgather by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 14.5, 14.5, 14.3 | 14.3 | 14.4 | 0.65% | >= 405 | FAIL |
-| 256M | 350.7, 350.7, 350.5 | 350.5 | 350.6 | 0.03% | >= 405 | FAIL |
-| 2G | 366.6, 366.3, 366.3 | 366.3 | 366.4 | 0.04% | >= 405 | FAIL |
-
-### NCCL sendrecv by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 18.5, 18.4, 18.1 | 18.1 | 18.3 | 0.93% | >= 360 | FAIL |
-| 256M | 352.3, 350.6, 350.5 | 350.5 | 351.1 | 0.24% | >= 360 | FAIL |
-| 2G | 368.8, 369.0, 368.8 | 368.8 | 368.9 | 0.03% | >= 360 | PASS |
-
-**Overall: FAIL**
-
-## Stress Test
-
-- **Source:** pytorch
-- **Duration:** 1800s (requested 1800s)
-- **Telemetry samples:** 1541
-- **Max temp:** {0: 60.0, 1: 60.0, 2: 68.0, 3: 56.0, 4: 60.0, 5: 68.0, 6: 65.0, 7: 56.0}
-- **Avg power:** {0: 697.7, 1: 697.4, 2: 697.2, 3: 697.7, 4: 697.5, 5: 698.0, 6: 697.8, 7: 698.4}
-- **Temp delta:** 12.0 C
-- **TFLOPS jitter:** 3.16%
-- **Steady TFLOPS samples:** 37676
-- **Throttle events:** 11912
-- **XID events:** 0
-- **Failure reasons:**
-  - GPU temperature delta 12.0C exceeds 5.0C
-  - non-idle throttle reasons observed in 11912 samples (first: GPU 0 0x4)
-- **Result: FAIL**
-
-## RDMA/InfiniBand
-
-### RDMA Port Checks
-
-| Device | Port | State | Rate | Required | Status |
-|--------|------|-------|------|----------|--------|
-| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
-| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
-| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-
-| Test | Value | Threshold | Status |
-|------|-------|-----------|--------|
-| ib_write_bw | 49.2 GB/s | >= 47 GB/s | PASS |
-| ib_read_bw | 39.1 GB/s | >= 47 GB/s | FAIL |
-| ib_write_lat | 5.68 us | <= 2 us | FAIL |
-| ib_read_lat | 16.00 us | <= 3.5 us | FAIL |
-| ibping | target=0x58 count=5 | 0% packet loss | PASS |
-
-- **PFC/ECN/CNP/congestion counters checked:** 0
-- **PFC/ECN/CNP/congestion non-zero:** no
-- **Failure reasons:**
-  - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
-  - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
-  - ib_read_bw bandwidth 39.11GB/s < 47GB/s
-  - ib_write_lat latency 5.68us > 2.0us
-  - ib_read_lat latency 16.0us > 3.5us
-**Overall: FAIL**
-
-## Training Simulation
-
-| Metric | Value |
-|--------|-------|
-| Model | synthetic_transformer_1.5b |
-| Params | 1470.5M |
-| Throughput | 188741 tokens/sec |
-| Avg Step Time | 86.8 ms |
-| Peak Memory | 18.1 GB |
-| Final Loss | 0.0041 |
-| Step Jitter | 626.74% |
-| Distributed Mode | ddp |
-| Verdict | FAIL (188741 tokens/sec) |
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_test_all_pdf_aikubeworker0016_20260522_182856.md b/reports_test_all_pdf_aikubeworker0016_20260522_182856.md
deleted file mode 100644
index dbee788..0000000
--- a/reports_test_all_pdf_aikubeworker0016_20260522_182856.md
+++ /dev/null
@@ -1,259 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22T18:29:01.245683
-- **Host:** aikubeworker0016
-- **GPU:** NVIDIA H100 80GB HBM3 x8
-- **Driver:** 580.159.03 | **CUDA:** 13.0
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Failed or unverified items:
-- Compute Throughput: FAIL (worst FP32 52 vs >= 54)
-- DCGM: ERROR: dcgmi diag -r 3 timeout after 1200s
-- NCCL: FAIL
-- Stress Test: FAIL
-- RDMA: FAIL
-- Training: FAIL (193836 tokens/sec)
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| GPU Info | PASS (8 GPUs detected) |
-| Health Check | PASS |
-| Memory Bandwidth | PASS (108.1%) |
-| Compute Throughput | FAIL (worst FP32 52 vs >= 54) |
-| NVLink/NVSwitch | PASS |
-| DCGM | ERROR: dcgmi diag -r 3 timeout after 1200s |
-| NCCL | FAIL |
-| Stress Test | FAIL |
-| RDMA | FAIL |
-| Training | FAIL (193836 tokens/sec) |
-
-## GPU Information
-
-| GPU | Model | VRAM | Temp | Power | SM Clock |
-|-----|-------|------|------|-------|----------|
-| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 19C | 70/700W | 345 MHz |
-| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz |
-| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz |
-| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 19C | 67/700W | 345 MHz |
-| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 19C | 67/700W | 345 MHz |
-| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 69/700W | 345 MHz |
-| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 19C | 68/700W | 345 MHz |
-| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 19C | 66/700W | 345 MHz |
-
-## Health Check
-
-**Overall: PASS**
-
-| GPU | Temp | Power | ECC | PCIe | Throttle | Status |
-|-----|------|-------|-----|------|----------|--------|
-| 0 | 19C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 1 | 20C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 2 | 20C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 3 | 19C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 4 | 19C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 5 | 21C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 6 | 19C PASS | 68W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-| 7 | 19C PASS | 66W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** |
-
-## Memory Bandwidth
-
-Source: nvbandwidth
-
-| Metric | Value | Peak | Efficiency |
-|--------|-------|------|------------|
-| H2D (PCIe) | 55.5 GB/s | 64 GB/s | 86.7% |
-| D2H (PCIe) | 54.7 GB/s | 64 GB/s | 85.5% |
-| D2D (NVLink) | 486.6 GB/s | 450 GB/s | 108.1% |
-
-**Verdict: PASS** (D2D efficiency 108.1%)
-
-## Compute Throughput
-
-| DType | Achieved (TFLOPS) | Peak | Threshold | Status |
-|-------|-------------------|------|------------|--------|
-| FP32 | 52.0 | 67 | >= 54 | FAIL |
-| TF32 | 366.2 | 495 | >= 444 | FAIL |
-| FP16 | 684.8 | 990 | >= 734 | FAIL |
-| BF16 | 720.7 | 990 | >= 745 | FAIL |
-| FP8 | 1180.3 | 1979 | >= 1400 | FAIL |
-| FP64 | 47.3 | 0 | >= 63 | FAIL |
-| INT8 | 100.5 | 0 | >= 1536 | FAIL |
-
-**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 59.6%)
-
-### Compute Consistency
-
-| DType | Min | Mean | Max | Spread | Limit | Status |
-|-------|-----|------|-----|--------|-------|--------|
-| FP32 | 51.9 | 52.0 | 52.2 | 0.58% | <= 3% | PASS |
-| TF32 | 361.1 | 366.2 | 368.9 | 2.13% | <= 3% | PASS |
-| FP16 | 672.6 | 684.8 | 695.0 | 3.27% | <= 3% | FAIL |
-| BF16 | 703.6 | 720.7 | 734.2 | 4.25% | <= 3% | FAIL |
-| FP8 | 1158.6 | 1180.3 | 1241.8 | 7.05% | <= 3% | FAIL |
-| FP64 | 46.7 | 47.3 | 48.0 | 2.75% | <= 3% | PASS |
-| INT8 | 100.4 | 100.5 | 101.1 | 0.70% | <= 3% | PASS |
-
-### Compute Per-GPU TFLOPS
-
-| GPU | FP32 | TF32 | FP16 | BF16 | FP8 | FP64 | INT8 |
-|---|---|---|---|---|---|---|---|
-| 0 | 51.9 | 361.1 | 673.3 | 703.6 | 1158.6 | 46.7 | 100.4 |
-| 1 | 52.0 | 367.0 | 684.0 | 725.7 | 1184.3 | 47.3 | 100.4 |
-| 2 | 52.2 | 368.7 | 695.0 | 734.2 | 1197.7 | 48.0 | 100.4 |
-| 3 | 51.9 | 367.8 | 688.0 | 708.1 | 1174.8 | 47.3 | 100.4 |
-| 4 | 52.0 | 365.2 | 688.4 | 718.2 | 1160.5 | 47.0 | 101.1 |
-| 5 | 52.1 | 368.9 | 684.2 | 733.7 | 1160.5 | 47.3 | 100.4 |
-| 6 | 51.9 | 364.0 | 672.6 | 715.6 | 1164.4 | 47.1 | 100.4 |
-| 7 | 51.9 | 367.0 | 692.5 | 726.5 | 1241.8 | 47.6 | 100.4 |
-
-## NVLink/NVSwitch
-
-**Overall: PASS**
-
-| GPU | Active Links | Issues |
-|-----|--------------|--------|
-| 0 | 18/18 | OK |
-| 1 | 18/18 | OK |
-| 2 | 18/18 | OK |
-| 3 | 18/18 | OK |
-| 4 | 18/18 | OK |
-| 5 | 18/18 | OK |
-| 6 | 18/18 | OK |
-| 7 | 18/18 | OK |
-
-## DCGM Diagnostic
-
-**Overall: FAIL** (dcgmi diag -r 3 timeout after 1200s)
-
-## NCCL Multi-GPU
-
-Source: nccl-tests | GPUs: 8
-
-| Operation | Bus BW (GB/s) | Threshold | Status |
-|-----------|---------------|-----------|--------|
-| allreduce | 472.5 | >= 405 | FAIL |
-| alltoall | 344.2 | >= 315 | FAIL |
-| broadcast | 363.8 | >= 360 | FAIL |
-| reducescatter | 352.5 | >= 405 | FAIL |
-| allgather | 366.8 | >= 405 | FAIL |
-| sendrecv | 369.0 | >= 360 | FAIL |
-
-### NCCL allreduce by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 24.7, 24.1, 24.5 | 24.1 | 24.4 | 1.02% | >= 405 | FAIL |
-| 256M | 421.8, 422.1, 421.4 | 421.4 | 421.8 | 0.07% | >= 405 | PASS |
-| 2G | 472.8, 472.2, 472.6 | 472.2 | 472.5 | 0.05% | >= 405 | PASS |
-
-### NCCL alltoall by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 8.0, 8.0, 7.9 | 7.9 | 8.0 | 0.59% | >= 315 | FAIL |
-| 256M | 326.8, 315.4, 315.8 | 315.4 | 319.3 | 1.65% | >= 315 | PASS |
-| 2G | 344.2, 343.8, 344.6 | 343.8 | 344.2 | 0.09% | >= 315 | PASS |
-
-### NCCL broadcast by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 14.4, 14.2, 14.1 | 14.1 | 14.2 | 0.88% | >= 360 | FAIL |
-| 256M | 345.3, 344.9, 344.4 | 344.4 | 344.9 | 0.11% | >= 360 | FAIL |
-| 2G | 363.6, 363.9, 363.8 | 363.6 | 363.8 | 0.03% | >= 360 | PASS |
-
-### NCCL reducescatter by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 14.3, 14.1, 14.1 | 14.1 | 14.2 | 0.67% | >= 405 | FAIL |
-| 256M | 328.2, 328.3, 328.4 | 328.2 | 328.3 | 0.02% | >= 405 | FAIL |
-| 2G | 352.2, 352.7, 352.6 | 352.2 | 352.5 | 0.06% | >= 405 | FAIL |
-
-### NCCL allgather by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 14.2, 14.5, 14.3 | 14.2 | 14.3 | 0.87% | >= 405 | FAIL |
-| 256M | 350.6, 350.6, 350.5 | 350.5 | 350.6 | 0.01% | >= 405 | FAIL |
-| 2G | 367.0, 366.8, 366.5 | 366.5 | 366.8 | 0.06% | >= 405 | FAIL |
-
-### NCCL sendrecv by size
-
-| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |
-|------|---------------------|-------|------|--------|-----------|--------|
-| 1M | 18.4, 18.2, 18.6 | 18.2 | 18.4 | 0.89% | >= 360 | FAIL |
-| 256M | 350.7, 350.8, 351.1 | 350.7 | 350.9 | 0.05% | >= 360 | FAIL |
-| 2G | 369.0, 369.0, 368.9 | 368.9 | 369.0 | 0.01% | >= 360 | PASS |
-
-**Overall: FAIL**
-
-## Stress Test
-
-- **Source:** pytorch
-- **Duration:** 1800s (requested 1800s)
-- **Telemetry samples:** 1541
-- **Max temp:** {0: 51.0, 1: 59.0, 2: 62.0, 3: 53.0, 4: 53.0, 5: 62.0, 6: 57.0, 7: 53.0}
-- **Avg power:** {0: 698.7, 1: 698.0, 2: 698.1, 3: 697.9, 4: 697.7, 5: 698.2, 6: 698.0, 7: 697.7}
-- **Temp delta:** 11.0 C
-- **TFLOPS jitter:** 3.05%
-- **Steady TFLOPS samples:** 37841
-- **Throttle events:** 11912
-- **XID events:** 0
-- **Failure reasons:**
-  - GPU temperature delta 11.0C exceeds 5.0C
-  - non-idle throttle reasons observed in 11912 samples (first: GPU 0 0x4)
-- **Result: FAIL**
-
-## RDMA/InfiniBand
-
-### RDMA Port Checks
-
-| Device | Port | State | Rate | Required | Status |
-|--------|------|-------|------|----------|--------|
-| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
-| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL |
-| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS |
-
-| Test | Value | Threshold | Status |
-|------|-------|-----------|--------|
-| ib_write_bw | 48.4 GB/s | >= 47 GB/s | PASS |
-| ib_read_bw | 40.3 GB/s | >= 47 GB/s | FAIL |
-| ib_write_lat | 2.44 us | <= 2 us | FAIL |
-| ib_read_lat | 16.00 us | <= 3.5 us | FAIL |
-| ibping | target=0x4b count=5 | 0% packet loss | PASS |
-
-- **PFC/ECN/CNP/congestion counters checked:** 0
-- **PFC/ECN/CNP/congestion non-zero:** no
-- **Failure reasons:**
-  - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
-  - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE)
-  - ib_read_bw bandwidth 40.29GB/s < 47GB/s
-  - ib_write_lat latency 2.44us > 2.0us
-  - ib_read_lat latency 16.0us > 3.5us
-**Overall: FAIL**
-
-## Training Simulation
-
-| Metric | Value |
-|--------|-------|
-| Model | synthetic_transformer_1.5b |
-| Params | 1470.5M |
-| Throughput | 193836 tokens/sec |
-| Avg Step Time | 84.5 ms |
-| Peak Memory | 18.1 GB |
-| Final Loss | 0.004 |
-| Step Jitter | 521.24% |
-| Distributed Mode | ddp |
-| Verdict | FAIL (193836 tokens/sec) |
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_training_warmup_aikubeworker0012_20260522_194528.md b/reports_training_warmup_aikubeworker0012_20260522_194528.md
deleted file mode 100644
index 948e866..0000000
--- a/reports_training_warmup_aikubeworker0012_20260522_194528.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22T19:46:07.450315
-- **Host:** aikubeworker0012
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Missing required evidence:
-- GPU Info
-- Health Check
-- Memory Bandwidth
-- Compute Throughput
-- NVLink/NVSwitch
-- NCCL
-- Stress Test
-- RDMA
-- DCGM
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Training | PASS (216654 tokens/sec) |
-
-## Training Simulation
-
-| Metric | Value |
-|--------|-------|
-| Model | synthetic_transformer_1.5b |
-| Params | 1470.5M |
-| Throughput | 216654 tokens/sec |
-| Avg Step Time | 75.6 ms |
-| Warmup Steps | 5 |
-| Peak Memory | 18.1 GB |
-| Final Loss | 0.0039 |
-| Step Jitter | 0.87% |
-| Distributed Mode | ddp |
-| Verdict | PASS (216654 tokens/sec) |
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
diff --git a/reports_training_warmup_aikubeworker0016_20260522_194609.md b/reports_training_warmup_aikubeworker0016_20260522_194609.md
deleted file mode 100644
index 61570ca..0000000
--- a/reports_training_warmup_aikubeworker0016_20260522_194609.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# GPU Test Report
-
-- **Date:** 2026-05-22T19:46:48.023650
-- **Host:** aikubeworker0016
-
-## Overall Acceptance Verdict
-
-**Result: FAIL**
-
-Missing required evidence:
-- GPU Info
-- Health Check
-- Memory Bandwidth
-- Compute Throughput
-- NVLink/NVSwitch
-- NCCL
-- Stress Test
-- RDMA
-- DCGM
-
-## Summary
-
-| Test | Result |
-|------|--------|
-| Training | PASS (217236 tokens/sec) |
-
-## Training Simulation
-
-| Metric | Value |
-|--------|-------|
-| Model | synthetic_transformer_1.5b |
-| Params | 1470.5M |
-| Throughput | 217236 tokens/sec |
-| Avg Step Time | 75.4 ms |
-| Warmup Steps | 5 |
-| Peak Memory | 18.1 GB |
-| Final Loss | 0.0039 |
-| Step Jitter | 1.23% |
-| Distributed Mode | ddp |
-| Verdict | PASS (217236 tokens/sec) |
-
----
-*Generated by GPU Test Suite v0.2.0*
\ No newline at end of file
-- 
2.47.2


From 017c981062a7d034706d8fb93b6f9ca9eaa04795 Mon Sep 17 00:00:00 2001
From: cs <shi.chen@robotics.cc>
Date: Tue, 26 May 2026 00:44:56 +0800
Subject: [PATCH 41/41] Remove remaining report docs from PR

---
 H100_test_all_vs_PDF_覆盖对比.md              |  85 ---------------
 H100验收_vs_test_all_差距分析.md              | 100 ------------------
 ...all_aikubeworker0016_中文结果与验收差距.md |  73 -------------
 3 files changed, 258 deletions(-)
 delete mode 100644 H100_test_all_vs_PDF_覆盖对比.md
 delete mode 100644 H100验收_vs_test_all_差距分析.md
 delete mode 100644 test_all_aikubeworker0016_中文结果与验收差距.md

diff --git a/H100_test_all_vs_PDF_覆盖对比.md b/H100_test_all_vs_PDF_覆盖对比.md
deleted file mode 100644
index f6d112a..0000000
--- a/H100_test_all_vs_PDF_覆盖对比.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# H100 PDF 验收项 vs 当前 `test all` 覆盖对比
-
-对比对象：
-
-- PDF：`/Users/d-robotics/Downloads/H100_production_acceptance.pdf`
-- 当前脚本：`python gpu_tester.py --config configs/default.yaml --test all --report --format md`
-- 范围：单节点 8 卡 H100。跨节点 NCCL/RDMA 暂不纳入本轮。
-
-## 结论
-
-当前 `test all` 已经从“功能巡检”扩成了“接近生产验收”的单节点套件：GPU 健康、NVLink/NVSwitch、HBM/PCIe/NVLink 带宽、计算、NCCL、压力、RDMA 本机端口、DCGM、训练模拟都会进入同一个 all。
-
-最新 stress smoke 已确认 PyTorch BF16 GEMM 压力能把两台机器压到 PDF 要求的功耗区间：
-
-- `aikubeworker0012`：45 秒 smoke，稳态平均功耗约 `697-698W/卡`，TFLOPS jitter `4.07%`，XID `0`，但温差 `12C`、`clocks_throttle_reasons.active=0x4`，按 PDF 严格 FAIL。
-- `aikubeworker0016`：45 秒 smoke，稳态平均功耗约 `697-699W/卡`，TFLOPS jitter `3.77%`，XID `0`，但温差 `8C`、`clocks_throttle_reasons.active=0x4`，按 PDF 严格 FAIL。
-
-也就是说，当前卡点已经不是“脚本压不满 H100”，而是机器在满功耗压力下没有满足 PDF 的 `温差 <=5C` 和 `Throttle Reasons 全程 0x0` 两个严格门槛。
-
-但如果严格按 PDF 做最终验收，现在还差这些：
-
-1. 24 小时类指标未覆盖：PDF 要求 SBE 24h 增长率、长稳态观察；当前 `all` 是单次快照 + 30 分钟压力，不等于 24 小时老化。
-2. 跨节点项目本轮故意不测：PDF 的 IB/RDMA 生产验收通常要双端 `ib_write_bw/read_bw/lat`、`ibping`；当前按你的要求先做单节点，跨节点未纳入。
-3. PFC/ECN/AER 的覆盖依赖机器暴露的系统计数器：脚本会读能找到的 sysfs 计数器和 dmesg，但如果交换机侧 PFC/ECN 不在主机暴露，仍需要网络侧补证据。
-4. NCCL 1MB 档会被严格阈值打失败：实测 1MB AllReduce bus BW 约 23 GB/s，而 256MB AllReduce 已通过 `nccl-tests` 验证，约 421 GB/s；如果 PDF 要求 1MB 也达到 405 GB/s，这项不是“没测”，而是会被判 FAIL。
-5. Stress 已能达到功耗和 jitter 要求，但短测已经暴露温差和 throttle strict FAIL；完整 1800 秒只会给出更正式的证据，不会自动改变这个判据。
-
-## 覆盖表
-
-| PDF 验收项 | 当前 `test all` 状态 | 还少什么 |
-|---|---:|---|
-| GPU 基本信息、Driver/CUDA | 已覆盖 | 无；会记录 driver、CUDA、GPU 型号 |
-| 温度阈值：稳态 ≤75C、峰值 ≤85C | 已覆盖健康快照；压力项覆盖 ≤80C | 24h 稳态曲线不在一次 all 内 |
-| idle power ≤100W/card | 部分覆盖 | 当前 health 会采功耗，但 idle 判据还不是独立验收项 |
-| stress power ≥630W/card | 已覆盖；短测两台约 697-699W/卡 | 完整 1800 秒仍待跑 |
-| throttle reasons active=0x0 | 已覆盖；短测两台出现 0x4 | 按 PDF 严格判 FAIL；不是脚本跳过项 |
-| DBE/SBE/retired pages | 部分覆盖 | retired pages 和内核错误已查；SBE 24h 增长率未覆盖 |
-| PCIe Gen5 x16 | 部分覆盖 | GPU 信息/拓扑可见；Replay/AER 依赖 dmesg/sysfs，可能还需额外主板侧证据 |
-| Fabric Manager active 且无 ERROR | 已覆盖 | 无；health 会查 systemd 和 journal |
-| NVLink：18 links/GPU、25GB/s/link、错误为 0 | 已覆盖 | 无；新增 `nvlink` 项 |
-| D2D/H2D/D2H 带宽 | 已覆盖 | 依赖 `nvbandwidth`，两台已具备 |
-| 8x8 P2P matrix off-diagonal mean/min/deviation | 已覆盖 | 无；由 nvbandwidth JSON 解析 |
-| Compute FP32/TF32/FP16/BF16/FP8/FP64/INT8 | 已覆盖 | INT8 为 PyTorch `_int_mm` 路径，若要供应商标准 INT8 kernel 需再换实现 |
-| NCCL AllReduce/AllGather/ReduceScatter/Broadcast/SendRecv/AllToAll | 已覆盖 | 无；`nccl-tests` 已在两台编好 |
-| NCCL 1MB/256MB/2GB，repeat 3，stddev ≤3% | 已覆盖 | 严格按 PDF 阈值时 1MB 档大概率 FAIL；256MB AllReduce 两台 `nccl-tests` 实测约 421GB/s |
-| Stress ≥30min，BF16/FP16 GEMM 8192，1s telemetry | 已覆盖；默认 BF16 GEMM `24576`，1s telemetry，warmup 后稳态判定 | 完整 1800 秒待执行；短测已暴露温差/throttle FAIL |
-| DCGM `dcgmi diag -r 3` | 已覆盖；DCGM 4.5.3 已安装，服务已启用 | 两台完整 `-r 3` 已 PASS；日志见 `/root/test_gpu_scripts/reports/dcgm_r3_*_20260522_17010*.log` |
-| RDMA 端口 ACTIVE、400Gbps | 部分覆盖 | 单节点可查端口；严格双端吞吐/时延本轮不跑 |
-| RDMA write/read bw ≥47GB/s、latency ≤2/3.5us | 部分覆盖 | 单机 localhost/perftest 不等价跨节点线速验收 |
-| PFC/ECN errors=0、ibping 双向 OK | 部分覆盖 | 主机能读到的计数器会查；交换机侧/跨节点 ibping 未覆盖 |
-| 1.5B synthetic Transformer BF16，8 卡，≥45k tokens/s | 已覆盖 DDP 路径 | 8 进程 DDP smoke 已通过；完整 50 step 长跑待执行 |
-| 任一子项 FAIL 则总体验收 FAIL | 已覆盖 | `all` 现在会按 strict verdict 退出非 0 |
-
-## 如果现在直接跑 `all`
-
-推荐命令：
-
-```bash
-cd /root/test_gpu_scripts
-/root/gpu-test-venv/bin/python gpu_tester.py --config configs/default.yaml --test all --report --format json --output reports/h100_all_$(hostname)_$(date +%Y%m%d_%H%M%S).json
-```
-
-如果要直接生成中文 Markdown 报告，用这个：
-
-```bash
-cd /root/test_gpu_scripts
-/root/gpu-test-venv/bin/python gpu_tester.py --config configs/default.yaml --test all --report --format md --output reports/h100_all_$(hostname)_$(date +%Y%m%d_%H%M%S).md
-```
-
-预计行为：
-
-- 会跑完整单节点项目，压力默认 1800 秒，默认使用 PyTorch BF16 GEMM 压力并采 1 秒 telemetry/XID。
-- stress 默认矩阵为 `24576`，用于把 H100 压到 ≥630W/卡；PDF 只要求 `matrix_size >=8192`，这里是为了满足功耗门槛。
-- NCCL 会跑 6 个 op × 3 个 message size × 3 次 repeat。
-- DCGM 会跑 `dcgmi diag -r 3 -n gpu:8 -j`；DCGM 工具链已安装并启动，`diag -r 1` 与两台独立 `r3` 长跑均已 PASS。
-- NCCL 1MB 档按 405GB/s 阈值也会失败；256MB AllReduce 已验证走 `nccl-tests`，两台约 421GB/s。
-- stress 按 PDF 严格口径预计会 FAIL：当前短测证据显示温差超过 5C，且 throttle active 出现 `0x4`。
-- 跨节点 RDMA/NCCL 不在这次单节点 all 里。
-
-## 当前最小补齐清单
-
-1. 如果要严格 RDMA 生产验收，下一轮用两台机器做 server/client 双端测试。
-2. 执行完整 1.5B DDP 50 step 训练验收并归档 tokens/s、jitter、显存和 loss。
-3. 执行完整 1800 秒 stress 并归档 1 秒 telemetry、XID、throttle、功耗和温度；当前预期会因温差/throttle FAIL。
-4. 如果要 24 小时验收，增加一个 24h monitor 模式，记录 SBE 增长率、XID、温度、功耗、降频曲线。
diff --git a/H100验收_vs_test_all_差距分析.md b/H100验收_vs_test_all_差距分析.md
deleted file mode 100644
index 5599d0c..0000000
--- a/H100验收_vs_test_all_差距分析.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# H100 生产验收标准 vs 当前 `gpu_tester.py --test all` 覆盖差距
-
-对比文件：`/Users/d-robotics/Downloads/H100_production_acceptance.pdf`
-
-对比对象：当前仓库执行 `python gpu_tester.py --test all --report --format md/json`
-
-## 结论
-
-当前仓库的 `test all` 能覆盖验收文档里的大类框架，但还不是完整的 H100 生产验收。
-
-它会跑 8 个模块：
-
-1. GPU Information
-2. Health Check
-3. Memory Benchmark
-4. Compute Benchmark
-5. NCCL Test
-6. GPU Stress Test
-7. RDMA/IB Test
-8. Training Simulation
-
-但是按照 PDF 的生产验收标准，仍缺少这些关键项：
-
-- NVLink 每卡 18 条链路的 active/速率/错误计数逐项验收
-- DCGM `dcgmi diag -r 3`
-- 30-60 分钟 burn-in 和 1 秒级温度/功耗/throttle/XID 采样
-- NCCL 官方 `nccl-tests` 的性能验收，包括 1MB/256MB/2GB 三个消息大小、重复 3 次取最差值、标准差
-- RDMA 生产口径：4MB 带宽、8B 延迟、PFC/ECN 错误、ibping 双向
-- 8 卡逐卡 compute 一致性，要求同 dtype 极差/均值 <= 3%
-- FP64、INT8 计算项
-- 训练项应为 8 卡 1.5B synthetic Transformer，并按 45k tokens/s、step 抖动、显存、loss 健康度验收
-
-## 覆盖矩阵
-
-| PDF 验收项 | `test all` 是否覆盖 | 当前覆盖程度 | 主要缺口 |
-| --- | --- | --- | --- |
-| 1. 健康检查 | 部分覆盖 | 温度、功耗、ECC、PCIe、时钟、throttle、persistence、IB 设备 | idle 功耗 <=100W 未单独判定；stress 功耗 >=630W 未判定；retired pages 未查；24h SBE 增长率未查；AER/Replay errors 未查；fabricmanager 服务和 ERROR 日志未查 |
-| 2. NVLink 拓扑与链路 | 部分覆盖 | GPU info 会保存 `nvidia-smi topo -m` | 未跑 `nvidia-smi nvlink -s/-c/-e`；未验证每卡 18 条 NVLink；未验证每条 25GB/s；未验证 CRC/Replay/Recovery error = 0 |
-| 3. Memory Bandwidth | 部分覆盖 | 会用 nvbandwidth 测 H2D、D2H、D2D write/read/bidir | 未输出完整 8x8 P2P 矩阵；未验非对角均值 >=360GB/s、最小值 >=320GB/s、相对均值偏差 <=±5%；D2D 口径和 PDF 的单卡/P2P 验收口径还没完全对齐 |
-| 4. Compute Throughput | 大部分覆盖 | 默认配置已是 matrix_size=8192、warmup=50、iterations=500、use_compile=true；H100 绝对 TFLOPS 阈值在 `gpu_specs.py` 里有 | 目前测试结果是整体/单进程口径，未真正逐 GPU 分别测出 8 卡极差/均值；未测 FP64、INT8 |
-| 5. NCCL Multi-GPU | 部分覆盖，依赖工具 | 代码支持 nccl-tests；若缺 binary 会 fallback torchrun 功能连通性 | 当前远端没装好 nccl-tests，实际会退化成功能测试且失败/无性能数据；默认只启 allreduce/alltoall/broadcast，未启 allgather/reducescatter/sendrecv；消息大小不是 1MB/256MB/2GB 三点；未重复 3 次取 worst；未统计标准差 |
-| 6. Stress/Burn-in | 部分覆盖 | 会跑 stress，默认 60 秒；无 gpu-burn 时用 PyTorch fallback | PDF 要 >=30min，推荐 60min；要 FP16/BF16 大 GEMM matrix >=8192；要每分钟 TFLOPS 抖动、温度 <=80、卡间温差 <=5、功耗 >=630W、throttle=0、XID=0；当前 PyTorch fallback 只分配约 64MB/卡，压力不够 |
-| 7. DCGM 诊断 | 未覆盖 | 无 | 没有执行 `dcgmi diag -r 3`，也没有解析 Software/Deployment/Hardware/Integration/Stress/Power 子项 |
-| 8. RDMA/IB | 部分覆盖 | 会发现 IB 设备，跑 ib_write_bw/read_bw/write_lat/read_lat | 当前脚本用 `localhost`，不是跨节点；msg_size 是 64KB，不是 4MB；latency 没指定 8B；阈值是 50GB/s 和 10us，不是 PDF 的 write/read >=47GB/s、write_lat <=2us、read_lat <=3.5us；未查 PFC/ECN、ibping 双向 |
-| 9. Training Simulation | 部分覆盖 | 会跑 GPT-2 或 synthetic transformer，输出 tokens/s、step time、显存、loss | 当前 synthetic 是约 1.47B 参数但实际单进程 `.cuda()`，不是 8 卡分布式训练；未按 45k tokens/s、step 抖动 <=±3%、peak <=70GB/卡、NaN/Inf 做硬判定 |
-| 10. 总体 Verdict | 部分覆盖 | report 有 summary | 当前 `all` 的 pass/fail 逻辑偏“模块是否报错”，不是 PDF 的任一子项 FAIL 即整机禁上生产 |
-
-## 如果现在直接执行 `test all`，能得到什么
-
-会得到一份“单节点综合体检/基准测试报告”，包含：
-
-- 8 张 H100 的基础信息、驱动/CUDA、PCIe、显存、温度、功耗
-- 健康检查结果
-- nvbandwidth 的 H2D/D2H/D2D 汇总带宽
-- FP32/TF32/FP16/BF16/FP8 计算吞吐
-- NCCL 测试结果，如果 nccl-tests 缺失会退化到 torchrun fallback
-- 60 秒 stress 结果
-- 本机 localhost RDMA/IB 结果
-- 训练模拟结果
-
-这份报告能作为“快速冒烟 + 单机初筛”，不能直接作为 PDF 标准下的“生产验收合格报告”。
-
-## 当前两台机器执行前置状态
-
-已经确认：
-
-- `nvbandwidth` 已装好并能被项目脚本调用
-- PyTorch CUDA 环境已装好
-- RDMA perftest 工具已存在
-- `nccl-tests` 和 `gpu-burn` 目前没有按 PDF 生产验收口径准备好
-
-另外，我刚才误触发的 `test all`：
-
-- `aikubeworker0016` 已经在跑单节点 `test all`，当前到 Training Simulation
-- `aikubeworker0012` 没有成功启动
-
-## 要补齐到 PDF 验收口径，需要加的最小清单
-
-1. 安装/修复 `nccl-tests`，确保真正输出 bus BW，而不是 torchrun fallback。
-2. 安装/修复 `gpu-burn`，或把 PyTorch stress 改成真正高占用 FP16/BF16 GEMM，并支持 30/60 分钟。
-3. 增加 NVLink 专项：`nvidia-smi nvlink -s/-c/-e`，按 18 条/卡、25GB/s、error=0 判定。
-4. 增加 DCGM 专项：`dcgmi diag -r 3`，解析子项 PASS/FAIL。
-5. 增加 telemetry 采样：stress 期间每 1 秒采温度、功耗、throttle、XID；计算稳态功耗、温差、抖动。
-6. 修改 RDMA：支持指定 server/client、4MB 带宽、8B 延迟、双向 ibping、PFC/ECN 计数。
-7. 修改 NCCL 配置：全 op 开启，按 1MB/256MB/2GB 三个 size，重复 3 次取最差值和标准差。
-8. 修改 Compute：逐 GPU 分别跑，计算同 dtype 极差/均值；增加 FP64、INT8。
-9. 修改 Training Simulation：明确 8 卡 1.5B synthetic 分布式训练，加入 tokens/s、step 抖动、显存、loss NaN/Inf 的 PASS/FAIL。
-10. 修改最终 verdict：按 PDF 规则，任一子项 FAIL 就整机不通过。
-
-## 建议执行策略
-
-现在直接跑：
-
-```bash
-/root/gpu-test-venv/bin/python gpu_tester.py --test all --report --format md --output reports_all/test_all.md
-```
-
-得到的是“当前仓库 all 覆盖范围报告”。
-
-要拿来做生产验收，需要先补齐上面的缺口，尤其是 `nccl-tests`、`gpu-burn`、NVLink、DCGM、长时间 burn-in、跨节点 RDMA。
diff --git a/test_all_aikubeworker0016_中文结果与验收差距.md b/test_all_aikubeworker0016_中文结果与验收差距.md
deleted file mode 100644
index d05e25a..0000000
--- a/test_all_aikubeworker0016_中文结果与验收差距.md
+++ /dev/null
@@ -1,73 +0,0 @@
-# aikubeworker0016 `test all` 中文结果与 H100 验收差距
-
-测试命令：
-
-```bash
-/root/gpu-test-venv/bin/python gpu_tester.py --test all --report --format json --output reports_all/test_all.json
-```
-
-测试机器：`aikubeworker0016 / 172.72.8.16`
-
-原始结果：`reports_all_aikubeworker0016.json`
-
-## 先说结论
-
-项目输出里最后显示 `Suite complete: 8/8 tests passed`，但这个结论不能直接当成生产验收 PASS。
-
-原因是当前 `all` 的汇总逻辑主要看模块有没有抛 `error`，没有把 `nccl.passed=false` 和 `rdma.passed=false` 当成整套失败。因此按 PDF 的生产验收口径，这台机器目前不能算完整验收通过。
-
-## 本次 `test all` 实际结果
-
-| 模块 | 当前结果 | 关键数据 | 按 PDF 验收看 |
-| --- | --- | --- | --- |
-| GPU 信息 | 已覆盖 | 8 张 H100，Driver 580.159.03，CUDA 13.0 | 基础信息 OK，但 NVLink 链路专项不足 |
-| 健康检查 | PASS | health.passed=true | 基础健康 OK，但缺 retired pages、AER/Replay、fabricmanager 日志、stress 期间采样 |
-| Memory | 有结果 | H2D 55.5 GB/s，D2H 55.3 GB/s，D2D 486.5 GB/s | 单项看起来不错，但缺 8x8 P2P 矩阵验收 |
-| Compute | 有结果 | FP32 51.9，TF32 357.0，FP16 664.0，BF16 700.1，FP8 1116.2 TFLOPS | 对 PDF 绝对门槛不全通过 |
-| NCCL | 实际不合格 | source=torchrun_fallback，`nccl.passed=false`，无 bus BW 性能数据 | 不满足 PDF NCCL 性能验收 |
-| Stress | PASS | PyTorch fallback，60 秒，8 GPU 状态 PASS | 不满足 PDF 的 30/60 分钟 burn-in；负载只有约 64MB/卡，压力明显不够 |
-| RDMA/IB | 实际不合格 | ib_write_bw/read_bw 0.13 GB/s WARN；write_lat 4.10us PASS；read_lat 16us WARN | 当前是 localhost 单节点口径，不满足 PDF RDMA 生产验收 |
-| Training | 有结果 | synthetic 1.47B，52471 tokens/s，peak 27.31GB，loss 0.0041 | tokens/s 过线，但代码实际不是 8 卡分布式训练验收 |
-
-## Compute 对 PDF 门槛的判断
-
-PDF H100 PASS 门槛：
-
-| DType | 本次结果 | PDF PASS 门槛 | 判断 |
-| --- | ---: | ---: | --- |
-| FP32 | 51.9 TFLOPS | >= 54 | WARN |
-| TF32 | 357.0 TFLOPS | >= 444 | FAIL |
-| FP16 | 664.0 TFLOPS | >= 734 | WARN |
-| BF16 | 700.1 TFLOPS | >= 745 | WARN |
-| FP8 | 1116.2 TFLOPS | >= 1400 | FAIL |
-| FP64 | 未测 | >= 63 | 缺失 |
-| INT8 | 未测 | >= 1536 | 缺失 |
-
-说明：PDF 里 WARN 区间是 PASS 门槛的 90%-100%。TF32 和 FP8 低于 90% 门槛，所以按 PDF 是 FAIL。
-
-## 如果只执行当前仓库 `test all`，少了什么
-
-1. 少 NVLink 专项验收：没有逐卡检查 18 条链路、25GB/s 速率、CRC/Replay/Recovery error = 0。
-2. 少 DCGM 诊断：没有 `dcgmi diag -r 3`。
-3. 少长时间 burn-in：当前是 60 秒，不是 30/60 分钟。
-4. 少 stress 期间 1 秒级采样：温度、功耗、throttle、XID、TFLOPS 抖动都没按 PDF 统计。
-5. 少真正 NCCL 性能：当前退化到 torchrun fallback，没有 `nccl-tests` bus BW。
-6. 少 NCCL 全操作和三档消息：PDF 要 AllReduce/AllGather/ReduceScatter/Broadcast/SendRecv/AllToAll，且 1MB/256MB/2GB 都过线。
-7. 少 NCCL 重复 3 次取最差值和标准差 <=3%。
-8. 少完整 P2P 8x8 矩阵：没有非对角均值、最小值、偏差判断。
-9. 少逐 GPU compute 一致性：没有真正分别测 8 卡同 dtype 极差/均值 <=3%。
-10. 少 FP64 和 INT8。
-11. 少 RDMA 生产口径：当前 `localhost`，64KB message，阈值 10us；PDF 要 4MB BW、8B latency、write/read >=47GB/s、write_lat <=2us、read_lat <=3.5us。
-12. 少 PFC/ECN 错误计数和 ibping 双向。
-13. 少真正 8 卡分布式 Training Simulation 验收。
-14. 少严格最终 verdict：当前代码会把 `passed=false` 的模块也计入“通过”，这是验收逻辑漏洞。
-
-## 建议
-
-`test all` 可以继续作为快速初筛跑，但如果目标是对齐 `H100_production_acceptance.pdf`，需要把它升级成“生产验收模式”。优先级如下：
-
-1. 先修汇总 verdict：任何子模块 `passed=false` 必须导致整机 FAIL。
-2. 先装好 `nccl-tests` 和 `gpu-burn`，否则 NCCL/Stress 都不是生产口径。
-3. 增加 NVLink、DCGM、长时间 telemetry、P2P 矩阵。
-4. 改 RDMA 为生产参数，且支持跨节点。
-5. 改 compute/training 为逐 GPU/8 卡分布式验收。
-- 
2.47.2