From 0a772103f0037ea0f4f3cc5923ff3945db2e4056 Mon Sep 17 00:00:00 2001 From: cs Date: Sat, 23 May 2026 20:11:22 +0800 Subject: [PATCH] Archive all-collectives NCCL artifacts --- ...llectives_20260523_120144_artifacts.sha256 | 24 ++++++++++ ..._collectives_20260523_120144_bundle.sha256 | 2 + ...ives_artifacts_manifest_20260523_120144.md | 46 +++++++++++++++++++ ...ts_multinode_nccl_handoff_plan_20260523.md | 2 + ...ts_multinode_nccl_latest_index_20260523.md | 10 +++- 5 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256 create mode 100644 reports_multinode_nccl_all_collectives_20260523_120144_bundle.sha256 create mode 100644 reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md diff --git a/reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256 b/reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256 new file mode 100644 index 0000000..0264ba3 --- /dev/null +++ b/reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256 @@ -0,0 +1,24 @@ +efa4a915bdf4943aef5d88c402c24eb2c60848e5f440f58058a1e99217b07e0d reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt +020eb35ddc5933da78b5c00c1b6fc25b11b23c4505300276d9736fbe8a35519b reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt +903772b675d9a9f7b04e061a25a90f97bf7844dddb5f3809bc9c501f4d6c783d reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt +b7ea7350b3703d4b31389d92b375562bd04a50b40fe16a6c8d037b134a51dbd5 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt +47f68b7510df3b472e7ac0ec2fb53dcefbe687bb4de0c889f8947cc652d09e61 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt +6889180431d639e414e188e1dbc586157565e8506255731b7b38d221d0f72919 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt +6ecbd8473d987d2a7839135029902bd629403eb407a7873502a49be26fa1c947 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt +fa2828cdfcb86e6715a17c8bf45de10ce421c12f0877efff9bafb218b2f00df3 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt +2eae24183754f8d084945d9857b84033ebccf1a2e606931b4f4fc19c5e2e876f reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt +277e900dc1efa8f036616226dbc30cb616ba97337e929ad8b1a14c12484867b3 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt +077fec1bf498fd202e2866f1cf6fb4502ac8d1bafba156f213453b21f6a6df2b reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt +727c69ad6111b891c25360bd9e97ce15f2e7a36d5ff61ae88a7577ecb61c895f reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt +8bec99a952eeb26fa3c6d89cbf2331393923fd4f0fae153b8efe3da239c0a09f reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt +be24943eb4b63e304cee41831adeb23ffbbc0e890ff19b067e06d6a4b48b2d90 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt +a8220b6a4fe3ae037837919a181452e0fc735f58f27fafff07ea431b09b905de reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt +ead794f19e1d2d780cf1840c124b6e0955c70c8b157feb47c4826599d5643b39 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.cmd.txt +4560364922a85d21827357b906491aae8283c6148ff1c0e0f0dc379a68307fdd reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stderr.txt +ade548ee5fdbe2d1fce461237b5b713cc2af24e6c2857bbbd73837f28551af27 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.stdout.txt diff --git a/reports_multinode_nccl_all_collectives_20260523_120144_bundle.sha256 b/reports_multinode_nccl_all_collectives_20260523_120144_bundle.sha256 new file mode 100644 index 0000000..3097f81 --- /dev/null +++ b/reports_multinode_nccl_all_collectives_20260523_120144_bundle.sha256 @@ -0,0 +1,2 @@ +06c565281813c4260da9cfee8f0b0289b61b3be95c01dd670c71fa1a441133e3 reports/multinode_nccl_all_collectives_20260523_120144.md +fa5961d47a5905da6ebc6c726421d73ddc2314a316a8f578683d31fe69c256e5 reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz diff --git a/reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md b/reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md new file mode 100644 index 0000000..b1fc9b5 --- /dev/null +++ b/reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md @@ -0,0 +1,46 @@ +# 多机多卡 NCCL 六项 Collective Artifacts Manifest 2026-05-23 + +- Remote report: `reports/multinode_nccl_all_collectives_20260523_120144.md` +- Remote artifact dir: `reports/multinode_nccl_all_collectives_20260523_120144_artifacts` +- Remote artifact tar: `reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz` +- Remote bundle checksum: `reports/multinode_nccl_all_collectives_20260523_120144_bundle.sha256` +- Remote per-file checksum: `reports/multinode_nccl_all_collectives_20260523_120144_artifacts.sha256` +- Local report copy: `reports_multinode_nccl_all_collectives_20260523_120144.md` +- Local artifact tar copy: `/private/tmp/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz` +- Case count: `6` +- Artifact files: `24` + +## Case Summary + +| Case | Peak Bus BW | Avg Bus BW | Threshold | Wrong | Return Code | Status | +|---|---:|---:|---:|---:|---:|---| +| `allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 354.27 | 354.45 | 491.84 | 0 | 0 | FAIL | +| `alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 37.00 | 37.14 | 76.54 | 0 | 0 | FAIL | +| `broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 191.65 | 190.25 | 0.00 | 0 | 0 | PASS | +| `reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 192.75 | 192.74 | 0.00 | 0 | 0 | PASS | +| `allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 192.14 | 192.47 | 0.00 | 0 | 0 | PASS | +| `sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run` | 26.98 | 26.97 | 0.00 | 0 | 0 | PASS | + +## Bundle Checksums + +```text +06c565281813c4260da9cfee8f0b0289b61b3be95c01dd670c71fa1a441133e3 reports/multinode_nccl_all_collectives_20260523_120144.md +fa5961d47a5905da6ebc6c726421d73ddc2314a316a8f578683d31fe69c256e5 reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz +``` + +## Per-file Checksums + +```text +020eb35ddc5933da78b5c00c1b6fc25b11b23c4505300276d9736fbe8a35519b reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allgather_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +47f68b7510df3b472e7ac0ec2fb53dcefbe687bb4de0c889f8947cc652d09e61 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/allreduce_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +fa2828cdfcb86e6715a17c8bf45de10ce421c12f0877efff9bafb218b2f00df3 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/alltoall_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +077fec1bf498fd202e2866f1cf6fb4502ac8d1bafba156f213453b21f6a6df2b reports/multinode_nccl_all_collectives_20260523_120144_artifacts/broadcast_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +be24943eb4b63e304cee41831adeb23ffbbc0e890ff19b067e06d6a4b48b2d90 reports/multinode_nccl_all_collectives_20260523_120144_artifacts/reducescatter_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +4560364922a85d21827357b906491aae8283c6148ff1c0e0f0dc379a68307fdd reports/multinode_nccl_all_collectives_20260523_120144_artifacts/sendrecv_2x8_2_nodes_x_8_GPUs_all_collectives_evidence_run.json +``` + +完整逐文件 checksum 已保存为: + +```text +reports_multinode_nccl_all_collectives_20260523_120144_artifacts.sha256 +``` diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md index 80b27c5..69bae84 100644 --- a/reports_multinode_nccl_handoff_plan_20260523.md +++ b/reports_multinode_nccl_handoff_plan_20260523.md @@ -17,6 +17,7 @@ | 原始 artifacts 已归档 | `/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_20260523_113803_artifacts`,每个 case 有完整 `cmd/stdout/stderr/json` | | artifacts 信号已分析 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md`,确认所有 case 都走 IB/GDRDMA 和 4 条 400G HCA,未见 SHARP/CollNet | | 多机六项 collective 已补测 | `reports_multinode_nccl_all_collectives_run_20260523.md`,2x8 下 6 项均正确性通过,allreduce/alltoall 按 PDF 阈值仍 FAIL | +| 六项 collective artifacts 已归档 | `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md`,远端 tar 为 `reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz` | | 没看到硬错误 | 未见 discard、RoCE retrans、slow restart、packet sequence error 等增长 | | 当前缺外部 NCCL 网络组件 | 未找到 `libnccl-net*.so*` / `libsharp*.so*`,未见 SHARP/HCOLL 包 | @@ -185,6 +186,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m% | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 | | `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 | | `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 | +| `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 最新多机多卡 2x8 六项 collective artifacts manifest 和 checksum | | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮深度复跑结果 | | `reports_multinode_nccl_environment_gap_20260523.md` | 硬件/软件环境等价性缺口 | | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail/counter 证据 | diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md index ebc3481..1e99d08 100644 --- a/reports_multinode_nccl_latest_index_20260523.md +++ b/reports_multinode_nccl_latest_index_20260523.md @@ -9,6 +9,7 @@ - 2026-05-23 `11:38` 已完成带 artifacts 的正式多机多卡 PDF matrix 复跑,原始报告为 `reports_multinode_nccl_pdf_matrix_20260523_113803.md`,中文结论为 `reports_multinode_nccl_pdf_matrix_run_20260523.md`,artifact manifest 为 `reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.md`。 - 已补充 artifacts 信号分析:`reports_multinode_nccl_artifact_signal_analysis_20260523.md`。结论是所有 case 都走 `IB`,都使用 `mlx5_0,mlx5_1,mlx5_6,mlx5_7`,都有 GDRDMA 信号,但没有 SHARP/CollNet/外部 NCCL net plugin 证据。 - 已补充并实跑多机多卡 2x8 六项 collective:`reports_multinode_nccl_all_collectives_run_20260523.md`。新增 `broadcast/reducescatter/allgather/sendrecv` 均 `returncode=0`、`wrong=0`、走 `IB/GDRDMA`;已知 PDF 阈值项 `allreduce/alltoall` 仍 FAIL。 +- 六项 collective 的完整 artifacts 已归档:`reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md`,远端 tar 为 `reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz`。 - 2 机 1/2/4 GPU per node 档位已接近 PDF 参考值,但严格按阈值仍 FAIL。 - 2 机 8 GPU 档位仍未达到 PDF 参考值: - allreduce 实测 `353.85 GB/s busbw`,PDF 目标 `491.84 GB/s`。 @@ -24,8 +25,9 @@ | 2 | `reports_multinode_nccl_environment_gap_20260523.md` | 说明当前环境为什么不能证明与 PDF 等价,重点是 4 x 400G rail 和缺少 NCCL net plugin / SHARP | | 3 | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 信号分析,确认 IB/GDRDMA/HCA 使用情况和 plugin/SHARP 缺口 | | 4 | `reports_multinode_nccl_all_collectives_run_20260523.md` | 多机多卡 2x8 六项 collective 补测结果,补齐单机 test all 的 NCCL 覆盖面 | -| 5 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 | -| 6 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果,包含 counter、GRAPH、PXN sweep | +| 5 | `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 多机多卡 2x8 六项 collective artifacts manifest 和 checksum | +| 6 | `reports_multinode_nccl_pdf_matrix_run_20260523.md` | 最新正式多机多卡 PDF matrix 结果摘要 | +| 7 | `reports_multinode_nccl_deep_diagnose_run_20260523.md` | 本轮完整深度诊断复跑结果,包含 counter、GRAPH、PXN sweep | ## 关键脚本 @@ -100,6 +102,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m% /root/test_gpu_scripts/reports_multinode_nccl_environment_gap_20260523.md /root/test_gpu_scripts/reports_multinode_nccl_artifact_signal_analysis_20260523.md /root/test_gpu_scripts/reports_multinode_nccl_all_collectives_run_20260523.md +/root/test_gpu_scripts/reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md /root/test_gpu_scripts/reports_multinode_nccl_deep_diagnose_run_20260523.md ``` @@ -140,8 +143,10 @@ manifest: reports_multinode_nccl_pdf_matrix_artifacts_manifest_20260523_113803.m ```text aikubeworker0012: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144.md artifacts: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts +artifacts tar: /root/test_gpu_scripts/reports/multinode_nccl_all_collectives_20260523_120144_artifacts.tar.gz local copy: reports_multinode_nccl_all_collectives_20260523_120144.md summary: reports_multinode_nccl_all_collectives_run_20260523.md +manifest: reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md ``` 下一次用 `scripts/run_multinode_nccl_pdf_matrix.sh` 复跑时,还会生成: @@ -231,6 +236,7 @@ PXN disabled sweep 未发现有效参数: | `reports_multinode_nccl_artifact_signal_analysis_20260523.md` | 最新 artifacts 的 IB/GDRDMA/HCA/plugin/SHARP 信号分析 | | `reports_multinode_nccl_all_collectives_20260523_120144.md` | 最新多机多卡 2x8 六项 collective 原始报告 | | `reports_multinode_nccl_all_collectives_run_20260523.md` | 最新多机多卡 2x8 六项 collective 中文摘要 | +| `reports_multinode_nccl_all_collectives_artifacts_manifest_20260523_120144.md` | 最新多机多卡 2x8 六项 collective artifacts manifest 和 checksum | | `reports_multinode_nccl_counter_probe_20260523.md` | RDMA rail 和 counter 证据 | | `reports_multinode_nccl_alltoall_tuning_20260523.md` | alltoall PXN 和参数 sweep 结论 | | `reports_rdma_single_node_summary.md` | 单节点 RDMA/HCA 速率摘要 |