From 098d1715f2a4d78153aab14f8087dc47509143c4 Mon Sep 17 00:00:00 2001 From: cs Date: Sat, 23 May 2026 19:36:53 +0800 Subject: [PATCH] Archive multinode NCCL raw artifacts --- docs/multinode_nccl_deep_diagnose_runbook.md | 8 ++++++++ modules/report.py | 2 ++ reports_multinode_nccl_handoff_plan_20260523.md | 2 +- reports_multinode_nccl_latest_index_20260523.md | 10 +++++++++- scripts/run_multinode_nccl_pdf_matrix.sh | 7 ++++++- 5 files changed, 26 insertions(+), 3 deletions(-) diff --git a/docs/multinode_nccl_deep_diagnose_runbook.md b/docs/multinode_nccl_deep_diagnose_runbook.md index 8bd082e..433d1ce 100644 --- a/docs/multinode_nccl_deep_diagnose_runbook.md +++ b/docs/multinode_nccl_deep_diagnose_runbook.md @@ -34,6 +34,14 @@ bash scripts/run_multinode_nccl_pdf_matrix.sh 它会跑 2 机 x 1/2/4/8 GPU per node 的 `all_reduce_perf` 和 `alltoall_perf`,输出到 `reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS.md`。 +同时会生成: + +```text +reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS_artifacts/ +``` + +每个 case 保存完整 `*.cmd.txt`、`*.stdout.txt`、`*.stderr.txt` 和解析后的 `*.json`,用于复核原始 NCCL 输出。 + 默认输出目录为: ```text diff --git a/modules/report.py b/modules/report.py index 79640c7..8411521 100644 --- a/modules/report.py +++ b/modules/report.py @@ -468,6 +468,8 @@ class ReportGenerator: if multinode and not multinode.get("error"): lines.append("## Multi-node NCCL / Cross Leaf\n") lines.append(f"Source: {multinode.get('source', 'unknown')} | Mode: {multinode.get('mode', 'unknown')}\n") + if multinode.get("artifact_dir"): + lines.append(f"- **Artifacts:** `{multinode.get('artifact_dir')}`") hosts = multinode.get("hosts", []) if hosts: host_text = ", ".join(f"{h.get('name') or h.get('addr')}({h.get('addr')})" for h in hosts) diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md index 25b78cf..05df781 100644 --- a/reports_multinode_nccl_handoff_plan_20260523.md +++ b/reports_multinode_nccl_handoff_plan_20260523.md @@ -176,7 +176,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m% | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 | | `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 | | `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 | -| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口 | +| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口;复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` | | `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置 | ## 当前建议 diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md index ef9bf8c..1aa52ef 100644 --- a/reports_multinode_nccl_latest_index_20260523.md +++ b/reports_multinode_nccl_latest_index_20260523.md @@ -30,7 +30,7 @@ | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 | | `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本,不启动 NCCL workload | | `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口,默认同时采环境快照 | -| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口,跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall | +| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口,跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall,并归档每个 case 的 command/stdout/stderr/parsed JSON | | `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置,固定 NCCL 2.27.7 和 `/data/nccl-tests-latest/build` | | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook | @@ -117,6 +117,14 @@ local copy: reports_multinode_nccl_pdf_matrix_20260523_112247.md summary: reports_multinode_nccl_pdf_matrix_run_20260523.md ``` +下一次用 `scripts/run_multinode_nccl_pdf_matrix.sh` 复跑时,还会生成: + +```text +/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS_artifacts/ +``` + +目录内按 case 保存完整 `cmd/stdout/stderr/json`,用于给网络/硬件侧复核原始 NCCL 输出。 + ## 当前证据摘要 ### HCA / rail diff --git a/scripts/run_multinode_nccl_pdf_matrix.sh b/scripts/run_multinode_nccl_pdf_matrix.sh index c61dcab..572ce04 100755 --- a/scripts/run_multinode_nccl_pdf_matrix.sh +++ b/scripts/run_multinode_nccl_pdf_matrix.sh @@ -92,6 +92,7 @@ TS="$(date +%Y%m%d_%H%M%S)" mkdir -p "$OUT_DIR" REPORT_FILE="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}.${FORMAT}" +ARTIFACT_DIR="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}_artifacts" PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight) MATRIX_CMD=( "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py" @@ -105,6 +106,7 @@ MATRIX_CMD=( echo "Project: $PROJECT_DIR" echo "Config: $CONFIG_FILE" echo "Report: $REPORT_FILE" +echo "Artifacts: $ARTIFACT_DIR" echo "Matrix: 2 nodes x {1,2,4,8} GPUs per node; all_reduce_perf + alltoall_perf; 16G" if ((DRY_RUN)); then @@ -117,6 +119,7 @@ if ((DRY_RUN)); then exit 0 fi printf 'DRY RUN matrix:' + printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR" printf ' %q' "${MATRIX_CMD[@]}" printf '\n' exit 0 @@ -135,8 +138,10 @@ if ((PREFLIGHT_ONLY)); then exit 0 fi -"${MATRIX_CMD[@]}" +mkdir -p "$ARTIFACT_DIR" +MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${MATRIX_CMD[@]}" status=$? echo "Report written to: $REPORT_FILE" +echo "Artifacts written to: $ARTIFACT_DIR" exit "$status"