From 1a8cf6cbbbf97cac60720147bf9042ab265c31a4 Mon Sep 17 00:00:00 2001 From: cs Date: Sat, 23 May 2026 19:36:53 +0800 Subject: [PATCH] Archive multinode NCCL raw artifacts --- docs/multinode_nccl_deep_diagnose_runbook.md | 8 +++++ modules/multinode_nccl_test.py | 35 +++++++++++++++++-- modules/report.py | 2 ++ ...ts_multinode_nccl_handoff_plan_20260523.md | 2 +- ...ts_multinode_nccl_latest_index_20260523.md | 10 +++++- scripts/run_multinode_nccl_pdf_matrix.sh | 7 +++- 6 files changed, 59 insertions(+), 5 deletions(-) diff --git a/docs/multinode_nccl_deep_diagnose_runbook.md b/docs/multinode_nccl_deep_diagnose_runbook.md index 8bd082e..433d1ce 100644 --- a/docs/multinode_nccl_deep_diagnose_runbook.md +++ b/docs/multinode_nccl_deep_diagnose_runbook.md @@ -34,6 +34,14 @@ bash scripts/run_multinode_nccl_pdf_matrix.sh 它会跑 2 机 x 1/2/4/8 GPU per node 的 `all_reduce_perf` 和 `alltoall_perf`,输出到 `reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS.md`。 +同时会生成: + +```text +reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS_artifacts/ +``` + +每个 case 保存完整 `*.cmd.txt`、`*.stdout.txt`、`*.stderr.txt` 和解析后的 `*.json`,用于复核原始 NCCL 输出。 + 默认输出目录为: ```text diff --git a/modules/multinode_nccl_test.py b/modules/multinode_nccl_test.py index 4e72525..c5afed6 100644 --- a/modules/multinode_nccl_test.py +++ b/modules/multinode_nccl_test.py @@ -1,5 +1,6 @@ """Multi-node NCCL benchmark wrapper for nccl-tests via mpirun.""" +import json import os import re import shutil @@ -36,6 +37,7 @@ class MultiNodeNCCLTest: self.cfg = config.get("multinode_nccl", {}) or {} self.tools_dir = resolve_tools_dir(config) self.console = Console() + self.artifact_dir = os.environ.get("MULTINODE_NCCL_ARTIFACT_DIR") or self.cfg.get("artifact_dir") def _find_mpirun(self) -> Optional[str]: configured = self.cfg.get("mpirun_path") @@ -252,6 +254,7 @@ class MultiNodeNCCLTest: "hosts": hosts, "preflight": preflight, "tests": results, + "artifact_dir": self.artifact_dir, "timestamp": datetime.now().isoformat(), } @@ -294,7 +297,7 @@ class MultiNodeNCCLTest: try: r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, env=self._runtime_env()) except subprocess.TimeoutExpired: - return { + result = { "label": topo["label"], "nodes": nodes, "gpus_per_node": gpus_per_node, @@ -305,6 +308,8 @@ class MultiNodeNCCLTest: "error": f"timeout after {timeout}s", "started_at": started, } + self._write_artifacts(label, topo, result, "", "") + return result parsed = self._parse_nccl_output(r.stdout) net_diag = self._parse_network_diagnostics(r.stdout + "\n" + r.stderr) @@ -312,7 +317,7 @@ class MultiNodeNCCLTest: wrong = sum(row.get("wrong", 0) for row in parsed["by_size"]) has_bw = parsed["peak_busbw_gbps"] > 0 status = "PASS" if r.returncode == 0 and has_bw and wrong == 0 and parsed["peak_busbw_gbps"] >= threshold else "FAIL" - return { + result = { "label": topo["label"], "nodes": nodes, "gpus_per_node": gpus_per_node, @@ -335,6 +340,26 @@ class MultiNodeNCCLTest: "started_at": started, "finished_at": datetime.now().isoformat(), } + self._write_artifacts(label, topo, result, r.stdout, r.stderr) + return result + + def _write_artifacts(self, label: str, topo: dict, result: dict, stdout: str, stderr: str): + if not self.artifact_dir: + return + os.makedirs(self.artifact_dir, exist_ok=True) + prefix = _safe_name(f"{label}_{topo.get('nodes')}x{topo.get('gpus_per_node')}_{topo.get('label')}") + base = os.path.join(self.artifact_dir, prefix) + with open(base + ".cmd.txt", "w") as f: + f.write(result.get("command", "")) + f.write("\n") + with open(base + ".stdout.txt", "w") as f: + f.write(stdout) + with open(base + ".stderr.txt", "w") as f: + f.write(stderr) + artifact_result = {k: v for k, v in result.items() if k not in ("stdout_tail", "stderr_tail")} + with open(base + ".json", "w") as f: + json.dump(artifact_result, f, indent=2, default=str) + result["artifact_prefix"] = base def _threshold_for(self, label: str, topo: dict = None) -> float: if topo and topo.get("min_peak_busbw_gbps") is not None: @@ -485,3 +510,9 @@ def _format_gbps(value) -> str: if numeric.is_integer(): return f"{numeric:.0f}" return f"{numeric:.2f}" + + +def _safe_name(value: str) -> str: + text = re.sub(r"[^A-Za-z0-9_.-]+", "_", value.strip()) + text = re.sub(r"_+", "_", text).strip("_") + return text[:160] or "case" diff --git a/modules/report.py b/modules/report.py index 79640c7..8411521 100644 --- a/modules/report.py +++ b/modules/report.py @@ -468,6 +468,8 @@ class ReportGenerator: if multinode and not multinode.get("error"): lines.append("## Multi-node NCCL / Cross Leaf\n") lines.append(f"Source: {multinode.get('source', 'unknown')} | Mode: {multinode.get('mode', 'unknown')}\n") + if multinode.get("artifact_dir"): + lines.append(f"- **Artifacts:** `{multinode.get('artifact_dir')}`") hosts = multinode.get("hosts", []) if hosts: host_text = ", ".join(f"{h.get('name') or h.get('addr')}({h.get('addr')})" for h in hosts) diff --git a/reports_multinode_nccl_handoff_plan_20260523.md b/reports_multinode_nccl_handoff_plan_20260523.md index 25b78cf..05df781 100644 --- a/reports_multinode_nccl_handoff_plan_20260523.md +++ b/reports_multinode_nccl_handoff_plan_20260523.md @@ -176,7 +176,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m% | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 | | `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 | | `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 | -| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口 | +| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口;复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` | | `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置 | ## 当前建议 diff --git a/reports_multinode_nccl_latest_index_20260523.md b/reports_multinode_nccl_latest_index_20260523.md index ef9bf8c..1aa52ef 100644 --- a/reports_multinode_nccl_latest_index_20260523.md +++ b/reports_multinode_nccl_latest_index_20260523.md @@ -30,7 +30,7 @@ | `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 | | `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本,不启动 NCCL workload | | `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口,默认同时采环境快照 | -| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口,跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall | +| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口,跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall,并归档每个 case 的 command/stdout/stderr/parsed JSON | | `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置,固定 NCCL 2.27.7 和 `/data/nccl-tests-latest/build` | | `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook | @@ -117,6 +117,14 @@ local copy: reports_multinode_nccl_pdf_matrix_20260523_112247.md summary: reports_multinode_nccl_pdf_matrix_run_20260523.md ``` +下一次用 `scripts/run_multinode_nccl_pdf_matrix.sh` 复跑时,还会生成: + +```text +/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS_artifacts/ +``` + +目录内按 case 保存完整 `cmd/stdout/stderr/json`,用于给网络/硬件侧复核原始 NCCL 输出。 + ## 当前证据摘要 ### HCA / rail diff --git a/scripts/run_multinode_nccl_pdf_matrix.sh b/scripts/run_multinode_nccl_pdf_matrix.sh index c61dcab..572ce04 100755 --- a/scripts/run_multinode_nccl_pdf_matrix.sh +++ b/scripts/run_multinode_nccl_pdf_matrix.sh @@ -92,6 +92,7 @@ TS="$(date +%Y%m%d_%H%M%S)" mkdir -p "$OUT_DIR" REPORT_FILE="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}.${FORMAT}" +ARTIFACT_DIR="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}_artifacts" PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight) MATRIX_CMD=( "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py" @@ -105,6 +106,7 @@ MATRIX_CMD=( echo "Project: $PROJECT_DIR" echo "Config: $CONFIG_FILE" echo "Report: $REPORT_FILE" +echo "Artifacts: $ARTIFACT_DIR" echo "Matrix: 2 nodes x {1,2,4,8} GPUs per node; all_reduce_perf + alltoall_perf; 16G" if ((DRY_RUN)); then @@ -117,6 +119,7 @@ if ((DRY_RUN)); then exit 0 fi printf 'DRY RUN matrix:' + printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR" printf ' %q' "${MATRIX_CMD[@]}" printf '\n' exit 0 @@ -135,8 +138,10 @@ if ((PREFLIGHT_ONLY)); then exit 0 fi -"${MATRIX_CMD[@]}" +mkdir -p "$ARTIFACT_DIR" +MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${MATRIX_CMD[@]}" status=$? echo "Report written to: $REPORT_FILE" +echo "Artifacts written to: $ARTIFACT_DIR" exit "$status"