Archive multinode NCCL raw artifacts

This commit is contained in:
cs 2026-05-23 19:36:53 +08:00
parent 63c32fd75d
commit 1a8cf6cbbb
6 changed files with 59 additions and 5 deletions

View File

@ -34,6 +34,14 @@ bash scripts/run_multinode_nccl_pdf_matrix.sh
它会跑 2 机 x 1/2/4/8 GPU per node 的 `all_reduce_perf``alltoall_perf`,输出到
`reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS.md`
同时会生成:
```text
reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS_artifacts/
```
每个 case 保存完整 `*.cmd.txt``*.stdout.txt``*.stderr.txt` 和解析后的 `*.json`,用于复核原始 NCCL 输出。
默认输出目录为:
```text

View File

@ -1,5 +1,6 @@
"""Multi-node NCCL benchmark wrapper for nccl-tests via mpirun."""
import json
import os
import re
import shutil
@ -36,6 +37,7 @@ class MultiNodeNCCLTest:
self.cfg = config.get("multinode_nccl", {}) or {}
self.tools_dir = resolve_tools_dir(config)
self.console = Console()
self.artifact_dir = os.environ.get("MULTINODE_NCCL_ARTIFACT_DIR") or self.cfg.get("artifact_dir")
def _find_mpirun(self) -> Optional[str]:
configured = self.cfg.get("mpirun_path")
@ -252,6 +254,7 @@ class MultiNodeNCCLTest:
"hosts": hosts,
"preflight": preflight,
"tests": results,
"artifact_dir": self.artifact_dir,
"timestamp": datetime.now().isoformat(),
}
@ -294,7 +297,7 @@ class MultiNodeNCCLTest:
try:
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, env=self._runtime_env())
except subprocess.TimeoutExpired:
return {
result = {
"label": topo["label"],
"nodes": nodes,
"gpus_per_node": gpus_per_node,
@ -305,6 +308,8 @@ class MultiNodeNCCLTest:
"error": f"timeout after {timeout}s",
"started_at": started,
}
self._write_artifacts(label, topo, result, "", "")
return result
parsed = self._parse_nccl_output(r.stdout)
net_diag = self._parse_network_diagnostics(r.stdout + "\n" + r.stderr)
@ -312,7 +317,7 @@ class MultiNodeNCCLTest:
wrong = sum(row.get("wrong", 0) for row in parsed["by_size"])
has_bw = parsed["peak_busbw_gbps"] > 0
status = "PASS" if r.returncode == 0 and has_bw and wrong == 0 and parsed["peak_busbw_gbps"] >= threshold else "FAIL"
return {
result = {
"label": topo["label"],
"nodes": nodes,
"gpus_per_node": gpus_per_node,
@ -335,6 +340,26 @@ class MultiNodeNCCLTest:
"started_at": started,
"finished_at": datetime.now().isoformat(),
}
self._write_artifacts(label, topo, result, r.stdout, r.stderr)
return result
def _write_artifacts(self, label: str, topo: dict, result: dict, stdout: str, stderr: str):
if not self.artifact_dir:
return
os.makedirs(self.artifact_dir, exist_ok=True)
prefix = _safe_name(f"{label}_{topo.get('nodes')}x{topo.get('gpus_per_node')}_{topo.get('label')}")
base = os.path.join(self.artifact_dir, prefix)
with open(base + ".cmd.txt", "w") as f:
f.write(result.get("command", ""))
f.write("\n")
with open(base + ".stdout.txt", "w") as f:
f.write(stdout)
with open(base + ".stderr.txt", "w") as f:
f.write(stderr)
artifact_result = {k: v for k, v in result.items() if k not in ("stdout_tail", "stderr_tail")}
with open(base + ".json", "w") as f:
json.dump(artifact_result, f, indent=2, default=str)
result["artifact_prefix"] = base
def _threshold_for(self, label: str, topo: dict = None) -> float:
if topo and topo.get("min_peak_busbw_gbps") is not None:
@ -485,3 +510,9 @@ def _format_gbps(value) -> str:
if numeric.is_integer():
return f"{numeric:.0f}"
return f"{numeric:.2f}"
def _safe_name(value: str) -> str:
text = re.sub(r"[^A-Za-z0-9_.-]+", "_", value.strip())
text = re.sub(r"_+", "_", text).strip("_")
return text[:160] or "case"

View File

@ -468,6 +468,8 @@ class ReportGenerator:
if multinode and not multinode.get("error"):
lines.append("## Multi-node NCCL / Cross Leaf\n")
lines.append(f"Source: {multinode.get('source', 'unknown')} | Mode: {multinode.get('mode', 'unknown')}\n")
if multinode.get("artifact_dir"):
lines.append(f"- **Artifacts:** `{multinode.get('artifact_dir')}`")
hosts = multinode.get("hosts", [])
if hosts:
host_text = ", ".join(f"{h.get('name') or h.get('addr')}({h.get('addr')})" for h in hosts)

View File

@ -176,7 +176,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 |
| `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
| `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 |
| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口 |
| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口;复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` |
| `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置 |
## 当前建议

View File

@ -30,7 +30,7 @@
| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 |
| `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本,不启动 NCCL workload |
| `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口,默认同时采环境快照 |
| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口,跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall |
| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口,跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall,并归档每个 case 的 command/stdout/stderr/parsed JSON |
| `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置,固定 NCCL 2.27.7 和 `/data/nccl-tests-latest/build` |
| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
@ -117,6 +117,14 @@ local copy: reports_multinode_nccl_pdf_matrix_20260523_112247.md
summary: reports_multinode_nccl_pdf_matrix_run_20260523.md
```
下一次用 `scripts/run_multinode_nccl_pdf_matrix.sh` 复跑时,还会生成:
```text
/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS_artifacts/
```
目录内按 case 保存完整 `cmd/stdout/stderr/json`,用于给网络/硬件侧复核原始 NCCL 输出。
## 当前证据摘要
### HCA / rail

View File

@ -92,6 +92,7 @@ TS="$(date +%Y%m%d_%H%M%S)"
mkdir -p "$OUT_DIR"
REPORT_FILE="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}.${FORMAT}"
ARTIFACT_DIR="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}_artifacts"
PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
MATRIX_CMD=(
"$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
@ -105,6 +106,7 @@ MATRIX_CMD=(
echo "Project: $PROJECT_DIR"
echo "Config: $CONFIG_FILE"
echo "Report: $REPORT_FILE"
echo "Artifacts: $ARTIFACT_DIR"
echo "Matrix: 2 nodes x {1,2,4,8} GPUs per node; all_reduce_perf + alltoall_perf; 16G"
if ((DRY_RUN)); then
@ -117,6 +119,7 @@ if ((DRY_RUN)); then
exit 0
fi
printf 'DRY RUN matrix:'
printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR"
printf ' %q' "${MATRIX_CMD[@]}"
printf '\n'
exit 0
@ -135,8 +138,10 @@ if ((PREFLIGHT_ONLY)); then
exit 0
fi
"${MATRIX_CMD[@]}"
mkdir -p "$ARTIFACT_DIR"
MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${MATRIX_CMD[@]}"
status=$?
echo "Report written to: $REPORT_FILE"
echo "Artifacts written to: $ARTIFACT_DIR"
exit "$status"