Archive multinode NCCL raw artifacts
This commit is contained in:
parent
63c32fd75d
commit
1a8cf6cbbb
@ -34,6 +34,14 @@ bash scripts/run_multinode_nccl_pdf_matrix.sh
|
||||
它会跑 2 机 x 1/2/4/8 GPU per node 的 `all_reduce_perf` 和 `alltoall_perf`,输出到
|
||||
`reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS.md`。
|
||||
|
||||
同时会生成:
|
||||
|
||||
```text
|
||||
reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS_artifacts/
|
||||
```
|
||||
|
||||
每个 case 保存完整 `*.cmd.txt`、`*.stdout.txt`、`*.stderr.txt` 和解析后的 `*.json`,用于复核原始 NCCL 输出。
|
||||
|
||||
默认输出目录为:
|
||||
|
||||
```text
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
"""Multi-node NCCL benchmark wrapper for nccl-tests via mpirun."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
@ -36,6 +37,7 @@ class MultiNodeNCCLTest:
|
||||
self.cfg = config.get("multinode_nccl", {}) or {}
|
||||
self.tools_dir = resolve_tools_dir(config)
|
||||
self.console = Console()
|
||||
self.artifact_dir = os.environ.get("MULTINODE_NCCL_ARTIFACT_DIR") or self.cfg.get("artifact_dir")
|
||||
|
||||
def _find_mpirun(self) -> Optional[str]:
|
||||
configured = self.cfg.get("mpirun_path")
|
||||
@ -252,6 +254,7 @@ class MultiNodeNCCLTest:
|
||||
"hosts": hosts,
|
||||
"preflight": preflight,
|
||||
"tests": results,
|
||||
"artifact_dir": self.artifact_dir,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
@ -294,7 +297,7 @@ class MultiNodeNCCLTest:
|
||||
try:
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, env=self._runtime_env())
|
||||
except subprocess.TimeoutExpired:
|
||||
return {
|
||||
result = {
|
||||
"label": topo["label"],
|
||||
"nodes": nodes,
|
||||
"gpus_per_node": gpus_per_node,
|
||||
@ -305,6 +308,8 @@ class MultiNodeNCCLTest:
|
||||
"error": f"timeout after {timeout}s",
|
||||
"started_at": started,
|
||||
}
|
||||
self._write_artifacts(label, topo, result, "", "")
|
||||
return result
|
||||
|
||||
parsed = self._parse_nccl_output(r.stdout)
|
||||
net_diag = self._parse_network_diagnostics(r.stdout + "\n" + r.stderr)
|
||||
@ -312,7 +317,7 @@ class MultiNodeNCCLTest:
|
||||
wrong = sum(row.get("wrong", 0) for row in parsed["by_size"])
|
||||
has_bw = parsed["peak_busbw_gbps"] > 0
|
||||
status = "PASS" if r.returncode == 0 and has_bw and wrong == 0 and parsed["peak_busbw_gbps"] >= threshold else "FAIL"
|
||||
return {
|
||||
result = {
|
||||
"label": topo["label"],
|
||||
"nodes": nodes,
|
||||
"gpus_per_node": gpus_per_node,
|
||||
@ -335,6 +340,26 @@ class MultiNodeNCCLTest:
|
||||
"started_at": started,
|
||||
"finished_at": datetime.now().isoformat(),
|
||||
}
|
||||
self._write_artifacts(label, topo, result, r.stdout, r.stderr)
|
||||
return result
|
||||
|
||||
def _write_artifacts(self, label: str, topo: dict, result: dict, stdout: str, stderr: str):
|
||||
if not self.artifact_dir:
|
||||
return
|
||||
os.makedirs(self.artifact_dir, exist_ok=True)
|
||||
prefix = _safe_name(f"{label}_{topo.get('nodes')}x{topo.get('gpus_per_node')}_{topo.get('label')}")
|
||||
base = os.path.join(self.artifact_dir, prefix)
|
||||
with open(base + ".cmd.txt", "w") as f:
|
||||
f.write(result.get("command", ""))
|
||||
f.write("\n")
|
||||
with open(base + ".stdout.txt", "w") as f:
|
||||
f.write(stdout)
|
||||
with open(base + ".stderr.txt", "w") as f:
|
||||
f.write(stderr)
|
||||
artifact_result = {k: v for k, v in result.items() if k not in ("stdout_tail", "stderr_tail")}
|
||||
with open(base + ".json", "w") as f:
|
||||
json.dump(artifact_result, f, indent=2, default=str)
|
||||
result["artifact_prefix"] = base
|
||||
|
||||
def _threshold_for(self, label: str, topo: dict = None) -> float:
|
||||
if topo and topo.get("min_peak_busbw_gbps") is not None:
|
||||
@ -485,3 +510,9 @@ def _format_gbps(value) -> str:
|
||||
if numeric.is_integer():
|
||||
return f"{numeric:.0f}"
|
||||
return f"{numeric:.2f}"
|
||||
|
||||
|
||||
def _safe_name(value: str) -> str:
|
||||
text = re.sub(r"[^A-Za-z0-9_.-]+", "_", value.strip())
|
||||
text = re.sub(r"_+", "_", text).strip("_")
|
||||
return text[:160] or "case"
|
||||
|
||||
@ -468,6 +468,8 @@ class ReportGenerator:
|
||||
if multinode and not multinode.get("error"):
|
||||
lines.append("## Multi-node NCCL / Cross Leaf\n")
|
||||
lines.append(f"Source: {multinode.get('source', 'unknown')} | Mode: {multinode.get('mode', 'unknown')}\n")
|
||||
if multinode.get("artifact_dir"):
|
||||
lines.append(f"- **Artifacts:** `{multinode.get('artifact_dir')}`")
|
||||
hosts = multinode.get("hosts", [])
|
||||
if hosts:
|
||||
host_text = ", ".join(f"{h.get('name') or h.get('addr')}({h.get('addr')})" for h in hosts)
|
||||
|
||||
@ -176,7 +176,7 @@ OUT_DIR=/root/test_gpu_scripts/reports/nccl_deep_diag_plugin_check_$(date +%Y%m%
|
||||
| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑诊断脚本 |
|
||||
| `scripts/nccl_environment_snapshot.sh` | 单节点 HCA/plugin/topo 快照脚本 |
|
||||
| `scripts/run_h100_single_node_all.sh` | 单节点原始 `test all` 报告入口 |
|
||||
| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口 |
|
||||
| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵报告入口;复跑时额外归档每个 case 的完整 `cmd/stdout/stderr/json` |
|
||||
| `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置 |
|
||||
|
||||
## 当前建议
|
||||
|
||||
@ -30,7 +30,7 @@
|
||||
| `scripts/multinode_nccl_deep_diagnose.sh` | 可复跑的多节点 NCCL 深度诊断脚本 |
|
||||
| `scripts/nccl_environment_snapshot.sh` | 单节点 NCCL/RDMA 环境等价性快照脚本,不启动 NCCL workload |
|
||||
| `scripts/run_h100_single_node_all.sh` | 单节点 H100 `test all` 原始报告入口,默认同时采环境快照 |
|
||||
| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口,跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall |
|
||||
| `scripts/run_multinode_nccl_pdf_matrix.sh` | 多机多卡 PDF 矩阵入口,跑 2 机 x 1/2/4/8 GPU per node 的 allreduce/alltoall,并归档每个 case 的 command/stdout/stderr/parsed JSON |
|
||||
| `configs/multinode_nccl_nccl227_pdf_matrix.yaml` | 多机多卡 PDF 矩阵配置,固定 NCCL 2.27.7 和 `/data/nccl-tests-latest/build` |
|
||||
| `docs/multinode_nccl_deep_diagnose_runbook.md` | 诊断脚本中文 runbook |
|
||||
|
||||
@ -117,6 +117,14 @@ local copy: reports_multinode_nccl_pdf_matrix_20260523_112247.md
|
||||
summary: reports_multinode_nccl_pdf_matrix_run_20260523.md
|
||||
```
|
||||
|
||||
下一次用 `scripts/run_multinode_nccl_pdf_matrix.sh` 复跑时,还会生成:
|
||||
|
||||
```text
|
||||
/root/test_gpu_scripts/reports/multinode_nccl_pdf_matrix_YYYYMMDD_HHMMSS_artifacts/
|
||||
```
|
||||
|
||||
目录内按 case 保存完整 `cmd/stdout/stderr/json`,用于给网络/硬件侧复核原始 NCCL 输出。
|
||||
|
||||
## 当前证据摘要
|
||||
|
||||
### HCA / rail
|
||||
|
||||
@ -92,6 +92,7 @@ TS="$(date +%Y%m%d_%H%M%S)"
|
||||
mkdir -p "$OUT_DIR"
|
||||
|
||||
REPORT_FILE="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}.${FORMAT}"
|
||||
ARTIFACT_DIR="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}_artifacts"
|
||||
PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
|
||||
MATRIX_CMD=(
|
||||
"$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
|
||||
@ -105,6 +106,7 @@ MATRIX_CMD=(
|
||||
echo "Project: $PROJECT_DIR"
|
||||
echo "Config: $CONFIG_FILE"
|
||||
echo "Report: $REPORT_FILE"
|
||||
echo "Artifacts: $ARTIFACT_DIR"
|
||||
echo "Matrix: 2 nodes x {1,2,4,8} GPUs per node; all_reduce_perf + alltoall_perf; 16G"
|
||||
|
||||
if ((DRY_RUN)); then
|
||||
@ -117,6 +119,7 @@ if ((DRY_RUN)); then
|
||||
exit 0
|
||||
fi
|
||||
printf 'DRY RUN matrix:'
|
||||
printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR"
|
||||
printf ' %q' "${MATRIX_CMD[@]}"
|
||||
printf '\n'
|
||||
exit 0
|
||||
@ -135,8 +138,10 @@ if ((PREFLIGHT_ONLY)); then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
"${MATRIX_CMD[@]}"
|
||||
mkdir -p "$ARTIFACT_DIR"
|
||||
MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${MATRIX_CMD[@]}"
|
||||
status=$?
|
||||
|
||||
echo "Report written to: $REPORT_FILE"
|
||||
echo "Artifacts written to: $ARTIFACT_DIR"
|
||||
exit "$status"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user