Add multi-node NCCL sweep test

2026-05-23 13:03:26 +08:00 · 2026-05-23 13:03:26 +08:00 · ac91f1aeb5
commit ac91f1aeb5
parent 2a51be1ba3
7 changed files with 1055 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -375,6 +375,27 @@ nccl:
  repeats: 3
  max_stddev_pct: 3

+multinode_nccl:
+  enabled: false                        # true 时纳入 --test all
+  hosts:
+    - {name: nccl-gpu-1, addr: 172.72.8.12, slots: 8}
+    - {name: nccl-gpu-2, addr: 172.72.8.16, slots: 8}
+  tests: [all_reduce_perf, alltoall_perf]
+  topologies:
+    - {nodes: 2, gpus_per_node: 8}
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  extra_ld_library_path:                # 传给远端 rank 的 MPI/NCCL/CUDA 库路径
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  begin_size: 1k
+  end_size: 16g
+  step_factor: 2
+  warmup_iters: 10
+  socket_ifname: bond0
+  ib_gid_index: 3
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+
 stress:
  duration_sec: 1800                   # 压力测试时长
  use_gpu_burn: false                  # 默认走 PyTorch GEMM stress
@ -539,16 +560,14 @@ report:
 └── 异常: 检查 IB 线缆、交换机配置、子网管理器

 步骤 3: 多节点 NCCL 测试
-├── 在每个节点上配置:
-│   export MASTER_ADDR=<主节点IP>
-│   export MASTER_PORT=29500
-│   export NCCL_SOCKET_IFNAME=ib0    # IB 网卡名
-│   export NCCL_DEBUG=INFO
-├── 运行 nccl-tests 手动测试:
-│   mpirun -np <总GPU数> -hostfile hosts \
-│     /opt/gpu-test-tools/nccl-tests/build/all_reduce_perf \
-│     -b 8 -e 256M -f 2 -g 1 -w 5 -n 20
-└── 确认: 多节点 AllReduce 带宽正常
+├── 在发起节点确认 mpirun、nccl-tests、跨节点 root SSH 可用
+├── 配置 configs/default.yaml 的 multinode_nccl.hosts / IB 参数
+├── 执行 PDF 风格 sweep:
+│   python3 gpu_tester.py --test multinode-nccl --report --format md
+├── 默认命令口径:
+│   mpirun -H <node1>:8,<node2>:8 --map-by ppr:8:node -np 16 \
+│     all_reduce_perf/alltoall_perf -b 1k -e 16g -f 2 -g 1 -w 10
+└── 确认: Peak Bus BW、Peak Size、wrong_count 正常

 步骤 4: 训练验证
 ├── python3 gpu_tester.py --test training
--- a/configs/default.yaml
+++ b/configs/default.yaml
@ -41,6 +41,52 @@ nccl:
  repeats: 3
  max_stddev_pct: 3

+multinode_nccl:
+  enabled: false
+  mode: sweep
+  hosts:
+    - name: nccl-gpu-1
+      addr: 172.72.8.12
+      slots: 8
+    - name: nccl-gpu-2
+      addr: 172.72.8.16
+      slots: 8
+  ssh_user: root
+  ssh_preflight: true
+  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
+  mpi_ld_preload: null
+  extra_ld_library_path:
+    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
+    - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
+    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
+  nccl_tests_dir: null  # null = tools.install_dir/nccl-tests/build
+  tests:
+    - all_reduce_perf
+    - alltoall_perf
+  topologies:
+    - nodes: 2
+      gpus_per_node: 8
+  begin_size: 1k
+  end_size: 16g
+  step_factor: 2
+  warmup_iters: 10
+  gpus_per_rank: 1
+  timeout_sec: 1800
+  socket_ifname: bond0
+  ib_gid_index: 3
+  ib_sl: 5
+  ib_tc: 136
+  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
+  ib_timeout: 22
+  qps_per_connection: 4
+  min_nchannels: 4
+  net_plugin: none
+  nvls_enable: 1
+  split_data_on_qps: 1
+  min_peak_busbw_gbps:
+    allreduce: 480
+    alltoall: 75
+
 stress:
  duration_sec: 1800
  production_duration_sec: 1800
--- a/gpu_tester.py
+++ b/gpu_tester.py
@ -28,6 +28,7 @@ from modules.stress_test import StressTest
 from modules.rdma_test import RDMATest
 from modules.nvlink_test import NVLinkTest
 from modules.dcgm_test import DCGMTest
+from modules.multinode_nccl_test import MultiNodeNCCLTest
 from modules.report import ReportGenerator
 from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label, get_supported_gpus, validate_driver_compatibility

@ -55,6 +56,44 @@ DEFAULT_CONFIG = {
        "repeats": 3,
        "max_stddev_pct": 3,
    },
+    "multinode_nccl": {
+        "enabled": False,
+        "mode": "sweep",
+        "hosts": [
+            {"name": "nccl-gpu-1", "addr": "172.72.8.12", "slots": 8},
+            {"name": "nccl-gpu-2", "addr": "172.72.8.16", "slots": 8},
+        ],
+        "ssh_user": "root",
+        "ssh_preflight": True,
+        "mpirun_path": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun",
+        "mpi_ld_preload": None,
+        "extra_ld_library_path": [
+            "/usr/mpi/gcc/openmpi-4.1.9a1/lib",
+            "/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib",
+            "/usr/local/cuda-12.4/targets/x86_64-linux/lib",
+        ],
+        "nccl_tests_dir": None,
+        "tests": ["all_reduce_perf", "alltoall_perf"],
+        "topologies": [{"nodes": 2, "gpus_per_node": 8}],
+        "begin_size": "1k",
+        "end_size": "16g",
+        "step_factor": 2,
+        "warmup_iters": 10,
+        "gpus_per_rank": 1,
+        "timeout_sec": 1800,
+        "socket_ifname": "bond0",
+        "ib_gid_index": 3,
+        "ib_sl": 5,
+        "ib_tc": 136,
+        "ib_hca": "mlx5_0,mlx5_1,mlx5_6,mlx5_7",
+        "ib_timeout": 22,
+        "qps_per_connection": 4,
+        "min_nchannels": 4,
+        "net_plugin": "none",
+        "nvls_enable": 1,
+        "split_data_on_qps": 1,
+        "min_peak_busbw_gbps": {"allreduce": 480, "alltoall": 75},
+    },
    "stress": {
        "duration_sec": 1800,
        "production_duration_sec": 1800,
@ -191,7 +230,8 @@ def interactive_menu(config: dict):
        ("8", "NVLink/NVSwitch Test", "nvlink"),
        ("9", "DCGM Diagnostic", "dcgm"),
        ("10", "Training Simulation", "training"),
-        ("11", "Full Test Suite (All Tests)", "all"),
+        ("11", "Multi-node NCCL Test", "multinode_nccl"),
+        ("12", "Full Test Suite (All Tests)", "all"),
        ("0", "Generate Report", "report"),
    ]

@ -218,6 +258,7 @@ def interactive_menu(config: dict):
            "nvlink": "NVLink links, speed, and error counters",
            "dcgm": "DCGM diag -r 3 production diagnostic",
            "training": "Simulate LLM training with PyTorch",
+            "multinode_nccl": "Cross-node NCCL via mpirun/nccl-tests",
            "all": "Run all tests sequentially",
            "report": "Export results to JSON/HTML",
        }
@ -326,6 +367,12 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict:
            m.print_results(result)
            return result

+        elif test_name == "multinode_nccl":
+            m = MultiNodeNCCLTest(config)
+            result = m.run()
+            m.print_results(result)
+            return result
+
        elif test_name == "all":
            return _run_full_suite(config, console)

@ -356,6 +403,8 @@ def _run_full_suite(config: dict, console: Console) -> dict:
        ("dcgm", "DCGM Diagnostic", DCGMTest),
        ("training", "Training Simulation", TrainingSim),
    ]
+    if (config.get("multinode_nccl", {}) or {}).get("enabled"):
+        tests.append(("multinode_nccl", "Multi-node NCCL Test", MultiNodeNCCLTest))

    for i, (key, name, mod_cls) in enumerate(tests, 1):
        console.print(f"\n[bold cyan][{i}/{len(tests)}] {name}[/bold cyan]")
@ -435,6 +484,7 @@ Examples:
   python gpu_tester.py --test benchmark --type memory
   python gpu_tester.py --test benchmark --type compute --dtype fp16
   python gpu_tester.py --test nccl            # NCCL test
+   python gpu_tester.py --test multinode-nccl  # Cross-node NCCL test
   python gpu_tester.py --test nvlink          # NVLink/NVSwitch test
   python gpu_tester.py --test dcgm            # DCGM diagnostic
   python gpu_tester.py --test training        # Training sim
@ -442,7 +492,7 @@ Examples:
   python gpu_tester.py --report --format json --output report.json
        """,
    )
-    parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"],
+    parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "multinode-nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"],
                        help="Run a specific test")
    parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)")
    parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"],
@ -499,6 +549,7 @@ Examples:
        "health": "health",
        "benchmark": None,
        "nccl": "nccl",
+        "multinode-nccl": "multinode_nccl",
        "stress": "stress",
        "rdma": "rdma",
        "nvlink": "nvlink",
--- a/modules/multinode_nccl_test.py
+++ b/modules/multinode_nccl_test.py
@ -0,0 +1,388 @@
+"""Multi-node NCCL benchmark wrapper for nccl-tests via mpirun."""
+
+import os
+import re
+import shutil
+import subprocess
+from datetime import datetime
+from typing import Optional
+
+from rich.console import Console
+from rich.table import Table
+
+from modules.gpu_specs import resolve_tools_dir
+
+
+_TEST_ALIASES = {
+    "allreduce": "all_reduce_perf",
+    "all_reduce": "all_reduce_perf",
+    "all_reduce_perf": "all_reduce_perf",
+    "alltoall": "alltoall_perf",
+    "all_to_all": "alltoall_perf",
+    "alltoall_perf": "alltoall_perf",
+}
+
+_OP_LABELS = {
+    "all_reduce_perf": "allreduce",
+    "alltoall_perf": "alltoall",
+}
+
+
+class MultiNodeNCCLTest:
+    """Run cross-node NCCL tests with a PDF-style message-size sweep."""
+
+    def __init__(self, config: dict):
+        self.config = config
+        self.cfg = config.get("multinode_nccl", {}) or {}
+        self.tools_dir = resolve_tools_dir(config)
+        self.console = Console()
+
+    def _find_mpirun(self) -> Optional[str]:
+        configured = self.cfg.get("mpirun_path")
+        if configured and os.path.isfile(str(configured)) and os.access(str(configured), os.X_OK):
+            return str(configured)
+        for cmd in ["mpirun", "mpiexec", os.path.join(self.tools_dir, "mpi", "bin", "mpirun")]:
+            found = shutil.which(cmd)
+            if found:
+                return found
+        return None
+
+    def _find_nccl_test(self, binary_name: str) -> Optional[str]:
+        configured = self.cfg.get("nccl_tests_dir")
+        candidates = []
+        if configured:
+            candidates.append(os.path.join(configured, binary_name))
+        candidates.append(os.path.join(self.tools_dir, "nccl-tests", "build", binary_name))
+        found = shutil.which(binary_name)
+        if found:
+            candidates.insert(0, found)
+
+        for path in candidates:
+            if path and os.path.isfile(path) and os.access(path, os.X_OK):
+                return path
+        return None
+
+    def _tests(self) -> list[str]:
+        configured = self.cfg.get("tests") or ["all_reduce_perf", "alltoall_perf"]
+        tests = []
+        for name in configured:
+            binary = _TEST_ALIASES.get(str(name).lower())
+            if binary and binary not in tests:
+                tests.append(binary)
+        return tests
+
+    def _hosts(self) -> list[dict]:
+        hosts = self.cfg.get("hosts") or []
+        normalized = []
+        for host in hosts:
+            if isinstance(host, str):
+                normalized.append({"addr": host, "slots": 8})
+            elif isinstance(host, dict):
+                normalized.append({
+                    "name": host.get("name") or host.get("addr"),
+                    "addr": host.get("addr") or host.get("host") or host.get("ip"),
+                    "slots": int(host.get("slots", 8)),
+                })
+        return [h for h in normalized if h.get("addr")]
+
+    def _topologies(self) -> list[dict]:
+        topologies = self.cfg.get("topologies") or [{"nodes": 2, "gpus_per_node": 8}]
+        normalized = []
+        for topo in topologies:
+            nodes = int(topo.get("nodes", 2))
+            gpus_per_node = int(topo.get("gpus_per_node", topo.get("gpn", 8)))
+            normalized.append({
+                "nodes": nodes,
+                "gpus_per_node": gpus_per_node,
+                "label": topo.get("label") or f"{nodes} nodes x {gpus_per_node} GPUs",
+            })
+        return normalized
+
+    def _env_exports(self) -> list[tuple[str, str]]:
+        env_cfg = {
+            "NCCL_DEBUG": self.cfg.get("debug", "WARN"),
+            "NCCL_SOCKET_IFNAME": self.cfg.get("socket_ifname"),
+            "NCCL_IB_GID_INDEX": self.cfg.get("ib_gid_index"),
+            "NCCL_IB_SL": self.cfg.get("ib_sl"),
+            "NCCL_IB_TC": self.cfg.get("ib_tc"),
+            "NCCL_IB_HCA": self.cfg.get("ib_hca"),
+            "NCCL_IB_TIMEOUT": self.cfg.get("ib_timeout"),
+            "NCCL_IB_QPS_PER_CONNECTION": self.cfg.get("qps_per_connection"),
+            "NCCL_MIN_NCHANNELS": self.cfg.get("min_nchannels"),
+            "NCCL_NET_PLUGIN": self.cfg.get("net_plugin"),
+            "NCCL_NVLS_ENABLE": self.cfg.get("nvls_enable"),
+            "NCCL_IB_SPLIT_DATA_ON_QPS": self.cfg.get("split_data_on_qps"),
+        }
+        mpi_ld_preload = self._mpi_ld_preload()
+        if mpi_ld_preload:
+            env_cfg["LD_PRELOAD"] = mpi_ld_preload
+        extra_ld_library_path = self._extra_ld_library_path()
+        if extra_ld_library_path:
+            existing = os.environ.get("LD_LIBRARY_PATH", "")
+            env_cfg["LD_LIBRARY_PATH"] = ":".join(
+                [extra_ld_library_path] + ([existing] if existing else [])
+            )
+        return [(k, str(v)) for k, v in env_cfg.items() if v is not None]
+
+    def _mpi_ld_preload(self) -> str:
+        preload = self.cfg.get("mpi_ld_preload")
+        if isinstance(preload, list):
+            return " ".join(str(p) for p in preload if p)
+        return str(preload) if preload else ""
+
+    def _runtime_env(self) -> dict:
+        env = os.environ.copy()
+        mpi_ld_preload = self._mpi_ld_preload()
+        if mpi_ld_preload:
+            env["LD_PRELOAD"] = mpi_ld_preload
+        extra_ld_library_path = self._extra_ld_library_path()
+        if extra_ld_library_path:
+            existing = env.get("LD_LIBRARY_PATH", "")
+            env["LD_LIBRARY_PATH"] = ":".join(
+                [extra_ld_library_path] + ([existing] if existing else [])
+            )
+        return env
+
+    def _extra_ld_library_path(self) -> str:
+        paths = self.cfg.get("extra_ld_library_path")
+        if isinstance(paths, list):
+            return ":".join(str(p) for p in paths if p)
+        return str(paths) if paths else ""
+
+    def _preflight(self, mpirun: Optional[str], tests: list[str], hosts: list[dict]) -> dict:
+        checks = []
+        checks.append({"name": "mpirun", "status": "PASS" if mpirun else "FAIL", "detail": mpirun or "not found"})
+        checks.append({"name": "hosts", "status": "PASS" if len(hosts) >= 2 else "FAIL", "detail": f"{len(hosts)} configured"})
+        for binary in tests:
+            path = self._find_nccl_test(binary)
+            checks.append({"name": binary, "status": "PASS" if path else "FAIL", "detail": path or "not found"})
+
+        if self.cfg.get("ssh_preflight", True):
+            user = self.cfg.get("ssh_user", "root")
+            for host in hosts:
+                target = f"{user}@{host['addr']}"
+                cmd = ["ssh", "-o", "BatchMode=yes", "-o", "ConnectTimeout=5", target, "hostname"]
+                try:
+                    r = subprocess.run(cmd, capture_output=True, text=True, timeout=8, env=self._runtime_env())
+                    detail = r.stdout.strip() or r.stderr.strip()[:120]
+                    checks.append({
+                        "name": f"ssh {host['addr']}",
+                        "status": "PASS" if r.returncode == 0 else "WARN",
+                        "detail": detail,
+                    })
+                except Exception as e:
+                    checks.append({"name": f"ssh {host['addr']}", "status": "WARN", "detail": str(e)})
+
+        return {
+            "checks": checks,
+            "passed": all(c["status"] == "PASS" for c in checks if not c["name"].startswith("ssh ")),
+        }
+
+    def run(self) -> dict:
+        mpirun = self._find_mpirun()
+        tests = self._tests()
+        hosts = self._hosts()
+        topologies = self._topologies()
+        preflight = self._preflight(mpirun, tests, hosts)
+
+        if not preflight["passed"]:
+            return {
+                "passed": False,
+                "source": "nccl-tests-mpirun",
+                "mode": self.cfg.get("mode", "sweep"),
+                "hosts": hosts,
+                "preflight": preflight,
+                "tests": {},
+                "error": "multinode NCCL preflight failed",
+                "timestamp": datetime.now().isoformat(),
+            }
+
+        results = {}
+        for binary in tests:
+            label = _OP_LABELS[binary]
+            binary_path = self._find_nccl_test(binary)
+            op_results = []
+            for topo in topologies:
+                op_results.append(self._run_topology(mpirun, binary_path, label, hosts, topo))
+            results[label] = {"binary": binary_path, "topologies": op_results}
+
+        passed = all(
+            topo.get("status") == "PASS"
+            for op in results.values()
+            for topo in op.get("topologies", [])
+        )
+        return {
+            "passed": passed,
+            "source": "nccl-tests-mpirun",
+            "mode": self.cfg.get("mode", "sweep"),
+            "hosts": hosts,
+            "preflight": preflight,
+            "tests": results,
+            "timestamp": datetime.now().isoformat(),
+        }
+
+    def _run_topology(self, mpirun: str, binary: str, label: str, hosts: list[dict], topo: dict) -> dict:
+        nodes = topo["nodes"]
+        gpus_per_node = topo["gpus_per_node"]
+        selected_hosts = hosts[:nodes]
+        host_arg = ",".join(f"{h['addr']}:{gpus_per_node}" for h in selected_hosts)
+        ranks = nodes * gpus_per_node
+
+        cmd = [
+            mpirun,
+            "--allow-run-as-root",
+            "--mca", "btl_openib_warn_no_device_params_found", "0",
+            "--mca", "btl_tcp_if_include", str(self.cfg.get("socket_ifname", "bond0")),
+            "-H", host_arg,
+            "--map-by", f"ppr:{gpus_per_node}:node",
+            "-np", str(ranks),
+        ]
+        for key, value in self._env_exports():
+            cmd.extend(["-x", f"{key}={value}"])
+
+        cmd.extend([
+            binary,
+            "-b", str(self.cfg.get("begin_size", "1k")),
+            "-e", str(self.cfg.get("end_size", "16g")),
+            "-g", str(self.cfg.get("gpus_per_rank", 1)),
+            "-f", str(self.cfg.get("step_factor", 2)),
+            "-w", str(self.cfg.get("warmup_iters", 10)),
+        ])
+        if self.cfg.get("iters") is not None:
+            cmd.extend(["-n", str(self.cfg["iters"])])
+
+        timeout = int(self.cfg.get("timeout_sec", 1800))
+        started = datetime.now().isoformat()
+        try:
+            r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, env=self._runtime_env())
+        except subprocess.TimeoutExpired:
+            return {
+                "label": topo["label"],
+                "nodes": nodes,
+                "gpus_per_node": gpus_per_node,
+                "ranks": ranks,
+                "hosts": selected_hosts,
+                "command": " ".join(cmd),
+                "status": "FAIL",
+                "error": f"timeout after {timeout}s",
+                "started_at": started,
+            }
+
+        parsed = self._parse_nccl_output(r.stdout)
+        threshold = self._threshold_for(label)
+        wrong = sum(row.get("wrong", 0) for row in parsed["by_size"])
+        has_bw = parsed["peak_busbw_gbps"] > 0
+        status = "PASS" if r.returncode == 0 and has_bw and wrong == 0 and parsed["peak_busbw_gbps"] >= threshold else "FAIL"
+        return {
+            "label": topo["label"],
+            "nodes": nodes,
+            "gpus_per_node": gpus_per_node,
+            "ranks": ranks,
+            "hosts": selected_hosts,
+            "command": " ".join(cmd),
+            "returncode": r.returncode,
+            "status": status,
+            "peak_busbw_gbps": parsed["peak_busbw_gbps"],
+            "peak_algbw_gbps": parsed["peak_algbw_gbps"],
+            "peak_size": parsed["peak_size"],
+            "avg_busbw_gbps": parsed["avg_busbw_gbps"],
+            "min_required_gbps": threshold,
+            "wrong_count": wrong,
+            "by_size": parsed["by_size"],
+            "stderr_tail": r.stderr[-1200:],
+            "stdout_tail": r.stdout[-1200:],
+            "started_at": started,
+            "finished_at": datetime.now().isoformat(),
+        }
+
+    def _threshold_for(self, label: str) -> float:
+        thresholds = self.cfg.get("min_peak_busbw_gbps") or {}
+        if isinstance(thresholds, dict):
+            return float(thresholds.get(label, 0) or 0)
+        return float(thresholds or 0)
+
+    @staticmethod
+    def _parse_nccl_output(stdout: str) -> dict:
+        rows = []
+        avg_bus = 0.0
+        for line in stdout.splitlines():
+            stripped = line.strip()
+            if not stripped:
+                continue
+            avg_match = re.search(r"Avg bus bandwidth\s*:\s*([0-9.]+)", stripped)
+            if avg_match:
+                avg_bus = float(avg_match.group(1))
+                continue
+            if stripped.startswith("#"):
+                continue
+            parts = stripped.split()
+            if len(parts) < 9:
+                continue
+            try:
+                size_bytes = int(parts[0])
+                time_us = float(parts[5])
+                algbw = float(parts[6])
+                busbw = float(parts[7])
+                wrong = int(parts[8])
+            except (ValueError, IndexError):
+                continue
+            rows.append({
+                "size_bytes": size_bytes,
+                "size": _format_size(size_bytes),
+                "time_us": time_us,
+                "algbw_gbps": algbw,
+                "busbw_gbps": busbw,
+                "wrong": wrong,
+            })
+
+        peak_row = max(rows, key=lambda r: r["busbw_gbps"], default={})
+        return {
+            "peak_busbw_gbps": round(float(peak_row.get("busbw_gbps", 0)), 2),
+            "peak_algbw_gbps": round(float(peak_row.get("algbw_gbps", 0)), 2),
+            "peak_size": peak_row.get("size", ""),
+            "avg_busbw_gbps": round(avg_bus, 2),
+            "by_size": rows,
+        }
+
+    @staticmethod
+    def print_results(results: dict, console: Console = None):
+        c = console or Console()
+        if results.get("error"):
+            c.print(f"[bold red]Multi-node NCCL failed: {results['error']}[/bold red]")
+        else:
+            c.print("[bold green]Multi-node NCCL complete[/bold green]" if results.get("passed") else "[bold red]Multi-node NCCL failed[/bold red]")
+
+        preflight = results.get("preflight", {})
+        if preflight.get("checks"):
+            table = Table(title="Preflight")
+            table.add_column("Check")
+            table.add_column("Status")
+            table.add_column("Detail")
+            for check in preflight["checks"]:
+                table.add_row(check["name"], check["status"], str(check.get("detail", "")))
+            c.print(table)
+
+        for op, data in (results.get("tests") or {}).items():
+            table = Table(title=f"Multi-node NCCL {op}")
+            table.add_column("Topology")
+            table.add_column("Peak Bus BW")
+            table.add_column("Peak Size")
+            table.add_column("Threshold")
+            table.add_column("Status")
+            for topo in data.get("topologies", []):
+                table.add_row(
+                    topo.get("label", ""),
+                    f"{topo.get('peak_busbw_gbps', 0):.2f} GB/s",
+                    str(topo.get("peak_size", "")),
+                    f">= {topo.get('min_required_gbps', 0):.0f} GB/s" if topo.get("min_required_gbps") else "-",
+                    topo.get("status", "?"),
+                )
+            c.print(table)
+
+
+def _format_size(size_bytes: int) -> str:
+    units = [("G", 1024 ** 3), ("M", 1024 ** 2), ("K", 1024)]
+    for suffix, factor in units:
+        if size_bytes >= factor and size_bytes % factor == 0:
+            return f"{size_bytes // factor}{suffix}"
+    return str(size_bytes)
--- a/modules/report.py
+++ b/modules/report.py
@ -464,6 +464,47 @@ class ReportGenerator:
            passed = nccl.get("passed", False)
            lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")

+        multinode = results.get("multinode_nccl")
+        if multinode and not multinode.get("error"):
+            lines.append("## Multi-node NCCL / Cross Leaf\n")
+            lines.append(f"Source: {multinode.get('source', 'unknown')} | Mode: {multinode.get('mode', 'unknown')}\n")
+            hosts = multinode.get("hosts", [])
+            if hosts:
+                host_text = ", ".join(f"{h.get('name') or h.get('addr')}({h.get('addr')})" for h in hosts)
+                lines.append(f"- **Hosts:** {host_text}")
+            preflight = multinode.get("preflight", {})
+            if preflight.get("checks"):
+                failed_checks = [c for c in preflight["checks"] if c.get("status") == "FAIL"]
+                warn_checks = [c for c in preflight["checks"] if c.get("status") == "WARN"]
+                lines.append(f"- **Preflight:** {'PASS' if not failed_checks else 'FAIL'}"
+                             f"{f' ({len(warn_checks)} warnings)' if warn_checks else ''}")
+            lines.append("")
+            for op, data in (multinode.get("tests") or {}).items():
+                lines.append(f"### Multi-node NCCL {op}\n")
+                lines.append("| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |")
+                lines.append("|----------|-------------|-----------|------------|-----------|--------|")
+                for topo in data.get("topologies", []):
+                    threshold = topo.get("min_required_gbps", 0) or 0
+                    threshold_text = f">= {threshold:.0f} GB/s" if threshold else "-"
+                    lines.append(
+                        f"| {topo.get('label', '')} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | "
+                        f"{topo.get('peak_size', '')} | {topo.get('avg_busbw_gbps', 0):.2f} GB/s | "
+                        f"{threshold_text} | {topo.get('status', '?')} |"
+                    )
+                lines.append("")
+            lines.append(f"**Overall: {'PASS' if multinode.get('passed') else 'FAIL'}**\n")
+        elif multinode and multinode.get("error"):
+            lines.append("## Multi-node NCCL / Cross Leaf\n")
+            lines.append(f"**Overall: FAIL** ({multinode.get('error')})\n")
+            preflight = multinode.get("preflight", {})
+            if preflight.get("checks"):
+                lines.append("| Check | Status | Detail |")
+                lines.append("|-------|--------|--------|")
+                for check in preflight["checks"]:
+                    detail = str(check.get("detail", "")).replace("\n", " ")
+                    lines.append(f"| {check.get('name', '')} | {check.get('status', '')} | {detail} |")
+                lines.append("")
+
        # --- Stress Test ---
        stress = results.get("stress")
        if stress and not stress.get("error"):
@ -836,6 +877,15 @@ class ReportGenerator:
            else:
                items.append(("NCCL", "FAIL"))

+        if "multinode_nccl" in results:
+            mn = results["multinode_nccl"]
+            if mn.get("error"):
+                items.append(("Multi-node NCCL", f"ERROR: {mn['error']}"))
+            elif mn.get("passed"):
+                items.append(("Multi-node NCCL", "PASS"))
+            else:
+                items.append(("Multi-node NCCL", "FAIL"))
+
        # Stress
        if "stress" in results:
            s = results["stress"]
--- a/reports_multinode_nccl_smoke_256m_aikubeworker0012.json
+++ b/reports_multinode_nccl_smoke_256m_aikubeworker0012.json
@ -0,0 +1,439 @@
+{
+  "multinode_nccl": {
+    "passed": false,
+    "source": "nccl-tests-mpirun",
+    "mode": "sweep",
+    "hosts": [
+      {
+        "name": "nccl-gpu-1",
+        "addr": "172.72.8.12",
+        "slots": 8
+      },
+      {
+        "name": "nccl-gpu-2",
+        "addr": "172.72.8.16",
+        "slots": 8
+      }
+    ],
+    "preflight": {
+      "checks": [
+        {
+          "name": "mpirun",
+          "status": "PASS",
+          "detail": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun"
+        },
+        {
+          "name": "hosts",
+          "status": "PASS",
+          "detail": "2 configured"
+        },
+        {
+          "name": "all_reduce_perf",
+          "status": "PASS",
+          "detail": "/opt/gpu-test-tools/nccl-tests/build/all_reduce_perf"
+        },
+        {
+          "name": "alltoall_perf",
+          "status": "PASS",
+          "detail": "/opt/gpu-test-tools/nccl-tests/build/alltoall_perf"
+        },
+        {
+          "name": "ssh 172.72.8.12",
+          "status": "WARN",
+          "detail": "Host key verification failed."
+        },
+        {
+          "name": "ssh 172.72.8.16",
+          "status": "PASS",
+          "detail": "aikubeworker0016"
+        }
+      ],
+      "passed": true
+    },
+    "tests": {
+      "allreduce": {
+        "binary": "/opt/gpu-test-tools/nccl-tests/build/all_reduce_perf",
+        "topologies": [
+          {
+            "label": "2 nodes x 8 GPUs",
+            "nodes": 2,
+            "gpus_per_node": 8,
+            "ranks": 16,
+            "hosts": [
+              {
+                "name": "nccl-gpu-1",
+                "addr": "172.72.8.12",
+                "slots": 8
+              },
+              {
+                "name": "nccl-gpu-2",
+                "addr": "172.72.8.16",
+                "slots": 8
+              }
+            ],
+            "command": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun --allow-run-as-root --mca btl_openib_warn_no_device_params_found 0 --mca btl_tcp_if_include bond0 -H 172.72.8.12:8,172.72.8.16:8 --map-by ppr:8:node -np 16 -x NCCL_DEBUG=WARN -x NCCL_SOCKET_IFNAME=bond0 -x NCCL_IB_GID_INDEX=3 -x NCCL_IB_SL=5 -x NCCL_IB_TC=136 -x NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 -x NCCL_IB_TIMEOUT=22 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_MIN_NCHANNELS=4 -x NCCL_NET_PLUGIN=none -x NCCL_NVLS_ENABLE=1 -x NCCL_IB_SPLIT_DATA_ON_QPS=1 -x LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib:/usr/local/cuda-12.4/targets/x86_64-linux/lib /opt/gpu-test-tools/nccl-tests/build/all_reduce_perf -b 1k -e 256M -g 1 -f 2 -w 2",
+            "returncode": 0,
+            "status": "FAIL",
+            "peak_busbw_gbps": 39.32,
+            "peak_algbw_gbps": 20.97,
+            "peak_size": "4M",
+            "avg_busbw_gbps": 9.1,
+            "min_required_gbps": 100.0,
+            "wrong_count": 0,
+            "by_size": [
+              {
+                "size_bytes": 1024,
+                "size": "1K",
+                "time_us": 80.32,
+                "algbw_gbps": 0.01,
+                "busbw_gbps": 0.02,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 2048,
+                "size": "2K",
+                "time_us": 35.79,
+                "algbw_gbps": 0.06,
+                "busbw_gbps": 0.11,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 4096,
+                "size": "4K",
+                "time_us": 37.49,
+                "algbw_gbps": 0.11,
+                "busbw_gbps": 0.2,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 8192,
+                "size": "8K",
+                "time_us": 40.32,
+                "algbw_gbps": 0.2,
+                "busbw_gbps": 0.38,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 16384,
+                "size": "16K",
+                "time_us": 43.04,
+                "algbw_gbps": 0.38,
+                "busbw_gbps": 0.71,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 32768,
+                "size": "32K",
+                "time_us": 43.32,
+                "algbw_gbps": 0.76,
+                "busbw_gbps": 1.42,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 65536,
+                "size": "64K",
+                "time_us": 47.45,
+                "algbw_gbps": 1.38,
+                "busbw_gbps": 2.59,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 131072,
+                "size": "128K",
+                "time_us": 89.3,
+                "algbw_gbps": 1.47,
+                "busbw_gbps": 2.75,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 262144,
+                "size": "256K",
+                "time_us": 165.38,
+                "algbw_gbps": 1.59,
+                "busbw_gbps": 2.97,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 524288,
+                "size": "512K",
+                "time_us": 4292.69,
+                "algbw_gbps": 0.12,
+                "busbw_gbps": 0.23,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 1048576,
+                "size": "1M",
+                "time_us": 139.29,
+                "algbw_gbps": 7.53,
+                "busbw_gbps": 14.12,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 2097152,
+                "size": "2M",
+                "time_us": 4195.12,
+                "algbw_gbps": 0.5,
+                "busbw_gbps": 0.94,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 4194304,
+                "size": "4M",
+                "time_us": 199.99,
+                "algbw_gbps": 20.97,
+                "busbw_gbps": 39.32,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 8388608,
+                "size": "8M",
+                "time_us": 6159.0,
+                "algbw_gbps": 1.36,
+                "busbw_gbps": 2.55,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 16777216,
+                "size": "16M",
+                "time_us": 6336.73,
+                "algbw_gbps": 2.65,
+                "busbw_gbps": 4.96,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 33554432,
+                "size": "32M",
+                "time_us": 12623.3,
+                "algbw_gbps": 2.66,
+                "busbw_gbps": 4.98,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 67108864,
+                "size": "64M",
+                "time_us": 17005.6,
+                "algbw_gbps": 3.95,
+                "busbw_gbps": 7.4,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 134217728,
+                "size": "128M",
+                "time_us": 23826.7,
+                "algbw_gbps": 5.63,
+                "busbw_gbps": 10.56,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 268435456,
+                "size": "256M",
+                "time_us": 47356.5,
+                "algbw_gbps": 5.67,
+                "busbw_gbps": 10.63,
+                "wrong": 0
+              }
+            ],
+            "stderr_tail": "",
+            "stdout_tail": "   6.25       0\n     1048576        262144     float     sum      -1   139.29    7.53   14.12       0  3552.34    0.30    0.55       0\n     2097152        524288     float     sum      -1  4195.12    0.50    0.94       0   158.81   13.21   24.76       0\n     4194304       1048576     float     sum      -1   199.99   20.97   39.32       0  3623.39    1.16    2.17       0\n     8388608       2097152     float     sum      -1  6159.00    1.36    2.55       0   324.45   25.85   48.48       0\n    16777216       4194304     float     sum      -1  6336.73    2.65    4.96       0   600.96   27.92   52.35       0\n    33554432       8388608     float     sum      -1  12623.3    2.66    4.98       0   949.39   35.34   66.27       0\n    67108864      16777216     float     sum      -1  17005.6    3.95    7.40       0  17175.5    3.91    7.33       0\n   134217728      33554432     float     sum      -1  23826.7    5.63   10.56       0  25793.0    5.20    9.76       0\n   268435456      67108864     float     sum      -1  47356.5    5.67   10.63       0  43195.8    6.21   11.65       0\n# Out of bounds values : 0 OK\n# Avg bus bandwidth    : 9.0956 \n#\n# Collective test concluded: all_reduce_perf\n#\n\n",
+            "started_at": "2026-05-23T04:59:28.584786",
+            "finished_at": "2026-05-23T04:59:54.886123"
+          }
+        ]
+      },
+      "alltoall": {
+        "binary": "/opt/gpu-test-tools/nccl-tests/build/alltoall_perf",
+        "topologies": [
+          {
+            "label": "2 nodes x 8 GPUs",
+            "nodes": 2,
+            "gpus_per_node": 8,
+            "ranks": 16,
+            "hosts": [
+              {
+                "name": "nccl-gpu-1",
+                "addr": "172.72.8.12",
+                "slots": 8
+              },
+              {
+                "name": "nccl-gpu-2",
+                "addr": "172.72.8.16",
+                "slots": 8
+              }
+            ],
+            "command": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun --allow-run-as-root --mca btl_openib_warn_no_device_params_found 0 --mca btl_tcp_if_include bond0 -H 172.72.8.12:8,172.72.8.16:8 --map-by ppr:8:node -np 16 -x NCCL_DEBUG=WARN -x NCCL_SOCKET_IFNAME=bond0 -x NCCL_IB_GID_INDEX=3 -x NCCL_IB_SL=5 -x NCCL_IB_TC=136 -x NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 -x NCCL_IB_TIMEOUT=22 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_MIN_NCHANNELS=4 -x NCCL_NET_PLUGIN=none -x NCCL_NVLS_ENABLE=1 -x NCCL_IB_SPLIT_DATA_ON_QPS=1 -x LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib:/usr/local/cuda-12.4/targets/x86_64-linux/lib /opt/gpu-test-tools/nccl-tests/build/alltoall_perf -b 1k -e 256M -g 1 -f 2 -w 2",
+            "returncode": 0,
+            "status": "FAIL",
+            "peak_busbw_gbps": 8.64,
+            "peak_algbw_gbps": 9.21,
+            "peak_size": "2M",
+            "avg_busbw_gbps": 2.19,
+            "min_required_gbps": 20.0,
+            "wrong_count": 0,
+            "by_size": [
+              {
+                "size_bytes": 1024,
+                "size": "1K",
+                "time_us": 58.44,
+                "algbw_gbps": 0.02,
+                "busbw_gbps": 0.02,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 2048,
+                "size": "2K",
+                "time_us": 47.2,
+                "algbw_gbps": 0.04,
+                "busbw_gbps": 0.04,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 4096,
+                "size": "4K",
+                "time_us": 47.68,
+                "algbw_gbps": 0.09,
+                "busbw_gbps": 0.08,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 8192,
+                "size": "8K",
+                "time_us": 48.78,
+                "algbw_gbps": 0.17,
+                "busbw_gbps": 0.16,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 16384,
+                "size": "16K",
+                "time_us": 79.34,
+                "algbw_gbps": 0.21,
+                "busbw_gbps": 0.19,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 32768,
+                "size": "32K",
+                "time_us": 68.8,
+                "algbw_gbps": 0.48,
+                "busbw_gbps": 0.45,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 65536,
+                "size": "64K",
+                "time_us": 49.86,
+                "algbw_gbps": 1.31,
+                "busbw_gbps": 1.23,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 131072,
+                "size": "128K",
+                "time_us": 52.89,
+                "algbw_gbps": 2.48,
+                "busbw_gbps": 2.32,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 262144,
+                "size": "256K",
+                "time_us": 3861.98,
+                "algbw_gbps": 0.07,
+                "busbw_gbps": 0.06,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 524288,
+                "size": "512K",
+                "time_us": 83.38,
+                "algbw_gbps": 6.29,
+                "busbw_gbps": 5.89,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 1048576,
+                "size": "1M",
+                "time_us": 182.32,
+                "algbw_gbps": 5.75,
+                "busbw_gbps": 5.39,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 2097152,
+                "size": "2M",
+                "time_us": 227.67,
+                "algbw_gbps": 9.21,
+                "busbw_gbps": 8.64,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 4194304,
+                "size": "4M",
+                "time_us": 6482.39,
+                "algbw_gbps": 0.65,
+                "busbw_gbps": 0.61,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 8388608,
+                "size": "8M",
+                "time_us": 10348.9,
+                "algbw_gbps": 0.81,
+                "busbw_gbps": 0.76,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 16777216,
+                "size": "16M",
+                "time_us": 18616.5,
+                "algbw_gbps": 0.9,
+                "busbw_gbps": 0.84,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 33554432,
+                "size": "32M",
+                "time_us": 17170.7,
+                "algbw_gbps": 1.95,
+                "busbw_gbps": 1.83,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 67108864,
+                "size": "64M",
+                "time_us": 35735.6,
+                "algbw_gbps": 1.88,
+                "busbw_gbps": 1.76,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 134217728,
+                "size": "128M",
+                "time_us": 69388.5,
+                "algbw_gbps": 1.93,
+                "busbw_gbps": 1.81,
+                "wrong": 0
+              },
+              {
+                "size_bytes": 268435456,
+                "size": "256M",
+                "time_us": 96873.9,
+                "algbw_gbps": 2.77,
+                "busbw_gbps": 2.6,
+                "wrong": 0
+              }
+            ],
+            "stderr_tail": "",
+            "stdout_tail": "56    6.85    6.42    N/A\n     1048576         16384     float    none      -1   182.32    5.75    5.39       0   169.19    6.20    5.81    N/A\n     2097152         32768     float    none      -1   227.67    9.21    8.64       0  3664.15    0.57    0.54    N/A\n     4194304         65536     float    none      -1  6482.39    0.65    0.61       0   553.24    7.58    7.11    N/A\n     8388608        131072     float    none      -1  10348.9    0.81    0.76       0   803.01   10.45    9.79    N/A\n    16777216        262144     float    none      -1  18616.5    0.90    0.84       0  4237.22    3.96    3.71    N/A\n    33554432        524288     float    none      -1  17170.7    1.95    1.83       0  20849.4    1.61    1.51    N/A\n    67108864       1048576     float    none      -1  35735.6    1.88    1.76       0  34524.7    1.94    1.82    N/A\n   134217728       2097152     float    none      -1  69388.5    1.93    1.81       0  63535.3    2.11    1.98    N/A\n   268435456       4194304     float    none      -1  96873.9    2.77    2.60       0   100742    2.66    2.50    N/A\n# Out of bounds values : 0 OK\n# Avg bus bandwidth    : 2.19061 \n#\n# Collective test concluded: alltoall_perf\n#\n\n",
+            "started_at": "2026-05-23T04:59:54.886310",
+            "finished_at": "2026-05-23T05:00:28.796555"
+          }
+        ]
+      }
+    },
+    "timestamp": "2026-05-23T05:00:28.796580"
+  },
+  "timestamp": "2026-05-23T05:00:28.807561",
+  "hostname": "aikubeworker0012"
+}
--- a/reports_multinode_nccl_smoke_256m_aikubeworker0012.md
+++ b/reports_multinode_nccl_smoke_256m_aikubeworker0012.md
@ -0,0 +1,50 @@
+# GPU Test Report
+
+- **Date:** 2026-05-23T05:00:28.807561
+- **Host:** aikubeworker0012
+
+## Overall Acceptance Verdict
+
+**Result: FAIL**
+
+Missing required evidence:
+- GPU Info
+- Health Check
+- Memory Bandwidth
+- Compute Throughput
+- NVLink/NVSwitch
+- NCCL
+- Stress Test
+- RDMA
+- DCGM
+- Training
+
+## Summary
+
+| Test | Result |
+|------|--------|
+| Multi-node NCCL | FAIL |
+
+## Multi-node NCCL / Cross Leaf
+
+Source: nccl-tests-mpirun | Mode: sweep
+
+- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
+- **Preflight:** PASS (1 warnings)
+
+### Multi-node NCCL allreduce
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs | 39.32 GB/s | 4M | 9.10 GB/s | >= 100 GB/s | FAIL |
+
+### Multi-node NCCL alltoall
+
+| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
+|----------|-------------|-----------|------------|-----------|--------|
+| 2 nodes x 8 GPUs | 8.64 GB/s | 2M | 2.19 GB/s | >= 20 GB/s | FAIL |
+
+**Overall: FAIL**
+
+---
+*Generated by GPU Test Suite v0.2.0*