diff --git a/README.md b/README.md index 1af08c4..eed4791 100644 --- a/README.md +++ b/README.md @@ -375,6 +375,27 @@ nccl: repeats: 3 max_stddev_pct: 3 +multinode_nccl: + enabled: false # true 时纳入 --test all + hosts: + - {name: nccl-gpu-1, addr: 172.72.8.12, slots: 8} + - {name: nccl-gpu-2, addr: 172.72.8.16, slots: 8} + tests: [all_reduce_perf, alltoall_perf] + topologies: + - {nodes: 2, gpus_per_node: 8} + mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun + extra_ld_library_path: # 传给远端 rank 的 MPI/NCCL/CUDA 库路径 + - /usr/mpi/gcc/openmpi-4.1.9a1/lib + - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib + - /usr/local/cuda-12.4/targets/x86_64-linux/lib + begin_size: 1k + end_size: 16g + step_factor: 2 + warmup_iters: 10 + socket_ifname: bond0 + ib_gid_index: 3 + ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 + stress: duration_sec: 1800 # 压力测试时长 use_gpu_burn: false # 默认走 PyTorch GEMM stress @@ -539,16 +560,14 @@ report: └── 异常: 检查 IB 线缆、交换机配置、子网管理器 步骤 3: 多节点 NCCL 测试 -├── 在每个节点上配置: -│ export MASTER_ADDR=<主节点IP> -│ export MASTER_PORT=29500 -│ export NCCL_SOCKET_IFNAME=ib0 # IB 网卡名 -│ export NCCL_DEBUG=INFO -├── 运行 nccl-tests 手动测试: -│ mpirun -np <总GPU数> -hostfile hosts \ -│ /opt/gpu-test-tools/nccl-tests/build/all_reduce_perf \ -│ -b 8 -e 256M -f 2 -g 1 -w 5 -n 20 -└── 确认: 多节点 AllReduce 带宽正常 +├── 在发起节点确认 mpirun、nccl-tests、跨节点 root SSH 可用 +├── 配置 configs/default.yaml 的 multinode_nccl.hosts / IB 参数 +├── 执行 PDF 风格 sweep: +│ python3 gpu_tester.py --test multinode-nccl --report --format md +├── 默认命令口径: +│ mpirun -H :8,:8 --map-by ppr:8:node -np 16 \ +│ all_reduce_perf/alltoall_perf -b 1k -e 16g -f 2 -g 1 -w 10 +└── 确认: Peak Bus BW、Peak Size、wrong_count 正常 步骤 4: 训练验证 ├── python3 gpu_tester.py --test training diff --git a/configs/default.yaml b/configs/default.yaml index 66f1cdf..1a1c8e2 100644 --- a/configs/default.yaml +++ b/configs/default.yaml @@ -41,6 +41,52 @@ nccl: repeats: 3 max_stddev_pct: 3 +multinode_nccl: + enabled: false + mode: sweep + hosts: + - name: nccl-gpu-1 + addr: 172.72.8.12 + slots: 8 + - name: nccl-gpu-2 + addr: 172.72.8.16 + slots: 8 + ssh_user: root + ssh_preflight: true + mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun + mpi_ld_preload: null + extra_ld_library_path: + - /usr/mpi/gcc/openmpi-4.1.9a1/lib + - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib + - /usr/local/cuda-12.4/targets/x86_64-linux/lib + nccl_tests_dir: null # null = tools.install_dir/nccl-tests/build + tests: + - all_reduce_perf + - alltoall_perf + topologies: + - nodes: 2 + gpus_per_node: 8 + begin_size: 1k + end_size: 16g + step_factor: 2 + warmup_iters: 10 + gpus_per_rank: 1 + timeout_sec: 1800 + socket_ifname: bond0 + ib_gid_index: 3 + ib_sl: 5 + ib_tc: 136 + ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 + ib_timeout: 22 + qps_per_connection: 4 + min_nchannels: 4 + net_plugin: none + nvls_enable: 1 + split_data_on_qps: 1 + min_peak_busbw_gbps: + allreduce: 480 + alltoall: 75 + stress: duration_sec: 1800 production_duration_sec: 1800 diff --git a/gpu_tester.py b/gpu_tester.py index 15bc694..35d89de 100644 --- a/gpu_tester.py +++ b/gpu_tester.py @@ -28,6 +28,7 @@ from modules.stress_test import StressTest from modules.rdma_test import RDMATest from modules.nvlink_test import NVLinkTest from modules.dcgm_test import DCGMTest +from modules.multinode_nccl_test import MultiNodeNCCLTest from modules.report import ReportGenerator from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label, get_supported_gpus, validate_driver_compatibility @@ -55,6 +56,44 @@ DEFAULT_CONFIG = { "repeats": 3, "max_stddev_pct": 3, }, + "multinode_nccl": { + "enabled": False, + "mode": "sweep", + "hosts": [ + {"name": "nccl-gpu-1", "addr": "172.72.8.12", "slots": 8}, + {"name": "nccl-gpu-2", "addr": "172.72.8.16", "slots": 8}, + ], + "ssh_user": "root", + "ssh_preflight": True, + "mpirun_path": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun", + "mpi_ld_preload": None, + "extra_ld_library_path": [ + "/usr/mpi/gcc/openmpi-4.1.9a1/lib", + "/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib", + "/usr/local/cuda-12.4/targets/x86_64-linux/lib", + ], + "nccl_tests_dir": None, + "tests": ["all_reduce_perf", "alltoall_perf"], + "topologies": [{"nodes": 2, "gpus_per_node": 8}], + "begin_size": "1k", + "end_size": "16g", + "step_factor": 2, + "warmup_iters": 10, + "gpus_per_rank": 1, + "timeout_sec": 1800, + "socket_ifname": "bond0", + "ib_gid_index": 3, + "ib_sl": 5, + "ib_tc": 136, + "ib_hca": "mlx5_0,mlx5_1,mlx5_6,mlx5_7", + "ib_timeout": 22, + "qps_per_connection": 4, + "min_nchannels": 4, + "net_plugin": "none", + "nvls_enable": 1, + "split_data_on_qps": 1, + "min_peak_busbw_gbps": {"allreduce": 480, "alltoall": 75}, + }, "stress": { "duration_sec": 1800, "production_duration_sec": 1800, @@ -191,7 +230,8 @@ def interactive_menu(config: dict): ("8", "NVLink/NVSwitch Test", "nvlink"), ("9", "DCGM Diagnostic", "dcgm"), ("10", "Training Simulation", "training"), - ("11", "Full Test Suite (All Tests)", "all"), + ("11", "Multi-node NCCL Test", "multinode_nccl"), + ("12", "Full Test Suite (All Tests)", "all"), ("0", "Generate Report", "report"), ] @@ -218,6 +258,7 @@ def interactive_menu(config: dict): "nvlink": "NVLink links, speed, and error counters", "dcgm": "DCGM diag -r 3 production diagnostic", "training": "Simulate LLM training with PyTorch", + "multinode_nccl": "Cross-node NCCL via mpirun/nccl-tests", "all": "Run all tests sequentially", "report": "Export results to JSON/HTML", } @@ -326,6 +367,12 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict: m.print_results(result) return result + elif test_name == "multinode_nccl": + m = MultiNodeNCCLTest(config) + result = m.run() + m.print_results(result) + return result + elif test_name == "all": return _run_full_suite(config, console) @@ -356,6 +403,8 @@ def _run_full_suite(config: dict, console: Console) -> dict: ("dcgm", "DCGM Diagnostic", DCGMTest), ("training", "Training Simulation", TrainingSim), ] + if (config.get("multinode_nccl", {}) or {}).get("enabled"): + tests.append(("multinode_nccl", "Multi-node NCCL Test", MultiNodeNCCLTest)) for i, (key, name, mod_cls) in enumerate(tests, 1): console.print(f"\n[bold cyan][{i}/{len(tests)}] {name}[/bold cyan]") @@ -435,6 +484,7 @@ Examples: python gpu_tester.py --test benchmark --type memory python gpu_tester.py --test benchmark --type compute --dtype fp16 python gpu_tester.py --test nccl # NCCL test + python gpu_tester.py --test multinode-nccl # Cross-node NCCL test python gpu_tester.py --test nvlink # NVLink/NVSwitch test python gpu_tester.py --test dcgm # DCGM diagnostic python gpu_tester.py --test training # Training sim @@ -442,7 +492,7 @@ Examples: python gpu_tester.py --report --format json --output report.json """, ) - parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"], + parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "multinode-nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"], help="Run a specific test") parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)") parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"], @@ -499,6 +549,7 @@ Examples: "health": "health", "benchmark": None, "nccl": "nccl", + "multinode-nccl": "multinode_nccl", "stress": "stress", "rdma": "rdma", "nvlink": "nvlink", diff --git a/modules/multinode_nccl_test.py b/modules/multinode_nccl_test.py new file mode 100644 index 0000000..1063ec2 --- /dev/null +++ b/modules/multinode_nccl_test.py @@ -0,0 +1,388 @@ +"""Multi-node NCCL benchmark wrapper for nccl-tests via mpirun.""" + +import os +import re +import shutil +import subprocess +from datetime import datetime +from typing import Optional + +from rich.console import Console +from rich.table import Table + +from modules.gpu_specs import resolve_tools_dir + + +_TEST_ALIASES = { + "allreduce": "all_reduce_perf", + "all_reduce": "all_reduce_perf", + "all_reduce_perf": "all_reduce_perf", + "alltoall": "alltoall_perf", + "all_to_all": "alltoall_perf", + "alltoall_perf": "alltoall_perf", +} + +_OP_LABELS = { + "all_reduce_perf": "allreduce", + "alltoall_perf": "alltoall", +} + + +class MultiNodeNCCLTest: + """Run cross-node NCCL tests with a PDF-style message-size sweep.""" + + def __init__(self, config: dict): + self.config = config + self.cfg = config.get("multinode_nccl", {}) or {} + self.tools_dir = resolve_tools_dir(config) + self.console = Console() + + def _find_mpirun(self) -> Optional[str]: + configured = self.cfg.get("mpirun_path") + if configured and os.path.isfile(str(configured)) and os.access(str(configured), os.X_OK): + return str(configured) + for cmd in ["mpirun", "mpiexec", os.path.join(self.tools_dir, "mpi", "bin", "mpirun")]: + found = shutil.which(cmd) + if found: + return found + return None + + def _find_nccl_test(self, binary_name: str) -> Optional[str]: + configured = self.cfg.get("nccl_tests_dir") + candidates = [] + if configured: + candidates.append(os.path.join(configured, binary_name)) + candidates.append(os.path.join(self.tools_dir, "nccl-tests", "build", binary_name)) + found = shutil.which(binary_name) + if found: + candidates.insert(0, found) + + for path in candidates: + if path and os.path.isfile(path) and os.access(path, os.X_OK): + return path + return None + + def _tests(self) -> list[str]: + configured = self.cfg.get("tests") or ["all_reduce_perf", "alltoall_perf"] + tests = [] + for name in configured: + binary = _TEST_ALIASES.get(str(name).lower()) + if binary and binary not in tests: + tests.append(binary) + return tests + + def _hosts(self) -> list[dict]: + hosts = self.cfg.get("hosts") or [] + normalized = [] + for host in hosts: + if isinstance(host, str): + normalized.append({"addr": host, "slots": 8}) + elif isinstance(host, dict): + normalized.append({ + "name": host.get("name") or host.get("addr"), + "addr": host.get("addr") or host.get("host") or host.get("ip"), + "slots": int(host.get("slots", 8)), + }) + return [h for h in normalized if h.get("addr")] + + def _topologies(self) -> list[dict]: + topologies = self.cfg.get("topologies") or [{"nodes": 2, "gpus_per_node": 8}] + normalized = [] + for topo in topologies: + nodes = int(topo.get("nodes", 2)) + gpus_per_node = int(topo.get("gpus_per_node", topo.get("gpn", 8))) + normalized.append({ + "nodes": nodes, + "gpus_per_node": gpus_per_node, + "label": topo.get("label") or f"{nodes} nodes x {gpus_per_node} GPUs", + }) + return normalized + + def _env_exports(self) -> list[tuple[str, str]]: + env_cfg = { + "NCCL_DEBUG": self.cfg.get("debug", "WARN"), + "NCCL_SOCKET_IFNAME": self.cfg.get("socket_ifname"), + "NCCL_IB_GID_INDEX": self.cfg.get("ib_gid_index"), + "NCCL_IB_SL": self.cfg.get("ib_sl"), + "NCCL_IB_TC": self.cfg.get("ib_tc"), + "NCCL_IB_HCA": self.cfg.get("ib_hca"), + "NCCL_IB_TIMEOUT": self.cfg.get("ib_timeout"), + "NCCL_IB_QPS_PER_CONNECTION": self.cfg.get("qps_per_connection"), + "NCCL_MIN_NCHANNELS": self.cfg.get("min_nchannels"), + "NCCL_NET_PLUGIN": self.cfg.get("net_plugin"), + "NCCL_NVLS_ENABLE": self.cfg.get("nvls_enable"), + "NCCL_IB_SPLIT_DATA_ON_QPS": self.cfg.get("split_data_on_qps"), + } + mpi_ld_preload = self._mpi_ld_preload() + if mpi_ld_preload: + env_cfg["LD_PRELOAD"] = mpi_ld_preload + extra_ld_library_path = self._extra_ld_library_path() + if extra_ld_library_path: + existing = os.environ.get("LD_LIBRARY_PATH", "") + env_cfg["LD_LIBRARY_PATH"] = ":".join( + [extra_ld_library_path] + ([existing] if existing else []) + ) + return [(k, str(v)) for k, v in env_cfg.items() if v is not None] + + def _mpi_ld_preload(self) -> str: + preload = self.cfg.get("mpi_ld_preload") + if isinstance(preload, list): + return " ".join(str(p) for p in preload if p) + return str(preload) if preload else "" + + def _runtime_env(self) -> dict: + env = os.environ.copy() + mpi_ld_preload = self._mpi_ld_preload() + if mpi_ld_preload: + env["LD_PRELOAD"] = mpi_ld_preload + extra_ld_library_path = self._extra_ld_library_path() + if extra_ld_library_path: + existing = env.get("LD_LIBRARY_PATH", "") + env["LD_LIBRARY_PATH"] = ":".join( + [extra_ld_library_path] + ([existing] if existing else []) + ) + return env + + def _extra_ld_library_path(self) -> str: + paths = self.cfg.get("extra_ld_library_path") + if isinstance(paths, list): + return ":".join(str(p) for p in paths if p) + return str(paths) if paths else "" + + def _preflight(self, mpirun: Optional[str], tests: list[str], hosts: list[dict]) -> dict: + checks = [] + checks.append({"name": "mpirun", "status": "PASS" if mpirun else "FAIL", "detail": mpirun or "not found"}) + checks.append({"name": "hosts", "status": "PASS" if len(hosts) >= 2 else "FAIL", "detail": f"{len(hosts)} configured"}) + for binary in tests: + path = self._find_nccl_test(binary) + checks.append({"name": binary, "status": "PASS" if path else "FAIL", "detail": path or "not found"}) + + if self.cfg.get("ssh_preflight", True): + user = self.cfg.get("ssh_user", "root") + for host in hosts: + target = f"{user}@{host['addr']}" + cmd = ["ssh", "-o", "BatchMode=yes", "-o", "ConnectTimeout=5", target, "hostname"] + try: + r = subprocess.run(cmd, capture_output=True, text=True, timeout=8, env=self._runtime_env()) + detail = r.stdout.strip() or r.stderr.strip()[:120] + checks.append({ + "name": f"ssh {host['addr']}", + "status": "PASS" if r.returncode == 0 else "WARN", + "detail": detail, + }) + except Exception as e: + checks.append({"name": f"ssh {host['addr']}", "status": "WARN", "detail": str(e)}) + + return { + "checks": checks, + "passed": all(c["status"] == "PASS" for c in checks if not c["name"].startswith("ssh ")), + } + + def run(self) -> dict: + mpirun = self._find_mpirun() + tests = self._tests() + hosts = self._hosts() + topologies = self._topologies() + preflight = self._preflight(mpirun, tests, hosts) + + if not preflight["passed"]: + return { + "passed": False, + "source": "nccl-tests-mpirun", + "mode": self.cfg.get("mode", "sweep"), + "hosts": hosts, + "preflight": preflight, + "tests": {}, + "error": "multinode NCCL preflight failed", + "timestamp": datetime.now().isoformat(), + } + + results = {} + for binary in tests: + label = _OP_LABELS[binary] + binary_path = self._find_nccl_test(binary) + op_results = [] + for topo in topologies: + op_results.append(self._run_topology(mpirun, binary_path, label, hosts, topo)) + results[label] = {"binary": binary_path, "topologies": op_results} + + passed = all( + topo.get("status") == "PASS" + for op in results.values() + for topo in op.get("topologies", []) + ) + return { + "passed": passed, + "source": "nccl-tests-mpirun", + "mode": self.cfg.get("mode", "sweep"), + "hosts": hosts, + "preflight": preflight, + "tests": results, + "timestamp": datetime.now().isoformat(), + } + + def _run_topology(self, mpirun: str, binary: str, label: str, hosts: list[dict], topo: dict) -> dict: + nodes = topo["nodes"] + gpus_per_node = topo["gpus_per_node"] + selected_hosts = hosts[:nodes] + host_arg = ",".join(f"{h['addr']}:{gpus_per_node}" for h in selected_hosts) + ranks = nodes * gpus_per_node + + cmd = [ + mpirun, + "--allow-run-as-root", + "--mca", "btl_openib_warn_no_device_params_found", "0", + "--mca", "btl_tcp_if_include", str(self.cfg.get("socket_ifname", "bond0")), + "-H", host_arg, + "--map-by", f"ppr:{gpus_per_node}:node", + "-np", str(ranks), + ] + for key, value in self._env_exports(): + cmd.extend(["-x", f"{key}={value}"]) + + cmd.extend([ + binary, + "-b", str(self.cfg.get("begin_size", "1k")), + "-e", str(self.cfg.get("end_size", "16g")), + "-g", str(self.cfg.get("gpus_per_rank", 1)), + "-f", str(self.cfg.get("step_factor", 2)), + "-w", str(self.cfg.get("warmup_iters", 10)), + ]) + if self.cfg.get("iters") is not None: + cmd.extend(["-n", str(self.cfg["iters"])]) + + timeout = int(self.cfg.get("timeout_sec", 1800)) + started = datetime.now().isoformat() + try: + r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, env=self._runtime_env()) + except subprocess.TimeoutExpired: + return { + "label": topo["label"], + "nodes": nodes, + "gpus_per_node": gpus_per_node, + "ranks": ranks, + "hosts": selected_hosts, + "command": " ".join(cmd), + "status": "FAIL", + "error": f"timeout after {timeout}s", + "started_at": started, + } + + parsed = self._parse_nccl_output(r.stdout) + threshold = self._threshold_for(label) + wrong = sum(row.get("wrong", 0) for row in parsed["by_size"]) + has_bw = parsed["peak_busbw_gbps"] > 0 + status = "PASS" if r.returncode == 0 and has_bw and wrong == 0 and parsed["peak_busbw_gbps"] >= threshold else "FAIL" + return { + "label": topo["label"], + "nodes": nodes, + "gpus_per_node": gpus_per_node, + "ranks": ranks, + "hosts": selected_hosts, + "command": " ".join(cmd), + "returncode": r.returncode, + "status": status, + "peak_busbw_gbps": parsed["peak_busbw_gbps"], + "peak_algbw_gbps": parsed["peak_algbw_gbps"], + "peak_size": parsed["peak_size"], + "avg_busbw_gbps": parsed["avg_busbw_gbps"], + "min_required_gbps": threshold, + "wrong_count": wrong, + "by_size": parsed["by_size"], + "stderr_tail": r.stderr[-1200:], + "stdout_tail": r.stdout[-1200:], + "started_at": started, + "finished_at": datetime.now().isoformat(), + } + + def _threshold_for(self, label: str) -> float: + thresholds = self.cfg.get("min_peak_busbw_gbps") or {} + if isinstance(thresholds, dict): + return float(thresholds.get(label, 0) or 0) + return float(thresholds or 0) + + @staticmethod + def _parse_nccl_output(stdout: str) -> dict: + rows = [] + avg_bus = 0.0 + for line in stdout.splitlines(): + stripped = line.strip() + if not stripped: + continue + avg_match = re.search(r"Avg bus bandwidth\s*:\s*([0-9.]+)", stripped) + if avg_match: + avg_bus = float(avg_match.group(1)) + continue + if stripped.startswith("#"): + continue + parts = stripped.split() + if len(parts) < 9: + continue + try: + size_bytes = int(parts[0]) + time_us = float(parts[5]) + algbw = float(parts[6]) + busbw = float(parts[7]) + wrong = int(parts[8]) + except (ValueError, IndexError): + continue + rows.append({ + "size_bytes": size_bytes, + "size": _format_size(size_bytes), + "time_us": time_us, + "algbw_gbps": algbw, + "busbw_gbps": busbw, + "wrong": wrong, + }) + + peak_row = max(rows, key=lambda r: r["busbw_gbps"], default={}) + return { + "peak_busbw_gbps": round(float(peak_row.get("busbw_gbps", 0)), 2), + "peak_algbw_gbps": round(float(peak_row.get("algbw_gbps", 0)), 2), + "peak_size": peak_row.get("size", ""), + "avg_busbw_gbps": round(avg_bus, 2), + "by_size": rows, + } + + @staticmethod + def print_results(results: dict, console: Console = None): + c = console or Console() + if results.get("error"): + c.print(f"[bold red]Multi-node NCCL failed: {results['error']}[/bold red]") + else: + c.print("[bold green]Multi-node NCCL complete[/bold green]" if results.get("passed") else "[bold red]Multi-node NCCL failed[/bold red]") + + preflight = results.get("preflight", {}) + if preflight.get("checks"): + table = Table(title="Preflight") + table.add_column("Check") + table.add_column("Status") + table.add_column("Detail") + for check in preflight["checks"]: + table.add_row(check["name"], check["status"], str(check.get("detail", ""))) + c.print(table) + + for op, data in (results.get("tests") or {}).items(): + table = Table(title=f"Multi-node NCCL {op}") + table.add_column("Topology") + table.add_column("Peak Bus BW") + table.add_column("Peak Size") + table.add_column("Threshold") + table.add_column("Status") + for topo in data.get("topologies", []): + table.add_row( + topo.get("label", ""), + f"{topo.get('peak_busbw_gbps', 0):.2f} GB/s", + str(topo.get("peak_size", "")), + f">= {topo.get('min_required_gbps', 0):.0f} GB/s" if topo.get("min_required_gbps") else "-", + topo.get("status", "?"), + ) + c.print(table) + + +def _format_size(size_bytes: int) -> str: + units = [("G", 1024 ** 3), ("M", 1024 ** 2), ("K", 1024)] + for suffix, factor in units: + if size_bytes >= factor and size_bytes % factor == 0: + return f"{size_bytes // factor}{suffix}" + return str(size_bytes) diff --git a/modules/report.py b/modules/report.py index 2f6f1ec..b82170b 100644 --- a/modules/report.py +++ b/modules/report.py @@ -464,6 +464,47 @@ class ReportGenerator: passed = nccl.get("passed", False) lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n") + multinode = results.get("multinode_nccl") + if multinode and not multinode.get("error"): + lines.append("## Multi-node NCCL / Cross Leaf\n") + lines.append(f"Source: {multinode.get('source', 'unknown')} | Mode: {multinode.get('mode', 'unknown')}\n") + hosts = multinode.get("hosts", []) + if hosts: + host_text = ", ".join(f"{h.get('name') or h.get('addr')}({h.get('addr')})" for h in hosts) + lines.append(f"- **Hosts:** {host_text}") + preflight = multinode.get("preflight", {}) + if preflight.get("checks"): + failed_checks = [c for c in preflight["checks"] if c.get("status") == "FAIL"] + warn_checks = [c for c in preflight["checks"] if c.get("status") == "WARN"] + lines.append(f"- **Preflight:** {'PASS' if not failed_checks else 'FAIL'}" + f"{f' ({len(warn_checks)} warnings)' if warn_checks else ''}") + lines.append("") + for op, data in (multinode.get("tests") or {}).items(): + lines.append(f"### Multi-node NCCL {op}\n") + lines.append("| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |") + lines.append("|----------|-------------|-----------|------------|-----------|--------|") + for topo in data.get("topologies", []): + threshold = topo.get("min_required_gbps", 0) or 0 + threshold_text = f">= {threshold:.0f} GB/s" if threshold else "-" + lines.append( + f"| {topo.get('label', '')} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | " + f"{topo.get('peak_size', '')} | {topo.get('avg_busbw_gbps', 0):.2f} GB/s | " + f"{threshold_text} | {topo.get('status', '?')} |" + ) + lines.append("") + lines.append(f"**Overall: {'PASS' if multinode.get('passed') else 'FAIL'}**\n") + elif multinode and multinode.get("error"): + lines.append("## Multi-node NCCL / Cross Leaf\n") + lines.append(f"**Overall: FAIL** ({multinode.get('error')})\n") + preflight = multinode.get("preflight", {}) + if preflight.get("checks"): + lines.append("| Check | Status | Detail |") + lines.append("|-------|--------|--------|") + for check in preflight["checks"]: + detail = str(check.get("detail", "")).replace("\n", " ") + lines.append(f"| {check.get('name', '')} | {check.get('status', '')} | {detail} |") + lines.append("") + # --- Stress Test --- stress = results.get("stress") if stress and not stress.get("error"): @@ -836,6 +877,15 @@ class ReportGenerator: else: items.append(("NCCL", "FAIL")) + if "multinode_nccl" in results: + mn = results["multinode_nccl"] + if mn.get("error"): + items.append(("Multi-node NCCL", f"ERROR: {mn['error']}")) + elif mn.get("passed"): + items.append(("Multi-node NCCL", "PASS")) + else: + items.append(("Multi-node NCCL", "FAIL")) + # Stress if "stress" in results: s = results["stress"] diff --git a/reports_multinode_nccl_smoke_256m_aikubeworker0012.json b/reports_multinode_nccl_smoke_256m_aikubeworker0012.json new file mode 100644 index 0000000..72c30ce --- /dev/null +++ b/reports_multinode_nccl_smoke_256m_aikubeworker0012.json @@ -0,0 +1,439 @@ +{ + "multinode_nccl": { + "passed": false, + "source": "nccl-tests-mpirun", + "mode": "sweep", + "hosts": [ + { + "name": "nccl-gpu-1", + "addr": "172.72.8.12", + "slots": 8 + }, + { + "name": "nccl-gpu-2", + "addr": "172.72.8.16", + "slots": 8 + } + ], + "preflight": { + "checks": [ + { + "name": "mpirun", + "status": "PASS", + "detail": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun" + }, + { + "name": "hosts", + "status": "PASS", + "detail": "2 configured" + }, + { + "name": "all_reduce_perf", + "status": "PASS", + "detail": "/opt/gpu-test-tools/nccl-tests/build/all_reduce_perf" + }, + { + "name": "alltoall_perf", + "status": "PASS", + "detail": "/opt/gpu-test-tools/nccl-tests/build/alltoall_perf" + }, + { + "name": "ssh 172.72.8.12", + "status": "WARN", + "detail": "Host key verification failed." + }, + { + "name": "ssh 172.72.8.16", + "status": "PASS", + "detail": "aikubeworker0016" + } + ], + "passed": true + }, + "tests": { + "allreduce": { + "binary": "/opt/gpu-test-tools/nccl-tests/build/all_reduce_perf", + "topologies": [ + { + "label": "2 nodes x 8 GPUs", + "nodes": 2, + "gpus_per_node": 8, + "ranks": 16, + "hosts": [ + { + "name": "nccl-gpu-1", + "addr": "172.72.8.12", + "slots": 8 + }, + { + "name": "nccl-gpu-2", + "addr": "172.72.8.16", + "slots": 8 + } + ], + "command": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun --allow-run-as-root --mca btl_openib_warn_no_device_params_found 0 --mca btl_tcp_if_include bond0 -H 172.72.8.12:8,172.72.8.16:8 --map-by ppr:8:node -np 16 -x NCCL_DEBUG=WARN -x NCCL_SOCKET_IFNAME=bond0 -x NCCL_IB_GID_INDEX=3 -x NCCL_IB_SL=5 -x NCCL_IB_TC=136 -x NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 -x NCCL_IB_TIMEOUT=22 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_MIN_NCHANNELS=4 -x NCCL_NET_PLUGIN=none -x NCCL_NVLS_ENABLE=1 -x NCCL_IB_SPLIT_DATA_ON_QPS=1 -x LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib:/usr/local/cuda-12.4/targets/x86_64-linux/lib /opt/gpu-test-tools/nccl-tests/build/all_reduce_perf -b 1k -e 256M -g 1 -f 2 -w 2", + "returncode": 0, + "status": "FAIL", + "peak_busbw_gbps": 39.32, + "peak_algbw_gbps": 20.97, + "peak_size": "4M", + "avg_busbw_gbps": 9.1, + "min_required_gbps": 100.0, + "wrong_count": 0, + "by_size": [ + { + "size_bytes": 1024, + "size": "1K", + "time_us": 80.32, + "algbw_gbps": 0.01, + "busbw_gbps": 0.02, + "wrong": 0 + }, + { + "size_bytes": 2048, + "size": "2K", + "time_us": 35.79, + "algbw_gbps": 0.06, + "busbw_gbps": 0.11, + "wrong": 0 + }, + { + "size_bytes": 4096, + "size": "4K", + "time_us": 37.49, + "algbw_gbps": 0.11, + "busbw_gbps": 0.2, + "wrong": 0 + }, + { + "size_bytes": 8192, + "size": "8K", + "time_us": 40.32, + "algbw_gbps": 0.2, + "busbw_gbps": 0.38, + "wrong": 0 + }, + { + "size_bytes": 16384, + "size": "16K", + "time_us": 43.04, + "algbw_gbps": 0.38, + "busbw_gbps": 0.71, + "wrong": 0 + }, + { + "size_bytes": 32768, + "size": "32K", + "time_us": 43.32, + "algbw_gbps": 0.76, + "busbw_gbps": 1.42, + "wrong": 0 + }, + { + "size_bytes": 65536, + "size": "64K", + "time_us": 47.45, + "algbw_gbps": 1.38, + "busbw_gbps": 2.59, + "wrong": 0 + }, + { + "size_bytes": 131072, + "size": "128K", + "time_us": 89.3, + "algbw_gbps": 1.47, + "busbw_gbps": 2.75, + "wrong": 0 + }, + { + "size_bytes": 262144, + "size": "256K", + "time_us": 165.38, + "algbw_gbps": 1.59, + "busbw_gbps": 2.97, + "wrong": 0 + }, + { + "size_bytes": 524288, + "size": "512K", + "time_us": 4292.69, + "algbw_gbps": 0.12, + "busbw_gbps": 0.23, + "wrong": 0 + }, + { + "size_bytes": 1048576, + "size": "1M", + "time_us": 139.29, + "algbw_gbps": 7.53, + "busbw_gbps": 14.12, + "wrong": 0 + }, + { + "size_bytes": 2097152, + "size": "2M", + "time_us": 4195.12, + "algbw_gbps": 0.5, + "busbw_gbps": 0.94, + "wrong": 0 + }, + { + "size_bytes": 4194304, + "size": "4M", + "time_us": 199.99, + "algbw_gbps": 20.97, + "busbw_gbps": 39.32, + "wrong": 0 + }, + { + "size_bytes": 8388608, + "size": "8M", + "time_us": 6159.0, + "algbw_gbps": 1.36, + "busbw_gbps": 2.55, + "wrong": 0 + }, + { + "size_bytes": 16777216, + "size": "16M", + "time_us": 6336.73, + "algbw_gbps": 2.65, + "busbw_gbps": 4.96, + "wrong": 0 + }, + { + "size_bytes": 33554432, + "size": "32M", + "time_us": 12623.3, + "algbw_gbps": 2.66, + "busbw_gbps": 4.98, + "wrong": 0 + }, + { + "size_bytes": 67108864, + "size": "64M", + "time_us": 17005.6, + "algbw_gbps": 3.95, + "busbw_gbps": 7.4, + "wrong": 0 + }, + { + "size_bytes": 134217728, + "size": "128M", + "time_us": 23826.7, + "algbw_gbps": 5.63, + "busbw_gbps": 10.56, + "wrong": 0 + }, + { + "size_bytes": 268435456, + "size": "256M", + "time_us": 47356.5, + "algbw_gbps": 5.67, + "busbw_gbps": 10.63, + "wrong": 0 + } + ], + "stderr_tail": "", + "stdout_tail": " 6.25 0\n 1048576 262144 float sum -1 139.29 7.53 14.12 0 3552.34 0.30 0.55 0\n 2097152 524288 float sum -1 4195.12 0.50 0.94 0 158.81 13.21 24.76 0\n 4194304 1048576 float sum -1 199.99 20.97 39.32 0 3623.39 1.16 2.17 0\n 8388608 2097152 float sum -1 6159.00 1.36 2.55 0 324.45 25.85 48.48 0\n 16777216 4194304 float sum -1 6336.73 2.65 4.96 0 600.96 27.92 52.35 0\n 33554432 8388608 float sum -1 12623.3 2.66 4.98 0 949.39 35.34 66.27 0\n 67108864 16777216 float sum -1 17005.6 3.95 7.40 0 17175.5 3.91 7.33 0\n 134217728 33554432 float sum -1 23826.7 5.63 10.56 0 25793.0 5.20 9.76 0\n 268435456 67108864 float sum -1 47356.5 5.67 10.63 0 43195.8 6.21 11.65 0\n# Out of bounds values : 0 OK\n# Avg bus bandwidth : 9.0956 \n#\n# Collective test concluded: all_reduce_perf\n#\n\n", + "started_at": "2026-05-23T04:59:28.584786", + "finished_at": "2026-05-23T04:59:54.886123" + } + ] + }, + "alltoall": { + "binary": "/opt/gpu-test-tools/nccl-tests/build/alltoall_perf", + "topologies": [ + { + "label": "2 nodes x 8 GPUs", + "nodes": 2, + "gpus_per_node": 8, + "ranks": 16, + "hosts": [ + { + "name": "nccl-gpu-1", + "addr": "172.72.8.12", + "slots": 8 + }, + { + "name": "nccl-gpu-2", + "addr": "172.72.8.16", + "slots": 8 + } + ], + "command": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun --allow-run-as-root --mca btl_openib_warn_no_device_params_found 0 --mca btl_tcp_if_include bond0 -H 172.72.8.12:8,172.72.8.16:8 --map-by ppr:8:node -np 16 -x NCCL_DEBUG=WARN -x NCCL_SOCKET_IFNAME=bond0 -x NCCL_IB_GID_INDEX=3 -x NCCL_IB_SL=5 -x NCCL_IB_TC=136 -x NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 -x NCCL_IB_TIMEOUT=22 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_MIN_NCHANNELS=4 -x NCCL_NET_PLUGIN=none -x NCCL_NVLS_ENABLE=1 -x NCCL_IB_SPLIT_DATA_ON_QPS=1 -x LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib:/usr/local/cuda-12.4/targets/x86_64-linux/lib /opt/gpu-test-tools/nccl-tests/build/alltoall_perf -b 1k -e 256M -g 1 -f 2 -w 2", + "returncode": 0, + "status": "FAIL", + "peak_busbw_gbps": 8.64, + "peak_algbw_gbps": 9.21, + "peak_size": "2M", + "avg_busbw_gbps": 2.19, + "min_required_gbps": 20.0, + "wrong_count": 0, + "by_size": [ + { + "size_bytes": 1024, + "size": "1K", + "time_us": 58.44, + "algbw_gbps": 0.02, + "busbw_gbps": 0.02, + "wrong": 0 + }, + { + "size_bytes": 2048, + "size": "2K", + "time_us": 47.2, + "algbw_gbps": 0.04, + "busbw_gbps": 0.04, + "wrong": 0 + }, + { + "size_bytes": 4096, + "size": "4K", + "time_us": 47.68, + "algbw_gbps": 0.09, + "busbw_gbps": 0.08, + "wrong": 0 + }, + { + "size_bytes": 8192, + "size": "8K", + "time_us": 48.78, + "algbw_gbps": 0.17, + "busbw_gbps": 0.16, + "wrong": 0 + }, + { + "size_bytes": 16384, + "size": "16K", + "time_us": 79.34, + "algbw_gbps": 0.21, + "busbw_gbps": 0.19, + "wrong": 0 + }, + { + "size_bytes": 32768, + "size": "32K", + "time_us": 68.8, + "algbw_gbps": 0.48, + "busbw_gbps": 0.45, + "wrong": 0 + }, + { + "size_bytes": 65536, + "size": "64K", + "time_us": 49.86, + "algbw_gbps": 1.31, + "busbw_gbps": 1.23, + "wrong": 0 + }, + { + "size_bytes": 131072, + "size": "128K", + "time_us": 52.89, + "algbw_gbps": 2.48, + "busbw_gbps": 2.32, + "wrong": 0 + }, + { + "size_bytes": 262144, + "size": "256K", + "time_us": 3861.98, + "algbw_gbps": 0.07, + "busbw_gbps": 0.06, + "wrong": 0 + }, + { + "size_bytes": 524288, + "size": "512K", + "time_us": 83.38, + "algbw_gbps": 6.29, + "busbw_gbps": 5.89, + "wrong": 0 + }, + { + "size_bytes": 1048576, + "size": "1M", + "time_us": 182.32, + "algbw_gbps": 5.75, + "busbw_gbps": 5.39, + "wrong": 0 + }, + { + "size_bytes": 2097152, + "size": "2M", + "time_us": 227.67, + "algbw_gbps": 9.21, + "busbw_gbps": 8.64, + "wrong": 0 + }, + { + "size_bytes": 4194304, + "size": "4M", + "time_us": 6482.39, + "algbw_gbps": 0.65, + "busbw_gbps": 0.61, + "wrong": 0 + }, + { + "size_bytes": 8388608, + "size": "8M", + "time_us": 10348.9, + "algbw_gbps": 0.81, + "busbw_gbps": 0.76, + "wrong": 0 + }, + { + "size_bytes": 16777216, + "size": "16M", + "time_us": 18616.5, + "algbw_gbps": 0.9, + "busbw_gbps": 0.84, + "wrong": 0 + }, + { + "size_bytes": 33554432, + "size": "32M", + "time_us": 17170.7, + "algbw_gbps": 1.95, + "busbw_gbps": 1.83, + "wrong": 0 + }, + { + "size_bytes": 67108864, + "size": "64M", + "time_us": 35735.6, + "algbw_gbps": 1.88, + "busbw_gbps": 1.76, + "wrong": 0 + }, + { + "size_bytes": 134217728, + "size": "128M", + "time_us": 69388.5, + "algbw_gbps": 1.93, + "busbw_gbps": 1.81, + "wrong": 0 + }, + { + "size_bytes": 268435456, + "size": "256M", + "time_us": 96873.9, + "algbw_gbps": 2.77, + "busbw_gbps": 2.6, + "wrong": 0 + } + ], + "stderr_tail": "", + "stdout_tail": "56 6.85 6.42 N/A\n 1048576 16384 float none -1 182.32 5.75 5.39 0 169.19 6.20 5.81 N/A\n 2097152 32768 float none -1 227.67 9.21 8.64 0 3664.15 0.57 0.54 N/A\n 4194304 65536 float none -1 6482.39 0.65 0.61 0 553.24 7.58 7.11 N/A\n 8388608 131072 float none -1 10348.9 0.81 0.76 0 803.01 10.45 9.79 N/A\n 16777216 262144 float none -1 18616.5 0.90 0.84 0 4237.22 3.96 3.71 N/A\n 33554432 524288 float none -1 17170.7 1.95 1.83 0 20849.4 1.61 1.51 N/A\n 67108864 1048576 float none -1 35735.6 1.88 1.76 0 34524.7 1.94 1.82 N/A\n 134217728 2097152 float none -1 69388.5 1.93 1.81 0 63535.3 2.11 1.98 N/A\n 268435456 4194304 float none -1 96873.9 2.77 2.60 0 100742 2.66 2.50 N/A\n# Out of bounds values : 0 OK\n# Avg bus bandwidth : 2.19061 \n#\n# Collective test concluded: alltoall_perf\n#\n\n", + "started_at": "2026-05-23T04:59:54.886310", + "finished_at": "2026-05-23T05:00:28.796555" + } + ] + } + }, + "timestamp": "2026-05-23T05:00:28.796580" + }, + "timestamp": "2026-05-23T05:00:28.807561", + "hostname": "aikubeworker0012" +} \ No newline at end of file diff --git a/reports_multinode_nccl_smoke_256m_aikubeworker0012.md b/reports_multinode_nccl_smoke_256m_aikubeworker0012.md new file mode 100644 index 0000000..57fea2a --- /dev/null +++ b/reports_multinode_nccl_smoke_256m_aikubeworker0012.md @@ -0,0 +1,50 @@ +# GPU Test Report + +- **Date:** 2026-05-23T05:00:28.807561 +- **Host:** aikubeworker0012 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Missing required evidence: +- GPU Info +- Health Check +- Memory Bandwidth +- Compute Throughput +- NVLink/NVSwitch +- NCCL +- Stress Test +- RDMA +- DCGM +- Training + +## Summary + +| Test | Result | +|------|--------| +| Multi-node NCCL | FAIL | + +## Multi-node NCCL / Cross Leaf + +Source: nccl-tests-mpirun | Mode: sweep + +- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16) +- **Preflight:** PASS (1 warnings) + +### Multi-node NCCL allreduce + +| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | +|----------|-------------|-----------|------------|-----------|--------| +| 2 nodes x 8 GPUs | 39.32 GB/s | 4M | 9.10 GB/s | >= 100 GB/s | FAIL | + +### Multi-node NCCL alltoall + +| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | +|----------|-------------|-----------|------------|-----------|--------| +| 2 nodes x 8 GPUs | 8.64 GB/s | 2M | 2.19 GB/s | >= 20 GB/s | FAIL | + +**Overall: FAIL** + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file