Add multi-node NCCL sweep test
This commit is contained in:
parent
2a51be1ba3
commit
ac91f1aeb5
39
README.md
39
README.md
@ -375,6 +375,27 @@ nccl:
|
||||
repeats: 3
|
||||
max_stddev_pct: 3
|
||||
|
||||
multinode_nccl:
|
||||
enabled: false # true 时纳入 --test all
|
||||
hosts:
|
||||
- {name: nccl-gpu-1, addr: 172.72.8.12, slots: 8}
|
||||
- {name: nccl-gpu-2, addr: 172.72.8.16, slots: 8}
|
||||
tests: [all_reduce_perf, alltoall_perf]
|
||||
topologies:
|
||||
- {nodes: 2, gpus_per_node: 8}
|
||||
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
|
||||
extra_ld_library_path: # 传给远端 rank 的 MPI/NCCL/CUDA 库路径
|
||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||
- /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
|
||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||
begin_size: 1k
|
||||
end_size: 16g
|
||||
step_factor: 2
|
||||
warmup_iters: 10
|
||||
socket_ifname: bond0
|
||||
ib_gid_index: 3
|
||||
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
|
||||
|
||||
stress:
|
||||
duration_sec: 1800 # 压力测试时长
|
||||
use_gpu_burn: false # 默认走 PyTorch GEMM stress
|
||||
@ -539,16 +560,14 @@ report:
|
||||
└── 异常: 检查 IB 线缆、交换机配置、子网管理器
|
||||
|
||||
步骤 3: 多节点 NCCL 测试
|
||||
├── 在每个节点上配置:
|
||||
│ export MASTER_ADDR=<主节点IP>
|
||||
│ export MASTER_PORT=29500
|
||||
│ export NCCL_SOCKET_IFNAME=ib0 # IB 网卡名
|
||||
│ export NCCL_DEBUG=INFO
|
||||
├── 运行 nccl-tests 手动测试:
|
||||
│ mpirun -np <总GPU数> -hostfile hosts \
|
||||
│ /opt/gpu-test-tools/nccl-tests/build/all_reduce_perf \
|
||||
│ -b 8 -e 256M -f 2 -g 1 -w 5 -n 20
|
||||
└── 确认: 多节点 AllReduce 带宽正常
|
||||
├── 在发起节点确认 mpirun、nccl-tests、跨节点 root SSH 可用
|
||||
├── 配置 configs/default.yaml 的 multinode_nccl.hosts / IB 参数
|
||||
├── 执行 PDF 风格 sweep:
|
||||
│ python3 gpu_tester.py --test multinode-nccl --report --format md
|
||||
├── 默认命令口径:
|
||||
│ mpirun -H <node1>:8,<node2>:8 --map-by ppr:8:node -np 16 \
|
||||
│ all_reduce_perf/alltoall_perf -b 1k -e 16g -f 2 -g 1 -w 10
|
||||
└── 确认: Peak Bus BW、Peak Size、wrong_count 正常
|
||||
|
||||
步骤 4: 训练验证
|
||||
├── python3 gpu_tester.py --test training
|
||||
|
||||
@ -41,6 +41,52 @@ nccl:
|
||||
repeats: 3
|
||||
max_stddev_pct: 3
|
||||
|
||||
multinode_nccl:
|
||||
enabled: false
|
||||
mode: sweep
|
||||
hosts:
|
||||
- name: nccl-gpu-1
|
||||
addr: 172.72.8.12
|
||||
slots: 8
|
||||
- name: nccl-gpu-2
|
||||
addr: 172.72.8.16
|
||||
slots: 8
|
||||
ssh_user: root
|
||||
ssh_preflight: true
|
||||
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
|
||||
mpi_ld_preload: null
|
||||
extra_ld_library_path:
|
||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||
- /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
|
||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||
nccl_tests_dir: null # null = tools.install_dir/nccl-tests/build
|
||||
tests:
|
||||
- all_reduce_perf
|
||||
- alltoall_perf
|
||||
topologies:
|
||||
- nodes: 2
|
||||
gpus_per_node: 8
|
||||
begin_size: 1k
|
||||
end_size: 16g
|
||||
step_factor: 2
|
||||
warmup_iters: 10
|
||||
gpus_per_rank: 1
|
||||
timeout_sec: 1800
|
||||
socket_ifname: bond0
|
||||
ib_gid_index: 3
|
||||
ib_sl: 5
|
||||
ib_tc: 136
|
||||
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
|
||||
ib_timeout: 22
|
||||
qps_per_connection: 4
|
||||
min_nchannels: 4
|
||||
net_plugin: none
|
||||
nvls_enable: 1
|
||||
split_data_on_qps: 1
|
||||
min_peak_busbw_gbps:
|
||||
allreduce: 480
|
||||
alltoall: 75
|
||||
|
||||
stress:
|
||||
duration_sec: 1800
|
||||
production_duration_sec: 1800
|
||||
|
||||
@ -28,6 +28,7 @@ from modules.stress_test import StressTest
|
||||
from modules.rdma_test import RDMATest
|
||||
from modules.nvlink_test import NVLinkTest
|
||||
from modules.dcgm_test import DCGMTest
|
||||
from modules.multinode_nccl_test import MultiNodeNCCLTest
|
||||
from modules.report import ReportGenerator
|
||||
from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label, get_supported_gpus, validate_driver_compatibility
|
||||
|
||||
@ -55,6 +56,44 @@ DEFAULT_CONFIG = {
|
||||
"repeats": 3,
|
||||
"max_stddev_pct": 3,
|
||||
},
|
||||
"multinode_nccl": {
|
||||
"enabled": False,
|
||||
"mode": "sweep",
|
||||
"hosts": [
|
||||
{"name": "nccl-gpu-1", "addr": "172.72.8.12", "slots": 8},
|
||||
{"name": "nccl-gpu-2", "addr": "172.72.8.16", "slots": 8},
|
||||
],
|
||||
"ssh_user": "root",
|
||||
"ssh_preflight": True,
|
||||
"mpirun_path": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun",
|
||||
"mpi_ld_preload": None,
|
||||
"extra_ld_library_path": [
|
||||
"/usr/mpi/gcc/openmpi-4.1.9a1/lib",
|
||||
"/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib",
|
||||
"/usr/local/cuda-12.4/targets/x86_64-linux/lib",
|
||||
],
|
||||
"nccl_tests_dir": None,
|
||||
"tests": ["all_reduce_perf", "alltoall_perf"],
|
||||
"topologies": [{"nodes": 2, "gpus_per_node": 8}],
|
||||
"begin_size": "1k",
|
||||
"end_size": "16g",
|
||||
"step_factor": 2,
|
||||
"warmup_iters": 10,
|
||||
"gpus_per_rank": 1,
|
||||
"timeout_sec": 1800,
|
||||
"socket_ifname": "bond0",
|
||||
"ib_gid_index": 3,
|
||||
"ib_sl": 5,
|
||||
"ib_tc": 136,
|
||||
"ib_hca": "mlx5_0,mlx5_1,mlx5_6,mlx5_7",
|
||||
"ib_timeout": 22,
|
||||
"qps_per_connection": 4,
|
||||
"min_nchannels": 4,
|
||||
"net_plugin": "none",
|
||||
"nvls_enable": 1,
|
||||
"split_data_on_qps": 1,
|
||||
"min_peak_busbw_gbps": {"allreduce": 480, "alltoall": 75},
|
||||
},
|
||||
"stress": {
|
||||
"duration_sec": 1800,
|
||||
"production_duration_sec": 1800,
|
||||
@ -191,7 +230,8 @@ def interactive_menu(config: dict):
|
||||
("8", "NVLink/NVSwitch Test", "nvlink"),
|
||||
("9", "DCGM Diagnostic", "dcgm"),
|
||||
("10", "Training Simulation", "training"),
|
||||
("11", "Full Test Suite (All Tests)", "all"),
|
||||
("11", "Multi-node NCCL Test", "multinode_nccl"),
|
||||
("12", "Full Test Suite (All Tests)", "all"),
|
||||
("0", "Generate Report", "report"),
|
||||
]
|
||||
|
||||
@ -218,6 +258,7 @@ def interactive_menu(config: dict):
|
||||
"nvlink": "NVLink links, speed, and error counters",
|
||||
"dcgm": "DCGM diag -r 3 production diagnostic",
|
||||
"training": "Simulate LLM training with PyTorch",
|
||||
"multinode_nccl": "Cross-node NCCL via mpirun/nccl-tests",
|
||||
"all": "Run all tests sequentially",
|
||||
"report": "Export results to JSON/HTML",
|
||||
}
|
||||
@ -326,6 +367,12 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict:
|
||||
m.print_results(result)
|
||||
return result
|
||||
|
||||
elif test_name == "multinode_nccl":
|
||||
m = MultiNodeNCCLTest(config)
|
||||
result = m.run()
|
||||
m.print_results(result)
|
||||
return result
|
||||
|
||||
elif test_name == "all":
|
||||
return _run_full_suite(config, console)
|
||||
|
||||
@ -356,6 +403,8 @@ def _run_full_suite(config: dict, console: Console) -> dict:
|
||||
("dcgm", "DCGM Diagnostic", DCGMTest),
|
||||
("training", "Training Simulation", TrainingSim),
|
||||
]
|
||||
if (config.get("multinode_nccl", {}) or {}).get("enabled"):
|
||||
tests.append(("multinode_nccl", "Multi-node NCCL Test", MultiNodeNCCLTest))
|
||||
|
||||
for i, (key, name, mod_cls) in enumerate(tests, 1):
|
||||
console.print(f"\n[bold cyan][{i}/{len(tests)}] {name}[/bold cyan]")
|
||||
@ -435,6 +484,7 @@ Examples:
|
||||
python gpu_tester.py --test benchmark --type memory
|
||||
python gpu_tester.py --test benchmark --type compute --dtype fp16
|
||||
python gpu_tester.py --test nccl # NCCL test
|
||||
python gpu_tester.py --test multinode-nccl # Cross-node NCCL test
|
||||
python gpu_tester.py --test nvlink # NVLink/NVSwitch test
|
||||
python gpu_tester.py --test dcgm # DCGM diagnostic
|
||||
python gpu_tester.py --test training # Training sim
|
||||
@ -442,7 +492,7 @@ Examples:
|
||||
python gpu_tester.py --report --format json --output report.json
|
||||
""",
|
||||
)
|
||||
parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"],
|
||||
parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "multinode-nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"],
|
||||
help="Run a specific test")
|
||||
parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)")
|
||||
parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"],
|
||||
@ -499,6 +549,7 @@ Examples:
|
||||
"health": "health",
|
||||
"benchmark": None,
|
||||
"nccl": "nccl",
|
||||
"multinode-nccl": "multinode_nccl",
|
||||
"stress": "stress",
|
||||
"rdma": "rdma",
|
||||
"nvlink": "nvlink",
|
||||
|
||||
388
modules/multinode_nccl_test.py
Normal file
388
modules/multinode_nccl_test.py
Normal file
@ -0,0 +1,388 @@
|
||||
"""Multi-node NCCL benchmark wrapper for nccl-tests via mpirun."""
|
||||
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
from modules.gpu_specs import resolve_tools_dir
|
||||
|
||||
|
||||
_TEST_ALIASES = {
|
||||
"allreduce": "all_reduce_perf",
|
||||
"all_reduce": "all_reduce_perf",
|
||||
"all_reduce_perf": "all_reduce_perf",
|
||||
"alltoall": "alltoall_perf",
|
||||
"all_to_all": "alltoall_perf",
|
||||
"alltoall_perf": "alltoall_perf",
|
||||
}
|
||||
|
||||
_OP_LABELS = {
|
||||
"all_reduce_perf": "allreduce",
|
||||
"alltoall_perf": "alltoall",
|
||||
}
|
||||
|
||||
|
||||
class MultiNodeNCCLTest:
|
||||
"""Run cross-node NCCL tests with a PDF-style message-size sweep."""
|
||||
|
||||
def __init__(self, config: dict):
|
||||
self.config = config
|
||||
self.cfg = config.get("multinode_nccl", {}) or {}
|
||||
self.tools_dir = resolve_tools_dir(config)
|
||||
self.console = Console()
|
||||
|
||||
def _find_mpirun(self) -> Optional[str]:
|
||||
configured = self.cfg.get("mpirun_path")
|
||||
if configured and os.path.isfile(str(configured)) and os.access(str(configured), os.X_OK):
|
||||
return str(configured)
|
||||
for cmd in ["mpirun", "mpiexec", os.path.join(self.tools_dir, "mpi", "bin", "mpirun")]:
|
||||
found = shutil.which(cmd)
|
||||
if found:
|
||||
return found
|
||||
return None
|
||||
|
||||
def _find_nccl_test(self, binary_name: str) -> Optional[str]:
|
||||
configured = self.cfg.get("nccl_tests_dir")
|
||||
candidates = []
|
||||
if configured:
|
||||
candidates.append(os.path.join(configured, binary_name))
|
||||
candidates.append(os.path.join(self.tools_dir, "nccl-tests", "build", binary_name))
|
||||
found = shutil.which(binary_name)
|
||||
if found:
|
||||
candidates.insert(0, found)
|
||||
|
||||
for path in candidates:
|
||||
if path and os.path.isfile(path) and os.access(path, os.X_OK):
|
||||
return path
|
||||
return None
|
||||
|
||||
def _tests(self) -> list[str]:
|
||||
configured = self.cfg.get("tests") or ["all_reduce_perf", "alltoall_perf"]
|
||||
tests = []
|
||||
for name in configured:
|
||||
binary = _TEST_ALIASES.get(str(name).lower())
|
||||
if binary and binary not in tests:
|
||||
tests.append(binary)
|
||||
return tests
|
||||
|
||||
def _hosts(self) -> list[dict]:
|
||||
hosts = self.cfg.get("hosts") or []
|
||||
normalized = []
|
||||
for host in hosts:
|
||||
if isinstance(host, str):
|
||||
normalized.append({"addr": host, "slots": 8})
|
||||
elif isinstance(host, dict):
|
||||
normalized.append({
|
||||
"name": host.get("name") or host.get("addr"),
|
||||
"addr": host.get("addr") or host.get("host") or host.get("ip"),
|
||||
"slots": int(host.get("slots", 8)),
|
||||
})
|
||||
return [h for h in normalized if h.get("addr")]
|
||||
|
||||
def _topologies(self) -> list[dict]:
|
||||
topologies = self.cfg.get("topologies") or [{"nodes": 2, "gpus_per_node": 8}]
|
||||
normalized = []
|
||||
for topo in topologies:
|
||||
nodes = int(topo.get("nodes", 2))
|
||||
gpus_per_node = int(topo.get("gpus_per_node", topo.get("gpn", 8)))
|
||||
normalized.append({
|
||||
"nodes": nodes,
|
||||
"gpus_per_node": gpus_per_node,
|
||||
"label": topo.get("label") or f"{nodes} nodes x {gpus_per_node} GPUs",
|
||||
})
|
||||
return normalized
|
||||
|
||||
def _env_exports(self) -> list[tuple[str, str]]:
|
||||
env_cfg = {
|
||||
"NCCL_DEBUG": self.cfg.get("debug", "WARN"),
|
||||
"NCCL_SOCKET_IFNAME": self.cfg.get("socket_ifname"),
|
||||
"NCCL_IB_GID_INDEX": self.cfg.get("ib_gid_index"),
|
||||
"NCCL_IB_SL": self.cfg.get("ib_sl"),
|
||||
"NCCL_IB_TC": self.cfg.get("ib_tc"),
|
||||
"NCCL_IB_HCA": self.cfg.get("ib_hca"),
|
||||
"NCCL_IB_TIMEOUT": self.cfg.get("ib_timeout"),
|
||||
"NCCL_IB_QPS_PER_CONNECTION": self.cfg.get("qps_per_connection"),
|
||||
"NCCL_MIN_NCHANNELS": self.cfg.get("min_nchannels"),
|
||||
"NCCL_NET_PLUGIN": self.cfg.get("net_plugin"),
|
||||
"NCCL_NVLS_ENABLE": self.cfg.get("nvls_enable"),
|
||||
"NCCL_IB_SPLIT_DATA_ON_QPS": self.cfg.get("split_data_on_qps"),
|
||||
}
|
||||
mpi_ld_preload = self._mpi_ld_preload()
|
||||
if mpi_ld_preload:
|
||||
env_cfg["LD_PRELOAD"] = mpi_ld_preload
|
||||
extra_ld_library_path = self._extra_ld_library_path()
|
||||
if extra_ld_library_path:
|
||||
existing = os.environ.get("LD_LIBRARY_PATH", "")
|
||||
env_cfg["LD_LIBRARY_PATH"] = ":".join(
|
||||
[extra_ld_library_path] + ([existing] if existing else [])
|
||||
)
|
||||
return [(k, str(v)) for k, v in env_cfg.items() if v is not None]
|
||||
|
||||
def _mpi_ld_preload(self) -> str:
|
||||
preload = self.cfg.get("mpi_ld_preload")
|
||||
if isinstance(preload, list):
|
||||
return " ".join(str(p) for p in preload if p)
|
||||
return str(preload) if preload else ""
|
||||
|
||||
def _runtime_env(self) -> dict:
|
||||
env = os.environ.copy()
|
||||
mpi_ld_preload = self._mpi_ld_preload()
|
||||
if mpi_ld_preload:
|
||||
env["LD_PRELOAD"] = mpi_ld_preload
|
||||
extra_ld_library_path = self._extra_ld_library_path()
|
||||
if extra_ld_library_path:
|
||||
existing = env.get("LD_LIBRARY_PATH", "")
|
||||
env["LD_LIBRARY_PATH"] = ":".join(
|
||||
[extra_ld_library_path] + ([existing] if existing else [])
|
||||
)
|
||||
return env
|
||||
|
||||
def _extra_ld_library_path(self) -> str:
|
||||
paths = self.cfg.get("extra_ld_library_path")
|
||||
if isinstance(paths, list):
|
||||
return ":".join(str(p) for p in paths if p)
|
||||
return str(paths) if paths else ""
|
||||
|
||||
def _preflight(self, mpirun: Optional[str], tests: list[str], hosts: list[dict]) -> dict:
|
||||
checks = []
|
||||
checks.append({"name": "mpirun", "status": "PASS" if mpirun else "FAIL", "detail": mpirun or "not found"})
|
||||
checks.append({"name": "hosts", "status": "PASS" if len(hosts) >= 2 else "FAIL", "detail": f"{len(hosts)} configured"})
|
||||
for binary in tests:
|
||||
path = self._find_nccl_test(binary)
|
||||
checks.append({"name": binary, "status": "PASS" if path else "FAIL", "detail": path or "not found"})
|
||||
|
||||
if self.cfg.get("ssh_preflight", True):
|
||||
user = self.cfg.get("ssh_user", "root")
|
||||
for host in hosts:
|
||||
target = f"{user}@{host['addr']}"
|
||||
cmd = ["ssh", "-o", "BatchMode=yes", "-o", "ConnectTimeout=5", target, "hostname"]
|
||||
try:
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=8, env=self._runtime_env())
|
||||
detail = r.stdout.strip() or r.stderr.strip()[:120]
|
||||
checks.append({
|
||||
"name": f"ssh {host['addr']}",
|
||||
"status": "PASS" if r.returncode == 0 else "WARN",
|
||||
"detail": detail,
|
||||
})
|
||||
except Exception as e:
|
||||
checks.append({"name": f"ssh {host['addr']}", "status": "WARN", "detail": str(e)})
|
||||
|
||||
return {
|
||||
"checks": checks,
|
||||
"passed": all(c["status"] == "PASS" for c in checks if not c["name"].startswith("ssh ")),
|
||||
}
|
||||
|
||||
def run(self) -> dict:
|
||||
mpirun = self._find_mpirun()
|
||||
tests = self._tests()
|
||||
hosts = self._hosts()
|
||||
topologies = self._topologies()
|
||||
preflight = self._preflight(mpirun, tests, hosts)
|
||||
|
||||
if not preflight["passed"]:
|
||||
return {
|
||||
"passed": False,
|
||||
"source": "nccl-tests-mpirun",
|
||||
"mode": self.cfg.get("mode", "sweep"),
|
||||
"hosts": hosts,
|
||||
"preflight": preflight,
|
||||
"tests": {},
|
||||
"error": "multinode NCCL preflight failed",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
results = {}
|
||||
for binary in tests:
|
||||
label = _OP_LABELS[binary]
|
||||
binary_path = self._find_nccl_test(binary)
|
||||
op_results = []
|
||||
for topo in topologies:
|
||||
op_results.append(self._run_topology(mpirun, binary_path, label, hosts, topo))
|
||||
results[label] = {"binary": binary_path, "topologies": op_results}
|
||||
|
||||
passed = all(
|
||||
topo.get("status") == "PASS"
|
||||
for op in results.values()
|
||||
for topo in op.get("topologies", [])
|
||||
)
|
||||
return {
|
||||
"passed": passed,
|
||||
"source": "nccl-tests-mpirun",
|
||||
"mode": self.cfg.get("mode", "sweep"),
|
||||
"hosts": hosts,
|
||||
"preflight": preflight,
|
||||
"tests": results,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
def _run_topology(self, mpirun: str, binary: str, label: str, hosts: list[dict], topo: dict) -> dict:
|
||||
nodes = topo["nodes"]
|
||||
gpus_per_node = topo["gpus_per_node"]
|
||||
selected_hosts = hosts[:nodes]
|
||||
host_arg = ",".join(f"{h['addr']}:{gpus_per_node}" for h in selected_hosts)
|
||||
ranks = nodes * gpus_per_node
|
||||
|
||||
cmd = [
|
||||
mpirun,
|
||||
"--allow-run-as-root",
|
||||
"--mca", "btl_openib_warn_no_device_params_found", "0",
|
||||
"--mca", "btl_tcp_if_include", str(self.cfg.get("socket_ifname", "bond0")),
|
||||
"-H", host_arg,
|
||||
"--map-by", f"ppr:{gpus_per_node}:node",
|
||||
"-np", str(ranks),
|
||||
]
|
||||
for key, value in self._env_exports():
|
||||
cmd.extend(["-x", f"{key}={value}"])
|
||||
|
||||
cmd.extend([
|
||||
binary,
|
||||
"-b", str(self.cfg.get("begin_size", "1k")),
|
||||
"-e", str(self.cfg.get("end_size", "16g")),
|
||||
"-g", str(self.cfg.get("gpus_per_rank", 1)),
|
||||
"-f", str(self.cfg.get("step_factor", 2)),
|
||||
"-w", str(self.cfg.get("warmup_iters", 10)),
|
||||
])
|
||||
if self.cfg.get("iters") is not None:
|
||||
cmd.extend(["-n", str(self.cfg["iters"])])
|
||||
|
||||
timeout = int(self.cfg.get("timeout_sec", 1800))
|
||||
started = datetime.now().isoformat()
|
||||
try:
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, env=self._runtime_env())
|
||||
except subprocess.TimeoutExpired:
|
||||
return {
|
||||
"label": topo["label"],
|
||||
"nodes": nodes,
|
||||
"gpus_per_node": gpus_per_node,
|
||||
"ranks": ranks,
|
||||
"hosts": selected_hosts,
|
||||
"command": " ".join(cmd),
|
||||
"status": "FAIL",
|
||||
"error": f"timeout after {timeout}s",
|
||||
"started_at": started,
|
||||
}
|
||||
|
||||
parsed = self._parse_nccl_output(r.stdout)
|
||||
threshold = self._threshold_for(label)
|
||||
wrong = sum(row.get("wrong", 0) for row in parsed["by_size"])
|
||||
has_bw = parsed["peak_busbw_gbps"] > 0
|
||||
status = "PASS" if r.returncode == 0 and has_bw and wrong == 0 and parsed["peak_busbw_gbps"] >= threshold else "FAIL"
|
||||
return {
|
||||
"label": topo["label"],
|
||||
"nodes": nodes,
|
||||
"gpus_per_node": gpus_per_node,
|
||||
"ranks": ranks,
|
||||
"hosts": selected_hosts,
|
||||
"command": " ".join(cmd),
|
||||
"returncode": r.returncode,
|
||||
"status": status,
|
||||
"peak_busbw_gbps": parsed["peak_busbw_gbps"],
|
||||
"peak_algbw_gbps": parsed["peak_algbw_gbps"],
|
||||
"peak_size": parsed["peak_size"],
|
||||
"avg_busbw_gbps": parsed["avg_busbw_gbps"],
|
||||
"min_required_gbps": threshold,
|
||||
"wrong_count": wrong,
|
||||
"by_size": parsed["by_size"],
|
||||
"stderr_tail": r.stderr[-1200:],
|
||||
"stdout_tail": r.stdout[-1200:],
|
||||
"started_at": started,
|
||||
"finished_at": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
def _threshold_for(self, label: str) -> float:
|
||||
thresholds = self.cfg.get("min_peak_busbw_gbps") or {}
|
||||
if isinstance(thresholds, dict):
|
||||
return float(thresholds.get(label, 0) or 0)
|
||||
return float(thresholds or 0)
|
||||
|
||||
@staticmethod
|
||||
def _parse_nccl_output(stdout: str) -> dict:
|
||||
rows = []
|
||||
avg_bus = 0.0
|
||||
for line in stdout.splitlines():
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
avg_match = re.search(r"Avg bus bandwidth\s*:\s*([0-9.]+)", stripped)
|
||||
if avg_match:
|
||||
avg_bus = float(avg_match.group(1))
|
||||
continue
|
||||
if stripped.startswith("#"):
|
||||
continue
|
||||
parts = stripped.split()
|
||||
if len(parts) < 9:
|
||||
continue
|
||||
try:
|
||||
size_bytes = int(parts[0])
|
||||
time_us = float(parts[5])
|
||||
algbw = float(parts[6])
|
||||
busbw = float(parts[7])
|
||||
wrong = int(parts[8])
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
rows.append({
|
||||
"size_bytes": size_bytes,
|
||||
"size": _format_size(size_bytes),
|
||||
"time_us": time_us,
|
||||
"algbw_gbps": algbw,
|
||||
"busbw_gbps": busbw,
|
||||
"wrong": wrong,
|
||||
})
|
||||
|
||||
peak_row = max(rows, key=lambda r: r["busbw_gbps"], default={})
|
||||
return {
|
||||
"peak_busbw_gbps": round(float(peak_row.get("busbw_gbps", 0)), 2),
|
||||
"peak_algbw_gbps": round(float(peak_row.get("algbw_gbps", 0)), 2),
|
||||
"peak_size": peak_row.get("size", ""),
|
||||
"avg_busbw_gbps": round(avg_bus, 2),
|
||||
"by_size": rows,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def print_results(results: dict, console: Console = None):
|
||||
c = console or Console()
|
||||
if results.get("error"):
|
||||
c.print(f"[bold red]Multi-node NCCL failed: {results['error']}[/bold red]")
|
||||
else:
|
||||
c.print("[bold green]Multi-node NCCL complete[/bold green]" if results.get("passed") else "[bold red]Multi-node NCCL failed[/bold red]")
|
||||
|
||||
preflight = results.get("preflight", {})
|
||||
if preflight.get("checks"):
|
||||
table = Table(title="Preflight")
|
||||
table.add_column("Check")
|
||||
table.add_column("Status")
|
||||
table.add_column("Detail")
|
||||
for check in preflight["checks"]:
|
||||
table.add_row(check["name"], check["status"], str(check.get("detail", "")))
|
||||
c.print(table)
|
||||
|
||||
for op, data in (results.get("tests") or {}).items():
|
||||
table = Table(title=f"Multi-node NCCL {op}")
|
||||
table.add_column("Topology")
|
||||
table.add_column("Peak Bus BW")
|
||||
table.add_column("Peak Size")
|
||||
table.add_column("Threshold")
|
||||
table.add_column("Status")
|
||||
for topo in data.get("topologies", []):
|
||||
table.add_row(
|
||||
topo.get("label", ""),
|
||||
f"{topo.get('peak_busbw_gbps', 0):.2f} GB/s",
|
||||
str(topo.get("peak_size", "")),
|
||||
f">= {topo.get('min_required_gbps', 0):.0f} GB/s" if topo.get("min_required_gbps") else "-",
|
||||
topo.get("status", "?"),
|
||||
)
|
||||
c.print(table)
|
||||
|
||||
|
||||
def _format_size(size_bytes: int) -> str:
|
||||
units = [("G", 1024 ** 3), ("M", 1024 ** 2), ("K", 1024)]
|
||||
for suffix, factor in units:
|
||||
if size_bytes >= factor and size_bytes % factor == 0:
|
||||
return f"{size_bytes // factor}{suffix}"
|
||||
return str(size_bytes)
|
||||
@ -464,6 +464,47 @@ class ReportGenerator:
|
||||
passed = nccl.get("passed", False)
|
||||
lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")
|
||||
|
||||
multinode = results.get("multinode_nccl")
|
||||
if multinode and not multinode.get("error"):
|
||||
lines.append("## Multi-node NCCL / Cross Leaf\n")
|
||||
lines.append(f"Source: {multinode.get('source', 'unknown')} | Mode: {multinode.get('mode', 'unknown')}\n")
|
||||
hosts = multinode.get("hosts", [])
|
||||
if hosts:
|
||||
host_text = ", ".join(f"{h.get('name') or h.get('addr')}({h.get('addr')})" for h in hosts)
|
||||
lines.append(f"- **Hosts:** {host_text}")
|
||||
preflight = multinode.get("preflight", {})
|
||||
if preflight.get("checks"):
|
||||
failed_checks = [c for c in preflight["checks"] if c.get("status") == "FAIL"]
|
||||
warn_checks = [c for c in preflight["checks"] if c.get("status") == "WARN"]
|
||||
lines.append(f"- **Preflight:** {'PASS' if not failed_checks else 'FAIL'}"
|
||||
f"{f' ({len(warn_checks)} warnings)' if warn_checks else ''}")
|
||||
lines.append("")
|
||||
for op, data in (multinode.get("tests") or {}).items():
|
||||
lines.append(f"### Multi-node NCCL {op}\n")
|
||||
lines.append("| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |")
|
||||
lines.append("|----------|-------------|-----------|------------|-----------|--------|")
|
||||
for topo in data.get("topologies", []):
|
||||
threshold = topo.get("min_required_gbps", 0) or 0
|
||||
threshold_text = f">= {threshold:.0f} GB/s" if threshold else "-"
|
||||
lines.append(
|
||||
f"| {topo.get('label', '')} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | "
|
||||
f"{topo.get('peak_size', '')} | {topo.get('avg_busbw_gbps', 0):.2f} GB/s | "
|
||||
f"{threshold_text} | {topo.get('status', '?')} |"
|
||||
)
|
||||
lines.append("")
|
||||
lines.append(f"**Overall: {'PASS' if multinode.get('passed') else 'FAIL'}**\n")
|
||||
elif multinode and multinode.get("error"):
|
||||
lines.append("## Multi-node NCCL / Cross Leaf\n")
|
||||
lines.append(f"**Overall: FAIL** ({multinode.get('error')})\n")
|
||||
preflight = multinode.get("preflight", {})
|
||||
if preflight.get("checks"):
|
||||
lines.append("| Check | Status | Detail |")
|
||||
lines.append("|-------|--------|--------|")
|
||||
for check in preflight["checks"]:
|
||||
detail = str(check.get("detail", "")).replace("\n", " ")
|
||||
lines.append(f"| {check.get('name', '')} | {check.get('status', '')} | {detail} |")
|
||||
lines.append("")
|
||||
|
||||
# --- Stress Test ---
|
||||
stress = results.get("stress")
|
||||
if stress and not stress.get("error"):
|
||||
@ -836,6 +877,15 @@ class ReportGenerator:
|
||||
else:
|
||||
items.append(("NCCL", "FAIL"))
|
||||
|
||||
if "multinode_nccl" in results:
|
||||
mn = results["multinode_nccl"]
|
||||
if mn.get("error"):
|
||||
items.append(("Multi-node NCCL", f"ERROR: {mn['error']}"))
|
||||
elif mn.get("passed"):
|
||||
items.append(("Multi-node NCCL", "PASS"))
|
||||
else:
|
||||
items.append(("Multi-node NCCL", "FAIL"))
|
||||
|
||||
# Stress
|
||||
if "stress" in results:
|
||||
s = results["stress"]
|
||||
|
||||
439
reports_multinode_nccl_smoke_256m_aikubeworker0012.json
Normal file
439
reports_multinode_nccl_smoke_256m_aikubeworker0012.json
Normal file
@ -0,0 +1,439 @@
|
||||
{
|
||||
"multinode_nccl": {
|
||||
"passed": false,
|
||||
"source": "nccl-tests-mpirun",
|
||||
"mode": "sweep",
|
||||
"hosts": [
|
||||
{
|
||||
"name": "nccl-gpu-1",
|
||||
"addr": "172.72.8.12",
|
||||
"slots": 8
|
||||
},
|
||||
{
|
||||
"name": "nccl-gpu-2",
|
||||
"addr": "172.72.8.16",
|
||||
"slots": 8
|
||||
}
|
||||
],
|
||||
"preflight": {
|
||||
"checks": [
|
||||
{
|
||||
"name": "mpirun",
|
||||
"status": "PASS",
|
||||
"detail": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun"
|
||||
},
|
||||
{
|
||||
"name": "hosts",
|
||||
"status": "PASS",
|
||||
"detail": "2 configured"
|
||||
},
|
||||
{
|
||||
"name": "all_reduce_perf",
|
||||
"status": "PASS",
|
||||
"detail": "/opt/gpu-test-tools/nccl-tests/build/all_reduce_perf"
|
||||
},
|
||||
{
|
||||
"name": "alltoall_perf",
|
||||
"status": "PASS",
|
||||
"detail": "/opt/gpu-test-tools/nccl-tests/build/alltoall_perf"
|
||||
},
|
||||
{
|
||||
"name": "ssh 172.72.8.12",
|
||||
"status": "WARN",
|
||||
"detail": "Host key verification failed."
|
||||
},
|
||||
{
|
||||
"name": "ssh 172.72.8.16",
|
||||
"status": "PASS",
|
||||
"detail": "aikubeworker0016"
|
||||
}
|
||||
],
|
||||
"passed": true
|
||||
},
|
||||
"tests": {
|
||||
"allreduce": {
|
||||
"binary": "/opt/gpu-test-tools/nccl-tests/build/all_reduce_perf",
|
||||
"topologies": [
|
||||
{
|
||||
"label": "2 nodes x 8 GPUs",
|
||||
"nodes": 2,
|
||||
"gpus_per_node": 8,
|
||||
"ranks": 16,
|
||||
"hosts": [
|
||||
{
|
||||
"name": "nccl-gpu-1",
|
||||
"addr": "172.72.8.12",
|
||||
"slots": 8
|
||||
},
|
||||
{
|
||||
"name": "nccl-gpu-2",
|
||||
"addr": "172.72.8.16",
|
||||
"slots": 8
|
||||
}
|
||||
],
|
||||
"command": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun --allow-run-as-root --mca btl_openib_warn_no_device_params_found 0 --mca btl_tcp_if_include bond0 -H 172.72.8.12:8,172.72.8.16:8 --map-by ppr:8:node -np 16 -x NCCL_DEBUG=WARN -x NCCL_SOCKET_IFNAME=bond0 -x NCCL_IB_GID_INDEX=3 -x NCCL_IB_SL=5 -x NCCL_IB_TC=136 -x NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 -x NCCL_IB_TIMEOUT=22 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_MIN_NCHANNELS=4 -x NCCL_NET_PLUGIN=none -x NCCL_NVLS_ENABLE=1 -x NCCL_IB_SPLIT_DATA_ON_QPS=1 -x LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib:/usr/local/cuda-12.4/targets/x86_64-linux/lib /opt/gpu-test-tools/nccl-tests/build/all_reduce_perf -b 1k -e 256M -g 1 -f 2 -w 2",
|
||||
"returncode": 0,
|
||||
"status": "FAIL",
|
||||
"peak_busbw_gbps": 39.32,
|
||||
"peak_algbw_gbps": 20.97,
|
||||
"peak_size": "4M",
|
||||
"avg_busbw_gbps": 9.1,
|
||||
"min_required_gbps": 100.0,
|
||||
"wrong_count": 0,
|
||||
"by_size": [
|
||||
{
|
||||
"size_bytes": 1024,
|
||||
"size": "1K",
|
||||
"time_us": 80.32,
|
||||
"algbw_gbps": 0.01,
|
||||
"busbw_gbps": 0.02,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 2048,
|
||||
"size": "2K",
|
||||
"time_us": 35.79,
|
||||
"algbw_gbps": 0.06,
|
||||
"busbw_gbps": 0.11,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 4096,
|
||||
"size": "4K",
|
||||
"time_us": 37.49,
|
||||
"algbw_gbps": 0.11,
|
||||
"busbw_gbps": 0.2,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 8192,
|
||||
"size": "8K",
|
||||
"time_us": 40.32,
|
||||
"algbw_gbps": 0.2,
|
||||
"busbw_gbps": 0.38,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 16384,
|
||||
"size": "16K",
|
||||
"time_us": 43.04,
|
||||
"algbw_gbps": 0.38,
|
||||
"busbw_gbps": 0.71,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 32768,
|
||||
"size": "32K",
|
||||
"time_us": 43.32,
|
||||
"algbw_gbps": 0.76,
|
||||
"busbw_gbps": 1.42,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 65536,
|
||||
"size": "64K",
|
||||
"time_us": 47.45,
|
||||
"algbw_gbps": 1.38,
|
||||
"busbw_gbps": 2.59,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 131072,
|
||||
"size": "128K",
|
||||
"time_us": 89.3,
|
||||
"algbw_gbps": 1.47,
|
||||
"busbw_gbps": 2.75,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 262144,
|
||||
"size": "256K",
|
||||
"time_us": 165.38,
|
||||
"algbw_gbps": 1.59,
|
||||
"busbw_gbps": 2.97,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 524288,
|
||||
"size": "512K",
|
||||
"time_us": 4292.69,
|
||||
"algbw_gbps": 0.12,
|
||||
"busbw_gbps": 0.23,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 1048576,
|
||||
"size": "1M",
|
||||
"time_us": 139.29,
|
||||
"algbw_gbps": 7.53,
|
||||
"busbw_gbps": 14.12,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 2097152,
|
||||
"size": "2M",
|
||||
"time_us": 4195.12,
|
||||
"algbw_gbps": 0.5,
|
||||
"busbw_gbps": 0.94,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 4194304,
|
||||
"size": "4M",
|
||||
"time_us": 199.99,
|
||||
"algbw_gbps": 20.97,
|
||||
"busbw_gbps": 39.32,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 8388608,
|
||||
"size": "8M",
|
||||
"time_us": 6159.0,
|
||||
"algbw_gbps": 1.36,
|
||||
"busbw_gbps": 2.55,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 16777216,
|
||||
"size": "16M",
|
||||
"time_us": 6336.73,
|
||||
"algbw_gbps": 2.65,
|
||||
"busbw_gbps": 4.96,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 33554432,
|
||||
"size": "32M",
|
||||
"time_us": 12623.3,
|
||||
"algbw_gbps": 2.66,
|
||||
"busbw_gbps": 4.98,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 67108864,
|
||||
"size": "64M",
|
||||
"time_us": 17005.6,
|
||||
"algbw_gbps": 3.95,
|
||||
"busbw_gbps": 7.4,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 134217728,
|
||||
"size": "128M",
|
||||
"time_us": 23826.7,
|
||||
"algbw_gbps": 5.63,
|
||||
"busbw_gbps": 10.56,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 268435456,
|
||||
"size": "256M",
|
||||
"time_us": 47356.5,
|
||||
"algbw_gbps": 5.67,
|
||||
"busbw_gbps": 10.63,
|
||||
"wrong": 0
|
||||
}
|
||||
],
|
||||
"stderr_tail": "",
|
||||
"stdout_tail": " 6.25 0\n 1048576 262144 float sum -1 139.29 7.53 14.12 0 3552.34 0.30 0.55 0\n 2097152 524288 float sum -1 4195.12 0.50 0.94 0 158.81 13.21 24.76 0\n 4194304 1048576 float sum -1 199.99 20.97 39.32 0 3623.39 1.16 2.17 0\n 8388608 2097152 float sum -1 6159.00 1.36 2.55 0 324.45 25.85 48.48 0\n 16777216 4194304 float sum -1 6336.73 2.65 4.96 0 600.96 27.92 52.35 0\n 33554432 8388608 float sum -1 12623.3 2.66 4.98 0 949.39 35.34 66.27 0\n 67108864 16777216 float sum -1 17005.6 3.95 7.40 0 17175.5 3.91 7.33 0\n 134217728 33554432 float sum -1 23826.7 5.63 10.56 0 25793.0 5.20 9.76 0\n 268435456 67108864 float sum -1 47356.5 5.67 10.63 0 43195.8 6.21 11.65 0\n# Out of bounds values : 0 OK\n# Avg bus bandwidth : 9.0956 \n#\n# Collective test concluded: all_reduce_perf\n#\n\n",
|
||||
"started_at": "2026-05-23T04:59:28.584786",
|
||||
"finished_at": "2026-05-23T04:59:54.886123"
|
||||
}
|
||||
]
|
||||
},
|
||||
"alltoall": {
|
||||
"binary": "/opt/gpu-test-tools/nccl-tests/build/alltoall_perf",
|
||||
"topologies": [
|
||||
{
|
||||
"label": "2 nodes x 8 GPUs",
|
||||
"nodes": 2,
|
||||
"gpus_per_node": 8,
|
||||
"ranks": 16,
|
||||
"hosts": [
|
||||
{
|
||||
"name": "nccl-gpu-1",
|
||||
"addr": "172.72.8.12",
|
||||
"slots": 8
|
||||
},
|
||||
{
|
||||
"name": "nccl-gpu-2",
|
||||
"addr": "172.72.8.16",
|
||||
"slots": 8
|
||||
}
|
||||
],
|
||||
"command": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun --allow-run-as-root --mca btl_openib_warn_no_device_params_found 0 --mca btl_tcp_if_include bond0 -H 172.72.8.12:8,172.72.8.16:8 --map-by ppr:8:node -np 16 -x NCCL_DEBUG=WARN -x NCCL_SOCKET_IFNAME=bond0 -x NCCL_IB_GID_INDEX=3 -x NCCL_IB_SL=5 -x NCCL_IB_TC=136 -x NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 -x NCCL_IB_TIMEOUT=22 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_MIN_NCHANNELS=4 -x NCCL_NET_PLUGIN=none -x NCCL_NVLS_ENABLE=1 -x NCCL_IB_SPLIT_DATA_ON_QPS=1 -x LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib:/usr/local/cuda-12.4/targets/x86_64-linux/lib /opt/gpu-test-tools/nccl-tests/build/alltoall_perf -b 1k -e 256M -g 1 -f 2 -w 2",
|
||||
"returncode": 0,
|
||||
"status": "FAIL",
|
||||
"peak_busbw_gbps": 8.64,
|
||||
"peak_algbw_gbps": 9.21,
|
||||
"peak_size": "2M",
|
||||
"avg_busbw_gbps": 2.19,
|
||||
"min_required_gbps": 20.0,
|
||||
"wrong_count": 0,
|
||||
"by_size": [
|
||||
{
|
||||
"size_bytes": 1024,
|
||||
"size": "1K",
|
||||
"time_us": 58.44,
|
||||
"algbw_gbps": 0.02,
|
||||
"busbw_gbps": 0.02,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 2048,
|
||||
"size": "2K",
|
||||
"time_us": 47.2,
|
||||
"algbw_gbps": 0.04,
|
||||
"busbw_gbps": 0.04,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 4096,
|
||||
"size": "4K",
|
||||
"time_us": 47.68,
|
||||
"algbw_gbps": 0.09,
|
||||
"busbw_gbps": 0.08,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 8192,
|
||||
"size": "8K",
|
||||
"time_us": 48.78,
|
||||
"algbw_gbps": 0.17,
|
||||
"busbw_gbps": 0.16,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 16384,
|
||||
"size": "16K",
|
||||
"time_us": 79.34,
|
||||
"algbw_gbps": 0.21,
|
||||
"busbw_gbps": 0.19,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 32768,
|
||||
"size": "32K",
|
||||
"time_us": 68.8,
|
||||
"algbw_gbps": 0.48,
|
||||
"busbw_gbps": 0.45,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 65536,
|
||||
"size": "64K",
|
||||
"time_us": 49.86,
|
||||
"algbw_gbps": 1.31,
|
||||
"busbw_gbps": 1.23,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 131072,
|
||||
"size": "128K",
|
||||
"time_us": 52.89,
|
||||
"algbw_gbps": 2.48,
|
||||
"busbw_gbps": 2.32,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 262144,
|
||||
"size": "256K",
|
||||
"time_us": 3861.98,
|
||||
"algbw_gbps": 0.07,
|
||||
"busbw_gbps": 0.06,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 524288,
|
||||
"size": "512K",
|
||||
"time_us": 83.38,
|
||||
"algbw_gbps": 6.29,
|
||||
"busbw_gbps": 5.89,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 1048576,
|
||||
"size": "1M",
|
||||
"time_us": 182.32,
|
||||
"algbw_gbps": 5.75,
|
||||
"busbw_gbps": 5.39,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 2097152,
|
||||
"size": "2M",
|
||||
"time_us": 227.67,
|
||||
"algbw_gbps": 9.21,
|
||||
"busbw_gbps": 8.64,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 4194304,
|
||||
"size": "4M",
|
||||
"time_us": 6482.39,
|
||||
"algbw_gbps": 0.65,
|
||||
"busbw_gbps": 0.61,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 8388608,
|
||||
"size": "8M",
|
||||
"time_us": 10348.9,
|
||||
"algbw_gbps": 0.81,
|
||||
"busbw_gbps": 0.76,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 16777216,
|
||||
"size": "16M",
|
||||
"time_us": 18616.5,
|
||||
"algbw_gbps": 0.9,
|
||||
"busbw_gbps": 0.84,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 33554432,
|
||||
"size": "32M",
|
||||
"time_us": 17170.7,
|
||||
"algbw_gbps": 1.95,
|
||||
"busbw_gbps": 1.83,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 67108864,
|
||||
"size": "64M",
|
||||
"time_us": 35735.6,
|
||||
"algbw_gbps": 1.88,
|
||||
"busbw_gbps": 1.76,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 134217728,
|
||||
"size": "128M",
|
||||
"time_us": 69388.5,
|
||||
"algbw_gbps": 1.93,
|
||||
"busbw_gbps": 1.81,
|
||||
"wrong": 0
|
||||
},
|
||||
{
|
||||
"size_bytes": 268435456,
|
||||
"size": "256M",
|
||||
"time_us": 96873.9,
|
||||
"algbw_gbps": 2.77,
|
||||
"busbw_gbps": 2.6,
|
||||
"wrong": 0
|
||||
}
|
||||
],
|
||||
"stderr_tail": "",
|
||||
"stdout_tail": "56 6.85 6.42 N/A\n 1048576 16384 float none -1 182.32 5.75 5.39 0 169.19 6.20 5.81 N/A\n 2097152 32768 float none -1 227.67 9.21 8.64 0 3664.15 0.57 0.54 N/A\n 4194304 65536 float none -1 6482.39 0.65 0.61 0 553.24 7.58 7.11 N/A\n 8388608 131072 float none -1 10348.9 0.81 0.76 0 803.01 10.45 9.79 N/A\n 16777216 262144 float none -1 18616.5 0.90 0.84 0 4237.22 3.96 3.71 N/A\n 33554432 524288 float none -1 17170.7 1.95 1.83 0 20849.4 1.61 1.51 N/A\n 67108864 1048576 float none -1 35735.6 1.88 1.76 0 34524.7 1.94 1.82 N/A\n 134217728 2097152 float none -1 69388.5 1.93 1.81 0 63535.3 2.11 1.98 N/A\n 268435456 4194304 float none -1 96873.9 2.77 2.60 0 100742 2.66 2.50 N/A\n# Out of bounds values : 0 OK\n# Avg bus bandwidth : 2.19061 \n#\n# Collective test concluded: alltoall_perf\n#\n\n",
|
||||
"started_at": "2026-05-23T04:59:54.886310",
|
||||
"finished_at": "2026-05-23T05:00:28.796555"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"timestamp": "2026-05-23T05:00:28.796580"
|
||||
},
|
||||
"timestamp": "2026-05-23T05:00:28.807561",
|
||||
"hostname": "aikubeworker0012"
|
||||
}
|
||||
50
reports_multinode_nccl_smoke_256m_aikubeworker0012.md
Normal file
50
reports_multinode_nccl_smoke_256m_aikubeworker0012.md
Normal file
@ -0,0 +1,50 @@
|
||||
# GPU Test Report
|
||||
|
||||
- **Date:** 2026-05-23T05:00:28.807561
|
||||
- **Host:** aikubeworker0012
|
||||
|
||||
## Overall Acceptance Verdict
|
||||
|
||||
**Result: FAIL**
|
||||
|
||||
Missing required evidence:
|
||||
- GPU Info
|
||||
- Health Check
|
||||
- Memory Bandwidth
|
||||
- Compute Throughput
|
||||
- NVLink/NVSwitch
|
||||
- NCCL
|
||||
- Stress Test
|
||||
- RDMA
|
||||
- DCGM
|
||||
- Training
|
||||
|
||||
## Summary
|
||||
|
||||
| Test | Result |
|
||||
|------|--------|
|
||||
| Multi-node NCCL | FAIL |
|
||||
|
||||
## Multi-node NCCL / Cross Leaf
|
||||
|
||||
Source: nccl-tests-mpirun | Mode: sweep
|
||||
|
||||
- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16)
|
||||
- **Preflight:** PASS (1 warnings)
|
||||
|
||||
### Multi-node NCCL allreduce
|
||||
|
||||
| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|
||||
|----------|-------------|-----------|------------|-----------|--------|
|
||||
| 2 nodes x 8 GPUs | 39.32 GB/s | 4M | 9.10 GB/s | >= 100 GB/s | FAIL |
|
||||
|
||||
### Multi-node NCCL alltoall
|
||||
|
||||
| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|
||||
|----------|-------------|-----------|------------|-----------|--------|
|
||||
| 2 nodes x 8 GPUs | 8.64 GB/s | 2M | 2.19 GB/s | >= 20 GB/s | FAIL |
|
||||
|
||||
**Overall: FAIL**
|
||||
|
||||
---
|
||||
*Generated by GPU Test Suite v0.2.0*
|
||||
Loading…
x
Reference in New Issue
Block a user