From 4b17bafd531a6013d93d49887e6e98447b4d26ca Mon Sep 17 00:00:00 2001 From: cs Date: Sat, 23 May 2026 13:03:26 +0800 Subject: [PATCH] Add multi-node NCCL sweep test --- README.md | 39 +- configs/default.yaml | 46 ++ gpu_tester.py | 55 ++- modules/report.py | 50 ++ ...node_nccl_smoke_256m_aikubeworker0012.json | 439 ++++++++++++++++++ ...tinode_nccl_smoke_256m_aikubeworker0012.md | 50 ++ 6 files changed, 667 insertions(+), 12 deletions(-) create mode 100644 reports_multinode_nccl_smoke_256m_aikubeworker0012.json create mode 100644 reports_multinode_nccl_smoke_256m_aikubeworker0012.md diff --git a/README.md b/README.md index 1af08c4..eed4791 100644 --- a/README.md +++ b/README.md @@ -375,6 +375,27 @@ nccl: repeats: 3 max_stddev_pct: 3 +multinode_nccl: + enabled: false # true 时纳入 --test all + hosts: + - {name: nccl-gpu-1, addr: 172.72.8.12, slots: 8} + - {name: nccl-gpu-2, addr: 172.72.8.16, slots: 8} + tests: [all_reduce_perf, alltoall_perf] + topologies: + - {nodes: 2, gpus_per_node: 8} + mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun + extra_ld_library_path: # 传给远端 rank 的 MPI/NCCL/CUDA 库路径 + - /usr/mpi/gcc/openmpi-4.1.9a1/lib + - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib + - /usr/local/cuda-12.4/targets/x86_64-linux/lib + begin_size: 1k + end_size: 16g + step_factor: 2 + warmup_iters: 10 + socket_ifname: bond0 + ib_gid_index: 3 + ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 + stress: duration_sec: 1800 # 压力测试时长 use_gpu_burn: false # 默认走 PyTorch GEMM stress @@ -539,16 +560,14 @@ report: └── 异常: 检查 IB 线缆、交换机配置、子网管理器 步骤 3: 多节点 NCCL 测试 -├── 在每个节点上配置: -│ export MASTER_ADDR=<主节点IP> -│ export MASTER_PORT=29500 -│ export NCCL_SOCKET_IFNAME=ib0 # IB 网卡名 -│ export NCCL_DEBUG=INFO -├── 运行 nccl-tests 手动测试: -│ mpirun -np <总GPU数> -hostfile hosts \ -│ /opt/gpu-test-tools/nccl-tests/build/all_reduce_perf \ -│ -b 8 -e 256M -f 2 -g 1 -w 5 -n 20 -└── 确认: 多节点 AllReduce 带宽正常 +├── 在发起节点确认 mpirun、nccl-tests、跨节点 root SSH 可用 +├── 配置 configs/default.yaml 的 multinode_nccl.hosts / IB 参数 +├── 执行 PDF 风格 sweep: +│ python3 gpu_tester.py --test multinode-nccl --report --format md +├── 默认命令口径: +│ mpirun -H :8,:8 --map-by ppr:8:node -np 16 \ +│ all_reduce_perf/alltoall_perf -b 1k -e 16g -f 2 -g 1 -w 10 +└── 确认: Peak Bus BW、Peak Size、wrong_count 正常 步骤 4: 训练验证 ├── python3 gpu_tester.py --test training diff --git a/configs/default.yaml b/configs/default.yaml index a432c11..09a3921 100644 --- a/configs/default.yaml +++ b/configs/default.yaml @@ -48,6 +48,52 @@ nccl: test_allgather: false test_sendrecv: false +multinode_nccl: + enabled: false + mode: sweep + hosts: + - name: nccl-gpu-1 + addr: 172.72.8.12 + slots: 8 + - name: nccl-gpu-2 + addr: 172.72.8.16 + slots: 8 + ssh_user: root + ssh_preflight: true + mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun + mpi_ld_preload: null + extra_ld_library_path: + - /usr/mpi/gcc/openmpi-4.1.9a1/lib + - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib + - /usr/local/cuda-12.4/targets/x86_64-linux/lib + nccl_tests_dir: null # null = tools.install_dir/nccl-tests/build + tests: + - all_reduce_perf + - alltoall_perf + topologies: + - nodes: 2 + gpus_per_node: 8 + begin_size: 1k + end_size: 16g + step_factor: 2 + warmup_iters: 10 + gpus_per_rank: 1 + timeout_sec: 1800 + socket_ifname: bond0 + ib_gid_index: 3 + ib_sl: 5 + ib_tc: 136 + ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 + ib_timeout: 22 + qps_per_connection: 4 + min_nchannels: 4 + net_plugin: none + nvls_enable: 1 + split_data_on_qps: 1 + min_peak_busbw_gbps: + allreduce: 480 + alltoall: 75 + stress: duration_sec: 600 # 10 min — reaches thermal steady state, validates throttle/jitter beyond warmup use_doubles: false diff --git a/gpu_tester.py b/gpu_tester.py index 15bc694..35d89de 100644 --- a/gpu_tester.py +++ b/gpu_tester.py @@ -28,6 +28,7 @@ from modules.stress_test import StressTest from modules.rdma_test import RDMATest from modules.nvlink_test import NVLinkTest from modules.dcgm_test import DCGMTest +from modules.multinode_nccl_test import MultiNodeNCCLTest from modules.report import ReportGenerator from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label, get_supported_gpus, validate_driver_compatibility @@ -55,6 +56,44 @@ DEFAULT_CONFIG = { "repeats": 3, "max_stddev_pct": 3, }, + "multinode_nccl": { + "enabled": False, + "mode": "sweep", + "hosts": [ + {"name": "nccl-gpu-1", "addr": "172.72.8.12", "slots": 8}, + {"name": "nccl-gpu-2", "addr": "172.72.8.16", "slots": 8}, + ], + "ssh_user": "root", + "ssh_preflight": True, + "mpirun_path": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun", + "mpi_ld_preload": None, + "extra_ld_library_path": [ + "/usr/mpi/gcc/openmpi-4.1.9a1/lib", + "/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib", + "/usr/local/cuda-12.4/targets/x86_64-linux/lib", + ], + "nccl_tests_dir": None, + "tests": ["all_reduce_perf", "alltoall_perf"], + "topologies": [{"nodes": 2, "gpus_per_node": 8}], + "begin_size": "1k", + "end_size": "16g", + "step_factor": 2, + "warmup_iters": 10, + "gpus_per_rank": 1, + "timeout_sec": 1800, + "socket_ifname": "bond0", + "ib_gid_index": 3, + "ib_sl": 5, + "ib_tc": 136, + "ib_hca": "mlx5_0,mlx5_1,mlx5_6,mlx5_7", + "ib_timeout": 22, + "qps_per_connection": 4, + "min_nchannels": 4, + "net_plugin": "none", + "nvls_enable": 1, + "split_data_on_qps": 1, + "min_peak_busbw_gbps": {"allreduce": 480, "alltoall": 75}, + }, "stress": { "duration_sec": 1800, "production_duration_sec": 1800, @@ -191,7 +230,8 @@ def interactive_menu(config: dict): ("8", "NVLink/NVSwitch Test", "nvlink"), ("9", "DCGM Diagnostic", "dcgm"), ("10", "Training Simulation", "training"), - ("11", "Full Test Suite (All Tests)", "all"), + ("11", "Multi-node NCCL Test", "multinode_nccl"), + ("12", "Full Test Suite (All Tests)", "all"), ("0", "Generate Report", "report"), ] @@ -218,6 +258,7 @@ def interactive_menu(config: dict): "nvlink": "NVLink links, speed, and error counters", "dcgm": "DCGM diag -r 3 production diagnostic", "training": "Simulate LLM training with PyTorch", + "multinode_nccl": "Cross-node NCCL via mpirun/nccl-tests", "all": "Run all tests sequentially", "report": "Export results to JSON/HTML", } @@ -326,6 +367,12 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict: m.print_results(result) return result + elif test_name == "multinode_nccl": + m = MultiNodeNCCLTest(config) + result = m.run() + m.print_results(result) + return result + elif test_name == "all": return _run_full_suite(config, console) @@ -356,6 +403,8 @@ def _run_full_suite(config: dict, console: Console) -> dict: ("dcgm", "DCGM Diagnostic", DCGMTest), ("training", "Training Simulation", TrainingSim), ] + if (config.get("multinode_nccl", {}) or {}).get("enabled"): + tests.append(("multinode_nccl", "Multi-node NCCL Test", MultiNodeNCCLTest)) for i, (key, name, mod_cls) in enumerate(tests, 1): console.print(f"\n[bold cyan][{i}/{len(tests)}] {name}[/bold cyan]") @@ -435,6 +484,7 @@ Examples: python gpu_tester.py --test benchmark --type memory python gpu_tester.py --test benchmark --type compute --dtype fp16 python gpu_tester.py --test nccl # NCCL test + python gpu_tester.py --test multinode-nccl # Cross-node NCCL test python gpu_tester.py --test nvlink # NVLink/NVSwitch test python gpu_tester.py --test dcgm # DCGM diagnostic python gpu_tester.py --test training # Training sim @@ -442,7 +492,7 @@ Examples: python gpu_tester.py --report --format json --output report.json """, ) - parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"], + parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "multinode-nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"], help="Run a specific test") parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)") parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"], @@ -499,6 +549,7 @@ Examples: "health": "health", "benchmark": None, "nccl": "nccl", + "multinode-nccl": "multinode_nccl", "stress": "stress", "rdma": "rdma", "nvlink": "nvlink", diff --git a/modules/report.py b/modules/report.py index 2f6f1ec..b82170b 100644 --- a/modules/report.py +++ b/modules/report.py @@ -464,6 +464,47 @@ class ReportGenerator: passed = nccl.get("passed", False) lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n") + multinode = results.get("multinode_nccl") + if multinode and not multinode.get("error"): + lines.append("## Multi-node NCCL / Cross Leaf\n") + lines.append(f"Source: {multinode.get('source', 'unknown')} | Mode: {multinode.get('mode', 'unknown')}\n") + hosts = multinode.get("hosts", []) + if hosts: + host_text = ", ".join(f"{h.get('name') or h.get('addr')}({h.get('addr')})" for h in hosts) + lines.append(f"- **Hosts:** {host_text}") + preflight = multinode.get("preflight", {}) + if preflight.get("checks"): + failed_checks = [c for c in preflight["checks"] if c.get("status") == "FAIL"] + warn_checks = [c for c in preflight["checks"] if c.get("status") == "WARN"] + lines.append(f"- **Preflight:** {'PASS' if not failed_checks else 'FAIL'}" + f"{f' ({len(warn_checks)} warnings)' if warn_checks else ''}") + lines.append("") + for op, data in (multinode.get("tests") or {}).items(): + lines.append(f"### Multi-node NCCL {op}\n") + lines.append("| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |") + lines.append("|----------|-------------|-----------|------------|-----------|--------|") + for topo in data.get("topologies", []): + threshold = topo.get("min_required_gbps", 0) or 0 + threshold_text = f">= {threshold:.0f} GB/s" if threshold else "-" + lines.append( + f"| {topo.get('label', '')} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | " + f"{topo.get('peak_size', '')} | {topo.get('avg_busbw_gbps', 0):.2f} GB/s | " + f"{threshold_text} | {topo.get('status', '?')} |" + ) + lines.append("") + lines.append(f"**Overall: {'PASS' if multinode.get('passed') else 'FAIL'}**\n") + elif multinode and multinode.get("error"): + lines.append("## Multi-node NCCL / Cross Leaf\n") + lines.append(f"**Overall: FAIL** ({multinode.get('error')})\n") + preflight = multinode.get("preflight", {}) + if preflight.get("checks"): + lines.append("| Check | Status | Detail |") + lines.append("|-------|--------|--------|") + for check in preflight["checks"]: + detail = str(check.get("detail", "")).replace("\n", " ") + lines.append(f"| {check.get('name', '')} | {check.get('status', '')} | {detail} |") + lines.append("") + # --- Stress Test --- stress = results.get("stress") if stress and not stress.get("error"): @@ -836,6 +877,15 @@ class ReportGenerator: else: items.append(("NCCL", "FAIL")) + if "multinode_nccl" in results: + mn = results["multinode_nccl"] + if mn.get("error"): + items.append(("Multi-node NCCL", f"ERROR: {mn['error']}")) + elif mn.get("passed"): + items.append(("Multi-node NCCL", "PASS")) + else: + items.append(("Multi-node NCCL", "FAIL")) + # Stress if "stress" in results: s = results["stress"] diff --git a/reports_multinode_nccl_smoke_256m_aikubeworker0012.json b/reports_multinode_nccl_smoke_256m_aikubeworker0012.json new file mode 100644 index 0000000..72c30ce --- /dev/null +++ b/reports_multinode_nccl_smoke_256m_aikubeworker0012.json @@ -0,0 +1,439 @@ +{ + "multinode_nccl": { + "passed": false, + "source": "nccl-tests-mpirun", + "mode": "sweep", + "hosts": [ + { + "name": "nccl-gpu-1", + "addr": "172.72.8.12", + "slots": 8 + }, + { + "name": "nccl-gpu-2", + "addr": "172.72.8.16", + "slots": 8 + } + ], + "preflight": { + "checks": [ + { + "name": "mpirun", + "status": "PASS", + "detail": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun" + }, + { + "name": "hosts", + "status": "PASS", + "detail": "2 configured" + }, + { + "name": "all_reduce_perf", + "status": "PASS", + "detail": "/opt/gpu-test-tools/nccl-tests/build/all_reduce_perf" + }, + { + "name": "alltoall_perf", + "status": "PASS", + "detail": "/opt/gpu-test-tools/nccl-tests/build/alltoall_perf" + }, + { + "name": "ssh 172.72.8.12", + "status": "WARN", + "detail": "Host key verification failed." + }, + { + "name": "ssh 172.72.8.16", + "status": "PASS", + "detail": "aikubeworker0016" + } + ], + "passed": true + }, + "tests": { + "allreduce": { + "binary": "/opt/gpu-test-tools/nccl-tests/build/all_reduce_perf", + "topologies": [ + { + "label": "2 nodes x 8 GPUs", + "nodes": 2, + "gpus_per_node": 8, + "ranks": 16, + "hosts": [ + { + "name": "nccl-gpu-1", + "addr": "172.72.8.12", + "slots": 8 + }, + { + "name": "nccl-gpu-2", + "addr": "172.72.8.16", + "slots": 8 + } + ], + "command": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun --allow-run-as-root --mca btl_openib_warn_no_device_params_found 0 --mca btl_tcp_if_include bond0 -H 172.72.8.12:8,172.72.8.16:8 --map-by ppr:8:node -np 16 -x NCCL_DEBUG=WARN -x NCCL_SOCKET_IFNAME=bond0 -x NCCL_IB_GID_INDEX=3 -x NCCL_IB_SL=5 -x NCCL_IB_TC=136 -x NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 -x NCCL_IB_TIMEOUT=22 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_MIN_NCHANNELS=4 -x NCCL_NET_PLUGIN=none -x NCCL_NVLS_ENABLE=1 -x NCCL_IB_SPLIT_DATA_ON_QPS=1 -x LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib:/usr/local/cuda-12.4/targets/x86_64-linux/lib /opt/gpu-test-tools/nccl-tests/build/all_reduce_perf -b 1k -e 256M -g 1 -f 2 -w 2", + "returncode": 0, + "status": "FAIL", + "peak_busbw_gbps": 39.32, + "peak_algbw_gbps": 20.97, + "peak_size": "4M", + "avg_busbw_gbps": 9.1, + "min_required_gbps": 100.0, + "wrong_count": 0, + "by_size": [ + { + "size_bytes": 1024, + "size": "1K", + "time_us": 80.32, + "algbw_gbps": 0.01, + "busbw_gbps": 0.02, + "wrong": 0 + }, + { + "size_bytes": 2048, + "size": "2K", + "time_us": 35.79, + "algbw_gbps": 0.06, + "busbw_gbps": 0.11, + "wrong": 0 + }, + { + "size_bytes": 4096, + "size": "4K", + "time_us": 37.49, + "algbw_gbps": 0.11, + "busbw_gbps": 0.2, + "wrong": 0 + }, + { + "size_bytes": 8192, + "size": "8K", + "time_us": 40.32, + "algbw_gbps": 0.2, + "busbw_gbps": 0.38, + "wrong": 0 + }, + { + "size_bytes": 16384, + "size": "16K", + "time_us": 43.04, + "algbw_gbps": 0.38, + "busbw_gbps": 0.71, + "wrong": 0 + }, + { + "size_bytes": 32768, + "size": "32K", + "time_us": 43.32, + "algbw_gbps": 0.76, + "busbw_gbps": 1.42, + "wrong": 0 + }, + { + "size_bytes": 65536, + "size": "64K", + "time_us": 47.45, + "algbw_gbps": 1.38, + "busbw_gbps": 2.59, + "wrong": 0 + }, + { + "size_bytes": 131072, + "size": "128K", + "time_us": 89.3, + "algbw_gbps": 1.47, + "busbw_gbps": 2.75, + "wrong": 0 + }, + { + "size_bytes": 262144, + "size": "256K", + "time_us": 165.38, + "algbw_gbps": 1.59, + "busbw_gbps": 2.97, + "wrong": 0 + }, + { + "size_bytes": 524288, + "size": "512K", + "time_us": 4292.69, + "algbw_gbps": 0.12, + "busbw_gbps": 0.23, + "wrong": 0 + }, + { + "size_bytes": 1048576, + "size": "1M", + "time_us": 139.29, + "algbw_gbps": 7.53, + "busbw_gbps": 14.12, + "wrong": 0 + }, + { + "size_bytes": 2097152, + "size": "2M", + "time_us": 4195.12, + "algbw_gbps": 0.5, + "busbw_gbps": 0.94, + "wrong": 0 + }, + { + "size_bytes": 4194304, + "size": "4M", + "time_us": 199.99, + "algbw_gbps": 20.97, + "busbw_gbps": 39.32, + "wrong": 0 + }, + { + "size_bytes": 8388608, + "size": "8M", + "time_us": 6159.0, + "algbw_gbps": 1.36, + "busbw_gbps": 2.55, + "wrong": 0 + }, + { + "size_bytes": 16777216, + "size": "16M", + "time_us": 6336.73, + "algbw_gbps": 2.65, + "busbw_gbps": 4.96, + "wrong": 0 + }, + { + "size_bytes": 33554432, + "size": "32M", + "time_us": 12623.3, + "algbw_gbps": 2.66, + "busbw_gbps": 4.98, + "wrong": 0 + }, + { + "size_bytes": 67108864, + "size": "64M", + "time_us": 17005.6, + "algbw_gbps": 3.95, + "busbw_gbps": 7.4, + "wrong": 0 + }, + { + "size_bytes": 134217728, + "size": "128M", + "time_us": 23826.7, + "algbw_gbps": 5.63, + "busbw_gbps": 10.56, + "wrong": 0 + }, + { + "size_bytes": 268435456, + "size": "256M", + "time_us": 47356.5, + "algbw_gbps": 5.67, + "busbw_gbps": 10.63, + "wrong": 0 + } + ], + "stderr_tail": "", + "stdout_tail": " 6.25 0\n 1048576 262144 float sum -1 139.29 7.53 14.12 0 3552.34 0.30 0.55 0\n 2097152 524288 float sum -1 4195.12 0.50 0.94 0 158.81 13.21 24.76 0\n 4194304 1048576 float sum -1 199.99 20.97 39.32 0 3623.39 1.16 2.17 0\n 8388608 2097152 float sum -1 6159.00 1.36 2.55 0 324.45 25.85 48.48 0\n 16777216 4194304 float sum -1 6336.73 2.65 4.96 0 600.96 27.92 52.35 0\n 33554432 8388608 float sum -1 12623.3 2.66 4.98 0 949.39 35.34 66.27 0\n 67108864 16777216 float sum -1 17005.6 3.95 7.40 0 17175.5 3.91 7.33 0\n 134217728 33554432 float sum -1 23826.7 5.63 10.56 0 25793.0 5.20 9.76 0\n 268435456 67108864 float sum -1 47356.5 5.67 10.63 0 43195.8 6.21 11.65 0\n# Out of bounds values : 0 OK\n# Avg bus bandwidth : 9.0956 \n#\n# Collective test concluded: all_reduce_perf\n#\n\n", + "started_at": "2026-05-23T04:59:28.584786", + "finished_at": "2026-05-23T04:59:54.886123" + } + ] + }, + "alltoall": { + "binary": "/opt/gpu-test-tools/nccl-tests/build/alltoall_perf", + "topologies": [ + { + "label": "2 nodes x 8 GPUs", + "nodes": 2, + "gpus_per_node": 8, + "ranks": 16, + "hosts": [ + { + "name": "nccl-gpu-1", + "addr": "172.72.8.12", + "slots": 8 + }, + { + "name": "nccl-gpu-2", + "addr": "172.72.8.16", + "slots": 8 + } + ], + "command": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun --allow-run-as-root --mca btl_openib_warn_no_device_params_found 0 --mca btl_tcp_if_include bond0 -H 172.72.8.12:8,172.72.8.16:8 --map-by ppr:8:node -np 16 -x NCCL_DEBUG=WARN -x NCCL_SOCKET_IFNAME=bond0 -x NCCL_IB_GID_INDEX=3 -x NCCL_IB_SL=5 -x NCCL_IB_TC=136 -x NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 -x NCCL_IB_TIMEOUT=22 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_MIN_NCHANNELS=4 -x NCCL_NET_PLUGIN=none -x NCCL_NVLS_ENABLE=1 -x NCCL_IB_SPLIT_DATA_ON_QPS=1 -x LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.9a1/lib:/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib:/usr/local/cuda-12.4/targets/x86_64-linux/lib /opt/gpu-test-tools/nccl-tests/build/alltoall_perf -b 1k -e 256M -g 1 -f 2 -w 2", + "returncode": 0, + "status": "FAIL", + "peak_busbw_gbps": 8.64, + "peak_algbw_gbps": 9.21, + "peak_size": "2M", + "avg_busbw_gbps": 2.19, + "min_required_gbps": 20.0, + "wrong_count": 0, + "by_size": [ + { + "size_bytes": 1024, + "size": "1K", + "time_us": 58.44, + "algbw_gbps": 0.02, + "busbw_gbps": 0.02, + "wrong": 0 + }, + { + "size_bytes": 2048, + "size": "2K", + "time_us": 47.2, + "algbw_gbps": 0.04, + "busbw_gbps": 0.04, + "wrong": 0 + }, + { + "size_bytes": 4096, + "size": "4K", + "time_us": 47.68, + "algbw_gbps": 0.09, + "busbw_gbps": 0.08, + "wrong": 0 + }, + { + "size_bytes": 8192, + "size": "8K", + "time_us": 48.78, + "algbw_gbps": 0.17, + "busbw_gbps": 0.16, + "wrong": 0 + }, + { + "size_bytes": 16384, + "size": "16K", + "time_us": 79.34, + "algbw_gbps": 0.21, + "busbw_gbps": 0.19, + "wrong": 0 + }, + { + "size_bytes": 32768, + "size": "32K", + "time_us": 68.8, + "algbw_gbps": 0.48, + "busbw_gbps": 0.45, + "wrong": 0 + }, + { + "size_bytes": 65536, + "size": "64K", + "time_us": 49.86, + "algbw_gbps": 1.31, + "busbw_gbps": 1.23, + "wrong": 0 + }, + { + "size_bytes": 131072, + "size": "128K", + "time_us": 52.89, + "algbw_gbps": 2.48, + "busbw_gbps": 2.32, + "wrong": 0 + }, + { + "size_bytes": 262144, + "size": "256K", + "time_us": 3861.98, + "algbw_gbps": 0.07, + "busbw_gbps": 0.06, + "wrong": 0 + }, + { + "size_bytes": 524288, + "size": "512K", + "time_us": 83.38, + "algbw_gbps": 6.29, + "busbw_gbps": 5.89, + "wrong": 0 + }, + { + "size_bytes": 1048576, + "size": "1M", + "time_us": 182.32, + "algbw_gbps": 5.75, + "busbw_gbps": 5.39, + "wrong": 0 + }, + { + "size_bytes": 2097152, + "size": "2M", + "time_us": 227.67, + "algbw_gbps": 9.21, + "busbw_gbps": 8.64, + "wrong": 0 + }, + { + "size_bytes": 4194304, + "size": "4M", + "time_us": 6482.39, + "algbw_gbps": 0.65, + "busbw_gbps": 0.61, + "wrong": 0 + }, + { + "size_bytes": 8388608, + "size": "8M", + "time_us": 10348.9, + "algbw_gbps": 0.81, + "busbw_gbps": 0.76, + "wrong": 0 + }, + { + "size_bytes": 16777216, + "size": "16M", + "time_us": 18616.5, + "algbw_gbps": 0.9, + "busbw_gbps": 0.84, + "wrong": 0 + }, + { + "size_bytes": 33554432, + "size": "32M", + "time_us": 17170.7, + "algbw_gbps": 1.95, + "busbw_gbps": 1.83, + "wrong": 0 + }, + { + "size_bytes": 67108864, + "size": "64M", + "time_us": 35735.6, + "algbw_gbps": 1.88, + "busbw_gbps": 1.76, + "wrong": 0 + }, + { + "size_bytes": 134217728, + "size": "128M", + "time_us": 69388.5, + "algbw_gbps": 1.93, + "busbw_gbps": 1.81, + "wrong": 0 + }, + { + "size_bytes": 268435456, + "size": "256M", + "time_us": 96873.9, + "algbw_gbps": 2.77, + "busbw_gbps": 2.6, + "wrong": 0 + } + ], + "stderr_tail": "", + "stdout_tail": "56 6.85 6.42 N/A\n 1048576 16384 float none -1 182.32 5.75 5.39 0 169.19 6.20 5.81 N/A\n 2097152 32768 float none -1 227.67 9.21 8.64 0 3664.15 0.57 0.54 N/A\n 4194304 65536 float none -1 6482.39 0.65 0.61 0 553.24 7.58 7.11 N/A\n 8388608 131072 float none -1 10348.9 0.81 0.76 0 803.01 10.45 9.79 N/A\n 16777216 262144 float none -1 18616.5 0.90 0.84 0 4237.22 3.96 3.71 N/A\n 33554432 524288 float none -1 17170.7 1.95 1.83 0 20849.4 1.61 1.51 N/A\n 67108864 1048576 float none -1 35735.6 1.88 1.76 0 34524.7 1.94 1.82 N/A\n 134217728 2097152 float none -1 69388.5 1.93 1.81 0 63535.3 2.11 1.98 N/A\n 268435456 4194304 float none -1 96873.9 2.77 2.60 0 100742 2.66 2.50 N/A\n# Out of bounds values : 0 OK\n# Avg bus bandwidth : 2.19061 \n#\n# Collective test concluded: alltoall_perf\n#\n\n", + "started_at": "2026-05-23T04:59:54.886310", + "finished_at": "2026-05-23T05:00:28.796555" + } + ] + } + }, + "timestamp": "2026-05-23T05:00:28.796580" + }, + "timestamp": "2026-05-23T05:00:28.807561", + "hostname": "aikubeworker0012" +} \ No newline at end of file diff --git a/reports_multinode_nccl_smoke_256m_aikubeworker0012.md b/reports_multinode_nccl_smoke_256m_aikubeworker0012.md new file mode 100644 index 0000000..57fea2a --- /dev/null +++ b/reports_multinode_nccl_smoke_256m_aikubeworker0012.md @@ -0,0 +1,50 @@ +# GPU Test Report + +- **Date:** 2026-05-23T05:00:28.807561 +- **Host:** aikubeworker0012 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Missing required evidence: +- GPU Info +- Health Check +- Memory Bandwidth +- Compute Throughput +- NVLink/NVSwitch +- NCCL +- Stress Test +- RDMA +- DCGM +- Training + +## Summary + +| Test | Result | +|------|--------| +| Multi-node NCCL | FAIL | + +## Multi-node NCCL / Cross Leaf + +Source: nccl-tests-mpirun | Mode: sweep + +- **Hosts:** nccl-gpu-1(172.72.8.12), nccl-gpu-2(172.72.8.16) +- **Preflight:** PASS (1 warnings) + +### Multi-node NCCL allreduce + +| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | +|----------|-------------|-----------|------------|-----------|--------| +| 2 nodes x 8 GPUs | 39.32 GB/s | 4M | 9.10 GB/s | >= 100 GB/s | FAIL | + +### Multi-node NCCL alltoall + +| Topology | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | +|----------|-------------|-----------|------------|-----------|--------| +| 2 nodes x 8 GPUs | 8.64 GB/s | 2M | 2.19 GB/s | >= 20 GB/s | FAIL | + +**Overall: FAIL** + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file