From 86f15544d7092d57b069a013b5ed9a2475272595 Mon Sep 17 00:00:00 2001 From: cs Date: Sat, 23 May 2026 10:41:09 +0800 Subject: [PATCH] Add H100 acceptance test coverage and reports --- .gitignore | 1 + H100_test_all_vs_PDF_覆盖对比.md | 85 ++ H100验收_vs_test_all_差距分析.md | 100 ++ README.md | 98 +- docs/h100_test_all_metrics_guide_cn.md | 255 +++++ docs/multinode_nccl_concepts.md | 362 +++++++ gpu_tester.py | 169 +++- modules/dcgm_test.py | 231 +++++ modules/health_check.py | 42 + modules/nccl_test.py | 171 ++-- modules/nvlink_test.py | 188 ++++ modules/report.py | 357 ++++++- modules/stress_test.py | 294 +++++- modules/training_sim.py | 288 +++++- reports_all_aikubeworker0016.json | 921 ++++++++++++++++++ reports_all_aikubeworker0016.md | 157 +++ ...cgm_r3_aikubeworker0012_20260522_200338.md | 65 ++ ...cgm_r3_aikubeworker0016_20260522_200538.md | 65 ++ reports_nvbandwidth_aikubeworker0012.json | 70 ++ reports_nvbandwidth_aikubeworker0012.md | 38 + reports_nvbandwidth_aikubeworker0016.json | 70 ++ reports_nvbandwidth_aikubeworker0016.md | 38 + reports_rdma_aikubeworker0012.json | 157 +++ reports_rdma_aikubeworker0016.json | 157 +++ ...ounter_aikubeworker0012_20260522_194808.md | 62 ++ ...ounter_aikubeworker0016_20260522_194828.md | 62 ++ reports_rdma_cross_node_mlx5_0_20260523.md | 50 + reports_rdma_single_node_summary.md | 73 ++ reports_single_gpu_aikubeworker0012.json | 292 ++++++ reports_single_gpu_aikubeworker0012.md | 54 + reports_single_gpu_aikubeworker0016.json | 292 ++++++ reports_single_gpu_aikubeworker0016.md | 54 + ...stress_smoke_reasons_aikubeworker0012.json | 165 ++++ ...s_stress_smoke_reasons_aikubeworker0012.md | 29 + ...stress_smoke_reasons_aikubeworker0016.json | 165 ++++ ...s_stress_smoke_reasons_aikubeworker0016.md | 29 + ...latest_aikubeworker0012_20260522_203246.md | 322 ++++++ ...latest_aikubeworker0016_20260522_203447.md | 322 ++++++ ...rts_test_all_latest_summary_cn_20260523.md | 101 ++ ...ll_pdf_aikubeworker0012_20260522_182656.md | 259 +++++ ...ll_pdf_aikubeworker0016_20260522_182856.md | 259 +++++ ...warmup_aikubeworker0012_20260522_194528.md | 43 + ...warmup_aikubeworker0016_20260522_194609.md | 43 + ...all_aikubeworker0016_中文结果与验收差距.md | 73 ++ 44 files changed, 6938 insertions(+), 190 deletions(-) create mode 100644 H100_test_all_vs_PDF_覆盖对比.md create mode 100644 H100验收_vs_test_all_差距分析.md create mode 100644 docs/h100_test_all_metrics_guide_cn.md create mode 100644 docs/multinode_nccl_concepts.md create mode 100644 modules/dcgm_test.py create mode 100644 modules/nvlink_test.py create mode 100644 reports_all_aikubeworker0016.json create mode 100644 reports_all_aikubeworker0016.md create mode 100644 reports_dcgm_r3_aikubeworker0012_20260522_200338.md create mode 100644 reports_dcgm_r3_aikubeworker0016_20260522_200538.md create mode 100644 reports_nvbandwidth_aikubeworker0012.json create mode 100644 reports_nvbandwidth_aikubeworker0012.md create mode 100644 reports_nvbandwidth_aikubeworker0016.json create mode 100644 reports_nvbandwidth_aikubeworker0016.md create mode 100644 reports_rdma_aikubeworker0012.json create mode 100644 reports_rdma_aikubeworker0016.json create mode 100644 reports_rdma_counter_aikubeworker0012_20260522_194808.md create mode 100644 reports_rdma_counter_aikubeworker0016_20260522_194828.md create mode 100644 reports_rdma_cross_node_mlx5_0_20260523.md create mode 100644 reports_rdma_single_node_summary.md create mode 100644 reports_single_gpu_aikubeworker0012.json create mode 100644 reports_single_gpu_aikubeworker0012.md create mode 100644 reports_single_gpu_aikubeworker0016.json create mode 100644 reports_single_gpu_aikubeworker0016.md create mode 100644 reports_stress_smoke_reasons_aikubeworker0012.json create mode 100644 reports_stress_smoke_reasons_aikubeworker0012.md create mode 100644 reports_stress_smoke_reasons_aikubeworker0016.json create mode 100644 reports_stress_smoke_reasons_aikubeworker0016.md create mode 100644 reports_test_all_latest_aikubeworker0012_20260522_203246.md create mode 100644 reports_test_all_latest_aikubeworker0016_20260522_203447.md create mode 100644 reports_test_all_latest_summary_cn_20260523.md create mode 100644 reports_test_all_pdf_aikubeworker0012_20260522_182656.md create mode 100644 reports_test_all_pdf_aikubeworker0016_20260522_182856.md create mode 100644 reports_training_warmup_aikubeworker0012_20260522_194528.md create mode 100644 reports_training_warmup_aikubeworker0016_20260522_194609.md create mode 100644 test_all_aikubeworker0016_中文结果与验收差距.md diff --git a/.gitignore b/.gitignore index 934bb96..99f18a6 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ reports/ venv/ .qoder/* .claude/settings.local.json +.omx/ diff --git a/H100_test_all_vs_PDF_覆盖对比.md b/H100_test_all_vs_PDF_覆盖对比.md new file mode 100644 index 0000000..f6d112a --- /dev/null +++ b/H100_test_all_vs_PDF_覆盖对比.md @@ -0,0 +1,85 @@ +# H100 PDF 验收项 vs 当前 `test all` 覆盖对比 + +对比对象: + +- PDF:`/Users/d-robotics/Downloads/H100_production_acceptance.pdf` +- 当前脚本:`python gpu_tester.py --config configs/default.yaml --test all --report --format md` +- 范围:单节点 8 卡 H100。跨节点 NCCL/RDMA 暂不纳入本轮。 + +## 结论 + +当前 `test all` 已经从“功能巡检”扩成了“接近生产验收”的单节点套件:GPU 健康、NVLink/NVSwitch、HBM/PCIe/NVLink 带宽、计算、NCCL、压力、RDMA 本机端口、DCGM、训练模拟都会进入同一个 all。 + +最新 stress smoke 已确认 PyTorch BF16 GEMM 压力能把两台机器压到 PDF 要求的功耗区间: + +- `aikubeworker0012`:45 秒 smoke,稳态平均功耗约 `697-698W/卡`,TFLOPS jitter `4.07%`,XID `0`,但温差 `12C`、`clocks_throttle_reasons.active=0x4`,按 PDF 严格 FAIL。 +- `aikubeworker0016`:45 秒 smoke,稳态平均功耗约 `697-699W/卡`,TFLOPS jitter `3.77%`,XID `0`,但温差 `8C`、`clocks_throttle_reasons.active=0x4`,按 PDF 严格 FAIL。 + +也就是说,当前卡点已经不是“脚本压不满 H100”,而是机器在满功耗压力下没有满足 PDF 的 `温差 <=5C` 和 `Throttle Reasons 全程 0x0` 两个严格门槛。 + +但如果严格按 PDF 做最终验收,现在还差这些: + +1. 24 小时类指标未覆盖:PDF 要求 SBE 24h 增长率、长稳态观察;当前 `all` 是单次快照 + 30 分钟压力,不等于 24 小时老化。 +2. 跨节点项目本轮故意不测:PDF 的 IB/RDMA 生产验收通常要双端 `ib_write_bw/read_bw/lat`、`ibping`;当前按你的要求先做单节点,跨节点未纳入。 +3. PFC/ECN/AER 的覆盖依赖机器暴露的系统计数器:脚本会读能找到的 sysfs 计数器和 dmesg,但如果交换机侧 PFC/ECN 不在主机暴露,仍需要网络侧补证据。 +4. NCCL 1MB 档会被严格阈值打失败:实测 1MB AllReduce bus BW 约 23 GB/s,而 256MB AllReduce 已通过 `nccl-tests` 验证,约 421 GB/s;如果 PDF 要求 1MB 也达到 405 GB/s,这项不是“没测”,而是会被判 FAIL。 +5. Stress 已能达到功耗和 jitter 要求,但短测已经暴露温差和 throttle strict FAIL;完整 1800 秒只会给出更正式的证据,不会自动改变这个判据。 + +## 覆盖表 + +| PDF 验收项 | 当前 `test all` 状态 | 还少什么 | +|---|---:|---| +| GPU 基本信息、Driver/CUDA | 已覆盖 | 无;会记录 driver、CUDA、GPU 型号 | +| 温度阈值:稳态 ≤75C、峰值 ≤85C | 已覆盖健康快照;压力项覆盖 ≤80C | 24h 稳态曲线不在一次 all 内 | +| idle power ≤100W/card | 部分覆盖 | 当前 health 会采功耗,但 idle 判据还不是独立验收项 | +| stress power ≥630W/card | 已覆盖;短测两台约 697-699W/卡 | 完整 1800 秒仍待跑 | +| throttle reasons active=0x0 | 已覆盖;短测两台出现 0x4 | 按 PDF 严格判 FAIL;不是脚本跳过项 | +| DBE/SBE/retired pages | 部分覆盖 | retired pages 和内核错误已查;SBE 24h 增长率未覆盖 | +| PCIe Gen5 x16 | 部分覆盖 | GPU 信息/拓扑可见;Replay/AER 依赖 dmesg/sysfs,可能还需额外主板侧证据 | +| Fabric Manager active 且无 ERROR | 已覆盖 | 无;health 会查 systemd 和 journal | +| NVLink:18 links/GPU、25GB/s/link、错误为 0 | 已覆盖 | 无;新增 `nvlink` 项 | +| D2D/H2D/D2H 带宽 | 已覆盖 | 依赖 `nvbandwidth`,两台已具备 | +| 8x8 P2P matrix off-diagonal mean/min/deviation | 已覆盖 | 无;由 nvbandwidth JSON 解析 | +| Compute FP32/TF32/FP16/BF16/FP8/FP64/INT8 | 已覆盖 | INT8 为 PyTorch `_int_mm` 路径,若要供应商标准 INT8 kernel 需再换实现 | +| NCCL AllReduce/AllGather/ReduceScatter/Broadcast/SendRecv/AllToAll | 已覆盖 | 无;`nccl-tests` 已在两台编好 | +| NCCL 1MB/256MB/2GB,repeat 3,stddev ≤3% | 已覆盖 | 严格按 PDF 阈值时 1MB 档大概率 FAIL;256MB AllReduce 两台 `nccl-tests` 实测约 421GB/s | +| Stress ≥30min,BF16/FP16 GEMM 8192,1s telemetry | 已覆盖;默认 BF16 GEMM `24576`,1s telemetry,warmup 后稳态判定 | 完整 1800 秒待执行;短测已暴露温差/throttle FAIL | +| DCGM `dcgmi diag -r 3` | 已覆盖;DCGM 4.5.3 已安装,服务已启用 | 两台完整 `-r 3` 已 PASS;日志见 `/root/test_gpu_scripts/reports/dcgm_r3_*_20260522_17010*.log` | +| RDMA 端口 ACTIVE、400Gbps | 部分覆盖 | 单节点可查端口;严格双端吞吐/时延本轮不跑 | +| RDMA write/read bw ≥47GB/s、latency ≤2/3.5us | 部分覆盖 | 单机 localhost/perftest 不等价跨节点线速验收 | +| PFC/ECN errors=0、ibping 双向 OK | 部分覆盖 | 主机能读到的计数器会查;交换机侧/跨节点 ibping 未覆盖 | +| 1.5B synthetic Transformer BF16,8 卡,≥45k tokens/s | 已覆盖 DDP 路径 | 8 进程 DDP smoke 已通过;完整 50 step 长跑待执行 | +| 任一子项 FAIL 则总体验收 FAIL | 已覆盖 | `all` 现在会按 strict verdict 退出非 0 | + +## 如果现在直接跑 `all` + +推荐命令: + +```bash +cd /root/test_gpu_scripts +/root/gpu-test-venv/bin/python gpu_tester.py --config configs/default.yaml --test all --report --format json --output reports/h100_all_$(hostname)_$(date +%Y%m%d_%H%M%S).json +``` + +如果要直接生成中文 Markdown 报告,用这个: + +```bash +cd /root/test_gpu_scripts +/root/gpu-test-venv/bin/python gpu_tester.py --config configs/default.yaml --test all --report --format md --output reports/h100_all_$(hostname)_$(date +%Y%m%d_%H%M%S).md +``` + +预计行为: + +- 会跑完整单节点项目,压力默认 1800 秒,默认使用 PyTorch BF16 GEMM 压力并采 1 秒 telemetry/XID。 +- stress 默认矩阵为 `24576`,用于把 H100 压到 ≥630W/卡;PDF 只要求 `matrix_size >=8192`,这里是为了满足功耗门槛。 +- NCCL 会跑 6 个 op × 3 个 message size × 3 次 repeat。 +- DCGM 会跑 `dcgmi diag -r 3 -n gpu:8 -j`;DCGM 工具链已安装并启动,`diag -r 1` 与两台独立 `r3` 长跑均已 PASS。 +- NCCL 1MB 档按 405GB/s 阈值也会失败;256MB AllReduce 已验证走 `nccl-tests`,两台约 421GB/s。 +- stress 按 PDF 严格口径预计会 FAIL:当前短测证据显示温差超过 5C,且 throttle active 出现 `0x4`。 +- 跨节点 RDMA/NCCL 不在这次单节点 all 里。 + +## 当前最小补齐清单 + +1. 如果要严格 RDMA 生产验收,下一轮用两台机器做 server/client 双端测试。 +2. 执行完整 1.5B DDP 50 step 训练验收并归档 tokens/s、jitter、显存和 loss。 +3. 执行完整 1800 秒 stress 并归档 1 秒 telemetry、XID、throttle、功耗和温度;当前预期会因温差/throttle FAIL。 +4. 如果要 24 小时验收,增加一个 24h monitor 模式,记录 SBE 增长率、XID、温度、功耗、降频曲线。 diff --git a/H100验收_vs_test_all_差距分析.md b/H100验收_vs_test_all_差距分析.md new file mode 100644 index 0000000..5599d0c --- /dev/null +++ b/H100验收_vs_test_all_差距分析.md @@ -0,0 +1,100 @@ +# H100 生产验收标准 vs 当前 `gpu_tester.py --test all` 覆盖差距 + +对比文件:`/Users/d-robotics/Downloads/H100_production_acceptance.pdf` + +对比对象:当前仓库执行 `python gpu_tester.py --test all --report --format md/json` + +## 结论 + +当前仓库的 `test all` 能覆盖验收文档里的大类框架,但还不是完整的 H100 生产验收。 + +它会跑 8 个模块: + +1. GPU Information +2. Health Check +3. Memory Benchmark +4. Compute Benchmark +5. NCCL Test +6. GPU Stress Test +7. RDMA/IB Test +8. Training Simulation + +但是按照 PDF 的生产验收标准,仍缺少这些关键项: + +- NVLink 每卡 18 条链路的 active/速率/错误计数逐项验收 +- DCGM `dcgmi diag -r 3` +- 30-60 分钟 burn-in 和 1 秒级温度/功耗/throttle/XID 采样 +- NCCL 官方 `nccl-tests` 的性能验收,包括 1MB/256MB/2GB 三个消息大小、重复 3 次取最差值、标准差 +- RDMA 生产口径:4MB 带宽、8B 延迟、PFC/ECN 错误、ibping 双向 +- 8 卡逐卡 compute 一致性,要求同 dtype 极差/均值 <= 3% +- FP64、INT8 计算项 +- 训练项应为 8 卡 1.5B synthetic Transformer,并按 45k tokens/s、step 抖动、显存、loss 健康度验收 + +## 覆盖矩阵 + +| PDF 验收项 | `test all` 是否覆盖 | 当前覆盖程度 | 主要缺口 | +| --- | --- | --- | --- | +| 1. 健康检查 | 部分覆盖 | 温度、功耗、ECC、PCIe、时钟、throttle、persistence、IB 设备 | idle 功耗 <=100W 未单独判定;stress 功耗 >=630W 未判定;retired pages 未查;24h SBE 增长率未查;AER/Replay errors 未查;fabricmanager 服务和 ERROR 日志未查 | +| 2. NVLink 拓扑与链路 | 部分覆盖 | GPU info 会保存 `nvidia-smi topo -m` | 未跑 `nvidia-smi nvlink -s/-c/-e`;未验证每卡 18 条 NVLink;未验证每条 25GB/s;未验证 CRC/Replay/Recovery error = 0 | +| 3. Memory Bandwidth | 部分覆盖 | 会用 nvbandwidth 测 H2D、D2H、D2D write/read/bidir | 未输出完整 8x8 P2P 矩阵;未验非对角均值 >=360GB/s、最小值 >=320GB/s、相对均值偏差 <=±5%;D2D 口径和 PDF 的单卡/P2P 验收口径还没完全对齐 | +| 4. Compute Throughput | 大部分覆盖 | 默认配置已是 matrix_size=8192、warmup=50、iterations=500、use_compile=true;H100 绝对 TFLOPS 阈值在 `gpu_specs.py` 里有 | 目前测试结果是整体/单进程口径,未真正逐 GPU 分别测出 8 卡极差/均值;未测 FP64、INT8 | +| 5. NCCL Multi-GPU | 部分覆盖,依赖工具 | 代码支持 nccl-tests;若缺 binary 会 fallback torchrun 功能连通性 | 当前远端没装好 nccl-tests,实际会退化成功能测试且失败/无性能数据;默认只启 allreduce/alltoall/broadcast,未启 allgather/reducescatter/sendrecv;消息大小不是 1MB/256MB/2GB 三点;未重复 3 次取 worst;未统计标准差 | +| 6. Stress/Burn-in | 部分覆盖 | 会跑 stress,默认 60 秒;无 gpu-burn 时用 PyTorch fallback | PDF 要 >=30min,推荐 60min;要 FP16/BF16 大 GEMM matrix >=8192;要每分钟 TFLOPS 抖动、温度 <=80、卡间温差 <=5、功耗 >=630W、throttle=0、XID=0;当前 PyTorch fallback 只分配约 64MB/卡,压力不够 | +| 7. DCGM 诊断 | 未覆盖 | 无 | 没有执行 `dcgmi diag -r 3`,也没有解析 Software/Deployment/Hardware/Integration/Stress/Power 子项 | +| 8. RDMA/IB | 部分覆盖 | 会发现 IB 设备,跑 ib_write_bw/read_bw/write_lat/read_lat | 当前脚本用 `localhost`,不是跨节点;msg_size 是 64KB,不是 4MB;latency 没指定 8B;阈值是 50GB/s 和 10us,不是 PDF 的 write/read >=47GB/s、write_lat <=2us、read_lat <=3.5us;未查 PFC/ECN、ibping 双向 | +| 9. Training Simulation | 部分覆盖 | 会跑 GPT-2 或 synthetic transformer,输出 tokens/s、step time、显存、loss | 当前 synthetic 是约 1.47B 参数但实际单进程 `.cuda()`,不是 8 卡分布式训练;未按 45k tokens/s、step 抖动 <=±3%、peak <=70GB/卡、NaN/Inf 做硬判定 | +| 10. 总体 Verdict | 部分覆盖 | report 有 summary | 当前 `all` 的 pass/fail 逻辑偏“模块是否报错”,不是 PDF 的任一子项 FAIL 即整机禁上生产 | + +## 如果现在直接执行 `test all`,能得到什么 + +会得到一份“单节点综合体检/基准测试报告”,包含: + +- 8 张 H100 的基础信息、驱动/CUDA、PCIe、显存、温度、功耗 +- 健康检查结果 +- nvbandwidth 的 H2D/D2H/D2D 汇总带宽 +- FP32/TF32/FP16/BF16/FP8 计算吞吐 +- NCCL 测试结果,如果 nccl-tests 缺失会退化到 torchrun fallback +- 60 秒 stress 结果 +- 本机 localhost RDMA/IB 结果 +- 训练模拟结果 + +这份报告能作为“快速冒烟 + 单机初筛”,不能直接作为 PDF 标准下的“生产验收合格报告”。 + +## 当前两台机器执行前置状态 + +已经确认: + +- `nvbandwidth` 已装好并能被项目脚本调用 +- PyTorch CUDA 环境已装好 +- RDMA perftest 工具已存在 +- `nccl-tests` 和 `gpu-burn` 目前没有按 PDF 生产验收口径准备好 + +另外,我刚才误触发的 `test all`: + +- `aikubeworker0016` 已经在跑单节点 `test all`,当前到 Training Simulation +- `aikubeworker0012` 没有成功启动 + +## 要补齐到 PDF 验收口径,需要加的最小清单 + +1. 安装/修复 `nccl-tests`,确保真正输出 bus BW,而不是 torchrun fallback。 +2. 安装/修复 `gpu-burn`,或把 PyTorch stress 改成真正高占用 FP16/BF16 GEMM,并支持 30/60 分钟。 +3. 增加 NVLink 专项:`nvidia-smi nvlink -s/-c/-e`,按 18 条/卡、25GB/s、error=0 判定。 +4. 增加 DCGM 专项:`dcgmi diag -r 3`,解析子项 PASS/FAIL。 +5. 增加 telemetry 采样:stress 期间每 1 秒采温度、功耗、throttle、XID;计算稳态功耗、温差、抖动。 +6. 修改 RDMA:支持指定 server/client、4MB 带宽、8B 延迟、双向 ibping、PFC/ECN 计数。 +7. 修改 NCCL 配置:全 op 开启,按 1MB/256MB/2GB 三个 size,重复 3 次取最差值和标准差。 +8. 修改 Compute:逐 GPU 分别跑,计算同 dtype 极差/均值;增加 FP64、INT8。 +9. 修改 Training Simulation:明确 8 卡 1.5B synthetic 分布式训练,加入 tokens/s、step 抖动、显存、loss NaN/Inf 的 PASS/FAIL。 +10. 修改最终 verdict:按 PDF 规则,任一子项 FAIL 就整机不通过。 + +## 建议执行策略 + +现在直接跑: + +```bash +/root/gpu-test-venv/bin/python gpu_tester.py --test all --report --format md --output reports_all/test_all.md +``` + +得到的是“当前仓库 all 覆盖范围报告”。 + +要拿来做生产验收,需要先补齐上面的缺口,尤其是 `nccl-tests`、`gpu-burn`、NVLink、DCGM、长时间 burn-in、跨节点 RDMA。 diff --git a/README.md b/README.md index ebe1ae6..1af08c4 100644 --- a/README.md +++ b/README.md @@ -159,7 +159,7 @@ python3 gpu_tester.py [3] Memory Benchmark (nvbandwidth) [4] Compute Benchmark [5] NCCL Multi-GPU Test - [6] GPU Stress Test (gpu-burn) + [6] GPU Stress Test (PyTorch/gpu-burn) [7] RDMA/IB Test [8] Training Simulation [9] Full Test Suite (All Tests) @@ -279,33 +279,35 @@ python3 gpu_tester.py --config /path/to/config.yaml --test all | FP16 | 312 TFLOPS | 990 TFLOPS | 2,250 TFLOPS | 3,500 TFLOPS | | BF16 | 312 TFLOPS | 990 TFLOPS | 2,250 TFLOPS | 3,500 TFLOPS | | FP8 | N/A | 1,979 TFLOPS | 4,500 TFLOPS | 7,000 TFLOPS | +| FP64 | 9.7 TFLOPS | 67 TFLOPS | TBD | TBD | +| INT8 | 624 TOPS | 1,979 TOPS | TBD | TBD | -默认配置:4096×4096 矩阵,10 次 warmup,100 次迭代。 +默认配置:8192×8192 矩阵,50 次 warmup,500 次迭代;逐 GPU 跑 FP32/TF32/FP16/BF16/FP8/FP64/INT8,并按同 dtype 的极差/均值判断一致性。 ### 5. NCCL Multi-GPU Test(多卡通信) -优先使用官方 nccl-tests(通过 mpirun 调用),不可用时 torchrun fallback。 +优先使用官方 nccl-tests(通过 mpirun 调用)并解析真实 bus BW;如果只能走 torchrun fallback,验收结果会标记 FAIL。 | 操作 | 说明 | |---|---| | AllReduce | 最常用的集合通信 | | AllToAll | 模型并行关键操作 | | Broadcast | 参数同步 | -| ReduceScatter | 可选 | -| AllGather | 可选 | -| SendRecv | 可选 | +| ReduceScatter | 必测 | +| AllGather | 必测 | +| SendRecv | 必测 | -默认测试数据量范围 8B ~ 256MB,5 次 warmup,20 次迭代。 +默认按 PDF 口径测试 1MB、256MB、2GB 三个 size,每个 op 重复 3 次,取 worst bus BW 和标准差;标准差超过 3% 判 FAIL。 **NVLink 参考带宽:** A100/A800 ≥ 240 GB/s | H100/H200 ≥ 360 GB/s | B200/B300 ≥ 720 GB/s(40% NVLink 峰值) ### 6. GPU Stress Test(压力测试) -使用 gpu-burn 进行长时满载测试,验证热稳定性和内存正确性。 +默认使用 PyTorch BF16/FP16 GEMM 进行长时高功耗满载测试;也可在配置中启用 gpu-burn。测试期间采集温度、功耗、throttle、XID,并计算稳态功耗、温差和 TFLOPS 抖动。 | 参数 | 默认值 | 说明 | |---|---|---| -| duration_sec | 60 | 测试时长(秒) | +| duration_sec | 1800 | 测试时长(秒) | | use_tensor_cores | true | 使用 Tensor Core | | memory_pct | 90 | 内存占用比例 | @@ -320,18 +322,18 @@ python3 gpu_tester.py --config /path/to/config.yaml --test all | 写延迟 | ib_write_lat | | 读延迟 | ib_read_lat | -**参考阈值:** 带宽 ≥ 50 GB/s, 延迟 ≤ 10 μs +**参考阈值:** 端口 ACTIVE 且 ≥400Gbps;4MB 写/读带宽 ≥47GB/s;8B 写延迟 ≤2μs、读延迟 ≤3.5μs;PFC/ECN/CNP/congestion 计数为 0。 ### 8. Training Simulation(训练模拟) -使用真实或合成模型模拟训练负载。 +默认跑 8 卡 DDP synthetic 1.5B Transformer 训练模拟。 | 模式 | 说明 | |---|---| -| 真实模型 | 加载 HuggingFace GPT-2(需安装 transformers) | -| 合成模型 | 6 层 Transformer(无需额外依赖) | +| DDP 合成模型 | 约 1.5B 参数,8 卡 torchrun | +| 单进程 fallback | 仅用于调试;生产验收按 FAIL | -输出:tokens/sec、步时、峰值显存、最终 loss。 +输出:tokens/sec、步时、warmup 后 step 抖动、峰值显存、最终 loss,并检查 loss 是否 NaN/Inf。 --- @@ -351,14 +353,14 @@ benchmark: nvbandwidth_buffer_mb: 512 # nvbandwidth 缓冲区大小 nvbandwidth_samples: 3 # nvbandwidth 采样次数 compute: - dtypes: [fp32, tf32, fp16, bf16, fp8] - matrix_size: 4096 # GEMM 矩阵维度 - warmup: 10 - iterations: 100 + dtypes: [fp32, tf32, fp16, bf16, fp8, fp64, int8] + matrix_size: 8192 # GEMM 矩阵维度 + warmup: 50 + iterations: 500 health: - temp_warning: 80 # 温度警告阈值 °C - temp_critical: 90 # 温度严重阈值 °C + temp_warning: 75 # 温度警告阈值 °C + temp_critical: 85 # 温度严重阈值 °C power_limit: null # null = 自动匹配 GPU TDP nccl: @@ -366,26 +368,62 @@ nccl: test_allreduce: true test_alltoall: true test_broadcast: true + test_reduce_scatter: true + test_allgather: true + test_sendrecv: true + message_sizes: [1M, 256M, 2G] + repeats: 3 + max_stddev_pct: 3 stress: - duration_sec: 60 # 压力测试时长 + duration_sec: 1800 # 压力测试时长 + use_gpu_burn: false # 默认走 PyTorch GEMM stress + dtype: bf16 + matrix_size: 24576 + telemetry_interval_sec: 1 + min_power_watts: 630 + max_tflops_jitter_pct: 5 + require_tflops_jitter: true use_tensor_cores: true rdma: - min_bandwidth_gbps: 50 # RDMA 最低可接受带宽 - max_latency_us: 10 # RDMA 最大可接受延迟 - msg_size: 65536 # 测试消息大小 + min_bandwidth_gbps: 47 # RDMA 最低可接受带宽 + min_port_rate_gbps: 400 # IB 端口最低速率 + max_write_latency_us: 2.0 + max_read_latency_us: 3.5 + msg_size: 4194304 # 4MB 带宽测试消息 + latency_msg_size: 8 # 8B 延迟测试消息 + server_addr: null # client 模式 perftest 对端 IP + ibping_target: null # ibping 对端 LID/GID,不是 IP + role: auto # auto / server / client + pfc_ecn_counters: true + +nvlink: + expected_links_per_gpu: 18 + expected_link_speed_gbps: 25 + require_zero_errors: true + +dcgm: + diag_level: 3 + timeout_sec: 3600 + expected_num_gpus: 8 + json_output: true + require_subtests: true training: - model: gpt2 # HuggingFace 模型名 + model: synthetic_1.5b # 8 卡 synthetic Transformer batch_size: 8 seq_length: 2048 num_steps: 50 + warmup_steps: 5 dtype: bf16 + mode: ddp + min_tokens_per_sec: 45000 + max_step_jitter_pct: 3 report: output_dir: ./reports - format: json # json 或 html + format: json # json / html / md ``` --- @@ -493,9 +531,11 @@ report: 步骤 2: RDMA 网络测试 ├── python3 gpu_tester.py --test rdma ├── 确认: IB 设备被识别 -├── 确认: 端口状态 Active -├── 确认: 写带宽 ≥ 50 GB/s -├── 确认: 延迟 ≤ 10 μs +├── 确认: 端口状态 ACTIVE 且 ≥400Gbps +├── 确认: 4MB 写/读带宽 ≥47 GB/s +├── 确认: 8B 写延迟 ≤2 μs、读延迟 ≤3.5 μs +├── 确认: ibping 双向连通 +├── 确认: PFC/ECN/CNP/congestion 计数为 0 └── 异常: 检查 IB 线缆、交换机配置、子网管理器 步骤 3: 多节点 NCCL 测试 diff --git a/docs/h100_test_all_metrics_guide_cn.md b/docs/h100_test_all_metrics_guide_cn.md new file mode 100644 index 0000000..37abd28 --- /dev/null +++ b/docs/h100_test_all_metrics_guide_cn.md @@ -0,0 +1,255 @@ +# H100 `test all` 指标说明 + +本文解释 `gpu_tester.py --test all` 报告里每一项指标的意义、它在验收中代表什么,以及异常时通常应该优先排查什么。 + +适用报告: + +- `reports_test_all_latest_aikubeworker0012_20260522_203246.md` +- `reports_test_all_latest_aikubeworker0016_20260522_203447.md` +- `reports_test_all_latest_summary_cn_20260523.md` + +## 总体判定 + +| 指标 | 意义 | 怎么看 | +|---|---|---| +| `Overall Acceptance Verdict` | 整机验收结论 | 按 PDF 生产验收规则,任一必测子项 FAIL,则整机 FAIL | +| `Suite complete: x/10 tests passed` | 10 个测试模块里通过了几个 | 用来快速看整体健康度,但最终以 `Overall Acceptance Verdict` 为准 | +| `PASS` | 达到当前配置阈值 | 表示该指标在当前测试口径下通过 | +| `FAIL` | 未达到当前配置阈值,或证据不足 | 表示该项不能作为生产验收通过证据 | +| `WARN` | 旧报告或非强制警告口径 | 当前 PDF 生产验收里,关键性能未达标应按 FAIL 处理 | + +## GPU Info + +GPU Info 是基础盘点项,用来确认机器硬件、驱动和 CUDA 环境是否符合预期。 + +| 指标 | 意义 | 异常影响 | +|---|---|---| +| GPU count | 当前系统识别到的 GPU 数量 | H100 8 卡机器如果不是 8 张,后续所有多卡测试都不可信 | +| GPU model | GPU 型号,例如 H100 | 型号不对会导致阈值、峰值、验收口径都不对 | +| Driver version | NVIDIA 驱动版本 | 版本过旧可能影响 CUDA、NCCL、DCGM、NVLink 工具 | +| CUDA version | CUDA 运行时或驱动支持版本 | CUDA 不匹配会导致 PyTorch、nccl-tests 或编译工具异常 | +| GPU UUID / PCI bus id | GPU 唯一标识和 PCIe 拓扑位置 | 用于定位具体故障卡、对应槽位和链路 | + +这项通常不直接代表性能好坏,它是确认“测的是不是目标机器、目标 GPU、目标软件栈”。 + +## Health Check + +Health Check 是空闲或轻负载状态下的基础健康检查。 + +| 指标 | 意义 | 怎么看 | +|---|---|---| +| Temperature | 当前 GPU 温度 | 空闲温度过高可能说明散热、风道、环境温度异常 | +| Power | 当前功耗 | 空闲功耗异常高可能说明有残留进程或功耗状态异常 | +| ECC errors | 显存纠错错误 | 单比特错误过多或双比特错误通常需要重点关注硬件稳定性 | +| PCIe | PCIe 代际和宽度,例如 Gen5 x16 | 降速或降宽会影响 CPU-GPU、RDMA、部分数据搬运性能 | +| Throttle | 当前是否触发限速 | 空闲状态下非 idle throttle 不正常,可能影响后续性能 | +| XID / NVRM events | 驱动或 GPU 错误事件 | 出现新 XID 通常说明硬件、驱动、供电或内核态异常 | + +Health PASS 只能说明基础状态正常,不代表满载性能一定达标。 + +## Memory Bandwidth + +Memory Bandwidth 衡量数据搬运能力,包括 CPU 到 GPU、GPU 到 CPU、GPU 到 GPU。 + +| 指标 | 意义 | 代表什么 | +|---|---|---| +| H2D | Host to Device,CPU 内存到 GPU 显存带宽 | 受 PCIe、NUMA、CPU 内存、驱动影响 | +| D2H | Device to Host,GPU 显存到 CPU 内存带宽 | 受 PCIe、NUMA、CPU 内存、驱动影响 | +| D2D | Device to Device,GPU 到 GPU 带宽 | 单节点多卡通常主要受 NVLink/NVSwitch 影响 | +| Efficiency | 实测值相对理论或配置阈值的比例 | 用于快速判断是否达到预期带宽 | + +H2D/D2H 主要看 PCIe 和 CPU 侧链路是否正常。D2D 更接近多卡训练、NCCL 和 P2P 通信的基础能力。 + +## Compute Throughput + +Compute Throughput 衡量 GPU 在不同数值格式下的矩阵计算吞吐,单位通常是 TFLOPS。 + +| 指标 | 意义 | 常见用途 | +|---|---|---| +| FP32 | 32 位浮点性能 | 传统科学计算、部分模型训练和验证 | +| TF32 | TensorFloat-32 Tensor Core 性能 | NVIDIA Ampere/Hopper 上常见的 FP32 加速路径 | +| FP16 | 16 位浮点 Tensor Core 性能 | 深度学习训练和推理常用 | +| BF16 | bfloat16 Tensor Core 性能 | 大模型训练常用,数值范围比 FP16 更稳 | +| FP8 | 8 位浮点 Tensor Core 性能 | 新一代低精度训练/推理加速 | +| FP64 | 64 位双精度性能 | HPC、科学计算、仿真 | +| INT8 | 8 位整数性能 | 推理、量化模型 | +| Achieved | 实测吞吐 | 越接近峰值越好 | +| Peak | 理论峰值或规格峰值 | 用来计算效率 | +| Threshold | 当前验收阈值 | 低于阈值则 FAIL | +| Efficiency | `Achieved / Peak` | 衡量实测利用率 | + +### Compute Consistency + +Consistency 是看同一种 dtype 下,不同 GPU 之间性能是否均衡。 + +| 指标 | 意义 | 异常含义 | +|---|---|---| +| Min | 8 张 GPU 里最慢卡的实测值 | 用于发现拖后腿的卡 | +| Mean | 8 张 GPU 平均值 | 用于看整体水平 | +| Max | 8 张 GPU 里最快卡的实测值 | 和 Min 一起计算离散度 | +| Spread | `(Max - Min) / Mean` | 反映卡间性能差异 | + +Spread 超过阈值通常说明某些卡受温度、功耗、PCIe、后台负载、时钟策略或硬件状态影响。即使平均性能还可以,卡间差异过大也会拖慢分布式训练。 + +## NVLink / NVSwitch + +NVLink/NVSwitch 测试确认 GPU 间高速互联是否完整、速率是否正确、错误计数是否干净。 + +| 指标 | 意义 | 怎么看 | +|---|---|---| +| Active Links | 每张 GPU 当前活跃 NVLink 数 | H100 8 卡 SXM 常见期望是每卡 18 条 | +| Expected Links | 配置期望链路数 | 少一条都可能影响拓扑和 NCCL 性能 | +| Link speed | 单条链路速率 | 速率不对说明链路降级或识别异常 | +| Error counters | NVLink 错误计数,例如 CRC/replay/recovery | 非零可能说明链路质量或硬件问题 | + +NVLink PASS 表示链路状态看起来正常,但 NCCL 仍可能因算法、拓扑、消息大小、NCCL 参数或系统噪声而不达标。 + +## DCGM Diagnostic + +DCGM 是 NVIDIA 官方诊断工具。`dcgmi diag -r 3` 是比较完整的生产诊断级别。 + +| 子项 | 意义 | +|---|---| +| Deployment/software | 驱动、库、系统软件依赖检查 | +| Hardware/memory | GPU 显存健康检查 | +| Hardware/diagnostic | GPU 硬件基础诊断 | +| Hardware/nvbandwidth | GPU/NVLink/NVSwitch 带宽诊断 | +| Integration/pcie | PCIe 集成和链路相关检查 | +| Stress/targeted_stress | DCGM 自带目标压力测试 | +| Stress/targeted_power | DCGM 自带目标功耗压力测试 | +| summary | 该分类汇总结果 | + +DCGM PASS 是强证据,说明官方诊断没有发现明显硬件故障。但它不替代项目里的 NCCL、RDMA、长时间 telemetry 和训练模拟验收。 + +## NCCL Multi-GPU + +NCCL 测试衡量单节点多 GPU 集合通信能力。它直接关系到多卡训练效率。 + +| 指标 | 意义 | 为什么重要 | +|---|---|---| +| source | 测试来源 | 必须是 `nccl-tests` 才有真实 bus BW;`torchrun_fallback` 只能说明功能连通,不是性能验收 | +| bus BW | NCCL 报告的总线等效带宽 | 用来衡量通信是否吃满 NVLink/NVSwitch | +| message size | 消息大小,例如 1M、256M、2G | 小消息看延迟和调度,中大消息看带宽 | +| repeats | 重复次数 | 减少偶然波动,当前按 3 次取样 | +| worst bus BW | 多次结果里的最差值 | 生产验收更关注最差情况 | +| mean bus BW | 多次平均值 | 反映稳定水平 | +| stddev | 标准差或波动 | 波动大说明通信稳定性不足 | + +### NCCL op 含义 + +| Op | 意义 | 常见场景 | +|---|---|---| +| allreduce | 每张卡都有一份数据,做规约后每张卡都拿到结果 | 数据并行梯度同步最常见 | +| allgather | 每张卡收集所有卡的数据分片 | 模型并行、张量并行、参数/激活收集 | +| reducescatter | 先规约再把结果切分给各卡 | ZeRO、优化器状态切分、分布式训练常用 | +| broadcast | 一张卡把数据广播给其他卡 | 参数同步、初始化权重分发 | +| sendrecv | 点对点发送和接收 | pipeline、定制通信、拓扑验证 | +| alltoall | 每张卡向每张卡交换不同数据 | MoE、专家并行、shuffle 类通信 | + +NCCL 小消息失败常见于延迟、调度或阈值口径较严;大消息失败更偏向链路带宽、拓扑、NCCL 参数或 NVSwitch/PCIe/NUMA 配置问题。 + +## Stress Test + +Stress Test 是长时间高负载稳定性测试。它不是只看“能不能跑完”,还要看满载期间的温度、功耗、限速和错误事件。 + +| 指标 | 意义 | 怎么看 | +|---|---|---| +| duration | 实际压力测试时长 | 生产验收通常需要 30/60 分钟 | +| source | 压力来源,例如 `pytorch` 或 `gpu-burn` | 说明用什么负载压 GPU | +| dtype | 压力计算的数据类型,例如 BF16 | 影响 Tensor Core、功耗和温度 | +| matrix_size | GEMM 矩阵边长 | 越大越容易形成持续高占用 | +| memory_pct | 目标显存占用比例 | 避免只测很小负载 | +| Avg steady power | 稳态平均功耗 | 判断是否真的把卡压起来 | +| Max steady temp | 稳态最高温度 | 判断散热上限 | +| Temp delta | 8 卡之间最高温和最低温的差 | 差异过大说明风道、散热或卡位不均衡 | +| TFLOPS jitter | 稳态吞吐波动 | 波动大说明性能不稳定 | +| Throttle events | 限速事件数量 | 非 idle throttle 会影响性能稳定性 | +| XID events | 压测期间新增 XID 错误 | 出现 XID 通常是严重风险 | + +### Throttle 常见含义 + +| 代码 | 常见含义 | 解释 | +|---|---|---| +| `0x1` | idle throttle | 空闲状态限速,通常不算真实问题 | +| `0x4` | `sw_power_cap` | 达到软件功耗上限,性能可能被功耗墙限制 | +| `0x8` | hardware slowdown | 硬件触发降速 | +| `0x10` | thermal slowdown | 温度触发降速 | +| `0x20` | power brake | 外部供电或硬件功率保护 | +| `0x40` | software thermal slowdown | 软件温度策略触发降速 | + +当前报告里的 `sw_power_cap` 表示负载确实压到了功耗墙附近,但验收口径把非 idle throttle 作为失败原因之一,因为它会影响长时间稳定输出。 + +## RDMA / InfiniBand + +RDMA 测试衡量 IB 网卡和网络链路性能。单节点 loopback 和跨节点 server/client 是两种不同证据,不能混用。 + +| 指标 | 意义 | 怎么看 | +|---|---|---| +| Device | IB 设备名,例如 `mlx5_0` | 对应具体 HCA/端口 | +| Port | 端口号 | 通常是 port 1 | +| State | 端口状态,例如 ACTIVE/DOWN | ACTIVE 才能作为可用链路 | +| Rate | 端口速率,例如 400 Gb/sec | 低于期望说明链路降级或接错网络 | +| GID/LID | IB 寻址信息 | `ibping` 和跨节点定位会用到 | +| ib_write_bw | RDMA write 带宽 | 客户端向远端写数据的吞吐 | +| ib_read_bw | RDMA read 带宽 | 客户端从远端读数据的吞吐 | +| ib_write_lat | RDMA write 延迟 | 小消息写延迟 | +| ib_read_lat | RDMA read 延迟 | 小消息读延迟 | +| ibping | IB 层连通性测试 | 看 LID/GID 层是否可达 | +| PFC/ECN/CNP counters | 拥塞和流控相关计数 | 非零或增长可能说明网络拥塞/丢包/流控问题 | + +### 单节点与跨节点的区别 + +| 口径 | 意义 | 能证明什么 | 不能证明什么 | +|---|---|---|---| +| `local_loopback` | 在同一台机器本地启动 perftest server/client | 工具、设备、单机端口基本可用 | 不能证明两台机器之间 RDMA 网络达标 | +| server/client 跨节点 | 一台做 server,另一台做 client | 能证明实际跨节点 RDMA 带宽/延迟 | 需要明确 server_addr、ib_device、ib_port、ibping_target | + +RDMA read 带宽低于 write 带宽很常见,但生产验收会给 read/write 各自设置阈值。read 不过线时,需要排查 HCA 固件、BIOS、PCIe、NUMA、RoCE/IB 配置、交换机、PFC/ECN、线缆和端口速率。 + +## Training Simulation + +Training Simulation 用一个合成 1.5B Transformer 训练负载验证 8 卡分布式训练是否能稳定运行。 + +| 指标 | 意义 | 怎么看 | +|---|---|---| +| Model | 模型类型 | 当前是 synthetic 1.5B,不依赖真实数据集 | +| Parameters | 参数量 | 用来确认负载规模是否达到预期 | +| GPU Count | 参与训练的 GPU 数 | 生产口径要求 8 卡 DDP | +| DType | 训练数值格式,例如 BF16 | 大模型训练常用 BF16 | +| Batch Size | 每步 batch 大小 | 影响吞吐和显存 | +| Seq Length | 序列长度 | 影响计算量和显存 | +| Steps | 计入统计的训练步数 | 步数太少会导致统计不稳 | +| Warmup Steps | 预热步数 | 避免把 CUDA 初始化、编译、缓存冷启动计入性能 | +| Avg Step Time | 平均每步耗时 | 越低越好 | +| Throughput | tokens/sec | 训练吞吐核心指标 | +| Samples/sec | 每秒样本数 | 辅助衡量数据处理速度 | +| Peak Memory | 峰值显存 | 看是否接近 OOM 或显存利用不足 | +| Final Loss | 最后 loss | 用于确认数值是有限值,没有 NaN/Inf | +| Step Jitter | step 时间抖动 | 抖动大说明训练不稳定 | +| Distributed Mode | 分布式模式 | 必须是 `ddp` 才满足 8 卡分布式口径 | + +Training PASS 说明 8 卡 DDP 训练路径、NCCL 功能连通、PyTorch CUDA 和基本数值稳定性都没问题。但它不能替代 NCCL 性能测试,因为训练负载可能没有覆盖所有通信模式和消息大小。 + +## 常见误读 + +1. `DCGM PASS` 不等于整机验收 PASS。DCGM 是官方诊断的一部分,不覆盖全部业务性能门槛。 +2. `Training PASS` 不等于 NCCL 性能 PASS。训练能跑,只说明功能链路通;NCCL bus BW 仍可能不达标。 +3. `NVLink PASS` 不等于 NCCL PASS。链路数量和错误计数正常,不代表所有 NCCL op/size 都达到阈值。 +4. `ibping PASS` 不等于 RDMA 带宽 PASS。`ibping` 只证明连通性,不证明吞吐和延迟达标。 +5. `local_loopback` 不能当作跨节点 RDMA 证据。跨节点验收必须有 server/client 两端证据。 +6. Stress 跑满 30 分钟不等于 PASS。温差、功耗、throttle、XID、jitter 都要一起看。 +7. 小消息 NCCL 低不一定是链路断了,可能是延迟、算法、启动开销或阈值口径导致;但生产验收仍按阈值判定。 + +## 排查优先级建议 + +| 失败项 | 优先看什么 | +|---|---| +| Compute FAIL | GPU 时钟、功耗策略、MIG/MPS、后台进程、PyTorch/CUDA 版本、benchmark 算法是否用到目标 Tensor Core 路径 | +| NCCL FAIL | `NCCL_DEBUG=INFO`、拓扑、NVSwitch/NVLink、NCCL 算法、消息大小、PCIe/NUMA、进程绑核 | +| Stress FAIL | 机箱风道、风扇、环境温度、功耗上限、`nvidia-smi -q -d POWER,CLOCK,TEMPERATURE` | +| RDMA FAIL | 端口速率、HCA 固件、线缆、交换机、PFC/ECN、NUMA、BIOS、跨节点 server/client 配置 | +| Training FAIL | torchrun、NCCL 环境变量、CUDA OOM、loss NaN/Inf、DDP 初始化、网络/共享内存 | + +## 一句话版 + +这套报告不是只看 GPU 能不能亮、训练能不能跑,而是同时验证:硬件识别、基础健康、显存和互联带宽、计算吞吐、多卡通信、长时间满载稳定性、IB/RDMA 网络、官方 DCGM 诊断和 8 卡训练业务路径。任何一个关键项 FAIL,按生产验收都应判整机不通过。 diff --git a/docs/multinode_nccl_concepts.md b/docs/multinode_nccl_concepts.md new file mode 100644 index 0000000..1c6039d --- /dev/null +++ b/docs/multinode_nccl_concepts.md @@ -0,0 +1,362 @@ +# 多机多卡 NCCL 测试概念说明 + +本文先讲概念,不涉及脚本改造。目标是理解两台 8 卡 H100 服务器做多机多卡通信测试时,应该从哪些层次逐步验证,以及每一层到底在证明什么。 + +当前示例机器: + +| 别名 | 主机名 | 内网 IP | GPU | +|---|---|---|---| +| nccl-gpu-1 | aikubeworker0012 | 172.72.8.12 | 8 x H100 | +| nccl-gpu-2 | aikubeworker0016 | 172.72.8.16 | 8 x H100 | + +两台机器合起来就是 16 张 GPU。多机 NCCL 测试的核心问题是:这 16 张 GPU 是否能通过正确的 GPU、NVLink、PCIe、IB/RDMA 网络路径,高效且正确地完成集体通信。 + +## 1. 总体思路 + +多机多卡通信测试是一个自底向上的过程。越底层越接近硬件和链路,越上层越接近真实训练业务。 + +```mermaid +flowchart TD + L0["0. 物理与基础连通
电源 / GPU / 网卡 / 线缆 / 交换机 / SSH"] --> L1["1. 系统识别层
nvidia-smi / lspci / ibstat / ibdev2netdev"] + L1 --> L2["2. 单机 GPU 健康层
温度 / 功耗 / ECC / PCIe / Throttling / NVLink Topo"] + L2 --> L3["3. 单机 GPU 性能层
HBM 带宽 / H2D-D2H / FP32-TF32-FP16-BF16-FP8 算力"] + L3 --> L4["4. 单机多卡通信层
单节点 8 卡 NCCL over NVLink/NVSwitch"] + L4 --> L5["5. 跨机网络与 RDMA 层
IP 连通 / IB Active / RDMA 带宽 / RDMA 延迟"] + L5 --> L6["6. 跨机 NCCL 层
两机 16 卡 AllReduce / AllGather / ReduceScatter / Broadcast / AllToAll"] + L6 --> L7["7. 训练负载层
torchrun / Megatron / DeepSpeed / 业务训练压测"] +``` + +最重要的原则: + +**上层失败,不一定是上层问题。** + +比如两机 `all_reduce_perf` 失败,原因可能在 NCCL,也可能在 SSH、MPI、IB、GID、网卡选择、驱动版本、CUDA 版本、NCCL 版本或 GPU Direct RDMA。 + +所以排查顺序应该是: + +```text +基础连通 -> 单机健康 -> 单机性能 -> 单机 NCCL -> 跨机 RDMA -> 跨机 NCCL -> 训练业务 +``` + +## 2. 两机 16 卡通信路径 + +单机内部主要走 NVLink/NVSwitch;跨机器时,数据必须经过 GPU、PCIe/NVLink、网卡、交换机和对端网卡。 + +```mermaid +flowchart LR + subgraph A["aikubeworker0012 / 172.72.8.12"] + A0["GPU0"] --- ASW["NVSwitch / NVLink"] + A1["GPU1"] --- ASW + A2["..."] --- ASW + A7["GPU7"] --- ASW + ASW --> ANIC["IB/RDMA NIC(s)"] + end + + subgraph NET["InfiniBand / RoCE Fabric"] + SW["IB Switch"] + end + + subgraph B["aikubeworker0016 / 172.72.8.16"] + BNIC["IB/RDMA NIC(s)"] --> BSW["NVSwitch / NVLink"] + B0["GPU0"] --- BSW + B1["GPU1"] --- BSW + B2["..."] --- BSW + B7["GPU7"] --- BSW + end + + ANIC <--> SW + SW <--> BNIC +``` + +这里有两个不同的通信域: + +| 通信域 | 典型路径 | 主要测试 | +|---|---|---| +| 单机内 8 卡 | GPU -> NVLink/NVSwitch -> GPU | 单机 NCCL、NVLink topo、D2D | +| 跨机器 16 卡 | GPU -> NIC -> IB/RDMA 网络 -> NIC -> GPU | RDMA、跨机 NCCL | + +这两个域的性能阈值不能混用。单机 NVSwitch 很快,跨机 RDMA 一般慢一些,跨机 NCCL 的瓶颈通常在 IB/RDMA 网络。 + +## 3. 每一层要测什么 + +### 3.1 基础连通层 + +这一层只证明机器能访问、身份和地址正确。 + +要确认: + +| 检查项 | 目的 | +|---|---| +| SSH 互通 | MPI/NCCL 多机启动依赖远端拉起进程 | +| hostname 正确 | 避免登录错机器 | +| IP 正确 | 确认使用的是训练网络或 IB/RDMA 对应网络 | +| 时间同步 | 长时间训练日志和超时排查更可靠 | + +这一层不证明 GPU 或 RDMA 性能,只证明“机器能互相找到”。 + +### 3.2 系统识别层 + +这一层证明系统能看见 GPU 和网卡。 + +常见信息: + +| 工具 | 看什么 | +|---|---| +| `nvidia-smi` | GPU 数量、型号、驱动、CUDA、温度、功耗 | +| `nvidia-smi topo -m` | GPU、NIC、CPU NUMA、NVLink/NVSwitch 拓扑 | +| `ibstat` | IB 设备、端口状态、链路速率 | +| `ibdev2netdev` | mlx5 设备和网络接口的映射 | +| `/sys/class/infiniband` | 端口状态、link layer、rate、GID | + +这一层很关键,因为 NCCL 经常因为选错网卡而跑到 TCP 或错误的接口上。 + +### 3.3 单机 GPU 健康层 + +这一层证明每台机器自己是健康的。 + +```mermaid +flowchart LR + H["单机健康检查"] --> T["温度"] + H --> P["功耗"] + H --> E["ECC 错误"] + H --> PCIE["PCIe Gen/Width"] + H --> C["SM/Mem Clock"] + H --> TH["Throttling"] + H --> PM["Persistence Mode"] +``` + +如果某张卡温度过高、ECC double-bit、PCIe 降级或 throttling,后面的 NCCL 测试即使能跑,结果也不可信。 + +### 3.4 单机 GPU 性能层 + +这一层证明每台机器的 GPU 本身性能正常。 + +| 测试 | 证明什么 | +|---|---| +| HBM/D2D 带宽 | GPU 显存和设备间拷贝能力 | +| H2D/D2H 带宽 | CPU/Host 到 GPU 的 PCIe 路径 | +| FP32/TF32 | 基础矩阵计算能力 | +| FP16/BF16/FP8 | 训练常用 Tensor Core 能力 | + +这一步是单机验收。它不能证明两台机器之间通信正常,但可以排除“某台机器本身 GPU 算力或带宽异常”。 + +### 3.5 单机多卡 NCCL 层 + +这一层验证单台机器 8 卡之间的集体通信。 + +```mermaid +flowchart TD + S["单机 8 卡 NCCL"] --> AR["AllReduce"] + S --> AG["AllGather"] + S --> RS["ReduceScatter"] + S --> BC["Broadcast"] + S --> AT["AllToAll"] +``` + +单机 NCCL 主要看 NVLink/NVSwitch 通信路径是否正常。常见指标: + +| 指标 | 含义 | +|---|---| +| `algbw` | 算法视角的有效带宽 | +| `busbw` | 总线视角的带宽,更适合比较通信链路利用率 | +| `#wrong` | 结果错误数量,必须是 0 | + +单机测试通过后,只能说明单台服务器内部 8 卡通信正常。 + +### 3.6 跨机 RDMA 层 + +这一层验证两台机器之间的网络和 RDMA 能力,不涉及 NCCL。 + +```mermaid +sequenceDiagram + participant N1 as aikubeworker0012 + participant FAB as IB/RDMA Fabric + participant N2 as aikubeworker0016 + + N1->>N2: ping / ssh + N1->>FAB: ib_write_bw client + FAB->>N2: ib_write_bw server + N1->>FAB: ib_read_bw client + FAB->>N2: ib_read_bw server + N1->>N2: ib_write_lat / ib_read_lat +``` + +这一层要回答: + +| 问题 | 说明 | +|---|---| +| IB 端口是否 Active | 没 Active 就不用跑 NCCL | +| RDMA 带宽是否达标 | 证明网络数据面能跑起来 | +| RDMA 延迟是否正常 | 高延迟会影响小消息和训练同步 | +| 是否是 InfiniBand/RoCE | 两者环境变量和排障点不同 | + +如果 RDMA 层失败,跨机 NCCL 大概率也会失败或退化到 TCP。 + +### 3.7 跨机 NCCL 层 + +这一层才是真正的多机多卡 NCCL 测试。 + +两台 8 卡机器通常是: + +```text +2 nodes x 8 GPUs = 16 ranks +每个 rank 绑定 1 张 GPU +``` + +概念上是: + +```mermaid +flowchart LR + subgraph N1["Node 1: 172.72.8.12"] + R0["rank 0 / GPU0"] + R1["rank 1 / GPU1"] + R2["..."] + R7["rank 7 / GPU7"] + end + + subgraph N2["Node 2: 172.72.8.16"] + R8["rank 8 / GPU0"] + R9["rank 9 / GPU1"] + R10["..."] + R15["rank 15 / GPU7"] + end + + R0 <--> R8 + R1 <--> R9 + R7 <--> R15 + N1 <--> N2 +``` + +典型测试项: + +| NCCL 测试 | 训练里对应什么 | +|---|---| +| AllReduce | 数据并行梯度同步 | +| ReduceScatter | ZeRO/FSDP 梯度切分 | +| AllGather | ZeRO/FSDP 参数聚合 | +| Broadcast | 参数广播、初始化 | +| AllToAll | MoE、专家并行、部分并行策略 | +| SendRecv | 点对点通信、pipeline parallel | + +跨机 NCCL 要看: + +| 指标 | 判定 | +|---|---| +| 是否成功启动 16 rank | MPI/SSH/路径/环境是否正常 | +| `#wrong == 0` | 正确性必须过 | +| `busbw` | 跨节点通信链路利用率 | +| 是否走 IB/RDMA | 需要从 `NCCL_DEBUG=INFO` 确认 | +| 是否退化 TCP | 如果退化,性能会明显偏低 | + +## 4. NCCL 为什么要分单机和跨机 + +单机 8 卡通信和跨机 16 卡通信的瓶颈不同。 + +```mermaid +flowchart TD + A["NCCL 性能结果"] --> B{"测试范围"} + B --> C["单机 8 卡"] + B --> D["跨机 16 卡"] + + C --> C1["主要瓶颈:NVLink / NVSwitch"] + C --> C2["阈值可参考 GPU NVLink 能力"] + + D --> D1["主要瓶颈:IB/RDMA 网络"] + D --> D2["阈值应参考网卡数量、速率、拓扑和 rail 数"] +``` + +所以不能用单机 NVLink 的阈值直接判断跨机 NCCL。跨机要根据真实网络能力设阈值,例如: + +| 网络配置 | 理论上限理解 | +|---|---| +| 单张 400G 网卡 | 约 50 GB/s 单向原始带宽 | +| 8 张 400G 网卡 | 约 400 GB/s 原始聚合带宽 | +| 实测 NCCL busbw | 会受拓扑、GDR、rail、NUMA、交换机、NCCL 算法影响 | + +实际验收时,应该先知道每台机器有几张 IB/RDMA 网卡、每张速率多少、GPU 到 NIC 的拓扑关系,再定跨机 NCCL 阈值。 + +## 5. 常见失败位置 + +```mermaid +flowchart TD + F["跨机 NCCL 失败"] --> A["启动失败"] + F --> B["能启动但很慢"] + F --> C["运行中 timeout"] + F --> D["结果 #wrong 非 0"] + + A --> A1["SSH 不通"] + A --> A2["远端路径不存在"] + A --> A3["MPI 环境不一致"] + A --> A4["root 运行未允许"] + + B --> B1["NCCL_SOCKET_IFNAME 选错"] + B --> B2["没走 IB/RDMA,退化 TCP"] + B --> B3["NCCL_IB_HCA 没选对"] + B --> B4["GPU Direct RDMA 没生效"] + + C --> C1["IB 端口不稳定"] + C --> C2["交换机/PFC/ECN 问题"] + C --> C3["NCCL timeout 配置"] + C --> C4["驱动/CUDA/NCCL 版本不兼容"] + + D --> D1["通信正确性失败"] + D --> D2["必须 FAIL,不能只看带宽"] +``` + +## 6. 推荐验收顺序 + +下面是面向两台 8 卡机器的推荐顺序: + +```mermaid +flowchart TD + A["Step 1: 两台机器基础信息"] --> B["Step 2: 两台机器单机 GPU 健康"] + B --> C["Step 3: 两台机器单机 benchmark"] + C --> D["Step 4: 两台机器分别跑单机 8 卡 NCCL"] + D --> E["Step 5: 两台机器互测 RDMA bandwidth/latency"] + E --> F["Step 6: 两机 16 卡 NCCL correctness"] + F --> G["Step 7: 两机 16 卡 NCCL performance"] + G --> H["Step 8: 两机训练 demo 或业务压测"] +``` + +每一步的意义: + +| 步骤 | 目的 | +|---|---| +| Step 1 | 确认没有登录错机器,基础网络和环境存在 | +| Step 2 | 排除 GPU 健康问题 | +| Step 3 | 排除 GPU 单卡/单机性能问题 | +| Step 4 | 排除单机 NVLink/NVSwitch/NCCL 问题 | +| Step 5 | 排除跨机 RDMA 问题 | +| Step 6 | 先证明 NCCL 正确性 | +| Step 7 | 再证明 NCCL 性能 | +| Step 8 | 最后用真实训练形态验证稳定性 | + +## 7. 对当前脚本的映射 + +当前脚本已有模块和上面层次的关系: + +| 当前模块 | 覆盖层次 | 备注 | +|---|---|---| +| `gpu_info` | 系统识别层 | 单机 | +| `health` | 单机 GPU 健康层 | 单机 | +| `benchmark` | 单机 GPU 性能层 | 单机 | +| `nccl` | 单机多卡通信层 | 当前主要是单机 | +| `rdma` | RDMA 检查 | 当前偏本机检查,不是两机互测 | +| `stress` | 稳定性 | 单机 | +| `training` | 训练负载层 | 当前偏单机 | +| 建议新增 `multi_node_nccl` | 跨机 NCCL 层 | 专门处理 hostfile、mpirun、多节点环境、结果解析 | + +如果未来要扩展脚本,比较自然的方向是新增一个多机模块,而不是把所有逻辑塞进现有 `nccl` 模块。 + +## 8. 最小概念模型 + +记住这句话即可: + +```text +单机 NCCL 验证 GPU 之间的 NVLink/NVSwitch。 +跨机 RDMA 验证机器之间的网络。 +跨机 NCCL 验证 NCCL 是否能把 GPU 和网络组合起来,为真实训练提供高效通信。 +``` + +因此,多机多卡测试不是一个命令,而是一条验证链路。 + diff --git a/gpu_tester.py b/gpu_tester.py index 4cfa47c..15bc694 100644 --- a/gpu_tester.py +++ b/gpu_tester.py @@ -5,6 +5,7 @@ import argparse import json import os import signal +import socket import sys import time from datetime import datetime @@ -25,6 +26,8 @@ from modules.nccl_test import NCCLTest from modules.training_sim import TrainingSim from modules.stress_test import StressTest from modules.rdma_test import RDMATest +from modules.nvlink_test import NVLinkTest +from modules.dcgm_test import DCGMTest from modules.report import ReportGenerator from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label, get_supported_gpus, validate_driver_compatibility @@ -32,43 +35,87 @@ DEFAULT_CONFIG = { "benchmark": { "memory": {"size_mb": 4096, "iterations": 10, "nvbandwidth_buffer_mb": 512, "nvbandwidth_samples": 3}, "compute": { - "dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8"], - "matrix_size": 4096, - "warmup": 10, - "iterations": 100, + "dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"], + "matrix_size": 8192, + "warmup": 50, + "iterations": 500, + "use_compile": True, }, }, - "health": {"temp_warning": 80, "temp_critical": 90, "power_limit": None}, + "health": {"temp_warning": 75, "temp_critical": 85, "power_limit": None}, "nccl": { "min_bandwidth_gbps": None, "test_allreduce": True, "test_alltoall": True, "test_broadcast": True, - "test_reduce_scatter": False, - "test_allgather": False, - "test_sendrecv": False, + "test_reduce_scatter": True, + "test_allgather": True, + "test_sendrecv": True, + "message_sizes": ["1M", "256M", "2G"], + "repeats": 3, + "max_stddev_pct": 3, }, "stress": { - "duration_sec": 60, + "duration_sec": 1800, + "production_duration_sec": 1800, + "use_gpu_burn": False, "use_doubles": False, "use_tensor_cores": True, "memory_pct": 90, "gpus": "all", + "dtype": "bf16", + "matrix_size": 24576, + "telemetry_interval_sec": 1, + "warmup_sec": 60, + "min_steady_samples": 10, + "max_temp_c": 80, + "max_temp_delta_c": 5, + "min_power_watts": 630, + "max_tflops_jitter_pct": 5, + "require_tflops_jitter": True, }, "rdma": { - "min_bandwidth_gbps": 50, - "max_latency_us": 10, + "min_bandwidth_gbps": 47, + "min_port_rate_gbps": 400, + "max_latency_us": 3.5, + "max_write_latency_us": 2.0, + "max_read_latency_us": 3.5, "ib_iterations": 1000, - "msg_size": 65536, + "msg_size": 4194304, + "latency_msg_size": 8, "ib_device": None, "ib_port": 1, + "server_addr": None, + "ibping_target": None, + "ibping_count": 5, + "role": "auto", + "pfc_ecn_counters": True, + }, + "nvlink": { + "expected_links_per_gpu": 18, + "expected_link_speed_gbps": 25, + "require_zero_errors": True, + }, + "dcgm": { + "diag_level": 3, + "timeout_sec": 1200, + "expected_num_gpus": 8, + "json_output": True, + "require_subtests": True, }, "training": { - "model": "gpt2", + "model": "synthetic_1.5b", "batch_size": 8, "seq_length": 2048, "num_steps": 50, + "warmup_steps": 5, "dtype": "bf16", + "mode": "ddp", + "synthetic_params_b": 1.5, + "min_tokens_per_sec": 45000, + "max_step_jitter_pct": 3, + "max_peak_memory_gb": 70, + "require_distributed": True, }, "report": {"output_dir": "./reports", "format": "json"}, "tools": {"install_dir": "/opt/gpu-test-tools"}, @@ -131,7 +178,7 @@ def interactive_menu(config: dict): if not check_prerequisites(console): return - results_store: dict = {"timestamp": datetime.now().isoformat(), "tests": {}} + results_store: dict = {"timestamp": datetime.now().isoformat(), "hostname": socket.gethostname(), "tests": {}} menu_items = [ ("1", "GPU Information", "gpu_info"), @@ -139,10 +186,12 @@ def interactive_menu(config: dict): ("3", "Memory Benchmark (nvbandwidth)", "memory_bench"), ("4", "Compute Benchmark", "compute_bench"), ("5", "NCCL Multi-GPU Test", "nccl"), - ("6", "GPU Stress Test (gpu-burn)", "stress"), + ("6", "GPU Stress Test (PyTorch/gpu-burn)", "stress"), ("7", "RDMA/IB Test", "rdma"), - ("8", "Training Simulation", "training"), - ("9", "Full Test Suite (All Tests)", "all"), + ("8", "NVLink/NVSwitch Test", "nvlink"), + ("9", "DCGM Diagnostic", "dcgm"), + ("10", "Training Simulation", "training"), + ("11", "Full Test Suite (All Tests)", "all"), ("0", "Generate Report", "report"), ] @@ -164,8 +213,10 @@ def interactive_menu(config: dict): "memory_bench": "HBM bandwidth via nvbandwidth", "compute_bench": "GEMM TFLOPS across FP32/TF32/FP16/BF16/FP8", "nccl": "AllReduce, AllToAll, Broadcast via nccl-tests", - "stress": "Long-running GPU stress via gpu-burn", + "stress": "Long-running high-power GEMM stress with telemetry", "rdma": "InfiniBand bandwidth & latency (ib_write_bw)", + "nvlink": "NVLink links, speed, and error counters", + "dcgm": "DCGM diag -r 3 production diagnostic", "training": "Simulate LLM training with PyTorch", "all": "Run all tests sequentially", "report": "Export results to JSON/HTML", @@ -257,6 +308,18 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict: m.print_results(result) return result + elif test_name == "nvlink": + m = NVLinkTest(config) + result = m.run() + m.print_results(result) + return result + + elif test_name == "dcgm": + m = DCGMTest(config) + result = m.run() + m.print_results(result) + return result + elif test_name == "training": m = TrainingSim(config) result = m.run() @@ -280,15 +343,17 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict: def _run_full_suite(config: dict, console: Console) -> dict: """Run all tests sequentially.""" console.print(Panel("[bold cyan]Running Full Test Suite[/bold cyan]", box=box.DOUBLE)) - all_results: dict = {"timestamp": datetime.now().isoformat()} + all_results: dict = {"timestamp": datetime.now().isoformat(), "hostname": socket.gethostname()} tests = [ ("gpu_info", "GPU Information", GPUInfo), ("health", "Health Check", HealthCheck), ("memory_bench", "Memory Benchmark", lambda c: Benchmark(c)), ("compute_bench", "Compute Benchmark", lambda c: Benchmark(c)), + ("nvlink", "NVLink/NVSwitch Test", NVLinkTest), ("nccl", "NCCL Test", NCCLTest), ("stress", "GPU Stress Test", StressTest), ("rdma", "RDMA/IB Test", RDMATest), + ("dcgm", "DCGM Diagnostic", DCGMTest), ("training", "Training Simulation", TrainingSim), ] @@ -313,14 +378,49 @@ def _run_full_suite(config: dict, console: Console) -> dict: # Summary console.print("\n" + "=" * 60) # Only count test results, exclude metadata like timestamp - test_results = {k: v for k, v in all_results.items() if k != "timestamp"} - passed = sum(1 for v in test_results.values() if not isinstance(v, dict) or "error" not in v) + test_results = {k: v for k, v in all_results.items() if k not in ("timestamp", "hostname")} + passed = sum(1 for v in test_results.values() if _test_result_passed(v)) total = len(test_results) color = "green" if passed == total else ("yellow" if passed > 0 else "red") console.print(f"[bold {color}]Suite complete: {passed}/{total} tests passed[/bold {color}]") return all_results +def _test_result_passed(result) -> bool: + """Strict production verdict helper for full-suite exit status.""" + if not isinstance(result, dict): + return True + if result.get("error"): + return False + if result.get("skipped") or result.get("status") == "SKIP": + return False + if result.get("source") == "torchrun_fallback": + return False + if "passed" in result: + return bool(result.get("passed")) + if "memory" in result: + mem = result["memory"] + if isinstance(mem, dict) and "passed" in mem: + return bool(mem.get("passed")) + if mem.get("error") or mem.get("source") == "pytorch": + return False + eff = mem.get("d2d_efficiency_pct") or mem.get("efficiency_pct") or 0 + return eff >= 80 + if "compute" in result: + comp = result["compute"] + if isinstance(comp, dict) and "passed" in comp: + return bool(comp.get("passed")) + thresholds = comp.get("pass_thresholds_tflops", {}) or {} + per_dtype = comp.get("per_dtype_tflops", {}) + for dt, threshold in thresholds.items(): + val = per_dtype.get(dt) + if not isinstance(val, (int, float)) or val < threshold: + return False + consistency = comp.get("consistency", {}) + return not any(not c.get("passed", False) for c in consistency.values()) + return True + + def main(): gpu_list_str = " / ".join(g.upper() for g in get_supported_gpus()) parser = argparse.ArgumentParser( @@ -335,15 +435,17 @@ Examples: python gpu_tester.py --test benchmark --type memory python gpu_tester.py --test benchmark --type compute --dtype fp16 python gpu_tester.py --test nccl # NCCL test + python gpu_tester.py --test nvlink # NVLink/NVSwitch test + python gpu_tester.py --test dcgm # DCGM diagnostic python gpu_tester.py --test training # Training sim python gpu_tester.py --test all # Full suite python gpu_tester.py --report --format json --output report.json """, ) - parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "training", "all"], + parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"], help="Run a specific test") parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)") - parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8"], + parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"], help="Compute benchmark dtype (with --test benchmark --type compute)") parser.add_argument("--interactive", action="store_true", help="Force interactive mode") parser.add_argument("--report", action="store_true", help="Generate report from last results") @@ -399,6 +501,8 @@ Examples: "nccl": "nccl", "stress": "stress", "rdma": "rdma", + "nvlink": "nvlink", + "dcgm": "dcgm", "training": "training", "all": "all", } @@ -415,19 +519,30 @@ Examples: result = bench.run() Benchmark.print_results(result) if args.report: - ReportGenerator(config).generate({"benchmark": result, "timestamp": datetime.now().isoformat()}, + ReportGenerator(config).generate({ + "benchmark": result, + "timestamp": datetime.now().isoformat(), + "hostname": socket.gethostname(), + }, fmt=args.format, output=args.output) + sys.exit(0 if _test_result_passed(result) else 1) elif args.test == "all": results = _run_full_suite(config, console) if args.report: ReportGenerator(config).generate(results, fmt=args.format, output=args.output) - has_errors = any("error" in v for v in results.values() if isinstance(v, dict)) - sys.exit(1 if has_errors else 0) + failed = any(not _test_result_passed(v) for k, v in results.items() if k not in ("timestamp", "hostname")) + sys.exit(1 if failed else 0) else: result = _run_test(test_map[args.test], config, console) if args.report and result: - ReportGenerator(config).generate({args.test: result, "timestamp": datetime.now().isoformat()}, + report_key = test_map[args.test] or args.test + ReportGenerator(config).generate({ + report_key: result, + "timestamp": datetime.now().isoformat(), + "hostname": socket.gethostname(), + }, fmt=args.format, output=args.output) + sys.exit(0 if _test_result_passed(result) else 1) if __name__ == "__main__": diff --git a/modules/dcgm_test.py b/modules/dcgm_test.py new file mode 100644 index 0000000..e7b4f49 --- /dev/null +++ b/modules/dcgm_test.py @@ -0,0 +1,231 @@ +"""DCGM diagnostic acceptance wrapper.""" + +import json +import os +import re +import shutil +import signal +import subprocess +from datetime import datetime +from typing import Optional + +from rich.console import Console +from rich.table import Table + + +class DCGMTest: + def __init__(self, config: dict): + self.config = config + self.console = Console() + self.cfg = config.get("dcgm", {}) + + def run(self) -> dict: + dcgmi = shutil.which("dcgmi") + if not dcgmi: + return { + "passed": False, + "error": "dcgmi not found", + "timestamp": datetime.now().isoformat(), + } + + level = str(self.cfg.get("diag_level", 3)) + timeout = int(self.cfg.get("timeout_sec", 1200)) + cmd = [dcgmi, "diag", "-r", level] + expected_gpus = self.cfg.get("expected_num_gpus") + if expected_gpus: + cmd.extend(["-n", f"gpu:{int(expected_gpus)}"]) + if self.cfg.get("json_output", True): + cmd.append("-j") + + try: + r = self._run_with_process_group_timeout(cmd, timeout) + except subprocess.TimeoutExpired as e: + output = ((e.output or "") + "\n" + (e.stderr or "")).strip() + return { + "passed": False, + "error": f"dcgmi diag -r {level} timeout after {timeout}s", + "command": cmd, + "raw_output_tail": output[-8000:], + "timestamp": datetime.now().isoformat(), + } + + output = r.stdout + "\n" + r.stderr + subtests = self._parse_json_output(output) or self._parse_output(output) + strict_statuses = {"PASS"} + failed = [s for s in subtests if s["status"] not in strict_statuses] + require_subtests = bool(self.cfg.get("require_subtests", True)) + passed = r.returncode == 0 and not failed and (bool(subtests) or not require_subtests) + return { + "passed": passed, + "returncode": r.returncode, + "level": int(level), + "command": cmd, + "expected_num_gpus": int(expected_gpus) if expected_gpus else None, + "subtests": subtests, + "raw_output_tail": output[-8000:], + "timestamp": datetime.now().isoformat(), + } + + @staticmethod + def _run_with_process_group_timeout(cmd: list[str], timeout: int) -> subprocess.CompletedProcess: + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + start_new_session=True, + ) + try: + stdout, stderr = proc.communicate(timeout=timeout) + except subprocess.TimeoutExpired as e: + try: + os.killpg(proc.pid, signal.SIGTERM) + stdout, stderr = proc.communicate(timeout=10) + except subprocess.TimeoutExpired: + os.killpg(proc.pid, signal.SIGKILL) + stdout, stderr = proc.communicate(timeout=10) + raise subprocess.TimeoutExpired(cmd, timeout, output=stdout, stderr=stderr) from e + return subprocess.CompletedProcess(cmd, proc.returncode, stdout, stderr) + + @classmethod + def _parse_json_output(cls, output: str) -> list[dict]: + text = output.strip() + if not text: + return [] + try: + payload = json.loads(text) + except json.JSONDecodeError: + m = re.search(r"(\{.*\})", text, re.S) + if not m: + return [] + try: + payload = json.loads(m.group(1)) + except json.JSONDecodeError: + return [] + + dcgm_payload = payload.get("DCGM Diagnostic") if isinstance(payload, dict) else None + if isinstance(dcgm_payload, dict): + parsed = cls._parse_dcgm_diagnostic_json(dcgm_payload) + if parsed: + return parsed + + subtests = [] + + def walk(node, path: list[str]): + if isinstance(node, dict): + node_name = ( + node.get("name") + or node.get("testName") + or node.get("test_name") + or node.get("category") + or node.get("category_name") + ) + child_path = [*path, str(node_name)] if node_name else path + status = node.get("status") or node.get("result") or node.get("Result") + if isinstance(status, str): + name = ( + node_name + or " / ".join(path[-3:]) + ) + normalized = cls._normalize_status(status) + if normalized: + subtests.append({ + "name": str(name)[:160], + "status": normalized, + "raw": json.dumps(node, default=str)[:1000], + }) + for key, value in node.items(): + walk(value, [*child_path, str(key)]) + elif isinstance(node, list): + for idx, item in enumerate(node): + walk(item, [*path, str(idx)]) + + walk(payload, []) + return subtests + + @classmethod + def _parse_dcgm_diagnostic_json(cls, payload: dict) -> list[dict]: + subtests = [] + for category in payload.get("test_categories", []) or []: + category_name = str(category.get("category") or "DCGM") + for test in category.get("tests", []) or []: + test_name = str(test.get("name") or "unnamed") + for result in test.get("results", []) or []: + status = cls._normalize_status(str(result.get("status", ""))) + if not status: + continue + entity_group = result.get("entity_group") or "entity" + entity_id = result.get("entity_id", "unknown") + name = f"{category_name}/{test_name}/{entity_group}{entity_id}" + subtests.append({ + "name": name[:160], + "status": status, + "raw": json.dumps(result, default=str)[:1000], + }) + summary = test.get("test_summary") or {} + status = cls._normalize_status(str(summary.get("status", ""))) + if status: + subtests.append({ + "name": f"{category_name}/{test_name}/summary"[:160], + "status": status, + "raw": json.dumps(summary, default=str)[:1000], + }) + return subtests + + @staticmethod + def _normalize_status(status: str) -> str: + s = status.strip().upper() + aliases = { + "PASS": "PASS", + "PASSED": "PASS", + "OK": "PASS", + "FAIL": "FAIL", + "FAILED": "FAIL", + "ERROR": "ERROR", + "WARN": "WARN", + "WARNING": "WARN", + "SKIP": "SKIP", + "SKIPPED": "SKIP", + "NOT_RUN": "SKIP", + "NOT RUN": "SKIP", + } + return aliases.get(s, s if s in {"PASS", "FAIL", "ERROR", "WARN", "SKIP"} else "") + + @staticmethod + def _parse_output(output: str) -> list[dict]: + subtests = [] + for line in output.splitlines(): + stripped = line.strip() + if not stripped: + continue + m = re.search(r"(.+?)\s*[:|]\s*(PASS|FAIL|WARN|ERROR|SKIP)\b", stripped, re.I) + if not m: + m = re.search(r"\b(PASS|FAIL|WARN|ERROR|SKIP)\b\s*[-:|]\s*(.+)", stripped, re.I) + if m: + status = DCGMTest._normalize_status(m.group(1)) + name = m.group(2).strip() + else: + continue + else: + name = m.group(1).strip(" .|-") + status = DCGMTest._normalize_status(m.group(2)) + if name and len(name) < 160: + subtests.append({"name": name, "status": status, "raw": stripped}) + return subtests + + @staticmethod + def print_results(results: dict, console: Optional[Console] = None): + c = console or Console() + if results.get("error"): + c.print(f"[bold red]DCGM error: {results['error']}[/bold red]") + return + passed = results.get("passed", False) + c.print("[bold green]✓ DCGM diag PASSED[/bold green]" if passed else "[bold red]✗ DCGM diag FAILED[/bold red]") + subtests = results.get("subtests", []) + if subtests: + table = Table(box=None, padding=(0, 1)) + table.add_column("Subtest") + table.add_column("Status", style="bold") + for s in subtests: + table.add_row(s.get("name", ""), s.get("status", "")) + c.print(table) diff --git a/modules/health_check.py b/modules/health_check.py index dd64071..1e446f6 100644 --- a/modules/health_check.py +++ b/modules/health_check.py @@ -171,6 +171,10 @@ class HealthCheck: gpu_health.append({"index": i, "status": worst, "checks": checks}) system_health = self._check_system() + for key in ("fabricmanager", "retired_pages", "kernel_errors"): + item = system_health.get(key, {}) + if isinstance(item, dict) and item.get("status") == "FAIL": + overall_pass = False return { "passed": overall_pass, @@ -228,6 +232,9 @@ class HealthCheck: rdma_devs = os.listdir("/sys/class/infiniband_verbs") nccl_env = {k: v for k, v in os.environ.items() if k.startswith("NCCL_")} + fabric = self._check_fabricmanager() + retired = self._check_retired_pages() + kernel_errors = self._check_kernel_errors() return { "nvidia_persistenced": {"installed": persistd, "running": persistd_running}, @@ -238,6 +245,41 @@ class HealthCheck: "infiniband_devices": ib_devs, "rdma_devices": rdma_devs, "nccl_env_vars": nccl_env, + "fabricmanager": fabric, + "retired_pages": retired, + "kernel_errors": kernel_errors, + } + + def _check_fabricmanager(self) -> dict: + r = self._run_cmd(["systemctl", "is-active", "nvidia-fabricmanager"], timeout=5) + active = r == "active" + logs = self._run_cmd(["journalctl", "-u", "nvidia-fabricmanager", "-n", "200", "--no-pager"], timeout=10) or "" + has_error = "ERROR" in logs.upper() or "FAILED" in logs.upper() + return { + "active": active, + "has_error_logs": has_error, + "status": "PASS" if active and not has_error else "FAIL", + } + + def _check_retired_pages(self) -> dict: + raw = self._run_cmd(["nvidia-smi", "-q", "-d", "PAGE_RETIREMENT"], timeout=30) or "" + nums = [int(x) for x in __import__("re").findall(r"Retired Pages.*?:\s*(\d+)", raw, flags=__import__("re").I)] + pending = "Pending Page Blacklist" in raw and "Yes" in raw + total = sum(nums) + return { + "retired_pages": total, + "pending_blacklist": pending, + "status": "PASS" if total == 0 and not pending else "FAIL", + } + + def _check_kernel_errors(self) -> dict: + raw = self._run_cmd(["dmesg", "--ctime", "--level=err,crit,alert,emerg"], timeout=10) or "" + upper = raw.upper() + hits = [line for line in raw.splitlines() if any(k in line.upper() for k in ("XID", "AER", "PCIE", "NVRM"))] + return { + "count": len(hits), + "tail": hits[-20:], + "status": "PASS" if not hits else "FAIL", } @staticmethod diff --git a/modules/nccl_test.py b/modules/nccl_test.py index fd9ab6a..9bc47d1 100644 --- a/modules/nccl_test.py +++ b/modules/nccl_test.py @@ -5,6 +5,8 @@ import os import re import shutil import subprocess +import statistics +import sys from datetime import datetime from typing import Optional @@ -70,6 +72,38 @@ class NCCLTest: return p return None + def _message_sizes(self) -> list[str]: + return list(self.nccl_cfg.get("message_sizes") or ["1M", "256M", "2G"]) + + def _repeats(self) -> int: + return int(self.nccl_cfg.get("repeats", 3)) + + def _max_stddev_pct(self) -> float: + return float(self.nccl_cfg.get("max_stddev_pct", 3)) + + def _runtime_env(self) -> dict: + env = {**os.environ, "NCCL_DEBUG": "WARN"} + lib_dirs = [] + + nccl_home = env.get("NCCL_HOME") or self.nccl_cfg.get("nccl_home") + if nccl_home: + lib_dirs.append(os.path.join(str(nccl_home), "lib")) + + for path in sys.path: + lib_dirs.append(os.path.join(path, "nvidia", "nccl", "lib")) + + venv_root = os.path.dirname(os.path.dirname(sys.executable)) + lib_dirs.extend(glob.glob(os.path.join(venv_root, "lib", "python*", "site-packages", "nvidia", "nccl", "lib"))) + + existing = env.get("LD_LIBRARY_PATH", "") + valid_dirs = [] + for d in lib_dirs: + if d and os.path.isdir(d) and d not in valid_dirs: + valid_dirs.append(d) + if valid_dirs: + env["LD_LIBRARY_PATH"] = ":".join(valid_dirs + ([existing] if existing else [])) + return env + def run(self) -> dict: gpu_count = 0 if TORCH_AVAILABLE: @@ -89,7 +123,7 @@ class NCCLTest: if self.nccl_cfg.get("test_reduce_scatter", False): tests.append(("reduce_scatter_perf", "ReduceScatter")) if self.nccl_cfg.get("test_allgather", False): - tests.append(("allgather_perf", "AllGather")) + tests.append(("all_gather_perf", "AllGather")) if self.nccl_cfg.get("test_sendrecv", False): tests.append(("sendrecv_perf", "SendRecv")) @@ -170,39 +204,7 @@ class NCCLTest: if not binary: return {"status": "SKIP", "error": f"{binary_name} not found"} - cmd = [ - binary, - "-b", "8M", - "-e", "8G", - "-f", "2", - "-g", str(gpu_count), - "-w", "5", - "-n", "20", - ] - - try: - env = os.environ.copy() - env["NCCL_DEBUG"] = "WARN" - r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env) - - combined = r.stdout + r.stderr - # Check for NCCL/CUDA compatibility errors - if "CUDA driver version is insufficient" in combined or \ - "Test NCCL failure" in combined: - error_msg = "NCCL/CUDA driver version mismatch" \ - if "CUDA driver version" in combined \ - else "NCCL test failure (library incompatibility)" - return {"status": "FAIL", "error": error_msg} - - if r.returncode != 0: - return {"status": "FAIL", "error": r.stderr[:300]} - - return self._parse_nccl_output(r.stdout, min_bw) - - except subprocess.TimeoutExpired: - return {"status": "FAIL", "error": "timeout"} - except Exception as e: - return {"status": "FAIL", "error": str(e)} + return self._run_nccl_matrix([binary, "-g", str(gpu_count)], min_bw) def _run_one_nccl_test_mpirun(self, binary_name: str, label: str, gpu_count: int, mpirun: str, min_bw: float) -> dict: @@ -218,37 +220,64 @@ class NCCLTest: "-x", "NCCL_DEBUG=WARN", "-x", "CUDA_VISIBLE_DEVICES=" + ",".join(str(i) for i in range(gpu_count)), binary, - "-b", "8", - "-e", "256M", - "-f", "2", "-g", "1", - "-w", "5", - "-n", "20", ] + return self._run_nccl_matrix(cmd, min_bw) + + def _run_nccl_matrix(self, base_cmd: list[str], min_bw: float) -> dict: + size_results = [] + failures = [] + env = self._runtime_env() + try: - env = os.environ.copy() - env["NCCL_DEBUG"] = "WARN" - r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env) - - combined = r.stdout + r.stderr - if "CUDA driver version is insufficient" in combined or \ - "Test NCCL failure" in combined: - error_msg = "NCCL/CUDA driver version mismatch" \ - if "CUDA driver version" in combined \ - else "NCCL test failure (library incompatibility)" - return {"status": "FAIL", "error": error_msg} - - if r.returncode != 0: - return {"status": "FAIL", "error": r.stderr[:300]} - - return self._parse_nccl_output(r.stdout, min_bw) + for size in self._message_sizes(): + runs = [] + for _ in range(self._repeats()): + cmd = [*base_cmd, "-b", size, "-e", size, "-f", "2", "-w", "5", "-n", "20"] + r = subprocess.run(cmd, capture_output=True, text=True, timeout=300, env=env) + combined = r.stdout + r.stderr + if "CUDA driver version is insufficient" in combined or "Test NCCL failure" in combined: + failures.append({"size": size, "error": "NCCL/CUDA/library failure"}) + continue + if r.returncode != 0: + failures.append({"size": size, "error": r.stderr[:300]}) + continue + parsed = self._parse_nccl_output(r.stdout, min_bw) + runs.append(parsed.get("best_busbw_gbps", 0)) + if runs: + worst = min(runs) + mean = sum(runs) / len(runs) + std_pct = (statistics.pstdev(runs) / mean * 100) if len(runs) > 1 and mean else 0 + size_results.append({ + "size": size, + "runs_busbw_gbps": [round(v, 1) for v in runs], + "worst_busbw_gbps": round(worst, 1), + "mean_busbw_gbps": round(mean, 1), + "stddev_pct": round(std_pct, 2), + "status": "PASS" if worst >= min_bw and std_pct <= self._max_stddev_pct() else "FAIL", + }) + else: + size_results.append({"size": size, "status": "FAIL", "runs_busbw_gbps": []}) except subprocess.TimeoutExpired: return {"status": "FAIL", "error": "timeout"} except Exception as e: return {"status": "FAIL", "error": str(e)} + best_bus = max((r.get("mean_busbw_gbps", 0) for r in size_results), default=0) + worst_bus = min((r.get("worst_busbw_gbps", 0) for r in size_results if r.get("runs_busbw_gbps")), default=0) + passed = bool(size_results) and all(r.get("status") == "PASS" for r in size_results) and not failures + return { + "status": "PASS" if passed else "FAIL", + "best_busbw_gbps": round(best_bus, 1), + "worst_busbw_gbps": round(worst_bus, 1), + "min_required_gbps": min_bw, + "max_stddev_pct": self._max_stddev_pct(), + "by_size": size_results, + "failures": failures, + } + @staticmethod def _parse_nccl_output(stdout: str, min_bw: float) -> dict: """Parse nccl-tests tabular output and extract bandwidth results.""" @@ -363,7 +392,7 @@ dist.destroy_process_group() r = subprocess.run( [torchrun_cmd, f"--nproc_per_node={gpu_count}", tmp.name], capture_output=True, text=True, timeout=120, - env={**os.environ, "NCCL_DEBUG": "WARN"}, + env=self._runtime_env(), ) os.unlink(tmp.name) @@ -390,10 +419,15 @@ dist.destroy_process_group() } return { - "passed": all_passed, + # torchrun fallback is a functional smoke only. It never proves + # production bus bandwidth, so it must not satisfy acceptance. + "passed": False, + "functional_passed": all_passed, "source": "torchrun_fallback", "tests": tests, "gpu_count": gpu_count, + "error": None if all_passed else "torchrun functional NCCL smoke failed", + "acceptance_gap": "nccl-tests bus bandwidth was not measured", } except Exception as e: return {"passed": False, "source": "torchrun_fallback", "error": str(e)} @@ -410,7 +444,8 @@ dist.destroy_process_group() if source == "torchrun_fallback": # Connectivity check mode - verdict = "[bold green]✓ NCCL Connectivity OK[/bold green]" if passed else "[bold red]✗ NCCL Connectivity FAILED[/bold red]" + functional = results.get("functional_passed", passed) + verdict = "[bold yellow]⚠ NCCL bus BW NOT VERIFIED[/bold yellow]" if functional else "[bold red]✗ NCCL Connectivity FAILED[/bold red]" c.print(f"{verdict} [dim](basic check via torchrun)[/dim]") tests = results.get("tests", {}) @@ -427,7 +462,7 @@ dist.destroy_process_group() else: c.print(f" [{s_color}]{op_name}[/{s_color}]") - c.print("\n[yellow]Note: functional connectivity test only (no performance data)[/yellow]") + c.print("\n[yellow]Note: functional connectivity test only (no bus bandwidth data; acceptance FAIL)[/yellow]") else: # nccl-tests mode verdict = "[bold green]✓ NCCL tests PASSED[/bold green]" if passed else "[bold yellow]⚠ NCCL tests WARNING[/bold yellow]" @@ -448,12 +483,16 @@ dist.destroy_process_group() if by_size: t = Table(box=None, padding=(0, 1)) t.add_column("Size", style="bold", justify="right") - t.add_column("Time (us)", justify="right") - t.add_column("Alg BW (GB/s)", justify="right") - t.add_column("Bus BW (GB/s)", justify="right") + t.add_column("Worst Bus BW", justify="right") + t.add_column("Mean Bus BW", justify="right") + t.add_column("StdDev", justify="right") + t.add_column("Status", justify="right") for r in by_size: - sz = r.get("size", 0) - sz_str = f"{sz/1024:.0f}K" if sz < 1048576 else f"{sz/1048576:.0f}M" - t.add_row(sz_str, f"{r.get('time_us',0):.1f}", - f"{r.get('algbw_gbps',0):.1f}", f"{r.get('busbw_gbps',0):.1f}") + t.add_row( + str(r.get("size", "")), + f"{r.get('worst_busbw_gbps', 0):.1f}", + f"{r.get('mean_busbw_gbps', 0):.1f}", + f"{r.get('stddev_pct', 0):.2f}%", + r.get("status", "?"), + ) c.print(t) diff --git a/modules/nvlink_test.py b/modules/nvlink_test.py new file mode 100644 index 0000000..ecf257b --- /dev/null +++ b/modules/nvlink_test.py @@ -0,0 +1,188 @@ +"""NVLink / NVSwitch production acceptance checks.""" + +import re +import shutil +import subprocess +from datetime import datetime +from typing import Optional + +from rich.console import Console +from rich.table import Table + + +class NVLinkTest: + def __init__(self, config: dict): + self.config = config + self.console = Console() + self.cfg = config.get("nvlink", {}) + + def _run(self, args: list[str], timeout: int = 60) -> tuple[int, str, str]: + if not shutil.which("nvidia-smi"): + return 127, "", "nvidia-smi not found" + r = subprocess.run(["nvidia-smi", *args], capture_output=True, text=True, timeout=timeout) + return r.returncode, r.stdout, r.stderr + + def run(self) -> dict: + expected_links = int(self.cfg.get("expected_links_per_gpu", 18)) + expected_speed = float(self.cfg.get("expected_link_speed_gbps", 25)) + require_zero_errors = bool(self.cfg.get("require_zero_errors", True)) + + rc_s, out_s, err_s = self._run(["nvlink", "-s"]) + rc_c, out_c, err_c = self._run(["nvlink", "-c"]) + rc_e, out_e, err_e = self._run(["nvlink", "-e"]) + + if rc_s != 0: + return { + "passed": False, + "error": (err_s or out_s or "nvidia-smi nvlink -s failed")[:1000], + "timestamp": datetime.now().isoformat(), + } + + links = self._parse_status(out_s) + if not links: + return { + "passed": False, + "error": "no NVLink status entries parsed from nvidia-smi nvlink -s", + "raw_status": out_s[-4000:], + "timestamp": datetime.now().isoformat(), + } + speeds = self._parse_speeds(out_c) if rc_c == 0 else {} + status_speeds = self._parse_speeds(out_s) + for gpu, gpu_speeds in status_speeds.items(): + speeds.setdefault(gpu, {}).update({k: v for k, v in gpu_speeds.items() if k not in speeds.get(gpu, {})}) + errors = self._parse_errors(out_e) if rc_e == 0 else {} + + gpu_results = [] + overall = True + for gpu, gpu_links in sorted(links.items(), key=lambda x: int(x[0])): + active = sum(1 for l in gpu_links.values() if l.get("active")) + inactive = [lid for lid, l in gpu_links.items() if not l.get("active")] + speed_bad = [] + for lid in gpu_links: + speed = speeds.get(gpu, {}).get(lid) + if speed is not None and speed < expected_speed: + speed_bad.append({"link": lid, "speed_gbps": speed}) + err_bad = [] + if require_zero_errors: + for lid, counters in errors.get(gpu, {}).items(): + total = sum(v for v in counters.values() if isinstance(v, int)) + if total: + err_bad.append({"link": lid, "counters": counters}) + + passed = active == expected_links and not inactive and not speed_bad and not err_bad + if not passed: + overall = False + gpu_results.append({ + "gpu": int(gpu), + "active_links": active, + "expected_links": expected_links, + "inactive_links": inactive, + "speed_issues": speed_bad, + "error_issues": err_bad, + "passed": passed, + }) + + return { + "passed": overall, + "expected_links_per_gpu": expected_links, + "expected_link_speed_gbps": expected_speed, + "require_zero_errors": require_zero_errors, + "gpus": gpu_results, + "raw_status": out_s[-4000:], + "raw_speed": out_c[-4000:] if out_c else "", + "raw_errors": out_e[-4000:] if out_e else "", + "timestamp": datetime.now().isoformat(), + } + + @staticmethod + def _parse_status(text: str) -> dict[str, dict[str, dict]]: + result: dict[str, dict[str, dict]] = {} + gpu = None + for line in text.splitlines(): + m_gpu = re.search(r"GPU\s+(\d+)", line, re.I) + if m_gpu: + gpu = m_gpu.group(1) + result.setdefault(gpu, {}) + continue + if gpu is None: + continue + m_link = re.search(r"Link\s+(\d+).*?(Active|Inactive|Disabled|Off|Down)", line, re.I) + if m_link: + state = m_link.group(2) + result[gpu][m_link.group(1)] = { + "state": state, + "active": state.lower() == "active", + "raw": line.strip(), + } + continue + m_speed = re.search(r"Link\s+(\d+).*?([0-9.]+)\s*GB/s", line, re.I) + if m_speed: + result[gpu][m_speed.group(1)] = { + "state": "Active", + "active": True, + "raw": line.strip(), + } + return result + + @staticmethod + def _parse_speeds(text: str) -> dict[str, dict[str, float]]: + result: dict[str, dict[str, float]] = {} + gpu = None + for line in text.splitlines(): + m_gpu = re.search(r"GPU\s+(\d+)", line, re.I) + if m_gpu: + gpu = m_gpu.group(1) + result.setdefault(gpu, {}) + continue + if gpu is None: + continue + m_link = re.search(r"Link\s+(\d+).*?([0-9.]+)\s*GB/s", line, re.I) + if m_link: + result[gpu][m_link.group(1)] = float(m_link.group(2)) + return result + + @staticmethod + def _parse_errors(text: str) -> dict[str, dict[str, dict[str, int]]]: + result: dict[str, dict[str, dict[str, int]]] = {} + gpu = None + link = None + for line in text.splitlines(): + m_gpu = re.search(r"GPU\s+(\d+)", line, re.I) + if m_gpu: + gpu = m_gpu.group(1) + result.setdefault(gpu, {}) + continue + m_link = re.search(r"Link\s+(\d+)", line, re.I) + if m_link and gpu is not None: + link = m_link.group(1) + result[gpu].setdefault(link, {}) + if gpu is None or link is None: + continue + for name in ("CRC", "Replay", "Recovery"): + m = re.search(rf"{name}[^0-9]*(\d+)", line, re.I) + if m: + result[gpu][link][name.lower()] = int(m.group(1)) + return result + + @staticmethod + def print_results(results: dict, console: Optional[Console] = None): + c = console or Console() + if results.get("error"): + c.print(f"[bold red]NVLink error: {results['error']}[/bold red]") + return + passed = results.get("passed", False) + c.print("[bold green]✓ NVLink PASSED[/bold green]" if passed else "[bold red]✗ NVLink FAILED[/bold red]") + table = Table(box=None, padding=(0, 1)) + table.add_column("GPU", style="bold") + table.add_column("Active Links", justify="right") + table.add_column("Issues") + for g in results.get("gpus", []): + issues = [] + if g.get("inactive_links"): + issues.append("inactive=" + ",".join(g["inactive_links"])) + if g.get("speed_issues"): + issues.append(f"speed={len(g['speed_issues'])}") + if g.get("error_issues"): + issues.append(f"errors={len(g['error_issues'])}") + table.add_row(str(g["gpu"]), f"{g['active_links']}/{g['expected_links']}", "; ".join(issues) or "OK") + c.print(table) diff --git a/modules/report.py b/modules/report.py index d9e1eba..2f6f1ec 100644 --- a/modules/report.py +++ b/modules/report.py @@ -93,8 +93,8 @@ class ReportGenerator: def _generate_html(self, results: dict, output: str) -> str: import socket - hostname = socket.gethostname() - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + hostname = results.get("hostname") or socket.gethostname() + timestamp = results.get("timestamp") or datetime.now().strftime("%Y-%m-%d %H:%M:%S") sections = [] @@ -178,8 +178,8 @@ class ReportGenerator: def _generate_markdown(self, results: dict, output: str) -> str: import socket - hostname = socket.gethostname() - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + hostname = results.get("hostname") or socket.gethostname() + timestamp = results.get("timestamp") or datetime.now().strftime("%Y-%m-%d %H:%M:%S") lines: list[str] = [] @@ -201,6 +201,21 @@ class ReportGenerator: # --- Summary table --- summary_items = self._build_summary(results) if summary_items: + verdict, failures, missing = self._overall_acceptance_verdict(summary_items) + lines.append("## Overall Acceptance Verdict\n") + lines.append(f"**Result: {verdict}**") + lines.append("") + if failures: + lines.append("Failed or unverified items:") + for name, status in failures: + lines.append(f"- {name}: {status}") + lines.append("") + if missing: + lines.append("Missing required evidence:") + for name in missing: + lines.append(f"- {name}") + lines.append("") + lines.append("## Summary\n") lines.append("| Test | Result |") lines.append("|------|--------|") @@ -319,8 +334,6 @@ class ReportGenerator: if use_abs and thr: if val >= thr: status = "PASS" - elif val >= thr * 0.9: - status = "WARN" else: status = "FAIL" lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | >= {thr} | {status} |") @@ -331,30 +344,123 @@ class ReportGenerator: overall_status = status lines.append("") if use_abs: + if any(not row.get("passed", False) for row in (comp_data.get("consistency", {}) or {}).values()): + overall_status = "FAIL" lines.append(f"**Verdict: {overall_status}** (absolute TFLOPS thresholds; worst efficiency {worst_eff:.1f}%)\n") else: overall_status = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL") lines.append(f"**Verdict: {overall_status}** (worst efficiency {worst_eff:.1f}%)\n") + consistency = comp_data.get("consistency", {}) or {} + if consistency: + lines.append("### Compute Consistency\n") + lines.append("| DType | Min | Mean | Max | Spread | Limit | Status |") + lines.append("|-------|-----|------|-----|--------|-------|--------|") + for dt, row in consistency.items(): + status = "PASS" if row.get("passed") else "FAIL" + lines.append( + f"| {dt.upper()} | {row.get('min_tflops', 0):.1f} | " + f"{row.get('mean_tflops', 0):.1f} | {row.get('max_tflops', 0):.1f} | " + f"{row.get('spread_pct', 0):.2f}% | <= {row.get('max_allowed_pct', 3)}% | {status} |" + ) + lines.append("") + + per_gpu = comp_data.get("per_gpu", []) or [] + dtype_order = [dt for dt in per_dtype.keys() if not isinstance(per_dtype.get(dt), str)] + if per_gpu and dtype_order: + lines.append("### Compute Per-GPU TFLOPS\n") + headers = ["GPU", *[dt.upper() for dt in dtype_order]] + lines.append("| " + " | ".join(headers) + " |") + lines.append("|" + "|".join(["---"] * len(headers)) + "|") + for row in per_gpu: + cells = [str(row.get("index", ""))] + for dt in dtype_order: + val = row.get(dt, "") + cells.append(f"{val:.1f}" if isinstance(val, (int, float)) else str(val)) + lines.append("| " + " | ".join(cells) + " |") + lines.append("") + + # --- NCCL --- + nvlink = results.get("nvlink") + if nvlink and not nvlink.get("error"): + lines.append("## NVLink/NVSwitch\n") + lines.append(f"**Overall: {'PASS' if nvlink.get('passed') else 'FAIL'}**\n") + lines.append("| GPU | Active Links | Issues |") + lines.append("|-----|--------------|--------|") + for g in nvlink.get("gpus", []): + issues = [] + if g.get("inactive_links"): + issues.append("inactive=" + ",".join(g["inactive_links"])) + if g.get("speed_issues"): + issues.append(f"speed issues={len(g['speed_issues'])}") + if g.get("error_issues"): + issues.append(f"errors={len(g['error_issues'])}") + lines.append(f"| {g.get('gpu')} | {g.get('active_links')}/{g.get('expected_links')} | {', '.join(issues) or 'OK'} |") + lines.append("") + elif nvlink and nvlink.get("error"): + lines.append("## NVLink/NVSwitch\n") + lines.append(f"**Overall: FAIL** ({nvlink.get('error')})\n") + + dcgm = results.get("dcgm") + if dcgm and not dcgm.get("error"): + lines.append("## DCGM Diagnostic\n") + lines.append(f"**Overall: {'PASS' if dcgm.get('passed') else 'FAIL'}**\n") + if dcgm.get("subtests"): + lines.append("| Subtest | Status |") + lines.append("|---------|--------|") + for s in dcgm.get("subtests", []): + lines.append(f"| {s.get('name', '')} | {s.get('status', '')} |") + lines.append("") + elif dcgm and dcgm.get("error"): + lines.append("## DCGM Diagnostic\n") + lines.append(f"**Overall: FAIL** ({dcgm.get('error')})\n") + # --- NCCL --- nccl = results.get("nccl") if nccl and not nccl.get("error"): lines.append("## NCCL Multi-GPU\n") lines.append(f"Source: {nccl.get('source', 'unknown')} | " f"GPUs: {nccl.get('gpu_count', '?')}\n") + if nccl.get("source") == "torchrun_fallback": + lines.append("> Functional NCCL smoke only: nccl-tests bus bandwidth was not measured, so this does not satisfy production acceptance.\n") tests = nccl.get("tests", {}) if tests: - lines.append("| Operation | Bus BW (GB/s) | Threshold | Status |") - lines.append("|-----------|---------------|-----------|--------|") + lines.append("> Summary reports the best Bus BW observed for each operation. PASS/FAIL is evaluated across every tested message size and repeat run shown in the detail table below.\n") + lines.append("| Operation | Best Bus BW (GB/s) | Failed Sizes | Threshold | Status |") + lines.append("|-----------|--------------------|--------------|-----------|--------|") for op, data in tests.items(): if isinstance(data, dict) and not data.get("error"): bw = data.get("best_busbw_gbps", 0) req = data.get("min_required_gbps", 0) status = data.get("status", "?") - lines.append(f"| {op} | {bw:.1f} | >= {req:.0f} | {status} |") + failed_sizes = [ + str(row.get("size", "?")) + for row in data.get("by_size", []) + if row.get("status") != "PASS" + ] + failed_sizes_text = ", ".join(failed_sizes) if failed_sizes else "-" + lines.append(f"| {op} | {bw:.1f} | {failed_sizes_text} | >= {req:.0f} | {status} |") elif isinstance(data, dict) and data.get("error"): - lines.append(f"| {op} | - | - | ERROR: {data['error']} |") + lines.append(f"| {op} | - | - | - | ERROR: {data['error']} |") lines.append("") + for op, data in tests.items(): + by_size = data.get("by_size", []) if isinstance(data, dict) else [] + if not by_size: + continue + lines.append(f"### NCCL {op} by size\n") + lines.append("| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |") + lines.append("|------|---------------------|-------|------|--------|-----------|--------|") + for row in by_size: + runs = ", ".join(str(v) for v in row.get("runs_busbw_gbps", [])) + lines.append( + f"| {row.get('size', '')} | {runs} | " + f"{row.get('worst_busbw_gbps', 0):.1f} | " + f"{row.get('mean_busbw_gbps', 0):.1f} | " + f"{row.get('stddev_pct', 0):.2f}% | " + f">= {data.get('min_required_gbps', 0):.0f} | " + f"{row.get('status', '?')} |" + ) + lines.append("") passed = nccl.get("passed", False) lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n") @@ -368,6 +474,21 @@ class ReportGenerator: source = stress.get("source", "unknown") lines.append(f"- **Source:** {source}") lines.append(f"- **Duration:** {elapsed:.0f}s (requested {duration}s)") + telemetry = stress.get("telemetry") or {} + if telemetry: + lines.append(f"- **Telemetry samples:** {telemetry.get('samples', 0)}") + lines.append(f"- **Max temp:** {telemetry.get('max_temp_c', {})}") + lines.append(f"- **Avg power:** {telemetry.get('avg_power_w', {})}") + lines.append(f"- **Temp delta:** {telemetry.get('temp_delta_c', 'N/A')} C") + lines.append(f"- **TFLOPS jitter:** {telemetry.get('tflops_jitter_pct', 'N/A')}%") + lines.append(f"- **Steady TFLOPS samples:** {telemetry.get('steady_tflops_samples', 0)}") + lines.append(f"- **Throttle events:** {telemetry.get('throttle_event_count', len(telemetry.get('throttle_events', [])))}") + lines.append(f"- **XID events:** {len(telemetry.get('xid_events', []))}") + failures = telemetry.get("failures") or [] + if failures: + lines.append("- **Failure reasons:**") + for reason in failures: + lines.append(f" - {reason}") lines.append(f"- **Result: {'PASS' if passed else 'FAIL'}**") lines.append("") @@ -378,26 +499,70 @@ class ReportGenerator: lines.append(f"**Overall: SKIP** [{rdma.get('reason', 'no IB hardware detected')}]\n") elif rdma and not rdma.get("error"): lines.append("## RDMA/InfiniBand\n") + rdma_legacy_note = self._rdma_legacy_note(rdma) + if rdma_legacy_note: + lines.append(f"> {rdma_legacy_note}\n") + port_checks = rdma.get("port_checks", []) + if port_checks: + lines.append("### RDMA Port Checks\n") + lines.append("| Device | Port | State | Rate | Required | Status |") + lines.append("|--------|------|-------|------|----------|--------|") + for p in port_checks: + lines.append( + f"| {p.get('device', '')} | {p.get('port', '')} | " + f"{p.get('state', '')} | {p.get('rate', '')} | " + f">= {p.get('min_rate_gbps', 400):.0f}Gbps ACTIVE | {p.get('status', '?')} |" + ) + lines.append("") bw_tests = rdma.get("bandwidth_tests", []) lat_tests = rdma.get("latency_tests", []) - if bw_tests or lat_tests: + ibping_tests = rdma.get("ibping_tests", []) + if bw_tests or lat_tests or ibping_tests: lines.append("| Test | Value | Threshold | Status |") lines.append("|------|-------|-----------|--------|") for bt in bw_tests: - if not bt.get("error"): + if bt.get("error"): + lines.append(f"| {bt.get('test', 'ib_bw')} | {bt.get('error')} | required runnable test | {bt.get('status', 'FAIL')} |") + else: + threshold, status = self._rdma_bandwidth_verdict(bt) lines.append(f"| {bt['test']} | {bt.get('bandwidth_gbps', 0):.1f} GB/s | " - f">= {bt.get('min_required_gbps', 0)} GB/s | {bt.get('status', '?')} |") + f">= {threshold:g} GB/s | {status} |") for lt in lat_tests: - if not lt.get("error"): + if lt.get("error"): + lines.append(f"| {lt.get('test', 'ib_lat')} | {lt.get('error')} | required runnable test | {lt.get('status', 'FAIL')} |") + else: + threshold, status = self._rdma_latency_verdict(lt) lines.append(f"| {lt['test']} | {lt.get('latency_us', 0):.2f} us | " - f"<= {lt.get('max_allowed_us', 0)} us | {lt.get('status', '?')} |") + f"<= {threshold:g} us | {status} |") + for it in ibping_tests: + direction = it.get("direction") or it.get("role", "N/A") + if it.get("error"): + lines.append(f"| {it.get('test', 'ibping')} | {it.get('error')} | bidirectional peer evidence | {it.get('status', 'FAIL')} |") + else: + lines.append(f"| {it['test']} | {direction} target={it.get('target', 'N/A')} count={it.get('count', 'N/A')} | " + f"0% packet loss | {it.get('status', '?')} |") lines.append("") + fabric = rdma.get("fabric_counters") or {} + if fabric: + counters = fabric.get("counters", {}) + lines.append(f"- **PFC/ECN/CNP/congestion counters checked:** {len(counters)}") + lines.append(f"- **PFC/ECN/CNP/congestion non-zero:** {'yes' if fabric.get('failed') else 'no'}") + if not counters: + lines.append("- **PFC/ECN/CNP/congestion evidence:** missing") + failures = rdma.get("failures") or [] + if not failures: + failures = self._rdma_failure_reasons(rdma) + if failures: + lines.append("- **Failure reasons:**") + for reason in failures: + lines.append(f" - {reason}") passed = rdma.get("passed", False) lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n") # --- Training --- training = results.get("training") if training and not training.get("error"): + training_status, training_detail, training_missing = self._training_verdict(training) lines.append("## Training Simulation\n") lines.append("| Metric | Value |") lines.append("|--------|-------|") @@ -405,8 +570,14 @@ class ReportGenerator: lines.append(f"| Params | {training.get('total_params_m', 0):.1f}M |") lines.append(f"| Throughput | {training.get('throughput_tokens_per_sec', 0):.0f} tokens/sec |") lines.append(f"| Avg Step Time | {training.get('avg_step_time_ms', 0):.1f} ms |") + lines.append(f"| Warmup Steps | {training.get('warmup_steps', 'N/A')} |") lines.append(f"| Peak Memory | {training.get('peak_memory_gb', 0):.1f} GB |") lines.append(f"| Final Loss | {training.get('final_loss', 'N/A')} |") + lines.append(f"| Step Jitter | {training.get('step_jitter_pct', 'N/A')}% |") + lines.append(f"| Distributed Mode | {training.get('distributed_mode', 'N/A')} |") + if training_missing: + lines.append(f"| Acceptance Gaps | missing {', '.join(training_missing)} |") + lines.append(f"| Verdict | {training_status} ({training_detail}) |") lines.append("") # --- Footer --- @@ -441,6 +612,101 @@ class ReportGenerator: return bench["compute"] return {} + @staticmethod + def _training_verdict(training: dict) -> tuple[str, str, list[str]]: + """Return report status for both current and legacy training result schemas.""" + tps = float(training.get("throughput_tokens_per_sec", 0) or 0) + if "passed" in training: + status = "PASS" if training.get("passed") else "FAIL" + return status, f"{tps:.0f} tokens/sec", [] + + required = ["passed", "step_jitter_pct", "distributed_mode", "loss_finite"] + missing = [k for k in required if k not in training] + return "UNVERIFIED", f"{tps:.0f} tokens/sec; legacy result lacks explicit acceptance verdict", missing + + def _rdma_cfg_value(self, key: str, default: float) -> float: + try: + return float((self.config.get("rdma", {}) or {}).get(key, default)) + except (TypeError, ValueError): + return default + + def _rdma_bandwidth_verdict(self, row: dict) -> tuple[float, str]: + threshold = self._rdma_cfg_value("min_bandwidth_gbps", 47.0) + value = float(row.get("bandwidth_gbps", 0) or 0) + return threshold, "PASS" if value >= threshold else "FAIL" + + def _rdma_latency_verdict(self, row: dict) -> tuple[float, str]: + name = row.get("test", "") + if name == "ib_write_lat": + threshold = self._rdma_cfg_value("max_write_latency_us", 2.0) + elif name == "ib_read_lat": + threshold = self._rdma_cfg_value("max_read_latency_us", 3.5) + else: + threshold = self._rdma_cfg_value("max_latency_us", 3.5) + value = float(row.get("latency_us", 0) or 0) + return threshold, "PASS" if 0 < value <= threshold else "FAIL" + + def _rdma_legacy_note(self, rdma: dict) -> str: + """Flag old RDMA result schemas whose embedded thresholds were looser.""" + for row in rdma.get("bandwidth_tests", []) or []: + if row.get("min_required_gbps") != self._rdma_cfg_value("min_bandwidth_gbps", 47.0): + return ( + "Legacy RDMA result re-evaluated with current PDF acceptance thresholds; " + "old WARN statuses and old 50GB/s/10us limits are not used for verdict." + ) + for row in rdma.get("latency_tests", []) or []: + threshold, _ = self._rdma_latency_verdict(row) + if row.get("max_allowed_us") != threshold: + return ( + "Legacy RDMA result re-evaluated with current PDF acceptance thresholds; " + "old WARN statuses and old 50GB/s/10us limits are not used for verdict." + ) + return "" + + def _rdma_failure_reasons(self, rdma: dict) -> list[str]: + failures = [] + for row in rdma.get("bandwidth_tests", []) or []: + threshold, status = self._rdma_bandwidth_verdict(row) + if status != "PASS": + failures.append( + f"{row.get('test')} bandwidth {row.get('bandwidth_gbps', 0)}GB/s < {threshold:g}GB/s" + ) + for row in rdma.get("latency_tests", []) or []: + threshold, status = self._rdma_latency_verdict(row) + if status != "PASS": + failures.append( + f"{row.get('test')} latency {row.get('latency_us', 0)}us > {threshold:g}us" + ) + for row in rdma.get("ibping_tests", []) or []: + if row.get("status") != "PASS": + failures.append(f"{row.get('test')} failed") + return failures + + @staticmethod + def _overall_acceptance_verdict(summary_items: list[tuple[str, str]]) -> tuple[str, list[tuple[str, str]], list[str]]: + """PDF-style machine verdict: every required item must be present and PASS.""" + required = [ + "GPU Info", + "Health Check", + "Memory Bandwidth", + "Compute Throughput", + "NVLink/NVSwitch", + "NCCL", + "Stress Test", + "RDMA", + "DCGM", + "Training", + ] + status_by_name = dict(summary_items) + missing = [name for name in required if name not in status_by_name] + failures = [ + (name, status) + for name, status in summary_items + if name in required and not str(status).startswith("PASS") + ] + verdict = "PASS" if not missing and not failures else "FAIL" + return verdict, failures, missing + def _build_summary(self, results: dict) -> list[tuple[str, str]]: """Build summary verdict list from results.""" items = [] @@ -473,7 +739,7 @@ class ReportGenerator: d2d = mem.get("d2d_bandwidth_gbps") or 0 items.append(("Memory Bandwidth", f"WARN ({d2d:.0f} GB/s via PyTorch fallback)")) else: - eff = mem.get("efficiency_pct") or 0 + eff = mem.get("d2d_efficiency_pct") or mem.get("efficiency_pct") or 0 verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 60 else "FAIL") items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)")) @@ -491,25 +757,43 @@ class ReportGenerator: rank = {"PASS": 0, "WARN": 1, "FAIL": 2} worst_status = "PASS" worst_dt = None + lowest_margin = None for dt, thr in pass_thresholds.items(): val = per_dtype.get(dt) if not isinstance(val, (int, float)): continue if val >= thr: st = "PASS" - elif val >= thr * 0.9: - st = "WARN" else: st = "FAIL" + margin = val / thr if thr else 0 + if lowest_margin is None or margin < lowest_margin: + lowest_margin = margin + worst_dt = dt if rank[st] > rank[worst_status]: worst_status = st - worst_dt = dt if worst_dt: - items.append(( - "Compute Throughput", - f"{worst_status} (worst {worst_dt.upper()} " - f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})" - )) + consistency = comp.get("consistency", {}) or {} + failed_consistency = [ + (dt, row) + for dt, row in consistency.items() + if not row.get("passed", False) + ] + if failed_consistency: + worst_status = "FAIL" + fail_dt, fail_row = failed_consistency[0] + items.append(( + "Compute Throughput", + f"FAIL ({fail_dt.upper()} spread " + f"{fail_row.get('spread_pct', 0):.2f}% > " + f"{fail_row.get('max_allowed_pct', 3)}%)" + )) + else: + items.append(( + "Compute Throughput", + f"{worst_status} (worst {worst_dt.upper()} " + f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})" + )) else: items.append(("Compute Throughput", f"{worst_status}")) else: @@ -521,11 +805,32 @@ class ReportGenerator: else: items.append(("Compute Throughput", "N/A")) + # NCCL + if "nvlink" in results: + nvl = results["nvlink"] + if nvl.get("error"): + items.append(("NVLink/NVSwitch", f"ERROR: {nvl['error']}")) + elif nvl.get("passed"): + items.append(("NVLink/NVSwitch", "PASS")) + else: + items.append(("NVLink/NVSwitch", "FAIL")) + + if "dcgm" in results: + d = results["dcgm"] + if d.get("error"): + items.append(("DCGM", f"ERROR: {d['error']}")) + elif d.get("passed"): + items.append(("DCGM", "PASS")) + else: + items.append(("DCGM", "FAIL")) + # NCCL if "nccl" in results: n = results["nccl"] if n.get("error"): items.append(("NCCL", f"ERROR: {n['error']}")) + elif n.get("source") == "torchrun_fallback": + items.append(("NCCL", "FAIL (no nccl-tests bus BW)")) elif n.get("passed"): items.append(("NCCL", "PASS")) else: @@ -559,7 +864,7 @@ class ReportGenerator: if t.get("error"): items.append(("Training", f"ERROR: {t['error']}")) else: - tps = t.get("throughput_tokens_per_sec", 0) - items.append(("Training", f"PASS ({tps:.0f} tokens/sec)")) + status, detail, _missing = self._training_verdict(t) + items.append(("Training", f"{status} ({detail})")) return items diff --git a/modules/stress_test.py b/modules/stress_test.py index 8b69d1c..460b3b1 100644 --- a/modules/stress_test.py +++ b/modules/stress_test.py @@ -1,9 +1,10 @@ -"""GPU stress test module — wraps gpu-burn for long-running stability tests.""" +"""GPU stress test module — gpu-burn or PyTorch GEMM with telemetry.""" import glob import os import shutil import subprocess +import threading import time from datetime import datetime @@ -46,7 +47,7 @@ class StressTest: memory_pct = cfg.get("memory_pct", 90) target_gpus = cfg.get("gpus", "all") - gpu_burn = self._find_gpu_burn() + gpu_burn = self._find_gpu_burn() if cfg.get("use_gpu_burn", False) else "" if gpu_burn: # Try gpu-burn first @@ -60,7 +61,7 @@ class StressTest: return result - self.console.print("[yellow]gpu_burn not found, using PyTorch stress test[/yellow]") + self.console.print("[yellow]Using PyTorch stress test[/yellow]") return self._run_pytorch_stress(duration_sec, memory_pct) def _run_gpu_burn(self, gpu_burn: str, duration: int, @@ -77,12 +78,26 @@ class StressTest: cmd.append(str(duration)) t0 = time.time() + xid_before = self._collect_xid_events() + interval = int(self.stress_cfg.get("telemetry_interval_sec", 1)) + telemetry = [] + stop_sampling = threading.Event() + sampler = threading.Thread( + target=self._sample_telemetry, + args=(telemetry, stop_sampling, interval), + daemon=True, + ) + sampler.start() try: r = subprocess.run(cmd, capture_output=True, text=True, timeout=duration + 120) elapsed = round(time.time() - t0, 1) + stop_sampling.set() + sampler.join(timeout=interval + 1) output = r.stdout + r.stderr - passed = r.returncode == 0 + xid_events = self._new_xid_events(xid_before, self._collect_xid_events()) + telemetry_summary = self._evaluate_telemetry(telemetry, [], xid_events) + passed = r.returncode == 0 and telemetry_summary.get("passed", False) gpu_results = [] for line in output.split("\n"): @@ -96,25 +111,36 @@ class StressTest: "duration_sec": duration, "elapsed_sec": elapsed, "gpu_results": gpu_results, + "telemetry": telemetry_summary, "raw_output_tail": output[-500:] if output else "", "timestamp": datetime.now().isoformat(), } except subprocess.TimeoutExpired: + stop_sampling.set() return { "source": "gpu-burn", "passed": False, "duration_sec": duration, "error": "timeout", + "telemetry": self._evaluate_telemetry( + telemetry, [], self._new_xid_events(xid_before, self._collect_xid_events()) + ), "timestamp": datetime.now().isoformat(), } except Exception as e: + stop_sampling.set() return { "source": "gpu-burn", "passed": False, "error": str(e), + "telemetry": self._evaluate_telemetry( + telemetry, [], self._new_xid_events(xid_before, self._collect_xid_events()) + ), "timestamp": datetime.now().isoformat(), } + finally: + stop_sampling.set() def _run_pytorch_stress(self, duration: int, memory_pct: int = 90) -> dict: try: @@ -127,58 +153,79 @@ class StressTest: gpu_count = torch.cuda.device_count() self.console.print(f"[cyan]PyTorch Stress Test ({duration}s, {gpu_count} GPUs, target {memory_pct}% memory)[/cyan]") + dtype_name = self.stress_cfg.get("dtype", "bf16") + matrix_size = int(self.stress_cfg.get("matrix_size", 8192)) + interval = int(self.stress_cfg.get("telemetry_interval_sec", 1)) + dtype_map = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32} + dtype = dtype_map.get(dtype_name, torch.bfloat16) + gpu_status = {} + telemetry = [] + stop_sampling = threading.Event() t0 = time.time() + xid_before = self._collect_xid_events() try: + sampler = threading.Thread( + target=self._sample_telemetry, + args=(telemetry, stop_sampling, interval), + daemon=True, + ) + sampler.start() tensors = {} + ballast = {} + pass_tflops = [] for i in range(gpu_count): with torch.cuda.device(i): - # Get actual free memory (accounting for other processes) free_mem, total_mem = torch.cuda.mem_get_info(i) - - # Calculate allocation from configured memory_pct - target_mem = int(total_mem * memory_pct / 100) - - # Cap at actual free memory with 5% safety margin - alloc_bytes = min(target_mem, int(free_mem * 0.95)) - - # matmul(A, A.T) needs 2x input memory (input + output) - mem_side = int((alloc_bytes / 4 / 2) ** 0.5) - # Cap compute matrix so a single matmul completes in ~2s on H100/H200 - # (FP32 ≈ 67 TFLOPS → 2*4096³/67e12 ≈ 2s). Without this cap, a 141GB - # HBM yields side ≈ 131K → single matmul ~68s × 8 GPUs serial → loop - # overshoots a 60s duration request by 10×+. - MAX_COMPUTE_SIDE = 4096 - side = min(mem_side, MAX_COMPUTE_SIDE) - - actual_mem_mb = side * side * 4 / 1024 / 1024 + side = matrix_size + elem = torch.tensor([], dtype=dtype).element_size() + compute_bytes = side * side * elem * 3 + target_mem = min(int(total_mem * memory_pct / 100), int(free_mem * 0.90)) + ballast_bytes = max(0, target_mem - compute_bytes) + if ballast_bytes: + ballast_elems = ballast_bytes // 2 + ballast[i] = torch.empty(ballast_elems, device=f"cuda:{i}", dtype=torch.float16) + actual_mem_mb = (compute_bytes + ballast_bytes) / 1024 / 1024 total_mem_mb = total_mem / 1024 / 1024 free_mem_mb = free_mem / 1024 / 1024 - + self.console.print( f" [dim]GPU {i}: total {total_mem_mb:.0f}MB, free {free_mem_mb:.0f}MB, " f"alloc {actual_mem_mb:.0f}MB ({actual_mem_mb/total_mem_mb*100:.0f}%) - " - f"matrix {side}x{side}[/dim]" + f"{dtype_name} matrix {side}x{side}[/dim]" + ) + tensors[i] = ( + torch.randn(side, side, device=f"cuda:{i}", dtype=dtype), + torch.randn(side, side, device=f"cuda:{i}", dtype=dtype), + torch.empty(side, side, device=f"cuda:{i}", dtype=dtype), ) - tensors[i] = torch.randn(side, side, device=f"cuda:{i}", dtype=torch.float32) self.console.print(f"\n[cyan]Starting stress test for {duration} seconds...[/cyan]") elapsed_check = 0 while time.time() - t0 < duration: + loop_start = time.perf_counter() # Dispatch matmul on all GPUs in parallel — do NOT synchronize between # GPUs, otherwise the 8 GPUs run serially and overshoot the duration. for i in range(gpu_count): with torch.cuda.device(i): - tensors[i] = torch.matmul(tensors[i], tensors[i].T) + a, b, out = tensors[i] + torch.matmul(a, b, out=out) # Single sync per pass — waits for all 8 streams concurrently for i in range(gpu_count): with torch.cuda.device(i): torch.cuda.synchronize() + loop_elapsed = time.perf_counter() - loop_start + current_elapsed = time.time() - t0 + if loop_elapsed > 0: + flops = gpu_count * 2 * (matrix_size ** 3) + pass_tflops.append({ + "elapsed_sec": current_elapsed, + "tflops": flops / loop_elapsed / 1e12, + }) # Show progress every 10 seconds - current_elapsed = time.time() - t0 if int(current_elapsed) != int(elapsed_check) and int(current_elapsed) % 10 == 0: self.console.print(f" [dim]Running {int(current_elapsed)}s / {duration}s[/dim]") elapsed_check = current_elapsed @@ -198,21 +245,196 @@ class StressTest: "duration_sec": duration, "error": error_msg, "gpu_status": gpu_status, + "telemetry": self._evaluate_telemetry( + telemetry, pass_tflops if "pass_tflops" in locals() else [], + self._new_xid_events(xid_before, self._collect_xid_events()), + ), } finally: + stop_sampling.set() tensors.clear() + ballast.clear() torch.cuda.empty_cache() elapsed = round(time.time() - t0, 1) + xid_events = self._new_xid_events(xid_before, self._collect_xid_events()) + telemetry_summary = self._evaluate_telemetry(telemetry, pass_tflops, xid_events) + passed = all(v == "PASS" for v in gpu_status.values()) and telemetry_summary.get("passed", False) return { "source": "pytorch", - "passed": True, + "passed": passed, "duration_sec": duration, "elapsed_sec": elapsed, "gpu_status": gpu_status, + "telemetry": telemetry_summary, "timestamp": datetime.now().isoformat(), } + def _sample_telemetry(self, telemetry: list, stop_event: threading.Event, interval: int): + query = "index,temperature.gpu,power.draw,clocks_throttle_reasons.active" + while not stop_event.is_set(): + try: + r = subprocess.run( + ["nvidia-smi", f"--query-gpu={query}", "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=10, + ) + if r.returncode == 0: + sample = {"time": time.time(), "gpus": []} + for line in r.stdout.splitlines(): + parts = [p.strip() for p in line.split(",")] + if len(parts) >= 4: + sample["gpus"].append({ + "index": int(parts[0]), + "temp_c": float(parts[1]), + "power_w": float(parts[2]), + "throttle": parts[3], + }) + telemetry.append(sample) + except Exception: + pass + stop_event.wait(interval) + + def _collect_xid_events(self) -> list[str]: + try: + r = subprocess.run( + ["dmesg", "--color=never"], + capture_output=True, text=True, timeout=10, + ) + if r.returncode != 0: + return [] + return [ + line.strip() + for line in r.stdout.splitlines() + if any(token in line.upper() for token in ("XID", "NVRM: XID")) + ] + except Exception: + return [] + + @staticmethod + def _new_xid_events(before: list[str], after: list[str]) -> list[str]: + seen = set(before) + return [line for line in after if line not in seen] + + def _evaluate_telemetry(self, telemetry: list, pass_tflops: list, xid_events: list[str] | None = None) -> dict: + cfg = self.stress_cfg + max_temp = float(cfg.get("max_temp_c", 80)) + max_delta = float(cfg.get("max_temp_delta_c", 5)) + min_power = float(cfg.get("min_power_watts", 630)) + max_jitter = float(cfg.get("max_tflops_jitter_pct", 5)) + require_jitter = bool(cfg.get("require_tflops_jitter", True)) + duration = float(cfg.get("duration_sec", 60)) + requested_warmup = float(cfg.get("warmup_sec", 60)) + warmup_sec = min(requested_warmup, max(0.0, duration * 0.2)) + min_steady_samples = int(cfg.get("min_steady_samples", 10)) + temps = {} + powers = {} + throttle_bad = [] + xid_events = xid_events or [] + steady_telemetry = [ + sample for sample in telemetry + if sample.get("time", 0) - telemetry[0].get("time", 0) >= warmup_sec + ] if telemetry else [] + evaluation_samples = steady_telemetry if len(steady_telemetry) >= min_steady_samples else telemetry + for sample in evaluation_samples: + for g in sample.get("gpus", []): + idx = g["index"] + temps.setdefault(idx, []).append(g["temp_c"]) + powers.setdefault(idx, []).append(g["power_w"]) + try: + bitmask = int(str(g["throttle"]), 16) + except ValueError: + bitmask = 0 + real_throttle = bitmask & ~0x1 + if real_throttle: + throttle_bad.append({ + "gpu": idx, + "throttle": g["throttle"], + "real_throttle": f"0x{real_throttle:x}", + }) + max_temps = {idx: max(vals) for idx, vals in temps.items() if vals} + avg_powers = {idx: sum(vals) / len(vals) for idx, vals in powers.items() if vals} + temp_delta = (max(max_temps.values()) - min(max_temps.values())) if len(max_temps) >= 2 else 0 + jitter = 0 + steady_tflops = [] + for item in pass_tflops: + if isinstance(item, dict): + if float(item.get("elapsed_sec", 0)) >= warmup_sec: + steady_tflops.append(float(item.get("tflops", 0))) + else: + steady_tflops.append(float(item)) + if len(steady_tflops) < 2 and pass_tflops: + steady_tflops = [ + float(item.get("tflops", 0)) if isinstance(item, dict) else float(item) + for item in pass_tflops + ] + if steady_tflops: + mean = sum(steady_tflops) / len(steady_tflops) + jitter = max(abs(v - mean) / mean * 100 for v in steady_tflops) if mean else 0 + failures = [] + temp_failures = {idx: v for idx, v in max_temps.items() if v > max_temp} + power_failures = {idx: v for idx, v in avg_powers.items() if v < min_power} + if not evaluation_samples: + failures.append("no telemetry samples available for evaluation") + if temp_failures: + failures.append( + "max temperature above threshold: " + + ", ".join(f"GPU {idx} {val:.1f}C" for idx, val in sorted(temp_failures.items())) + ) + if temp_delta > max_delta: + failures.append(f"GPU temperature delta {temp_delta:.1f}C exceeds {max_delta:.1f}C") + if power_failures: + failures.append( + "average steady-state power below threshold: " + + ", ".join(f"GPU {idx} {val:.1f}W" for idx, val in sorted(power_failures.items())) + ) + if throttle_bad: + failures.append( + f"non-idle throttle reasons observed in {len(throttle_bad)} samples " + f"(first: GPU {throttle_bad[0]['gpu']} {throttle_bad[0]['real_throttle']})" + ) + if xid_events: + failures.append(f"{len(xid_events)} new XID/NVRM XID events observed") + if require_jitter and len(steady_tflops) < 2: + failures.append( + f"insufficient steady TFLOPS samples for jitter evaluation: {len(steady_tflops)} < 2" + ) + if jitter > max_jitter: + failures.append(f"TFLOPS jitter {jitter:.2f}% exceeds {max_jitter:.2f}%") + passed = ( + bool(evaluation_samples) + and all(v <= max_temp for v in max_temps.values()) + and temp_delta <= max_delta + and all(v >= min_power for v in avg_powers.values()) + and not throttle_bad + and not xid_events + and (not require_jitter or len(steady_tflops) >= 2) + and jitter <= max_jitter + ) + return { + "passed": passed, + "samples": len(telemetry), + "steady_samples": len(evaluation_samples), + "warmup_sec": round(warmup_sec, 1), + "max_temp_c": {k: round(v, 1) for k, v in max_temps.items()}, + "avg_power_w": {k: round(v, 1) for k, v in avg_powers.items()}, + "temp_delta_c": round(temp_delta, 1), + "throttle_events": throttle_bad[:20], + "throttle_event_count": len(throttle_bad), + "xid_events": xid_events[-20:], + "tflops_jitter_pct": round(jitter, 2), + "steady_tflops_samples": len(steady_tflops), + "failures": failures, + "thresholds": { + "max_temp_c": max_temp, + "max_temp_delta_c": max_delta, + "min_power_w": min_power, + "max_tflops_jitter_pct": max_jitter, + "require_tflops_jitter": require_jitter, + "warmup_sec": requested_warmup, + "min_steady_samples": min_steady_samples, + }, + } + @staticmethod def print_results(results: dict, console: Console = None): c = console or Console() @@ -245,5 +467,21 @@ class StressTest: color = "green" if status == "PASS" else "red" c.print(f" GPU {gid}: [{color}]{status}[/{color}]") + telemetry = results.get("telemetry") or {} + if telemetry: + c.print("\n Telemetry:") + c.print(f" Samples: {telemetry.get('samples', 0)} total, {telemetry.get('steady_samples', 0)} evaluated after {telemetry.get('warmup_sec', 0)}s warmup") + c.print(f" Avg steady power: {telemetry.get('avg_power_w', {})}") + c.print(f" Max steady temp: {telemetry.get('max_temp_c', {})}") + c.print(f" Temp delta: {telemetry.get('temp_delta_c', 'N/A')} C") + c.print(f" TFLOPS jitter: {telemetry.get('tflops_jitter_pct', 'N/A')}%") + c.print(f" Throttle events: {telemetry.get('throttle_event_count', len(telemetry.get('throttle_events', [])))}") + c.print(f" XID events: {len(telemetry.get('xid_events', []))}") + failures = telemetry.get("failures", []) + if failures: + c.print(" [red]Failure reasons:[/red]") + for reason in failures: + c.print(f" [red]- {reason}[/red]") + if results.get("error"): c.print(f" [red]Error: {results['error']}[/red]") diff --git a/modules/training_sim.py b/modules/training_sim.py index dc7f5a3..af93850 100644 --- a/modules/training_sim.py +++ b/modules/training_sim.py @@ -1,8 +1,13 @@ """Training simulation module - LLM training workload with PyTorch.""" +import json +import os +import sys +import tempfile import time import subprocess import shutil +import math from datetime import datetime from typing import Optional @@ -36,6 +41,7 @@ class TrainingSim: batch_size = self.train_cfg.get("batch_size", 8) seq_length = self.train_cfg.get("seq_length", 2048) num_steps = self.train_cfg.get("num_steps", 50) + warmup_steps = int(self.train_cfg.get("warmup_steps", 5)) dtype_str = self.train_cfg.get("dtype", "bf16") dtype_map = { @@ -47,7 +53,13 @@ class TrainingSim: self.console.print(f"[cyan]Training Simulation[/cyan]") self.console.print(f" Model: {model_name} | Batch: {batch_size} | Seq: {seq_length} | " - f"DType: {dtype_str} | Steps: {num_steps} | GPUs: {gpu_count}") + f"DType: {dtype_str} | Steps: {num_steps} | Warmup: {warmup_steps} | GPUs: {gpu_count}") + + if self.train_cfg.get("mode", "ddp") == "ddp" and gpu_count > 1: + ddp_result = self._run_synthetic_ddp(gpu_count, batch_size, seq_length, num_steps, dtype_str) + if ddp_result.get("passed") or not self.train_cfg.get("allow_fallback", False): + return ddp_result + self.console.print("[yellow]DDP synthetic training failed, falling back to single-process synthetic path[/yellow]") try: from transformers import AutoModelForCausalLM, AutoTokenizer @@ -87,9 +99,10 @@ class TrainingSim: BarColumn(), TextColumn("{task.completed}/{task.total}"), TimeElapsedColumn(), console=self.console, ) as progress: - task = progress.add_task("Training steps...", total=num_steps) + total_steps = num_steps + warmup_steps + task = progress.add_task("Training steps...", total=total_steps) - for step in range(num_steps): + for step in range(total_steps): torch.cuda.synchronize() t0 = time.perf_counter() @@ -119,8 +132,15 @@ class TrainingSim: progress.advance(task) - avg_step_time = sum(step_times) / len(step_times) + measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times + avg_step_time = sum(measured_steps) / len(measured_steps) throughput = batch_size * seq_length / avg_step_time + jitter = self._jitter_pct(measured_steps) + peak_mem = round(max(mem_usage) if mem_usage else 0, 2) + final_loss = float(loss.item()) if hasattr(loss, "item") else float("nan") + passed = self._acceptance_pass(throughput, jitter, peak_mem, final_loss) + if self.train_cfg.get("require_distributed", True): + passed = False return { "model": model_name, @@ -130,11 +150,18 @@ class TrainingSim: "batch_size": batch_size, "seq_length": seq_length, "num_steps": num_steps, + "warmup_steps": warmup_steps, + "total_steps": total_steps, "avg_step_time_ms": round(avg_step_time * 1000, 1), "throughput_tokens_per_sec": round(throughput, 0), "throughput_samples_per_sec": round(batch_size / avg_step_time, 2), - "peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2), - "final_loss": round(loss.item(), 4) if hasattr(loss, 'item') else None, + "peak_memory_gb": peak_mem, + "final_loss": round(final_loss, 4), + "step_jitter_pct": round(jitter, 2), + "distributed_mode": "device_map", + "loss_finite": math.isfinite(final_loss), + "passed": passed, + "acceptance_gap": "8-GPU DDP was not used" if self.train_cfg.get("require_distributed", True) else "", "timestamp": datetime.now().isoformat(), } @@ -142,6 +169,196 @@ class TrainingSim: self.console.print(f"[yellow]Model loading failed: {e}[/yellow]") return self._run_synthetic(gpu_count, batch_size, seq_length, num_steps, dtype) + def _run_synthetic_ddp(self, gpu_count: int, batch_size: int, seq_length: int, + num_steps: int, dtype_str: str) -> dict: + """Run the 1.5B synthetic Transformer with one process per GPU.""" + torchrun = os.path.join(os.path.dirname(sys.executable), "torchrun") + if not os.path.isfile(torchrun): + torchrun = shutil.which("torchrun") or "" + if not torchrun: + return { + "model": "synthetic_transformer_1.5b", + "gpu_count": gpu_count, + "distributed_mode": "ddp", + "passed": False, + "error": "torchrun not found", + "timestamp": datetime.now().isoformat(), + } + + script = r''' +import json +import math +import os +import time +import torch +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP + +def main(): + local_rank = int(os.environ["LOCAL_RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + torch.cuda.set_device(local_rank) + dist.init_process_group("nccl") + + global_batch = int(os.environ["TRAIN_BATCH_SIZE"]) + local_batch = max(1, global_batch // world_size) + seq_length = int(os.environ["TRAIN_SEQ_LENGTH"]) + num_steps = int(os.environ["TRAIN_NUM_STEPS"]) + warmup_steps = int(os.environ.get("TRAIN_WARMUP_STEPS", "5")) + total_steps = num_steps + warmup_steps + dtype_name = os.environ.get("TRAIN_DTYPE", "bf16") + dtype = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}.get(dtype_name, torch.bfloat16) + + hidden_size = 4096 + num_layers = 6 + num_heads = 32 + vocab_size = 32000 + + class SyntheticTransformer(torch.nn.Module): + def __init__(self): + super().__init__() + self.embed = torch.nn.Embedding(vocab_size, hidden_size) + self.layers = torch.nn.ModuleList([ + torch.nn.TransformerEncoderLayer( + d_model=hidden_size, + nhead=num_heads, + dim_feedforward=hidden_size * 4, + batch_first=True, + dtype=dtype, + ) for _ in range(num_layers) + ]) + self.head = torch.nn.Linear(hidden_size, vocab_size, dtype=dtype) + + def forward(self, x): + h = self.embed(x).to(dtype) + for layer in self.layers: + h = layer(h) + return self.head(h) + + model = SyntheticTransformer().cuda() + total_params = sum(p.numel() for p in model.parameters()) + model = DDP(model, device_ids=[local_rank], output_device=local_rank) + optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4) + input_ids = torch.randint(0, vocab_size, (local_batch, seq_length), device="cuda") + step_times = [] + last_loss = torch.tensor(float("nan"), device="cuda") + torch.cuda.reset_peak_memory_stats(local_rank) + + for _ in range(total_steps): + torch.cuda.synchronize() + t0 = time.perf_counter() + with torch.amp.autocast("cuda", dtype=dtype, enabled=dtype in (torch.float16, torch.bfloat16)): + logits = model(input_ids) + loss = torch.nn.functional.cross_entropy(logits.reshape(-1, vocab_size), input_ids.reshape(-1)) + loss.backward() + optimizer.step() + optimizer.zero_grad(set_to_none=True) + torch.cuda.synchronize() + step_times.append(time.perf_counter() - t0) + last_loss = loss.detach() + + peak_mem = torch.tensor(torch.cuda.max_memory_allocated(local_rank) / 1024**3, device="cuda") + dist.all_reduce(peak_mem, op=dist.ReduceOp.MAX) + finite = torch.tensor(1 if math.isfinite(float(last_loss.item())) else 0, device="cuda") + dist.all_reduce(finite, op=dist.ReduceOp.MIN) + + if dist.get_rank() == 0: + measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times + avg_step = sum(measured_steps) / len(measured_steps) + mean = avg_step + jitter = max(abs(v - mean) / mean * 100 for v in measured_steps) if mean else 0.0 + throughput = global_batch * seq_length / avg_step if avg_step else 0.0 + print("TRAINING_DDP_JSON=" + json.dumps({ + "model": "synthetic_transformer_1.5b", + "total_params_m": round(total_params / 1e6, 1), + "num_layers": num_layers, + "hidden_size": hidden_size, + "gpu_count": world_size, + "dtype": dtype_name, + "batch_size": global_batch, + "local_batch_size": local_batch, + "seq_length": seq_length, + "num_steps": num_steps, + "warmup_steps": warmup_steps, + "total_steps": total_steps, + "avg_step_time_ms": round(avg_step * 1000, 1), + "throughput_tokens_per_sec": round(throughput, 0), + "throughput_samples_per_sec": round(global_batch / avg_step, 2) if avg_step else 0, + "peak_memory_gb": round(float(peak_mem.item()), 2), + "final_loss": round(float(last_loss.item()), 4), + "step_jitter_pct": round(jitter, 2), + "distributed_mode": "ddp", + "loss_finite": bool(int(finite.item())), + }), flush=True) + dist.destroy_process_group() + +if __name__ == "__main__": + main() +''' + tmp = tempfile.NamedTemporaryFile("w", suffix="_training_ddp.py", delete=False) + tmp.write(script) + tmp.close() + + env = { + **os.environ, + "TRAIN_BATCH_SIZE": str(batch_size), + "TRAIN_SEQ_LENGTH": str(seq_length), + "TRAIN_NUM_STEPS": str(num_steps), + "TRAIN_WARMUP_STEPS": str(int(self.train_cfg.get("warmup_steps", 5))), + "TRAIN_DTYPE": dtype_str, + "NCCL_DEBUG": os.environ.get("NCCL_DEBUG", "WARN"), + } + cmd = [torchrun, f"--nproc_per_node={gpu_count}", tmp.name] + self.console.print(f" Running synthetic 1.5B DDP via torchrun ({gpu_count} processes)...") + try: + timeout = int(self.train_cfg.get("timeout_sec", max(600, num_steps * 180))) + r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, env=env) + except subprocess.TimeoutExpired: + os.unlink(tmp.name) + return { + "model": "synthetic_transformer_1.5b", + "gpu_count": gpu_count, + "distributed_mode": "ddp", + "passed": False, + "error": "training_ddp_timeout", + "timestamp": datetime.now().isoformat(), + } + finally: + if os.path.exists(tmp.name): + try: + os.unlink(tmp.name) + except OSError: + pass + + marker = "TRAINING_DDP_JSON=" + payload = None + for line in (r.stdout + "\n" + r.stderr).splitlines(): + if marker in line: + payload = line.split(marker, 1)[1].strip() + if r.returncode != 0 or not payload: + return { + "model": "synthetic_transformer_1.5b", + "gpu_count": gpu_count, + "distributed_mode": "ddp", + "passed": False, + "error": (r.stderr or r.stdout or "training_ddp_failed")[-1000:], + "timestamp": datetime.now().isoformat(), + } + + result = json.loads(payload) + loss_value = float(result.get("final_loss", "nan")) + passed = self._acceptance_pass( + float(result.get("throughput_tokens_per_sec", 0)), + float(result.get("step_jitter_pct", 999)), + float(result.get("peak_memory_gb", 999)), + loss_value, + ) and bool(result.get("loss_finite", False)) and result.get("gpu_count") == gpu_count + result.update({ + "passed": passed, + "timestamp": datetime.now().isoformat(), + }) + return result + def _run_synthetic(self, gpu_count, batch_size, seq_length, num_steps, dtype) -> dict: self.console.print(" Running synthetic training benchmark...") @@ -170,11 +387,17 @@ class TrainingSim: h = layer(h) return self.head(h) - model = SyntheticTransformer().cuda() + model = SyntheticTransformer() total_params = sum(p.numel() for p in model.parameters()) self.console.print(f" Synthetic params: {total_params / 1e6:.1f}M") + distributed_mode = "single_gpu" + if gpu_count > 1: + model = torch.nn.DataParallel(model).cuda() + distributed_mode = "data_parallel" + else: + model = model.cuda() model.train() optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4) @@ -183,14 +406,17 @@ class TrainingSim: step_times = [] mem_usage = [] + warmup_steps = int(self.train_cfg.get("warmup_steps", 5)) + total_steps = num_steps + warmup_steps + with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), TextColumn("{task.completed}/{task.total}"), TimeElapsedColumn(), console=self.console, ) as progress: - task = progress.add_task("Synthetic training...", total=num_steps) + task = progress.add_task("Synthetic training...", total=total_steps) - for step in range(num_steps): + for step in range(total_steps): torch.cuda.synchronize() t0 = time.perf_counter() @@ -206,14 +432,22 @@ class TrainingSim: elapsed = time.perf_counter() - t0 step_times.append(elapsed) - mem_used = torch.cuda.max_memory_allocated() / 1024**3 + mem_used = max(torch.cuda.max_memory_allocated(i) for i in range(gpu_count)) / 1024**3 mem_usage.append(mem_used) - torch.cuda.reset_peak_memory_stats() + for i in range(gpu_count): + torch.cuda.reset_peak_memory_stats(i) progress.advance(task) - avg_step_time = sum(step_times) / len(step_times) + measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times + avg_step_time = sum(measured_steps) / len(measured_steps) throughput = batch_size * seq_length / avg_step_time + jitter = self._jitter_pct(measured_steps) + peak_mem = round(max(mem_usage) if mem_usage else 0, 2) + final_loss = float(loss.item()) + passed = self._acceptance_pass(throughput, jitter, peak_mem, final_loss) + if self.train_cfg.get("require_distributed", True): + passed = False return { "model": "synthetic_transformer", @@ -225,14 +459,36 @@ class TrainingSim: "batch_size": batch_size, "seq_length": seq_length, "num_steps": num_steps, + "warmup_steps": warmup_steps, + "total_steps": total_steps, "avg_step_time_ms": round(avg_step_time * 1000, 1), "throughput_tokens_per_sec": round(throughput, 0), "throughput_samples_per_sec": round(batch_size / avg_step_time, 2), - "peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2), - "final_loss": round(loss.item(), 4), + "peak_memory_gb": peak_mem, + "final_loss": round(final_loss, 4), + "step_jitter_pct": round(jitter, 2), + "distributed_mode": distributed_mode, + "loss_finite": math.isfinite(final_loss), + "passed": passed, + "acceptance_gap": "8-GPU DDP was not used" if self.train_cfg.get("require_distributed", True) else "", "timestamp": datetime.now().isoformat(), } + @staticmethod + def _jitter_pct(step_times: list[float]) -> float: + if not step_times: + return 0.0 + mean = sum(step_times) / len(step_times) + return max(abs(v - mean) / mean * 100 for v in step_times) if mean else 0.0 + + def _acceptance_pass(self, throughput: float, jitter: float, peak_mem: float, loss_value: float) -> bool: + return ( + throughput >= float(self.train_cfg.get("min_tokens_per_sec", 45000)) + and jitter <= float(self.train_cfg.get("max_step_jitter_pct", 3)) + and peak_mem <= float(self.train_cfg.get("max_peak_memory_gb", 70)) + and math.isfinite(loss_value) + ) + @staticmethod def print_results(results: dict, console: Console = None): c = console or Console() @@ -254,11 +510,15 @@ class TrainingSim: ("Batch Size", str(results.get("batch_size", "N/A"))), ("Seq Length", str(results.get("seq_length", "N/A"))), ("Steps", str(results.get("num_steps", "N/A"))), + ("Warmup Steps", str(results.get("warmup_steps", "N/A"))), ("Avg Step Time", f"{results.get('avg_step_time_ms', 'N/A')} ms"), ("Throughput", f"{results.get('throughput_tokens_per_sec', 'N/A')} tokens/s"), ("Samples/sec", f"{results.get('throughput_samples_per_sec', 'N/A')}"), ("Peak Memory", f"{results.get('peak_memory_gb', 'N/A')} GB"), ("Final Loss", str(results.get("final_loss", "N/A"))), + ("Step Jitter", f"{results.get('step_jitter_pct', 'N/A')}%"), + ("Distributed Mode", results.get("distributed_mode", "N/A")), + ("Verdict", "PASS" if results.get("passed") else "FAIL"), ] for label, val in metrics: table.add_row(label, str(val)) diff --git a/reports_all_aikubeworker0016.json b/reports_all_aikubeworker0016.json new file mode 100644 index 0000000..d3db53f --- /dev/null +++ b/reports_all_aikubeworker0016.json @@ -0,0 +1,921 @@ +{ + "timestamp": "2026-05-22T15:49:02.368516", + "gpu_info": { + "driver_version": "580.159.03", + "cuda_version": "13.0", + "gpu_count": 8, + "gpus": [ + { + "index": 0, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-dfbc9513-255d-4fe7-2b77-7b1ec3972e75", + "pci_bus_id": "00000000:18:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 4, + "vram_free_mb": 81076, + "power_draw": 69.98, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 21, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924016120", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 1, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-bb845ef7-d7b5-f011-9395-ea74274e2282", + "pci_bus_id": "00000000:2A:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 4, + "vram_free_mb": 81076, + "power_draw": 67.54, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 21, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924015483", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 2, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-3720cf13-2a34-be38-27be-0a7adc4addc4", + "pci_bus_id": "00000000:3A:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 4, + "vram_free_mb": 81076, + "power_draw": 66.82, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 22, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924025595", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 3, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-87080b2d-ac43-be0d-d574-c193078850ae", + "pci_bus_id": "00000000:5D:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 4, + "vram_free_mb": 81076, + "power_draw": 67.02, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 21, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924016862", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 4, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-599bd883-cc5c-a5dd-6c33-c15f7049da48", + "pci_bus_id": "00000000:9A:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 4, + "vram_free_mb": 81076, + "power_draw": 67.24, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 21, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924025670", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 5, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-a1c6bba4-61b0-e623-06c9-9c88635e26fe", + "pci_bus_id": "00000000:AB:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 4, + "vram_free_mb": 81076, + "power_draw": 69.31, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 23, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924027166", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 6, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-98745a0c-39bd-3e56-d6ca-54ba3647ab6d", + "pci_bus_id": "00000000:BA:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 4, + "vram_free_mb": 81076, + "power_draw": 67.84, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 21, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924026234", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 7, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-8c73bd8b-666b-357e-ac5d-c75ac7a759db", + "pci_bus_id": "00000000:DB:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 4, + "vram_free_mb": 81076, + "power_draw": 66.21, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 21, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924027255", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + } + ], + "topology": "\t\u001b[4mGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tNIC3\tNIC4\tNIC5\tNIC6\tNIC7\tNIC8\tNIC9\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\u001b[0m\nGPU0\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tPIX\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU1\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tPIX\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU2\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tPIX\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU3\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU4\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU5\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU6\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tPIX\t56-111,168-223\t1\t\tN/A\nGPU7\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nNIC0\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC3\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC4\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t X \tPIX\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC5\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\tPIX\t X \tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC6\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\t\t\t\t\nNIC7\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\t\t\t\t\nNIC8\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\t\t\t\t\nNIC9\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n NIC4: mlx5_4\n NIC5: mlx5_5\n NIC6: mlx5_6\n NIC7: mlx5_7\n NIC8: mlx5_8\n NIC9: mlx5_9\n\n", + "timestamp": "2026-05-22T15:49:09.197459", + "detected_gpu_type": "h100", + "gpu_label": "H100 SXM5" + }, + "health": { + "passed": true, + "gpu_health": [ + { + "index": 0, + "status": "WARN", + "checks": { + "temperature": { + "value": 21, + "status": "PASS", + "threshold": 75 + }, + "power": { + "value": 69.86, + "limit": 700.0, + "status": "PASS" + }, + "ecc_errors": { + "single": 0, + "double": 0, + "status": "PASS" + }, + "memory_errors": { + "status": "PASS" + }, + "pcie_link": { + "gen": 5, + "width": 16, + "status": "PASS" + }, + "clock_speed": { + "sm": 345, + "mem": 2619, + "status": "PASS" + }, + "throttling": { + "status": "PASS", + "reasons": [] + }, + "persistence_mode": { + "enabled": false, + "status": "WARN" + } + } + }, + { + "index": 1, + "status": "WARN", + "checks": { + "temperature": { + "value": 21, + "status": "PASS", + "threshold": 75 + }, + "power": { + "value": 67.48, + "limit": 700.0, + "status": "PASS" + }, + "ecc_errors": { + "single": 0, + "double": 0, + "status": "PASS" + }, + "memory_errors": { + "status": "PASS" + }, + "pcie_link": { + "gen": 5, + "width": 16, + "status": "PASS" + }, + "clock_speed": { + "sm": 345, + "mem": 2619, + "status": "PASS" + }, + "throttling": { + "status": "PASS", + "reasons": [] + }, + "persistence_mode": { + "enabled": false, + "status": "WARN" + } + } + }, + { + "index": 2, + "status": "WARN", + "checks": { + "temperature": { + "value": 22, + "status": "PASS", + "threshold": 75 + }, + "power": { + "value": 66.76, + "limit": 700.0, + "status": "PASS" + }, + "ecc_errors": { + "single": 0, + "double": 0, + "status": "PASS" + }, + "memory_errors": { + "status": "PASS" + }, + "pcie_link": { + "gen": 5, + "width": 16, + "status": "PASS" + }, + "clock_speed": { + "sm": 345, + "mem": 2619, + "status": "PASS" + }, + "throttling": { + "status": "PASS", + "reasons": [] + }, + "persistence_mode": { + "enabled": false, + "status": "WARN" + } + } + }, + { + "index": 3, + "status": "WARN", + "checks": { + "temperature": { + "value": 21, + "status": "PASS", + "threshold": 75 + }, + "power": { + "value": 67.06, + "limit": 700.0, + "status": "PASS" + }, + "ecc_errors": { + "single": 0, + "double": 0, + "status": "PASS" + }, + "memory_errors": { + "status": "PASS" + }, + "pcie_link": { + "gen": 5, + "width": 16, + "status": "PASS" + }, + "clock_speed": { + "sm": 345, + "mem": 2619, + "status": "PASS" + }, + "throttling": { + "status": "PASS", + "reasons": [] + }, + "persistence_mode": { + "enabled": false, + "status": "WARN" + } + } + }, + { + "index": 4, + "status": "WARN", + "checks": { + "temperature": { + "value": 21, + "status": "PASS", + "threshold": 75 + }, + "power": { + "value": 67.23, + "limit": 700.0, + "status": "PASS" + }, + "ecc_errors": { + "single": 0, + "double": 0, + "status": "PASS" + }, + "memory_errors": { + "status": "PASS" + }, + "pcie_link": { + "gen": 5, + "width": 16, + "status": "PASS" + }, + "clock_speed": { + "sm": 345, + "mem": 2619, + "status": "PASS" + }, + "throttling": { + "status": "PASS", + "reasons": [] + }, + "persistence_mode": { + "enabled": false, + "status": "WARN" + } + } + }, + { + "index": 5, + "status": "WARN", + "checks": { + "temperature": { + "value": 23, + "status": "PASS", + "threshold": 75 + }, + "power": { + "value": 69.27, + "limit": 700.0, + "status": "PASS" + }, + "ecc_errors": { + "single": 0, + "double": 0, + "status": "PASS" + }, + "memory_errors": { + "status": "PASS" + }, + "pcie_link": { + "gen": 5, + "width": 16, + "status": "PASS" + }, + "clock_speed": { + "sm": 345, + "mem": 2619, + "status": "PASS" + }, + "throttling": { + "status": "PASS", + "reasons": [] + }, + "persistence_mode": { + "enabled": false, + "status": "WARN" + } + } + }, + { + "index": 6, + "status": "WARN", + "checks": { + "temperature": { + "value": 21, + "status": "PASS", + "threshold": 75 + }, + "power": { + "value": 67.81, + "limit": 700.0, + "status": "PASS" + }, + "ecc_errors": { + "single": 0, + "double": 0, + "status": "PASS" + }, + "memory_errors": { + "status": "PASS" + }, + "pcie_link": { + "gen": 5, + "width": 16, + "status": "PASS" + }, + "clock_speed": { + "sm": 345, + "mem": 2619, + "status": "PASS" + }, + "throttling": { + "status": "PASS", + "reasons": [] + }, + "persistence_mode": { + "enabled": false, + "status": "WARN" + } + } + }, + { + "index": 7, + "status": "WARN", + "checks": { + "temperature": { + "value": 21, + "status": "PASS", + "threshold": 75 + }, + "power": { + "value": 66.3, + "limit": 700.0, + "status": "PASS" + }, + "ecc_errors": { + "single": 0, + "double": 0, + "status": "PASS" + }, + "memory_errors": { + "status": "PASS" + }, + "pcie_link": { + "gen": 5, + "width": 16, + "status": "PASS" + }, + "clock_speed": { + "sm": 345, + "mem": 2619, + "status": "PASS" + }, + "throttling": { + "status": "PASS", + "reasons": [] + }, + "persistence_mode": { + "enabled": false, + "status": "WARN" + } + } + } + ], + "system_health": { + "nvidia_persistenced": { + "installed": true, + "running": false + }, + "hugepages": { + "configured": false, + "count": 0 + }, + "swap": { + "enabled": true + }, + "transparent_hugepage": "madvise", + "file_descriptors": { + "soft": 1024, + "max": 1048576 + }, + "infiniband_devices": [ + "mlx5_4", + "mlx5_2", + "mlx5_0", + "mlx5_9", + "mlx5_7", + "mlx5_5", + "mlx5_3", + "mlx5_1", + "mlx5_8", + "mlx5_6" + ], + "rdma_devices": [ + "abi_version", + "uverbs4", + "uverbs2", + "uverbs0", + "uverbs9", + "uverbs7", + "uverbs5", + "uverbs3", + "uverbs1", + "uverbs8", + "uverbs6" + ], + "nccl_env_vars": {} + }, + "timestamp": "2026-05-22T15:49:11.294816", + "detected_gpu_type": "h100" + }, + "memory_bench": { + "memory": { + "source": "nvbandwidth", + "h2d_bandwidth_gbps": 55.5, + "d2h_bandwidth_gbps": 55.3, + "d2d_bandwidth_gbps": 486.5, + "h2d_peak_gbps": 64, + "d2h_peak_gbps": 64, + "d2d_peak_gbps": 450.0, + "h2d_efficiency_pct": 86.7, + "d2h_efficiency_pct": 86.4, + "d2d_efficiency_pct": 108.1, + "peak_bandwidth_gbps": 3400, + "efficiency_pct": 108.1, + "results_by_test": { + "h2d": 55.5, + "d2h": 55.3, + "d2d_write": 397.4, + "d2d_read": 395.1, + "d2d_bidir": 486.5 + }, + "per_gpu": [] + } + }, + "compute_bench": { + "compute": { + "per_dtype_tflops": { + "fp32": 51.9, + "tf32": 357.0, + "fp16": 664.0, + "bf16": 700.1, + "fp8": 1116.2 + }, + "peak_tflops": { + "fp32": 67, + "tf32": 495, + "fp16": 990, + "bf16": 990, + "fp8": 1979 + }, + "efficiency_pct": { + "fp32": 77.5, + "tf32": 72.1, + "fp16": 67.1, + "bf16": 70.7, + "fp8": 56.4 + }, + "pass_thresholds_tflops": { + "fp32": 54, + "tf32": 444, + "fp16": 734, + "bf16": 745, + "fp8": 1400 + }, + "per_gpu": [ + { + "index": 0, + "fp32": 51.9, + "tf32": 357.0, + "fp16": 664.0, + "bf16": 700.1, + "fp8": 1116.2 + }, + { + "index": 1, + "fp32": 51.9, + "tf32": 357.0, + "fp16": 664.0, + "bf16": 700.1, + "fp8": 1116.2 + }, + { + "index": 2, + "fp32": 51.9, + "tf32": 357.0, + "fp16": 664.0, + "bf16": 700.1, + "fp8": 1116.2 + }, + { + "index": 3, + "fp32": 51.9, + "tf32": 357.0, + "fp16": 664.0, + "bf16": 700.1, + "fp8": 1116.2 + }, + { + "index": 4, + "fp32": 51.9, + "tf32": 357.0, + "fp16": 664.0, + "bf16": 700.1, + "fp8": 1116.2 + }, + { + "index": 5, + "fp32": 51.9, + "tf32": 357.0, + "fp16": 664.0, + "bf16": 700.1, + "fp8": 1116.2 + }, + { + "index": 6, + "fp32": 51.9, + "tf32": 357.0, + "fp16": 664.0, + "bf16": 700.1, + "fp8": 1116.2 + }, + { + "index": 7, + "fp32": 51.9, + "tf32": 357.0, + "fp16": 664.0, + "bf16": 700.1, + "fp8": 1116.2 + } + ], + "matrix_size": 8192, + "warmup": 50, + "iterations": 500 + } + }, + "nccl": { + "passed": false, + "source": "torchrun_fallback", + "tests": { + "NCCL version 2.21.5+cuda12.4": { + "status": "FAIL", + "error": null + }, + "allreduce": { + "status": "PASS", + "error": null + }, + "broadcast": { + "status": "PASS", + "error": null + }, + "allgather": { + "status": "PASS", + "error": null + }, + "reducescatter": { + "status": "PASS", + "error": null + }, + "alltoall": { + "status": "PASS", + "error": null + } + }, + "gpu_count": 8 + }, + "stress": { + "source": "pytorch", + "passed": true, + "duration_sec": 60, + "elapsed_sec": 60.0, + "gpu_status": { + "0": "PASS", + "1": "PASS", + "2": "PASS", + "3": "PASS", + "4": "PASS", + "5": "PASS", + "6": "PASS", + "7": "PASS" + }, + "timestamp": "2026-05-22T15:51:56.803540" + }, + "rdma": { + "passed": false, + "devices": [ + { + "name": "mlx5_0", + "ports": [ + { + "port": "1", + "rate": "400 Gb/sec (4X NDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:58a2:e103:0088:81e0" + } + ] + }, + { + "name": "mlx5_1", + "ports": [ + { + "port": "1", + "rate": "400 Gb/sec (4X NDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:9c63:c003:0054:e00a" + } + ] + }, + { + "name": "mlx5_2", + "ports": [ + { + "port": "1", + "rate": "25 Gb/sec (1X EDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf" + } + ] + }, + { + "name": "mlx5_3", + "ports": [ + { + "port": "1", + "rate": "25 Gb/sec (1X EDR)", + "state": "1: DOWN", + "phys_state": "3: Disabled", + "gid": "fe80:0000:0000:0000:c670:bdff:fefd:5bd9" + } + ] + }, + { + "name": "mlx5_4", + "ports": [ + { + "port": "1", + "rate": "100 Gb/sec (2X HDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:9c63:c003:005f:58ec" + } + ] + }, + { + "name": "mlx5_5", + "ports": [ + { + "port": "1", + "rate": "100 Gb/sec (2X HDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:9c63:c003:005f:58ed" + } + ] + }, + { + "name": "mlx5_6", + "ports": [ + { + "port": "1", + "rate": "400 Gb/sec (4X NDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:9c63:c003:0055:0e56" + } + ] + }, + { + "name": "mlx5_7", + "ports": [ + { + "port": "1", + "rate": "400 Gb/sec (4X NDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:a088:c203:00f0:286c" + } + ] + }, + { + "name": "mlx5_8", + "ports": [ + { + "port": "1", + "rate": "25 Gb/sec (1X EDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf" + } + ] + }, + { + "name": "mlx5_9", + "ports": [ + { + "port": "1", + "rate": "25 Gb/sec (1X EDR)", + "state": "1: DOWN", + "phys_state": "3: Disabled", + "gid": "fe80:0000:0000:0000:c670:bdff:fefd:569d" + } + ] + } + ], + "bandwidth_tests": [ + { + "test": "ib_write_bw", + "status": "WARN", + "bandwidth_gbps": 0.13, + "min_required_gbps": 50 + }, + { + "test": "ib_read_bw", + "status": "WARN", + "bandwidth_gbps": 0.13, + "min_required_gbps": 50 + } + ], + "latency_tests": [ + { + "test": "ib_write_lat", + "status": "PASS", + "latency_us": 4.1, + "max_allowed_us": 10 + }, + { + "test": "ib_read_lat", + "status": "WARN", + "latency_us": 16.0, + "max_allowed_us": 10 + } + ], + "timestamp": "2026-05-22T15:52:03.507540" + }, + "training": { + "model": "synthetic_transformer", + "total_params_m": 1470.5, + "num_layers": 6, + "hidden_size": 4096, + "gpu_count": 8, + "dtype": "bfloat16", + "batch_size": 8, + "seq_length": 2048, + "num_steps": 50, + "avg_step_time_ms": 312.3, + "throughput_tokens_per_sec": 52471.0, + "throughput_samples_per_sec": 25.62, + "peak_memory_gb": 27.31, + "final_loss": 0.0041, + "timestamp": "2026-05-22T15:52:32.650522" + } +} \ No newline at end of file diff --git a/reports_all_aikubeworker0016.md b/reports_all_aikubeworker0016.md new file mode 100644 index 0000000..80dda75 --- /dev/null +++ b/reports_all_aikubeworker0016.md @@ -0,0 +1,157 @@ +# GPU Test Report + +- **Date:** 2026-05-22T15:49:02.368516 +- **Host:** aikubeworker0016 +- **GPU:** NVIDIA H100 80GB HBM3 x8 +- **Driver:** 580.159.03 | **CUDA:** 13.0 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Failed or unverified items: +- Compute Throughput: FAIL (worst FP32 52 vs >= 54) +- NCCL: FAIL (no nccl-tests bus BW) +- RDMA: FAIL +- Training: UNVERIFIED (52471 tokens/sec; legacy result lacks explicit acceptance verdict) + +Missing required evidence: +- NVLink/NVSwitch +- DCGM + +## Summary + +| Test | Result | +|------|--------| +| GPU Info | PASS (8 GPUs detected) | +| Health Check | PASS | +| Memory Bandwidth | PASS (108.1%) | +| Compute Throughput | FAIL (worst FP32 52 vs >= 54) | +| NCCL | FAIL (no nccl-tests bus BW) | +| Stress Test | PASS | +| RDMA | FAIL | +| Training | UNVERIFIED (52471 tokens/sec; legacy result lacks explicit acceptance verdict) | + +## GPU Information + +| GPU | Model | VRAM | Temp | Power | SM Clock | +|-----|-------|------|------|-------|----------| +| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 70/700W | 345 MHz | +| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 68/700W | 345 MHz | +| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 22C | 67/700W | 345 MHz | +| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 67/700W | 345 MHz | +| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 67/700W | 345 MHz | +| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 23C | 69/700W | 345 MHz | +| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 68/700W | 345 MHz | +| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 66/700W | 345 MHz | + +## Health Check + +**Overall: PASS** + +| GPU | Temp | Power | ECC | PCIe | Throttle | Status | +|-----|------|-------|-----|------|----------|--------| +| 0 | 21C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** | +| 1 | 21C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** | +| 2 | 22C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** | +| 3 | 21C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** | +| 4 | 21C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** | +| 5 | 23C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** | +| 6 | 21C PASS | 68W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** | +| 7 | 21C PASS | 66W PASS | S:0 D:0 | Gen5x16 | PASS | **WARN** | + +## Memory Bandwidth + +Source: nvbandwidth + +| Metric | Value | Peak | Efficiency | +|--------|-------|------|------------| +| H2D (PCIe) | 55.5 GB/s | 64 GB/s | 86.7% | +| D2H (PCIe) | 55.3 GB/s | 64 GB/s | 86.4% | +| D2D (NVLink) | 486.5 GB/s | 450 GB/s | 108.1% | + +**Verdict: PASS** (D2D efficiency 108.1%) + +## Compute Throughput + +| DType | Achieved (TFLOPS) | Peak | Threshold | Status | +|-------|-------------------|------|------------|--------| +| FP32 | 51.9 | 67 | >= 54 | FAIL | +| TF32 | 357.0 | 495 | >= 444 | FAIL | +| FP16 | 664.0 | 990 | >= 734 | FAIL | +| BF16 | 700.1 | 990 | >= 745 | FAIL | +| FP8 | 1116.2 | 1979 | >= 1400 | FAIL | + +**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 56.4%) + +### Compute Per-GPU TFLOPS + +| GPU | FP32 | TF32 | FP16 | BF16 | FP8 | +|---|---|---|---|---|---| +| 0 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 | +| 1 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 | +| 2 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 | +| 3 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 | +| 4 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 | +| 5 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 | +| 6 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 | +| 7 | 51.9 | 357.0 | 664.0 | 700.1 | 1116.2 | + +## NCCL Multi-GPU + +Source: torchrun_fallback | GPUs: 8 + +> Functional NCCL smoke only: nccl-tests bus bandwidth was not measured, so this does not satisfy production acceptance. + +| Operation | Bus BW (GB/s) | Threshold | Status | +|-----------|---------------|-----------|--------| +| NCCL version 2.21.5+cuda12.4 | 0.0 | >= 0 | FAIL | +| allreduce | 0.0 | >= 0 | PASS | +| broadcast | 0.0 | >= 0 | PASS | +| allgather | 0.0 | >= 0 | PASS | +| reducescatter | 0.0 | >= 0 | PASS | +| alltoall | 0.0 | >= 0 | PASS | + +**Overall: FAIL** + +## Stress Test + +- **Source:** pytorch +- **Duration:** 60s (requested 60s) +- **Result: PASS** + +## RDMA/InfiniBand + +> Legacy RDMA result re-evaluated with current PDF acceptance thresholds; old WARN statuses and old 50GB/s/10us limits are not used for verdict. + +| Test | Value | Threshold | Status | +|------|-------|-----------|--------| +| ib_write_bw | 0.1 GB/s | >= 47 GB/s | FAIL | +| ib_read_bw | 0.1 GB/s | >= 47 GB/s | FAIL | +| ib_write_lat | 4.10 us | <= 2 us | FAIL | +| ib_read_lat | 16.00 us | <= 3.5 us | FAIL | + +- **Failure reasons:** + - ib_write_bw bandwidth 0.13GB/s < 47GB/s + - ib_read_bw bandwidth 0.13GB/s < 47GB/s + - ib_write_lat latency 4.1us > 2us + - ib_read_lat latency 16.0us > 3.5us +**Overall: FAIL** + +## Training Simulation + +| Metric | Value | +|--------|-------| +| Model | synthetic_transformer | +| Params | 1470.5M | +| Throughput | 52471 tokens/sec | +| Avg Step Time | 312.3 ms | +| Peak Memory | 27.3 GB | +| Final Loss | 0.0041 | +| Step Jitter | N/A% | +| Distributed Mode | N/A | +| Acceptance Gaps | missing passed, step_jitter_pct, distributed_mode, loss_finite | +| Verdict | UNVERIFIED (52471 tokens/sec; legacy result lacks explicit acceptance verdict) | + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_dcgm_r3_aikubeworker0012_20260522_200338.md b/reports_dcgm_r3_aikubeworker0012_20260522_200338.md new file mode 100644 index 0000000..1663b83 --- /dev/null +++ b/reports_dcgm_r3_aikubeworker0012_20260522_200338.md @@ -0,0 +1,65 @@ +# GPU Test Report + +- **Date:** 2026-05-22T20:26:56.947796 +- **Host:** aikubeworker0012 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Missing required evidence: +- GPU Info +- Health Check +- Memory Bandwidth +- Compute Throughput +- NVLink/NVSwitch +- NCCL +- Stress Test +- RDMA +- Training + +## Summary + +| Test | Result | +|------|--------| +| DCGM | PASS | + +## DCGM Diagnostic + +**Overall: PASS** + +| Subtest | Status | +|---------|--------| +| Hardware/nvbandwidth/GPU6 | PASS | +| Hardware/nvbandwidth/GPU7 | PASS | +| Hardware/nvbandwidth/summary | PASS | +| Integration/pcie/GPU0 | PASS | +| Integration/pcie/GPU1 | PASS | +| Integration/pcie/GPU2 | PASS | +| Integration/pcie/GPU3 | PASS | +| Integration/pcie/GPU4 | PASS | +| Integration/pcie/GPU5 | PASS | +| Integration/pcie/GPU6 | PASS | +| Integration/pcie/GPU7 | PASS | +| Integration/pcie/summary | PASS | +| Stress/targeted_stress/GPU0 | PASS | +| Stress/targeted_stress/GPU1 | PASS | +| Stress/targeted_stress/GPU2 | PASS | +| Stress/targeted_stress/GPU3 | PASS | +| Stress/targeted_stress/GPU4 | PASS | +| Stress/targeted_stress/GPU5 | PASS | +| Stress/targeted_stress/GPU6 | PASS | +| Stress/targeted_stress/GPU7 | PASS | +| Stress/targeted_stress/summary | PASS | +| Stress/targeted_power/GPU0 | PASS | +| Stress/targeted_power/GPU1 | PASS | +| Stress/targeted_power/GPU2 | PASS | +| Stress/targeted_power/GPU3 | PASS | +| Stress/targeted_power/GPU4 | PASS | +| Stress/targeted_power/GPU5 | PASS | +| Stress/targeted_power/GPU6 | PASS | +| Stress/targeted_power/GPU7 | PASS | +| Stress/targeted_power/summary | PASS | + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_dcgm_r3_aikubeworker0016_20260522_200538.md b/reports_dcgm_r3_aikubeworker0016_20260522_200538.md new file mode 100644 index 0000000..f51b5bf --- /dev/null +++ b/reports_dcgm_r3_aikubeworker0016_20260522_200538.md @@ -0,0 +1,65 @@ +# GPU Test Report + +- **Date:** 2026-05-22T20:28:58.716266 +- **Host:** aikubeworker0016 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Missing required evidence: +- GPU Info +- Health Check +- Memory Bandwidth +- Compute Throughput +- NVLink/NVSwitch +- NCCL +- Stress Test +- RDMA +- Training + +## Summary + +| Test | Result | +|------|--------| +| DCGM | PASS | + +## DCGM Diagnostic + +**Overall: PASS** + +| Subtest | Status | +|---------|--------| +| Hardware/nvbandwidth/GPU6 | PASS | +| Hardware/nvbandwidth/GPU7 | PASS | +| Hardware/nvbandwidth/summary | PASS | +| Integration/pcie/GPU0 | PASS | +| Integration/pcie/GPU1 | PASS | +| Integration/pcie/GPU2 | PASS | +| Integration/pcie/GPU3 | PASS | +| Integration/pcie/GPU4 | PASS | +| Integration/pcie/GPU5 | PASS | +| Integration/pcie/GPU6 | PASS | +| Integration/pcie/GPU7 | PASS | +| Integration/pcie/summary | PASS | +| Stress/targeted_stress/GPU0 | PASS | +| Stress/targeted_stress/GPU1 | PASS | +| Stress/targeted_stress/GPU2 | PASS | +| Stress/targeted_stress/GPU3 | PASS | +| Stress/targeted_stress/GPU4 | PASS | +| Stress/targeted_stress/GPU5 | PASS | +| Stress/targeted_stress/GPU6 | PASS | +| Stress/targeted_stress/GPU7 | PASS | +| Stress/targeted_stress/summary | PASS | +| Stress/targeted_power/GPU0 | PASS | +| Stress/targeted_power/GPU1 | PASS | +| Stress/targeted_power/GPU2 | PASS | +| Stress/targeted_power/GPU3 | PASS | +| Stress/targeted_power/GPU4 | PASS | +| Stress/targeted_power/GPU5 | PASS | +| Stress/targeted_power/GPU6 | PASS | +| Stress/targeted_power/GPU7 | PASS | +| Stress/targeted_power/summary | PASS | + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_nvbandwidth_aikubeworker0012.json b/reports_nvbandwidth_aikubeworker0012.json new file mode 100644 index 0000000..05a0587 --- /dev/null +++ b/reports_nvbandwidth_aikubeworker0012.json @@ -0,0 +1,70 @@ +{ + "benchmark": { + "memory": { + "source": "nvbandwidth", + "h2d_bandwidth_gbps": 55.5, + "d2h_bandwidth_gbps": 54.8, + "d2d_bandwidth_gbps": 0.0, + "h2d_peak_gbps": 64, + "d2h_peak_gbps": 64, + "d2d_peak_gbps": 450.0, + "h2d_efficiency_pct": 86.7, + "d2h_efficiency_pct": 85.6, + "d2d_efficiency_pct": null, + "peak_bandwidth_gbps": 3400, + "efficiency_pct": null, + "results_by_test": { + "h2d": 55.5, + "d2h": 54.8, + "d2d_write": 0.0, + "d2d_read": 0.0, + "d2d_bidir": 0.0 + }, + "per_gpu": [] + }, + "compute": { + "per_dtype_tflops": { + "fp32": 52.2, + "tf32": 360.7, + "fp16": 680.0, + "bf16": 707.6, + "fp8": 1142.4 + }, + "peak_tflops": { + "fp32": 67, + "tf32": 495, + "fp16": 990, + "bf16": 990, + "fp8": 1979 + }, + "efficiency_pct": { + "fp32": 77.9, + "tf32": 72.9, + "fp16": 68.7, + "bf16": 71.5, + "fp8": 57.7 + }, + "pass_thresholds_tflops": { + "fp32": 54, + "tf32": 444, + "fp16": 734, + "bf16": 745, + "fp8": 1400 + }, + "per_gpu": [ + { + "index": 0, + "fp32": 52.2, + "tf32": 360.7, + "fp16": 680.0, + "bf16": 707.6, + "fp8": 1142.4 + } + ], + "matrix_size": 8192, + "warmup": 50, + "iterations": 500 + } + }, + "timestamp": "2026-05-22T15:35:16.675924" +} \ No newline at end of file diff --git a/reports_nvbandwidth_aikubeworker0012.md b/reports_nvbandwidth_aikubeworker0012.md new file mode 100644 index 0000000..bf571ab --- /dev/null +++ b/reports_nvbandwidth_aikubeworker0012.md @@ -0,0 +1,38 @@ +# GPU Test Report + +- **Date:** 2026-05-22 15:37:12 +- **Host:** aikubeworker0012 + +## Summary + +| Test | Result | +|------|--------| +| Memory Bandwidth | FAIL (0.0%) | +| Compute Throughput | FAIL (worst TF32 361 vs >= 444) | + +## Memory Bandwidth + +Source: nvbandwidth + +| Metric | Value | Peak | Efficiency | +|--------|-------|------|------------| +| H2D (PCIe) | 55.5 GB/s | 64 GB/s | 86.7% | +| D2H (PCIe) | 54.8 GB/s | 64 GB/s | 85.6% | +| D2D (NVLink) | 0.0 GB/s | 450 GB/s | 0.0% | + +**Verdict: FAIL** (D2D efficiency 0.0%) + +## Compute Throughput + +| DType | Achieved (TFLOPS) | Peak | Threshold | Status | +|-------|-------------------|------|------------|--------| +| FP32 | 52.2 | 67 | >= 54 | WARN | +| TF32 | 360.7 | 495 | >= 444 | FAIL | +| FP16 | 680.0 | 990 | >= 734 | WARN | +| BF16 | 707.6 | 990 | >= 745 | WARN | +| FP8 | 1142.4 | 1979 | >= 1400 | FAIL | + +**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 57.7%) + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_nvbandwidth_aikubeworker0016.json b/reports_nvbandwidth_aikubeworker0016.json new file mode 100644 index 0000000..34ac61c --- /dev/null +++ b/reports_nvbandwidth_aikubeworker0016.json @@ -0,0 +1,70 @@ +{ + "benchmark": { + "memory": { + "source": "nvbandwidth", + "h2d_bandwidth_gbps": 55.5, + "d2h_bandwidth_gbps": 55.0, + "d2d_bandwidth_gbps": 0.0, + "h2d_peak_gbps": 64, + "d2h_peak_gbps": 64, + "d2d_peak_gbps": 450.0, + "h2d_efficiency_pct": 86.7, + "d2h_efficiency_pct": 85.9, + "d2d_efficiency_pct": null, + "peak_bandwidth_gbps": 3400, + "efficiency_pct": null, + "results_by_test": { + "h2d": 55.5, + "d2h": 55.0, + "d2d_write": 0.0, + "d2d_read": 0.0, + "d2d_bidir": 0.0 + }, + "per_gpu": [] + }, + "compute": { + "per_dtype_tflops": { + "fp32": 52.2, + "tf32": 357.5, + "fp16": 665.3, + "bf16": 697.1, + "fp8": 1138.8 + }, + "peak_tflops": { + "fp32": 67, + "tf32": 495, + "fp16": 990, + "bf16": 990, + "fp8": 1979 + }, + "efficiency_pct": { + "fp32": 77.9, + "tf32": 72.2, + "fp16": 67.2, + "bf16": 70.4, + "fp8": 57.5 + }, + "pass_thresholds_tflops": { + "fp32": 54, + "tf32": 444, + "fp16": 734, + "bf16": 745, + "fp8": 1400 + }, + "per_gpu": [ + { + "index": 0, + "fp32": 52.2, + "tf32": 357.5, + "fp16": 665.3, + "bf16": 697.1, + "fp8": 1138.8 + } + ], + "matrix_size": 8192, + "warmup": 50, + "iterations": 500 + } + }, + "timestamp": "2026-05-22T15:35:19.219299" +} \ No newline at end of file diff --git a/reports_nvbandwidth_aikubeworker0016.md b/reports_nvbandwidth_aikubeworker0016.md new file mode 100644 index 0000000..01320cf --- /dev/null +++ b/reports_nvbandwidth_aikubeworker0016.md @@ -0,0 +1,38 @@ +# GPU Test Report + +- **Date:** 2026-05-22 15:37:18 +- **Host:** aikubeworker0016 + +## Summary + +| Test | Result | +|------|--------| +| Memory Bandwidth | FAIL (0.0%) | +| Compute Throughput | FAIL (worst TF32 358 vs >= 444) | + +## Memory Bandwidth + +Source: nvbandwidth + +| Metric | Value | Peak | Efficiency | +|--------|-------|------|------------| +| H2D (PCIe) | 55.5 GB/s | 64 GB/s | 86.7% | +| D2H (PCIe) | 55.0 GB/s | 64 GB/s | 85.9% | +| D2D (NVLink) | 0.0 GB/s | 450 GB/s | 0.0% | + +**Verdict: FAIL** (D2D efficiency 0.0%) + +## Compute Throughput + +| DType | Achieved (TFLOPS) | Peak | Threshold | Status | +|-------|-------------------|------|------------|--------| +| FP32 | 52.2 | 67 | >= 54 | WARN | +| TF32 | 357.5 | 495 | >= 444 | FAIL | +| FP16 | 665.3 | 990 | >= 734 | WARN | +| BF16 | 697.1 | 990 | >= 745 | WARN | +| FP8 | 1138.8 | 1979 | >= 1400 | FAIL | + +**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 57.5%) + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_rdma_aikubeworker0012.json b/reports_rdma_aikubeworker0012.json new file mode 100644 index 0000000..93d7644 --- /dev/null +++ b/reports_rdma_aikubeworker0012.json @@ -0,0 +1,157 @@ +{ + "rdma": { + "passed": false, + "devices": [ + { + "name": "mlx5_0", + "ports": [ + { + "port": "1", + "rate": "400 Gb/sec (4X NDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:58a2:e103:0093:3898" + } + ] + }, + { + "name": "mlx5_1", + "ports": [ + { + "port": "1", + "rate": "400 Gb/sec (4X NDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:58a2:e103:0093:3db0" + } + ] + }, + { + "name": "mlx5_2", + "ports": [ + { + "port": "1", + "rate": "25 Gb/sec (1X EDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:5c3f:b8ff:fe5e:7832" + } + ] + }, + { + "name": "mlx5_3", + "ports": [ + { + "port": "1", + "rate": "25 Gb/sec (1X EDR)", + "state": "1: DOWN", + "phys_state": "3: Disabled", + "gid": "fe80:0000:0000:0000:5e25:73ff:fe4e:eac1" + } + ] + }, + { + "name": "mlx5_4", + "ports": [ + { + "port": "1", + "rate": "100 Gb/sec (2X HDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:9c63:c003:005f:63cc" + } + ] + }, + { + "name": "mlx5_5", + "ports": [ + { + "port": "1", + "rate": "100 Gb/sec (2X HDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:9c63:c003:005f:63cd" + } + ] + }, + { + "name": "mlx5_6", + "ports": [ + { + "port": "1", + "rate": "400 Gb/sec (4X NDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:58a2:e103:0093:3bf4" + } + ] + }, + { + "name": "mlx5_7", + "ports": [ + { + "port": "1", + "rate": "400 Gb/sec (4X NDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:58a2:e103:0093:3e28" + } + ] + }, + { + "name": "mlx5_8", + "ports": [ + { + "port": "1", + "rate": "25 Gb/sec (1X EDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:5c3f:b8ff:fe5e:7832" + } + ] + }, + { + "name": "mlx5_9", + "ports": [ + { + "port": "1", + "rate": "25 Gb/sec (1X EDR)", + "state": "1: DOWN", + "phys_state": "3: Disabled", + "gid": "fe80:0000:0000:0000:5e25:73ff:fe63:1717" + } + ] + } + ], + "bandwidth_tests": [ + { + "test": "ib_write_bw", + "status": "WARN", + "bandwidth_gbps": 0.13, + "min_required_gbps": 50 + }, + { + "test": "ib_read_bw", + "status": "WARN", + "bandwidth_gbps": 0.13, + "min_required_gbps": 50 + } + ], + "latency_tests": [ + { + "test": "ib_write_lat", + "status": "PASS", + "latency_us": 4.53, + "max_allowed_us": 10 + }, + { + "test": "ib_read_lat", + "status": "WARN", + "latency_us": 16.0, + "max_allowed_us": 10 + } + ], + "timestamp": "2026-05-22T15:41:20.534115" + }, + "timestamp": "2026-05-22T15:41:20.544589" +} \ No newline at end of file diff --git a/reports_rdma_aikubeworker0016.json b/reports_rdma_aikubeworker0016.json new file mode 100644 index 0000000..5e98f8a --- /dev/null +++ b/reports_rdma_aikubeworker0016.json @@ -0,0 +1,157 @@ +{ + "rdma": { + "passed": false, + "devices": [ + { + "name": "mlx5_0", + "ports": [ + { + "port": "1", + "rate": "400 Gb/sec (4X NDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:58a2:e103:0088:81e0" + } + ] + }, + { + "name": "mlx5_1", + "ports": [ + { + "port": "1", + "rate": "400 Gb/sec (4X NDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:9c63:c003:0054:e00a" + } + ] + }, + { + "name": "mlx5_2", + "ports": [ + { + "port": "1", + "rate": "25 Gb/sec (1X EDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf" + } + ] + }, + { + "name": "mlx5_3", + "ports": [ + { + "port": "1", + "rate": "25 Gb/sec (1X EDR)", + "state": "1: DOWN", + "phys_state": "3: Disabled", + "gid": "fe80:0000:0000:0000:c670:bdff:fefd:5bd9" + } + ] + }, + { + "name": "mlx5_4", + "ports": [ + { + "port": "1", + "rate": "100 Gb/sec (2X HDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:9c63:c003:005f:58ec" + } + ] + }, + { + "name": "mlx5_5", + "ports": [ + { + "port": "1", + "rate": "100 Gb/sec (2X HDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:9c63:c003:005f:58ed" + } + ] + }, + { + "name": "mlx5_6", + "ports": [ + { + "port": "1", + "rate": "400 Gb/sec (4X NDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:9c63:c003:0055:0e56" + } + ] + }, + { + "name": "mlx5_7", + "ports": [ + { + "port": "1", + "rate": "400 Gb/sec (4X NDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:a088:c203:00f0:286c" + } + ] + }, + { + "name": "mlx5_8", + "ports": [ + { + "port": "1", + "rate": "25 Gb/sec (1X EDR)", + "state": "4: ACTIVE", + "phys_state": "5: LinkUp", + "gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf" + } + ] + }, + { + "name": "mlx5_9", + "ports": [ + { + "port": "1", + "rate": "25 Gb/sec (1X EDR)", + "state": "1: DOWN", + "phys_state": "3: Disabled", + "gid": "fe80:0000:0000:0000:c670:bdff:fefd:569d" + } + ] + } + ], + "bandwidth_tests": [ + { + "test": "ib_write_bw", + "status": "WARN", + "bandwidth_gbps": 0.13, + "min_required_gbps": 50 + }, + { + "test": "ib_read_bw", + "status": "WARN", + "bandwidth_gbps": 0.13, + "min_required_gbps": 50 + } + ], + "latency_tests": [ + { + "test": "ib_write_lat", + "status": "PASS", + "latency_us": 4.22, + "max_allowed_us": 10 + }, + { + "test": "ib_read_lat", + "status": "WARN", + "latency_us": 16.0, + "max_allowed_us": 10 + } + ], + "timestamp": "2026-05-22T15:41:07.851101" + }, + "timestamp": "2026-05-22T15:41:07.861558" +} \ No newline at end of file diff --git a/reports_rdma_counter_aikubeworker0012_20260522_194808.md b/reports_rdma_counter_aikubeworker0012_20260522_194808.md new file mode 100644 index 0000000..f254bef --- /dev/null +++ b/reports_rdma_counter_aikubeworker0012_20260522_194808.md @@ -0,0 +1,62 @@ +# GPU Test Report + +- **Date:** 2026-05-22T19:48:26.622179 +- **Host:** aikubeworker0012 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Failed or unverified items: +- RDMA: FAIL + +Missing required evidence: +- GPU Info +- Health Check +- Memory Bandwidth +- Compute Throughput +- NVLink/NVSwitch +- NCCL +- Stress Test +- DCGM +- Training + +## Summary + +| Test | Result | +|------|--------| +| RDMA | FAIL | + +## RDMA/InfiniBand + +### RDMA Port Checks + +| Device | Port | State | Rate | Required | Status | +|--------|------|-------|------|----------|--------| +| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL | +| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL | +| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | + +| Test | Value | Threshold | Status | +|------|-------|-----------|--------| +| ib_write_bw | 49.3 GB/s | >= 47 GB/s | PASS | +| ib_read_bw | 39.2 GB/s | >= 47 GB/s | FAIL | +| ib_write_lat | 4.49 us | <= 2 us | FAIL | +| ib_read_lat | 16.00 us | <= 3.5 us | FAIL | +| ibping | target=0x58 count=5 | 0% packet loss | PASS | + +- **PFC/ECN/CNP/congestion counters checked:** 146 +- **PFC/ECN/CNP/congestion non-zero:** no +- **Failure reasons:** + - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE) + - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE) + - ib_read_bw bandwidth 39.21GB/s < 47GB/s + - ib_write_lat latency 4.49us > 2.0us + - ib_read_lat latency 16.0us > 3.5us +**Overall: FAIL** + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_rdma_counter_aikubeworker0016_20260522_194828.md b/reports_rdma_counter_aikubeworker0016_20260522_194828.md new file mode 100644 index 0000000..a72f917 --- /dev/null +++ b/reports_rdma_counter_aikubeworker0016_20260522_194828.md @@ -0,0 +1,62 @@ +# GPU Test Report + +- **Date:** 2026-05-22T19:48:45.899570 +- **Host:** aikubeworker0016 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Failed or unverified items: +- RDMA: FAIL + +Missing required evidence: +- GPU Info +- Health Check +- Memory Bandwidth +- Compute Throughput +- NVLink/NVSwitch +- NCCL +- Stress Test +- DCGM +- Training + +## Summary + +| Test | Result | +|------|--------| +| RDMA | FAIL | + +## RDMA/InfiniBand + +### RDMA Port Checks + +| Device | Port | State | Rate | Required | Status | +|--------|------|-------|------|----------|--------| +| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL | +| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL | +| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | + +| Test | Value | Threshold | Status | +|------|-------|-----------|--------| +| ib_write_bw | 48.1 GB/s | >= 47 GB/s | PASS | +| ib_read_bw | 40.3 GB/s | >= 47 GB/s | FAIL | +| ib_write_lat | 4.28 us | <= 2 us | FAIL | +| ib_read_lat | 16.00 us | <= 3.5 us | FAIL | +| ibping | target=0x4b count=5 | 0% packet loss | PASS | + +- **PFC/ECN/CNP/congestion counters checked:** 146 +- **PFC/ECN/CNP/congestion non-zero:** no +- **Failure reasons:** + - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE) + - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE) + - ib_read_bw bandwidth 40.3GB/s < 47GB/s + - ib_write_lat latency 4.28us > 2.0us + - ib_read_lat latency 16.0us > 3.5us +**Overall: FAIL** + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_rdma_cross_node_mlx5_0_20260523.md b/reports_rdma_cross_node_mlx5_0_20260523.md new file mode 100644 index 0000000..dfdfb8a --- /dev/null +++ b/reports_rdma_cross_node_mlx5_0_20260523.md @@ -0,0 +1,50 @@ +# RDMA Cross-node Evidence Report + +- **Date:** 2026-05-23 Asia/Shanghai +- **Scope:** `aikubeworker0012` <-> `aikubeworker0016`, single rail `mlx5_0`, port 1 +- **Client/server bootstrap IPs:** `172.72.8.12` and `172.72.8.16` +- **Bandwidth message size:** 4MB +- **Latency message size:** 8B +- **Iterations:** 1000 + +## Port Evidence + +| Host | Device | State | Rate | Link | LID | +|---|---|---|---|---|---| +| aikubeworker0012 | mlx5_0/1 | ACTIVE | 400 Gb/sec (4X NDR) | InfiniBand | 0x58 | +| aikubeworker0016 | mlx5_0/1 | ACTIVE | 400 Gb/sec (4X NDR) | InfiniBand | 0x4b | + +## Cross-node Perftest Results + +| Direction | Test | Value | PDF Threshold | Status | +|---|---|---:|---:|---| +| 0016 -> 0012 | ib_write_bw | 49.35 GB/s | >= 47 GB/s | PASS | +| 0016 -> 0012 | ib_read_bw | 44.36 GB/s | >= 47 GB/s | FAIL | +| 0016 -> 0012 | ib_write_lat avg | 2.17 us | <= 2.0 us | FAIL | +| 0016 -> 0012 | ib_read_lat avg | 4.05 us | <= 3.5 us | FAIL | +| 0012 -> 0016 | ib_write_bw | 48.38 GB/s | >= 47 GB/s | PASS | +| 0012 -> 0016 | ib_read_bw | 44.37 GB/s | >= 47 GB/s | FAIL | +| 0012 -> 0016 | ib_write_lat avg | 2.13 us | <= 2.0 us | FAIL | +| 0012 -> 0016 | ib_read_lat avg | 4.08 us | <= 3.5 us | FAIL | + +## Bidirectional ibping + +| Direction | Target LID | Result | +|---|---|---| +| 0016 -> 0012 | 0x58 | 5 transmitted, 5 received, 0% packet loss; avg 0.005 ms | +| 0012 -> 0016 | 0x4b | 5 transmitted, 5 received, 0% packet loss; avg 0.005 ms | + +## Fabric Counters + +| Host | PFC/ECN/CNP/congestion Counters Checked | Non-zero Counters | Status | +|---|---:|---:|---| +| aikubeworker0012 | 146 | 0 | PASS | +| aikubeworker0016 | 146 | 0 | PASS | + +## Verdict + +**RDMA cross-node verdict: FAIL** + +Reason: bidirectional connectivity is good, PFC/ECN/CNP/congestion counters are clean, and write bandwidth passes. However read bandwidth is below 47 GB/s in both directions, write latency is slightly above 2.0 us in both directions, and read latency is above 3.5 us in both directions. + +Note: `modules/rdma_test.py` was corrected on 2026-05-23 to parse `ib_write_lat` / `ib_read_lat` `t_avg[usec]` rather than the 99.9 percentile column. Older reports that show `read_lat` around 16 us are therefore not the current parser output. diff --git a/reports_rdma_single_node_summary.md b/reports_rdma_single_node_summary.md new file mode 100644 index 0000000..c1c95de --- /dev/null +++ b/reports_rdma_single_node_summary.md @@ -0,0 +1,73 @@ +# Single-node RDMA/IB Report + +Generated: 2026-05-22 23:41 Asia/Shanghai + +Scope: project CLI `gpu_tester.py --test rdma --report --format json`, run separately on each host. + +Important note: the current repository RDMA test is single-node only. In `modules/rdma_test.py`, the perftest client connects to `localhost`, so this report validates local IB device discovery and local perftest behavior. It does not validate cross-node RDMA bandwidth between `aikubeworker0012` and `aikubeworker0016`. + +## Summary + +| Host | Devices Found | Active 400G Ports | Active 100G Ports | Down Ports | Overall | +| --- | ---: | --- | --- | --- | --- | +| aikubeworker0012 / 172.72.8.12 | 10 | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | mlx5_4, mlx5_5 | mlx5_3, mlx5_9 | WARN | +| aikubeworker0016 / 172.72.8.16 | 10 | mlx5_0, mlx5_1, mlx5_6, mlx5_7 | mlx5_4, mlx5_5 | mlx5_3, mlx5_9 | WARN | + +## Bandwidth + +The bandwidth numbers below are from the repo's local `localhost` RDMA perftest path. + +| Host | ib_write_bw | Threshold | Status | ib_read_bw | Threshold | Status | +| --- | ---: | ---: | --- | ---: | ---: | --- | +| aikubeworker0012 | 0.13 GB/s | 50 GB/s | WARN | 0.13 GB/s | 50 GB/s | WARN | +| aikubeworker0016 | 0.13 GB/s | 50 GB/s | WARN | 0.13 GB/s | 50 GB/s | WARN | + +## Latency + +| Host | ib_write_lat | Limit | Status | ib_read_lat | Limit | Status | +| --- | ---: | ---: | --- | ---: | ---: | --- | +| aikubeworker0012 | 4.53 us | 10 us | PASS | 16.00 us | 10 us | WARN | +| aikubeworker0016 | 4.22 us | 10 us | PASS | 16.00 us | 10 us | WARN | + +## Device Inventory + +### aikubeworker0012 + +| Device | Port | State | Physical State | Rate | +| --- | --- | --- | --- | --- | +| mlx5_0 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) | +| mlx5_1 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) | +| mlx5_2 | 1 | ACTIVE | LinkUp | 25 Gb/sec (1X EDR) | +| mlx5_3 | 1 | DOWN | Disabled | 25 Gb/sec (1X EDR) | +| mlx5_4 | 1 | ACTIVE | LinkUp | 100 Gb/sec (2X HDR) | +| mlx5_5 | 1 | ACTIVE | LinkUp | 100 Gb/sec (2X HDR) | +| mlx5_6 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) | +| mlx5_7 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) | +| mlx5_8 | 1 | ACTIVE | LinkUp | 25 Gb/sec (1X EDR) | +| mlx5_9 | 1 | DOWN | Disabled | 25 Gb/sec (1X EDR) | + +### aikubeworker0016 + +| Device | Port | State | Physical State | Rate | +| --- | --- | --- | --- | --- | +| mlx5_0 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) | +| mlx5_1 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) | +| mlx5_2 | 1 | ACTIVE | LinkUp | 25 Gb/sec (1X EDR) | +| mlx5_3 | 1 | DOWN | Disabled | 25 Gb/sec (1X EDR) | +| mlx5_4 | 1 | ACTIVE | LinkUp | 100 Gb/sec (2X HDR) | +| mlx5_5 | 1 | ACTIVE | LinkUp | 100 Gb/sec (2X HDR) | +| mlx5_6 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) | +| mlx5_7 | 1 | ACTIVE | LinkUp | 400 Gb/sec (4X NDR) | +| mlx5_8 | 1 | ACTIVE | LinkUp | 25 Gb/sec (1X EDR) | +| mlx5_9 | 1 | DOWN | Disabled | 25 Gb/sec (1X EDR) | + +## Files + +Raw JSON: + +- `reports_rdma_aikubeworker0012.json` +- `reports_rdma_aikubeworker0016.json` + +Markdown summary: + +- `reports_rdma_single_node_summary.md` diff --git a/reports_single_gpu_aikubeworker0012.json b/reports_single_gpu_aikubeworker0012.json new file mode 100644 index 0000000..6cc5a37 --- /dev/null +++ b/reports_single_gpu_aikubeworker0012.json @@ -0,0 +1,292 @@ +{ + "timestamp": "2026-05-22T15:26:26.973586", + "gpu_info": { + "driver_version": "580.159.03", + "cuda_version": "13.0", + "gpu_count": 8, + "gpus": [ + { + "index": 0, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-7658c03c-7659-9886-041e-545c21d53e12", + "pci_bus_id": "00000000:18:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 4, + "vram_free_mb": 81076, + "power_draw": 69.72, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 25, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1654923030411", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 1, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-6392d40b-893b-9fc2-4284-a3f1d8c4d7f1", + "pci_bus_id": "00000000:2A:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 0, + "vram_free_mb": 81079, + "power_draw": 73.17, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 25, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1654724063165", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 2, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-2ae38735-10de-fb0b-fb20-9d1b5b434558", + "pci_bus_id": "00000000:3A:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 0, + "vram_free_mb": 81079, + "power_draw": 68.71, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 26, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1654823036530", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 3, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-ec62123f-0c48-6dbd-49e4-8b231b3fed0e", + "pci_bus_id": "00000000:5D:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 0, + "vram_free_mb": 81079, + "power_draw": 69.73, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 25, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1654923021638", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 4, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-b64fc270-109e-1543-fb0c-be7feecf14f1", + "pci_bus_id": "00000000:9A:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 0, + "vram_free_mb": 81079, + "power_draw": 68.84, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 24, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1655023033179", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 5, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-15ab7baf-9010-7cf3-5462-eeb09f8dbe65", + "pci_bus_id": "00000000:AB:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 0, + "vram_free_mb": 81079, + "power_draw": 69.94, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 27, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1655023034225", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 6, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-225f6f3c-6fef-d1e2-5428-d90f665fb3d3", + "pci_bus_id": "00000000:BA:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 0, + "vram_free_mb": 81079, + "power_draw": 70.46, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 25, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1654923078278", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 7, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-79aeb6a8-c00c-6edb-956f-779ef56950a3", + "pci_bus_id": "00000000:DB:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 0, + "vram_free_mb": 81079, + "power_draw": 71.76, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 24, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1654024031464", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + } + ], + "topology": "\t\u001b[4mGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tNIC3\tNIC4\tNIC5\tNIC6\tNIC7\tNIC8\tNIC9\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\u001b[0m\nGPU0\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tPIX\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU1\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tPIX\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU2\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tPIX\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU3\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU4\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU5\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU6\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tPIX\t56-111,168-223\t1\t\tN/A\nGPU7\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nNIC0\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC3\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC4\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t X \tPIX\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC5\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\tPIX\t X \tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC6\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\t\t\t\t\nNIC7\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\t\t\t\t\nNIC8\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\t\t\t\t\nNIC9\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n NIC4: mlx5_4\n NIC5: mlx5_5\n NIC6: mlx5_6\n NIC7: mlx5_7\n NIC8: mlx5_8\n NIC9: mlx5_9\n\n", + "timestamp": "2026-05-22T15:26:34.187409", + "detected_gpu_type": "h100", + "gpu_label": "H100 SXM5" + }, + "memory_bench": { + "memory": { + "source": "pytorch", + "h2d_bandwidth_gbps": 11.8, + "d2h_bandwidth_gbps": 9.9, + "d2d_bandwidth_gbps": 829.1, + "peak_bandwidth_gbps": 3400, + "efficiency_pct": 24.4, + "test_sizes_mb": [ + 1, + 4, + 16, + 64, + 256, + 1024, + 4096 + ], + "bandwidth_by_size": { + "1": { + "h2d_gbps": 3.8, + "d2h_gbps": 1.4, + "d2d_gbps": 40.6 + }, + "4": { + "h2d_gbps": 7.6, + "d2h_gbps": 9.9, + "d2d_gbps": 141.5 + }, + "16": { + "h2d_gbps": 11.0, + "d2h_gbps": 1.9, + "d2d_gbps": 450.3 + }, + "64": { + "h2d_gbps": 11.8, + "d2h_gbps": 1.4, + "d2d_gbps": 726.5 + }, + "256": { + "h2d_gbps": 9.0, + "d2h_gbps": 1.4, + "d2d_gbps": 793.8 + }, + "1024": { + "h2d_gbps": 5.5, + "d2h_gbps": 1.4, + "d2d_gbps": 821.2 + }, + "4096": { + "h2d_gbps": 5.9, + "d2h_gbps": 1.4, + "d2d_gbps": 829.1 + } + }, + "per_gpu": [] + } + }, + "compute_bench": { + "compute": { + "per_dtype_tflops": { + "fp32": 52.0, + "tf32": 362.3, + "fp16": 691.0, + "bf16": 713.0, + "fp8": 1148.8 + }, + "peak_tflops": { + "fp32": 67, + "tf32": 495, + "fp16": 990, + "bf16": 990, + "fp8": 1979 + }, + "efficiency_pct": { + "fp32": 77.6, + "tf32": 73.2, + "fp16": 69.8, + "bf16": 72.0, + "fp8": 58.0 + }, + "pass_thresholds_tflops": { + "fp32": 54, + "tf32": 444, + "fp16": 734, + "bf16": 745, + "fp8": 1400 + }, + "per_gpu": [ + { + "index": 0, + "fp32": 52.0, + "tf32": 362.3, + "fp16": 691.0, + "bf16": 713.0, + "fp8": 1148.8 + } + ], + "matrix_size": 8192, + "warmup": 50, + "iterations": 500 + } + } +} \ No newline at end of file diff --git a/reports_single_gpu_aikubeworker0012.md b/reports_single_gpu_aikubeworker0012.md new file mode 100644 index 0000000..3a6c3c9 --- /dev/null +++ b/reports_single_gpu_aikubeworker0012.md @@ -0,0 +1,54 @@ +# GPU Test Report + +- **Date:** 2026-05-22 15:27:51 +- **Host:** aikubeworker0012 +- **GPU:** NVIDIA H100 80GB HBM3 x8 +- **Driver:** 580.159.03 | **CUDA:** 13.0 + +## Summary + +| Test | Result | +|------|--------| +| GPU Info | PASS (8 GPUs detected) | +| Memory Bandwidth | WARN (829 GB/s via PyTorch fallback) | +| Compute Throughput | FAIL (worst TF32 362 vs >= 444) | + +## GPU Information + +| GPU | Model | VRAM | Temp | Power | SM Clock | +|-----|-------|------|------|-------|----------| +| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 70/700W | 345 MHz | +| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 73/700W | 345 MHz | +| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 26C | 69/700W | 345 MHz | +| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 70/700W | 345 MHz | +| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 69/700W | 345 MHz | +| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 27C | 70/700W | 345 MHz | +| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 70/700W | 345 MHz | +| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 72/700W | 345 MHz | + +## Memory Bandwidth + +Source: pytorch + +| Metric | Value | Peak | Efficiency | +|--------|-------|------|------------| +| H2D (PCIe) | 11.8 GB/s | 0 GB/s | 0.0% | +| D2H (PCIe) | 9.9 GB/s | 0 GB/s | 0.0% | +| D2D (NVLink) | 829.1 GB/s | 3400 GB/s | 24.4% | + +**Verdict: WARN** (D2D 829.1 GB/s via PyTorch fallback; nvbandwidth unavailable — figure is indicative only, not a true HBM peak) + +## Compute Throughput + +| DType | Achieved (TFLOPS) | Peak | Threshold | Status | +|-------|-------------------|------|------------|--------| +| FP32 | 52.0 | 67 | >= 54 | WARN | +| TF32 | 362.3 | 495 | >= 444 | FAIL | +| FP16 | 691.0 | 990 | >= 734 | WARN | +| BF16 | 713.0 | 990 | >= 745 | WARN | +| FP8 | 1148.8 | 1979 | >= 1400 | FAIL | + +**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 58.0%) + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_single_gpu_aikubeworker0016.json b/reports_single_gpu_aikubeworker0016.json new file mode 100644 index 0000000..4b3c442 --- /dev/null +++ b/reports_single_gpu_aikubeworker0016.json @@ -0,0 +1,292 @@ +{ + "timestamp": "2026-05-22T15:26:29.511252", + "gpu_info": { + "driver_version": "580.159.03", + "cuda_version": "13.0", + "gpu_count": 8, + "gpus": [ + { + "index": 0, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-dfbc9513-255d-4fe7-2b77-7b1ec3972e75", + "pci_bus_id": "00000000:18:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 4, + "vram_free_mb": 81076, + "power_draw": 69.81, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 20, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924016120", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 1, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-bb845ef7-d7b5-f011-9395-ea74274e2282", + "pci_bus_id": "00000000:2A:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 0, + "vram_free_mb": 81079, + "power_draw": 67.45, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 20, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924015483", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 2, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-3720cf13-2a34-be38-27be-0a7adc4addc4", + "pci_bus_id": "00000000:3A:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 0, + "vram_free_mb": 81079, + "power_draw": 66.69, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 21, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924025595", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 3, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-87080b2d-ac43-be0d-d574-c193078850ae", + "pci_bus_id": "00000000:5D:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 0, + "vram_free_mb": 81079, + "power_draw": 66.86, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 20, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924016862", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 4, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-599bd883-cc5c-a5dd-6c33-c15f7049da48", + "pci_bus_id": "00000000:9A:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 0, + "vram_free_mb": 81079, + "power_draw": 67.07, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 20, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924025670", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 5, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-a1c6bba4-61b0-e623-06c9-9c88635e26fe", + "pci_bus_id": "00000000:AB:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 0, + "vram_free_mb": 81079, + "power_draw": 69.12, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 22, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924027166", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 6, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-98745a0c-39bd-3e56-d6ca-54ba3647ab6d", + "pci_bus_id": "00000000:BA:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 0, + "vram_free_mb": 81079, + "power_draw": 67.61, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 20, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924026234", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + }, + { + "index": 7, + "name": "NVIDIA H100 80GB HBM3", + "uuid": "GPU-8c73bd8b-666b-357e-ac5d-c75ac7a759db", + "pci_bus_id": "00000000:DB:00.0", + "pcie_link_gen": 5, + "pcie_link_width": 16, + "vram_total_mb": 81559, + "vram_used_mb": 0, + "vram_free_mb": 81079, + "power_draw": 66.19, + "power_limit": 700.0, + "clock_sm": 345, + "clock_mem": 2619, + "temperature": 20, + "fan_speed": 0, + "persistence_mode": false, + "compute_mode": "Default", + "serial_number": "1651924027255", + "ecc_errors_single": 0, + "ecc_errors_double": 0 + } + ], + "topology": "\t\u001b[4mGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tNIC3\tNIC4\tNIC5\tNIC6\tNIC7\tNIC8\tNIC9\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\u001b[0m\nGPU0\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tPIX\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU1\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tPIX\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU2\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tPIX\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU3\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU4\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU5\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU6\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tPIX\t56-111,168-223\t1\t\tN/A\nGPU7\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nNIC0\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC3\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC4\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t X \tPIX\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC5\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\tPIX\t X \tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC6\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\t\t\t\t\nNIC7\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\t\t\t\t\nNIC8\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\t\t\t\t\nNIC9\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n NIC4: mlx5_4\n NIC5: mlx5_5\n NIC6: mlx5_6\n NIC7: mlx5_7\n NIC8: mlx5_8\n NIC9: mlx5_9\n\n", + "timestamp": "2026-05-22T15:26:36.627805", + "detected_gpu_type": "h100", + "gpu_label": "H100 SXM5" + }, + "memory_bench": { + "memory": { + "source": "pytorch", + "h2d_bandwidth_gbps": 11.8, + "d2h_bandwidth_gbps": 10.1, + "d2d_bandwidth_gbps": 829.0, + "peak_bandwidth_gbps": 3400, + "efficiency_pct": 24.4, + "test_sizes_mb": [ + 1, + 4, + 16, + 64, + 256, + 1024, + 4096 + ], + "bandwidth_by_size": { + "1": { + "h2d_gbps": 3.6, + "d2h_gbps": 1.4, + "d2d_gbps": 40.3 + }, + "4": { + "h2d_gbps": 7.7, + "d2h_gbps": 10.1, + "d2d_gbps": 159.5 + }, + "16": { + "h2d_gbps": 10.9, + "d2h_gbps": 1.9, + "d2d_gbps": 439.5 + }, + "64": { + "h2d_gbps": 11.8, + "d2h_gbps": 1.4, + "d2d_gbps": 740.5 + }, + "256": { + "h2d_gbps": 9.0, + "d2h_gbps": 1.4, + "d2d_gbps": 792.1 + }, + "1024": { + "h2d_gbps": 8.4, + "d2h_gbps": 1.4, + "d2d_gbps": 818.9 + }, + "4096": { + "h2d_gbps": 6.1, + "d2h_gbps": 1.4, + "d2d_gbps": 829.0 + } + }, + "per_gpu": [] + } + }, + "compute_bench": { + "compute": { + "per_dtype_tflops": { + "fp32": 51.9, + "tf32": 357.8, + "fp16": 667.2, + "bf16": 699.1, + "fp8": 1146.2 + }, + "peak_tflops": { + "fp32": 67, + "tf32": 495, + "fp16": 990, + "bf16": 990, + "fp8": 1979 + }, + "efficiency_pct": { + "fp32": 77.5, + "tf32": 72.3, + "fp16": 67.4, + "bf16": 70.6, + "fp8": 57.9 + }, + "pass_thresholds_tflops": { + "fp32": 54, + "tf32": 444, + "fp16": 734, + "bf16": 745, + "fp8": 1400 + }, + "per_gpu": [ + { + "index": 0, + "fp32": 51.9, + "tf32": 357.8, + "fp16": 667.2, + "bf16": 699.1, + "fp8": 1146.2 + } + ], + "matrix_size": 8192, + "warmup": 50, + "iterations": 500 + } + } +} \ No newline at end of file diff --git a/reports_single_gpu_aikubeworker0016.md b/reports_single_gpu_aikubeworker0016.md new file mode 100644 index 0000000..49f9f45 --- /dev/null +++ b/reports_single_gpu_aikubeworker0016.md @@ -0,0 +1,54 @@ +# GPU Test Report + +- **Date:** 2026-05-22 15:27:53 +- **Host:** aikubeworker0016 +- **GPU:** NVIDIA H100 80GB HBM3 x8 +- **Driver:** 580.159.03 | **CUDA:** 13.0 + +## Summary + +| Test | Result | +|------|--------| +| GPU Info | PASS (8 GPUs detected) | +| Memory Bandwidth | WARN (829 GB/s via PyTorch fallback) | +| Compute Throughput | FAIL (worst TF32 358 vs >= 444) | + +## GPU Information + +| GPU | Model | VRAM | Temp | Power | SM Clock | +|-----|-------|------|------|-------|----------| +| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 70/700W | 345 MHz | +| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz | +| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 67/700W | 345 MHz | +| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz | +| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz | +| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 22C | 69/700W | 345 MHz | +| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 68/700W | 345 MHz | +| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 66/700W | 345 MHz | + +## Memory Bandwidth + +Source: pytorch + +| Metric | Value | Peak | Efficiency | +|--------|-------|------|------------| +| H2D (PCIe) | 11.8 GB/s | 0 GB/s | 0.0% | +| D2H (PCIe) | 10.1 GB/s | 0 GB/s | 0.0% | +| D2D (NVLink) | 829.0 GB/s | 3400 GB/s | 24.4% | + +**Verdict: WARN** (D2D 829.0 GB/s via PyTorch fallback; nvbandwidth unavailable — figure is indicative only, not a true HBM peak) + +## Compute Throughput + +| DType | Achieved (TFLOPS) | Peak | Threshold | Status | +|-------|-------------------|------|------------|--------| +| FP32 | 51.9 | 67 | >= 54 | WARN | +| TF32 | 357.8 | 495 | >= 444 | FAIL | +| FP16 | 667.2 | 990 | >= 734 | WARN | +| BF16 | 699.1 | 990 | >= 745 | WARN | +| FP8 | 1146.2 | 1979 | >= 1400 | FAIL | + +**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 57.9%) + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_stress_smoke_reasons_aikubeworker0012.json b/reports_stress_smoke_reasons_aikubeworker0012.json new file mode 100644 index 0000000..2722c96 --- /dev/null +++ b/reports_stress_smoke_reasons_aikubeworker0012.json @@ -0,0 +1,165 @@ +{ + "stress": { + "source": "pytorch", + "passed": false, + "duration_sec": 45, + "elapsed_sec": 45.4, + "gpu_status": { + "0": "PASS", + "1": "PASS", + "2": "PASS", + "3": "PASS", + "4": "PASS", + "5": "PASS", + "6": "PASS", + "7": "PASS" + }, + "telemetry": { + "passed": false, + "samples": 39, + "steady_samples": 31, + "warmup_sec": 9.0, + "max_temp_c": { + "0": 59.0, + "1": 58.0, + "2": 65.0, + "3": 54.0, + "4": 59.0, + "5": 66.0, + "6": 62.0, + "7": 55.0 + }, + "avg_power_w": { + "0": 697.0, + "1": 697.4, + "2": 697.9, + "3": 698.0, + "4": 697.8, + "5": 697.6, + "6": 697.9, + "7": 698.2 + }, + "temp_delta_c": 12.0, + "throttle_events": [ + { + "gpu": 0, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 1, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 2, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 3, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 4, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 5, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 6, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 7, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 0, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 1, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 2, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 3, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 4, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 5, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 6, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 7, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 0, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 1, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 2, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 3, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + } + ], + "throttle_event_count": 248, + "xid_events": [], + "tflops_jitter_pct": 4.07, + "steady_tflops_samples": 781, + "failures": [ + "GPU temperature delta 12.0C exceeds 5.0C", + "non-idle throttle reasons observed in 248 samples (first: GPU 0 0x4)" + ], + "thresholds": { + "max_temp_c": 80.0, + "max_temp_delta_c": 5.0, + "min_power_w": 630.0, + "max_tflops_jitter_pct": 5.0, + "warmup_sec": 10.0, + "min_steady_samples": 10 + } + }, + "timestamp": "2026-05-22T17:52:09.074859" + }, + "timestamp": "2026-05-22T17:52:09.082873" +} \ No newline at end of file diff --git a/reports_stress_smoke_reasons_aikubeworker0012.md b/reports_stress_smoke_reasons_aikubeworker0012.md new file mode 100644 index 0000000..cea30e2 --- /dev/null +++ b/reports_stress_smoke_reasons_aikubeworker0012.md @@ -0,0 +1,29 @@ +# GPU Test Report + +- **Date:** 2026-05-22T17:52:09.082873 +- **Host:** aikubeworker0012 + +## Summary + +| Test | Result | +|------|--------| +| Stress Test | FAIL | + +## Stress Test + +- **Source:** pytorch +- **Duration:** 45s (requested 45s) +- **Telemetry samples:** 39 +- **Max temp:** {'0': 59.0, '1': 58.0, '2': 65.0, '3': 54.0, '4': 59.0, '5': 66.0, '6': 62.0, '7': 55.0} +- **Avg power:** {'0': 697.0, '1': 697.4, '2': 697.9, '3': 698.0, '4': 697.8, '5': 697.6, '6': 697.9, '7': 698.2} +- **Temp delta:** 12.0 C +- **TFLOPS jitter:** 4.07% +- **Throttle events:** 248 +- **XID events:** 0 +- **Failure reasons:** + - GPU temperature delta 12.0C exceeds 5.0C + - non-idle throttle reasons observed in 248 samples (first: GPU 0 0x4) +- **Result: FAIL** + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_stress_smoke_reasons_aikubeworker0016.json b/reports_stress_smoke_reasons_aikubeworker0016.json new file mode 100644 index 0000000..8d39f58 --- /dev/null +++ b/reports_stress_smoke_reasons_aikubeworker0016.json @@ -0,0 +1,165 @@ +{ + "stress": { + "source": "pytorch", + "passed": false, + "duration_sec": 45, + "elapsed_sec": 45.4, + "gpu_status": { + "0": "PASS", + "1": "PASS", + "2": "PASS", + "3": "PASS", + "4": "PASS", + "5": "PASS", + "6": "PASS", + "7": "PASS" + }, + "telemetry": { + "passed": false, + "samples": 39, + "steady_samples": 31, + "warmup_sec": 9.0, + "max_temp_c": { + "0": 50.0, + "1": 56.0, + "2": 57.0, + "3": 52.0, + "4": 51.0, + "5": 58.0, + "6": 53.0, + "7": 51.0 + }, + "avg_power_w": { + "0": 698.3, + "1": 698.5, + "2": 697.6, + "3": 697.9, + "4": 697.8, + "5": 698.0, + "6": 697.5, + "7": 698.0 + }, + "temp_delta_c": 8.0, + "throttle_events": [ + { + "gpu": 0, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 1, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 2, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 3, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 4, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 5, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 6, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 7, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 0, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 1, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 2, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 3, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 4, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 5, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 6, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 7, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 0, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 1, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 2, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + }, + { + "gpu": 3, + "throttle": "0x0000000000000004", + "real_throttle": "0x4" + } + ], + "throttle_event_count": 248, + "xid_events": [], + "tflops_jitter_pct": 3.77, + "steady_tflops_samples": 787, + "failures": [ + "GPU temperature delta 8.0C exceeds 5.0C", + "non-idle throttle reasons observed in 248 samples (first: GPU 0 0x4)" + ], + "thresholds": { + "max_temp_c": 80.0, + "max_temp_delta_c": 5.0, + "min_power_w": 630.0, + "max_tflops_jitter_pct": 5.0, + "warmup_sec": 10.0, + "min_steady_samples": 10 + } + }, + "timestamp": "2026-05-22T17:53:02.058687" + }, + "timestamp": "2026-05-22T17:53:02.066792" +} \ No newline at end of file diff --git a/reports_stress_smoke_reasons_aikubeworker0016.md b/reports_stress_smoke_reasons_aikubeworker0016.md new file mode 100644 index 0000000..9f9c3ab --- /dev/null +++ b/reports_stress_smoke_reasons_aikubeworker0016.md @@ -0,0 +1,29 @@ +# GPU Test Report + +- **Date:** 2026-05-22T17:53:02.066792 +- **Host:** aikubeworker0016 + +## Summary + +| Test | Result | +|------|--------| +| Stress Test | FAIL | + +## Stress Test + +- **Source:** pytorch +- **Duration:** 45s (requested 45s) +- **Telemetry samples:** 39 +- **Max temp:** {'0': 50.0, '1': 56.0, '2': 57.0, '3': 52.0, '4': 51.0, '5': 58.0, '6': 53.0, '7': 51.0} +- **Avg power:** {'0': 698.3, '1': 698.5, '2': 697.6, '3': 697.9, '4': 697.8, '5': 698.0, '6': 697.5, '7': 698.0} +- **Temp delta:** 8.0 C +- **TFLOPS jitter:** 3.77% +- **Throttle events:** 248 +- **XID events:** 0 +- **Failure reasons:** + - GPU temperature delta 8.0C exceeds 5.0C + - non-idle throttle reasons observed in 248 samples (first: GPU 0 0x4) +- **Result: FAIL** + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_test_all_latest_aikubeworker0012_20260522_203246.md b/reports_test_all_latest_aikubeworker0012_20260522_203246.md new file mode 100644 index 0000000..8853d18 --- /dev/null +++ b/reports_test_all_latest_aikubeworker0012_20260522_203246.md @@ -0,0 +1,322 @@ +# GPU Test Report + +- **Date:** 2026-05-22T20:32:51.687830 +- **Host:** aikubeworker0012 +- **GPU:** NVIDIA H100 80GB HBM3 x8 +- **Driver:** 580.159.03 | **CUDA:** 13.0 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Failed or unverified items: +- Compute Throughput: FAIL (FP16 spread 3.04% > 3%) +- NCCL: FAIL +- Stress Test: FAIL +- RDMA: FAIL + +## Summary + +| Test | Result | +|------|--------| +| GPU Info | PASS (8 GPUs detected) | +| Health Check | PASS | +| Memory Bandwidth | PASS (108.1%) | +| Compute Throughput | FAIL (FP16 spread 3.04% > 3%) | +| NVLink/NVSwitch | PASS | +| DCGM | PASS | +| NCCL | FAIL | +| Stress Test | FAIL | +| RDMA | FAIL | +| Training | PASS (216498 tokens/sec) | + +## GPU Information + +| GPU | Model | VRAM | Temp | Power | SM Clock | +|-----|-------|------|------|-------|----------| +| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 69/700W | 345 MHz | +| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 73/700W | 345 MHz | +| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 26C | 69/700W | 345 MHz | +| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 69/700W | 345 MHz | +| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 69/700W | 345 MHz | +| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 27C | 70/700W | 345 MHz | +| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 70/700W | 345 MHz | +| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 71/700W | 345 MHz | + +## Health Check + +**Overall: PASS** + +| GPU | Temp | Power | ECC | PCIe | Throttle | Status | +|-----|------|-------|-----|------|----------|--------| +| 0 | 25C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 1 | 25C PASS | 73W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 2 | 26C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 3 | 24C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 4 | 24C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 5 | 27C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 6 | 25C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 7 | 24C PASS | 71W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | + +## Memory Bandwidth + +Source: nvbandwidth + +| Metric | Value | Peak | Efficiency | +|--------|-------|------|------------| +| H2D (PCIe) | 55.4 GB/s | 64 GB/s | 86.6% | +| D2H (PCIe) | 54.0 GB/s | 64 GB/s | 84.4% | +| D2D (NVLink) | 486.5 GB/s | 450 GB/s | 108.1% | + +**Verdict: PASS** (D2D efficiency 108.1%) + +## Compute Throughput + +| DType | Achieved (TFLOPS) | Peak | Threshold | Status | +|-------|-------------------|------|------------|--------| +| FP32 | 51.9 | 67 | >= 54 | FAIL | +| TF32 | 364.9 | 495 | >= 444 | FAIL | +| FP16 | 680.0 | 990 | >= 734 | FAIL | +| BF16 | 713.2 | 990 | >= 745 | FAIL | +| FP8 | 1170.4 | 1979 | >= 1400 | FAIL | +| FP64 | 46.9 | 67 | >= 63 | FAIL | +| INT8 | 100.4 | 1979 | >= 1536 | FAIL | + +**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 5.1%) + +### Compute Consistency + +| DType | Min | Mean | Max | Spread | Limit | Status | +|-------|-----|------|-----|--------|-------|--------| +| FP32 | 51.9 | 52.0 | 52.1 | 0.38% | <= 3% | PASS | +| TF32 | 361.0 | 364.9 | 369.0 | 2.19% | <= 3% | PASS | +| FP16 | 667.3 | 680.0 | 688.0 | 3.04% | <= 3% | FAIL | +| BF16 | 703.0 | 713.3 | 735.7 | 4.58% | <= 3% | FAIL | +| FP8 | 1156.9 | 1170.5 | 1186.1 | 2.49% | <= 3% | PASS | +| FP64 | 45.9 | 46.9 | 47.5 | 3.41% | <= 3% | FAIL | +| INT8 | 100.4 | 100.4 | 100.4 | 0.00% | <= 3% | PASS | + +### Compute Per-GPU TFLOPS + +| GPU | FP32 | TF32 | FP16 | BF16 | FP8 | FP64 | INT8 | +|---|---|---|---|---|---|---|---| +| 0 | 52.0 | 369.0 | 688.0 | 735.7 | 1186.1 | 47.5 | 100.4 | +| 1 | 51.9 | 365.6 | 675.3 | 711.6 | 1171.0 | 47.0 | 100.4 | +| 2 | 51.9 | 364.9 | 685.7 | 715.3 | 1175.3 | 47.1 | 100.4 | +| 3 | 51.9 | 364.0 | 679.9 | 704.0 | 1167.6 | 47.4 | 100.4 | +| 4 | 51.9 | 367.7 | 681.2 | 719.0 | 1178.0 | 46.6 | 100.4 | +| 5 | 52.0 | 364.3 | 680.8 | 712.3 | 1165.5 | 46.8 | 100.4 | +| 6 | 52.1 | 362.9 | 681.8 | 703.0 | 1156.9 | 46.9 | 100.4 | +| 7 | 51.9 | 361.0 | 667.3 | 705.3 | 1163.2 | 45.9 | 100.4 | + +## NVLink/NVSwitch + +**Overall: PASS** + +| GPU | Active Links | Issues | +|-----|--------------|--------| +| 0 | 18/18 | OK | +| 1 | 18/18 | OK | +| 2 | 18/18 | OK | +| 3 | 18/18 | OK | +| 4 | 18/18 | OK | +| 5 | 18/18 | OK | +| 6 | 18/18 | OK | +| 7 | 18/18 | OK | + +## DCGM Diagnostic + +**Overall: PASS** + +| Subtest | Status | +|---------|--------| +| Deployment/software/GPU0 | PASS | +| Deployment/software/GPU1 | PASS | +| Deployment/software/GPU2 | PASS | +| Deployment/software/GPU3 | PASS | +| Deployment/software/GPU4 | PASS | +| Deployment/software/GPU5 | PASS | +| Deployment/software/GPU6 | PASS | +| Deployment/software/GPU7 | PASS | +| Deployment/software/summary | PASS | +| Hardware/memory/GPU0 | PASS | +| Hardware/memory/GPU1 | PASS | +| Hardware/memory/GPU2 | PASS | +| Hardware/memory/GPU3 | PASS | +| Hardware/memory/GPU4 | PASS | +| Hardware/memory/GPU5 | PASS | +| Hardware/memory/GPU6 | PASS | +| Hardware/memory/GPU7 | PASS | +| Hardware/memory/summary | PASS | +| Hardware/diagnostic/GPU0 | PASS | +| Hardware/diagnostic/GPU1 | PASS | +| Hardware/diagnostic/GPU2 | PASS | +| Hardware/diagnostic/GPU3 | PASS | +| Hardware/diagnostic/GPU4 | PASS | +| Hardware/diagnostic/GPU5 | PASS | +| Hardware/diagnostic/GPU6 | PASS | +| Hardware/diagnostic/GPU7 | PASS | +| Hardware/diagnostic/summary | PASS | +| Hardware/nvbandwidth/GPU0 | PASS | +| Hardware/nvbandwidth/GPU1 | PASS | +| Hardware/nvbandwidth/GPU2 | PASS | +| Hardware/nvbandwidth/GPU3 | PASS | +| Hardware/nvbandwidth/GPU4 | PASS | +| Hardware/nvbandwidth/GPU5 | PASS | +| Hardware/nvbandwidth/GPU6 | PASS | +| Hardware/nvbandwidth/GPU7 | PASS | +| Hardware/nvbandwidth/summary | PASS | +| Integration/pcie/GPU0 | PASS | +| Integration/pcie/GPU1 | PASS | +| Integration/pcie/GPU2 | PASS | +| Integration/pcie/GPU3 | PASS | +| Integration/pcie/GPU4 | PASS | +| Integration/pcie/GPU5 | PASS | +| Integration/pcie/GPU6 | PASS | +| Integration/pcie/GPU7 | PASS | +| Integration/pcie/summary | PASS | +| Stress/targeted_stress/GPU0 | PASS | +| Stress/targeted_stress/GPU1 | PASS | +| Stress/targeted_stress/GPU2 | PASS | +| Stress/targeted_stress/GPU3 | PASS | +| Stress/targeted_stress/GPU4 | PASS | +| Stress/targeted_stress/GPU5 | PASS | +| Stress/targeted_stress/GPU6 | PASS | +| Stress/targeted_stress/GPU7 | PASS | +| Stress/targeted_stress/summary | PASS | +| Stress/targeted_power/GPU0 | PASS | +| Stress/targeted_power/GPU1 | PASS | +| Stress/targeted_power/GPU2 | PASS | +| Stress/targeted_power/GPU3 | PASS | +| Stress/targeted_power/GPU4 | PASS | +| Stress/targeted_power/GPU5 | PASS | +| Stress/targeted_power/GPU6 | PASS | +| Stress/targeted_power/GPU7 | PASS | +| Stress/targeted_power/summary | PASS | + +## NCCL Multi-GPU + +Source: nccl-tests | GPUs: 8 + +| Operation | Bus BW (GB/s) | Threshold | Status | +|-----------|---------------|-----------|--------| +| allreduce | 472.3 | >= 405 | FAIL | +| alltoall | 343.3 | >= 315 | FAIL | +| broadcast | 364.1 | >= 360 | FAIL | +| reducescatter | 352.8 | >= 405 | FAIL | +| allgather | 366.4 | >= 405 | FAIL | +| sendrecv | 369.0 | >= 360 | FAIL | + +### NCCL allreduce by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 24.9, 25.0, 24.7 | 24.7 | 24.9 | 0.50% | >= 405 | FAIL | +| 256M | 421.6, 421.8, 421.6 | 421.6 | 421.7 | 0.02% | >= 405 | PASS | +| 2G | 472.8, 472.7, 471.5 | 471.5 | 472.3 | 0.13% | >= 405 | PASS | + +### NCCL alltoall by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 8.1, 8.0, 8.0 | 8.0 | 8.0 | 0.59% | >= 315 | FAIL | +| 256M | 305.3, 314.9, 313.1 | 305.3 | 311.1 | 1.34% | >= 315 | FAIL | +| 2G | 342.1, 342.5, 345.4 | 342.1 | 343.3 | 0.43% | >= 315 | PASS | + +### NCCL broadcast by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 14.5, 14.6, 14.2 | 14.2 | 14.4 | 1.18% | >= 360 | FAIL | +| 256M | 344.2, 345.9, 344.6 | 344.2 | 344.9 | 0.21% | >= 360 | FAIL | +| 2G | 364.2, 364.0, 364.1 | 364.0 | 364.1 | 0.02% | >= 360 | PASS | + +### NCCL reducescatter by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 14.1, 13.8, 14.2 | 13.8 | 14.0 | 1.21% | >= 405 | FAIL | +| 256M | 328.6, 328.3, 328.2 | 328.2 | 328.4 | 0.05% | >= 405 | FAIL | +| 2G | 352.6, 352.4, 353.3 | 352.4 | 352.8 | 0.11% | >= 405 | FAIL | + +### NCCL allgather by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 14.6, 14.3, 14.4 | 14.3 | 14.4 | 0.86% | >= 405 | FAIL | +| 256M | 350.5, 350.4, 349.9 | 349.9 | 350.3 | 0.07% | >= 405 | FAIL | +| 2G | 366.3, 366.6, 366.2 | 366.2 | 366.4 | 0.05% | >= 405 | FAIL | + +### NCCL sendrecv by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 18.4, 18.4, 18.4 | 18.4 | 18.4 | 0.00% | >= 360 | FAIL | +| 256M | 350.9, 351.6, 351.4 | 350.9 | 351.3 | 0.08% | >= 360 | FAIL | +| 2G | 368.9, 369.1, 368.9 | 368.9 | 369.0 | 0.03% | >= 360 | PASS | + +**Overall: FAIL** + +## Stress Test + +- **Source:** pytorch +- **Duration:** 1800s (requested 1800s) +- **Telemetry samples:** 1266 +- **Max temp:** {0: 60.0, 1: 60.0, 2: 68.0, 3: 56.0, 4: 60.0, 5: 68.0, 6: 64.0, 7: 56.0} +- **Avg power:** {0: 697.7, 1: 697.5, 2: 697.1, 3: 697.8, 4: 697.8, 5: 697.9, 6: 697.7, 7: 698.3} +- **Temp delta:** 12.0 C +- **TFLOPS jitter:** 4.37% +- **Steady TFLOPS samples:** 37672 +- **Throttle events:** 9712 +- **XID events:** 0 +- **Failure reasons:** + - GPU temperature delta 12.0C exceeds 5.0C + - non-idle throttle reasons observed in 9712 samples (first: GPU 0 0x4) +- **Result: FAIL** + +## RDMA/InfiniBand + +### RDMA Port Checks + +| Device | Port | State | Rate | Required | Status | +|--------|------|-------|------|----------|--------| +| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL | +| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL | +| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | + +| Test | Value | Threshold | Status | +|------|-------|-----------|--------| +| ib_write_bw | 49.5 GB/s | >= 47 GB/s | PASS | +| ib_read_bw | 39.1 GB/s | >= 47 GB/s | FAIL | +| ib_write_lat | 1.25 us | <= 2 us | PASS | +| ib_read_lat | 2.60 us | <= 3.5 us | PASS | +| ibping | local_loopback target=0x58 count=5 | 0% packet loss | PASS | + +- **PFC/ECN/CNP/congestion counters checked:** 146 +- **PFC/ECN/CNP/congestion non-zero:** no +- **Failure reasons:** + - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE) + - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE) + - ib_read_bw bandwidth 39.12GB/s < 47GB/s +**Overall: FAIL** + +## Training Simulation + +| Metric | Value | +|--------|-------| +| Model | synthetic_transformer_1.5b | +| Params | 1470.5M | +| Throughput | 216498 tokens/sec | +| Avg Step Time | 75.7 ms | +| Warmup Steps | 5 | +| Peak Memory | 18.1 GB | +| Final Loss | 0.0039 | +| Step Jitter | 1.89% | +| Distributed Mode | ddp | +| Verdict | PASS (216498 tokens/sec) | + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_test_all_latest_aikubeworker0016_20260522_203447.md b/reports_test_all_latest_aikubeworker0016_20260522_203447.md new file mode 100644 index 0000000..3a4077f --- /dev/null +++ b/reports_test_all_latest_aikubeworker0016_20260522_203447.md @@ -0,0 +1,322 @@ +# GPU Test Report + +- **Date:** 2026-05-22T20:34:52.129246 +- **Host:** aikubeworker0016 +- **GPU:** NVIDIA H100 80GB HBM3 x8 +- **Driver:** 580.159.03 | **CUDA:** 13.0 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Failed or unverified items: +- Compute Throughput: FAIL (BF16 spread 3.44% > 3%) +- NCCL: FAIL +- Stress Test: FAIL +- RDMA: FAIL + +## Summary + +| Test | Result | +|------|--------| +| GPU Info | PASS (8 GPUs detected) | +| Health Check | PASS | +| Memory Bandwidth | PASS (108.1%) | +| Compute Throughput | FAIL (BF16 spread 3.44% > 3%) | +| NVLink/NVSwitch | PASS | +| DCGM | PASS | +| NCCL | FAIL | +| Stress Test | FAIL | +| RDMA | FAIL | +| Training | PASS (216683 tokens/sec) | + +## GPU Information + +| GPU | Model | VRAM | Temp | Power | SM Clock | +|-----|-------|------|------|-------|----------| +| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 70/700W | 345 MHz | +| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 68/700W | 345 MHz | +| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 67/700W | 345 MHz | +| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz | +| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 68/700W | 345 MHz | +| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 22C | 69/700W | 345 MHz | +| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 68/700W | 345 MHz | +| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 66/700W | 345 MHz | + +## Health Check + +**Overall: PASS** + +| GPU | Temp | Power | ECC | PCIe | Throttle | Status | +|-----|------|-------|-----|------|----------|--------| +| 0 | 20C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 1 | 21C PASS | 68W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 2 | 21C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 3 | 20C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 4 | 20C PASS | 68W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 5 | 22C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 6 | 20C PASS | 68W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 7 | 20C PASS | 66W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | + +## Memory Bandwidth + +Source: nvbandwidth + +| Metric | Value | Peak | Efficiency | +|--------|-------|------|------------| +| H2D (PCIe) | 55.4 GB/s | 64 GB/s | 86.6% | +| D2H (PCIe) | 54.4 GB/s | 64 GB/s | 85.0% | +| D2D (NVLink) | 486.6 GB/s | 450 GB/s | 108.1% | + +**Verdict: PASS** (D2D efficiency 108.1%) + +## Compute Throughput + +| DType | Achieved (TFLOPS) | Peak | Threshold | Status | +|-------|-------------------|------|------------|--------| +| FP32 | 52.1 | 67 | >= 54 | FAIL | +| TF32 | 366.7 | 495 | >= 444 | FAIL | +| FP16 | 682.7 | 990 | >= 734 | FAIL | +| BF16 | 717.3 | 990 | >= 745 | FAIL | +| FP8 | 1173.5 | 1979 | >= 1400 | FAIL | +| FP64 | 47.4 | 67 | >= 63 | FAIL | +| INT8 | 100.4 | 1979 | >= 1536 | FAIL | + +**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 5.1%) + +### Compute Consistency + +| DType | Min | Mean | Max | Spread | Limit | Status | +|-------|-----|------|-----|--------|-------|--------| +| FP32 | 51.9 | 52.1 | 52.2 | 0.58% | <= 3% | PASS | +| TF32 | 362.3 | 366.7 | 369.2 | 1.88% | <= 3% | PASS | +| FP16 | 674.4 | 682.7 | 693.1 | 2.74% | <= 3% | PASS | +| BF16 | 705.3 | 717.2 | 730.0 | 3.44% | <= 3% | FAIL | +| FP8 | 1155.2 | 1173.5 | 1186.2 | 2.64% | <= 3% | PASS | +| FP64 | 46.3 | 47.4 | 48.5 | 4.64% | <= 3% | FAIL | +| INT8 | 100.4 | 100.4 | 100.4 | 0.00% | <= 3% | PASS | + +### Compute Per-GPU TFLOPS + +| GPU | FP32 | TF32 | FP16 | BF16 | FP8 | FP64 | INT8 | +|---|---|---|---|---|---|---|---| +| 0 | 52.2 | 362.3 | 674.4 | 714.3 | 1159.0 | 46.3 | 100.4 | +| 1 | 51.9 | 366.5 | 674.7 | 721.4 | 1185.4 | 47.7 | 100.4 | +| 2 | 52.2 | 367.4 | 693.1 | 730.0 | 1185.7 | 48.5 | 100.4 | +| 3 | 52.2 | 367.8 | 682.2 | 708.2 | 1163.4 | 47.4 | 100.4 | +| 4 | 52.0 | 366.4 | 686.9 | 714.1 | 1186.2 | 47.3 | 100.4 | +| 5 | 52.0 | 369.2 | 679.9 | 721.1 | 1155.2 | 47.3 | 100.4 | +| 6 | 51.9 | 365.1 | 677.7 | 705.3 | 1169.0 | 47.0 | 100.4 | +| 7 | 52.2 | 369.0 | 692.8 | 723.5 | 1184.3 | 47.6 | 100.4 | + +## NVLink/NVSwitch + +**Overall: PASS** + +| GPU | Active Links | Issues | +|-----|--------------|--------| +| 0 | 18/18 | OK | +| 1 | 18/18 | OK | +| 2 | 18/18 | OK | +| 3 | 18/18 | OK | +| 4 | 18/18 | OK | +| 5 | 18/18 | OK | +| 6 | 18/18 | OK | +| 7 | 18/18 | OK | + +## DCGM Diagnostic + +**Overall: PASS** + +| Subtest | Status | +|---------|--------| +| Deployment/software/GPU0 | PASS | +| Deployment/software/GPU1 | PASS | +| Deployment/software/GPU2 | PASS | +| Deployment/software/GPU3 | PASS | +| Deployment/software/GPU4 | PASS | +| Deployment/software/GPU5 | PASS | +| Deployment/software/GPU6 | PASS | +| Deployment/software/GPU7 | PASS | +| Deployment/software/summary | PASS | +| Hardware/memory/GPU0 | PASS | +| Hardware/memory/GPU1 | PASS | +| Hardware/memory/GPU2 | PASS | +| Hardware/memory/GPU3 | PASS | +| Hardware/memory/GPU4 | PASS | +| Hardware/memory/GPU5 | PASS | +| Hardware/memory/GPU6 | PASS | +| Hardware/memory/GPU7 | PASS | +| Hardware/memory/summary | PASS | +| Hardware/diagnostic/GPU0 | PASS | +| Hardware/diagnostic/GPU1 | PASS | +| Hardware/diagnostic/GPU2 | PASS | +| Hardware/diagnostic/GPU3 | PASS | +| Hardware/diagnostic/GPU4 | PASS | +| Hardware/diagnostic/GPU5 | PASS | +| Hardware/diagnostic/GPU6 | PASS | +| Hardware/diagnostic/GPU7 | PASS | +| Hardware/diagnostic/summary | PASS | +| Hardware/nvbandwidth/GPU0 | PASS | +| Hardware/nvbandwidth/GPU1 | PASS | +| Hardware/nvbandwidth/GPU2 | PASS | +| Hardware/nvbandwidth/GPU3 | PASS | +| Hardware/nvbandwidth/GPU4 | PASS | +| Hardware/nvbandwidth/GPU5 | PASS | +| Hardware/nvbandwidth/GPU6 | PASS | +| Hardware/nvbandwidth/GPU7 | PASS | +| Hardware/nvbandwidth/summary | PASS | +| Integration/pcie/GPU0 | PASS | +| Integration/pcie/GPU1 | PASS | +| Integration/pcie/GPU2 | PASS | +| Integration/pcie/GPU3 | PASS | +| Integration/pcie/GPU4 | PASS | +| Integration/pcie/GPU5 | PASS | +| Integration/pcie/GPU6 | PASS | +| Integration/pcie/GPU7 | PASS | +| Integration/pcie/summary | PASS | +| Stress/targeted_stress/GPU0 | PASS | +| Stress/targeted_stress/GPU1 | PASS | +| Stress/targeted_stress/GPU2 | PASS | +| Stress/targeted_stress/GPU3 | PASS | +| Stress/targeted_stress/GPU4 | PASS | +| Stress/targeted_stress/GPU5 | PASS | +| Stress/targeted_stress/GPU6 | PASS | +| Stress/targeted_stress/GPU7 | PASS | +| Stress/targeted_stress/summary | PASS | +| Stress/targeted_power/GPU0 | PASS | +| Stress/targeted_power/GPU1 | PASS | +| Stress/targeted_power/GPU2 | PASS | +| Stress/targeted_power/GPU3 | PASS | +| Stress/targeted_power/GPU4 | PASS | +| Stress/targeted_power/GPU5 | PASS | +| Stress/targeted_power/GPU6 | PASS | +| Stress/targeted_power/GPU7 | PASS | +| Stress/targeted_power/summary | PASS | + +## NCCL Multi-GPU + +Source: nccl-tests | GPUs: 8 + +| Operation | Bus BW (GB/s) | Threshold | Status | +|-----------|---------------|-----------|--------| +| allreduce | 472.4 | >= 405 | FAIL | +| alltoall | 344.3 | >= 315 | FAIL | +| broadcast | 363.6 | >= 360 | FAIL | +| reducescatter | 353.1 | >= 405 | FAIL | +| allgather | 366.4 | >= 405 | FAIL | +| sendrecv | 368.9 | >= 360 | FAIL | + +### NCCL allreduce by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 24.9, 24.4, 24.9 | 24.4 | 24.7 | 0.95% | >= 405 | FAIL | +| 256M | 421.9, 421.1, 421.9 | 421.1 | 421.6 | 0.09% | >= 405 | PASS | +| 2G | 472.6, 472.0, 472.5 | 472.0 | 472.4 | 0.06% | >= 405 | PASS | + +### NCCL alltoall by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 7.9, 7.8, 8.1 | 7.8 | 7.9 | 1.57% | >= 315 | FAIL | +| 256M | 298.7, 312.7, 303.2 | 298.7 | 304.9 | 1.91% | >= 315 | FAIL | +| 2G | 342.2, 345.4, 345.2 | 342.2 | 344.3 | 0.43% | >= 315 | PASS | + +### NCCL broadcast by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 14.5, 14.3, 14.4 | 14.3 | 14.4 | 0.57% | >= 360 | FAIL | +| 256M | 344.1, 344.3, 344.8 | 344.1 | 344.4 | 0.09% | >= 360 | FAIL | +| 2G | 364.0, 363.6, 363.3 | 363.3 | 363.6 | 0.08% | >= 360 | PASS | + +### NCCL reducescatter by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 14.0, 14.2, 14.3 | 14.0 | 14.2 | 0.88% | >= 405 | FAIL | +| 256M | 328.8, 328.7, 328.4 | 328.4 | 328.6 | 0.05% | >= 405 | FAIL | +| 2G | 351.9, 353.8, 353.6 | 351.9 | 353.1 | 0.24% | >= 405 | FAIL | + +### NCCL allgather by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 14.4, 13.9, 14.0 | 13.9 | 14.1 | 1.53% | >= 405 | FAIL | +| 256M | 350.2, 350.4, 350.7 | 350.2 | 350.4 | 0.06% | >= 405 | FAIL | +| 2G | 366.9, 366.4, 366.0 | 366.0 | 366.4 | 0.10% | >= 405 | FAIL | + +### NCCL sendrecv by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 18.4, 18.3, 18.5 | 18.3 | 18.4 | 0.44% | >= 360 | FAIL | +| 256M | 351.1, 351.4, 351.3 | 351.1 | 351.3 | 0.04% | >= 360 | FAIL | +| 2G | 368.9, 368.8, 368.9 | 368.8 | 368.9 | 0.01% | >= 360 | PASS | + +**Overall: FAIL** + +## Stress Test + +- **Source:** pytorch +- **Duration:** 1800s (requested 1800s) +- **Telemetry samples:** 1295 +- **Max temp:** {0: 51.0, 1: 59.0, 2: 61.0, 3: 53.0, 4: 53.0, 5: 62.0, 6: 56.0, 7: 52.0} +- **Avg power:** {0: 698.8, 1: 697.8, 2: 698.1, 3: 697.9, 4: 697.9, 5: 698.2, 6: 698.0, 7: 697.8} +- **Temp delta:** 11.0 C +- **TFLOPS jitter:** 3.4% +- **Steady TFLOPS samples:** 37874 +- **Throttle events:** 9944 +- **XID events:** 0 +- **Failure reasons:** + - GPU temperature delta 11.0C exceeds 5.0C + - non-idle throttle reasons observed in 9944 samples (first: GPU 0 0x4) +- **Result: FAIL** + +## RDMA/InfiniBand + +### RDMA Port Checks + +| Device | Port | State | Rate | Required | Status | +|--------|------|-------|------|----------|--------| +| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL | +| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL | +| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | + +| Test | Value | Threshold | Status | +|------|-------|-----------|--------| +| ib_write_bw | 48.6 GB/s | >= 47 GB/s | PASS | +| ib_read_bw | 40.3 GB/s | >= 47 GB/s | FAIL | +| ib_write_lat | 1.29 us | <= 2 us | PASS | +| ib_read_lat | 2.59 us | <= 3.5 us | PASS | +| ibping | local_loopback target=0x4b count=5 | 0% packet loss | PASS | + +- **PFC/ECN/CNP/congestion counters checked:** 146 +- **PFC/ECN/CNP/congestion non-zero:** no +- **Failure reasons:** + - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE) + - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE) + - ib_read_bw bandwidth 40.29GB/s < 47GB/s +**Overall: FAIL** + +## Training Simulation + +| Metric | Value | +|--------|-------| +| Model | synthetic_transformer_1.5b | +| Params | 1470.5M | +| Throughput | 216683 tokens/sec | +| Avg Step Time | 75.6 ms | +| Warmup Steps | 5 | +| Peak Memory | 18.1 GB | +| Final Loss | 0.0039 | +| Step Jitter | 1.2% | +| Distributed Mode | ddp | +| Verdict | PASS (216683 tokens/sec) | + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_test_all_latest_summary_cn_20260523.md b/reports_test_all_latest_summary_cn_20260523.md new file mode 100644 index 0000000..9ef9449 --- /dev/null +++ b/reports_test_all_latest_summary_cn_20260523.md @@ -0,0 +1,101 @@ +# H100 单节点 test all 中文汇总 + +生成时间:2026-05-23 +测试范围:`aikubeworker0012`、`aikubeworker0016` 单节点 `python gpu_tester.py --test all --report --format md` + +原始报告: + +- `reports_test_all_latest_aikubeworker0012_20260522_203246.md` +- `reports_test_all_latest_aikubeworker0016_20260522_203447.md` + +## 总结论 + +| 机器 | Suite | PDF 验收结论 | 主要失败项 | +|---|---:|---|---| +| aikubeworker0012 | 6/10 PASS | FAIL | Compute、NCCL、Stress、RDMA | +| aikubeworker0016 | 6/10 PASS | FAIL | Compute、NCCL、Stress、RDMA | + +按 PDF 口径,任一必测子项 FAIL,则整机 FAIL。因此两台机器当前都不通过生产验收。 + +## 通过项 + +| 项目 | aikubeworker0012 | aikubeworker0016 | 说明 | +|---|---|---|---| +| GPU Info | PASS | PASS | 8 张 H100 | +| Health | PASS | PASS | 温度、空闲功耗、ECC、PCIe、空闲 throttle 正常 | +| Memory Bandwidth | PASS | PASS | D2D 效率均约 108.1% | +| NVLink/NVSwitch | PASS | PASS | 8 卡均 18/18 links | +| DCGM diag -r 3 | PASS | PASS | software、memory、diagnostic、nvbandwidth、pcie、targeted stress/power 全 PASS | +| Training Simulation | PASS | PASS | 8 卡 DDP synthetic 1.5B,loss finite | + +Training 结果: + +| 机器 | Throughput | Step jitter | Peak memory | Verdict | +|---|---:|---:|---:|---| +| aikubeworker0012 | 216498 tokens/s | 1.89% | 18.08 GB | PASS | +| aikubeworker0016 | 216683 tokens/s | 1.20% | 18.08 GB | PASS | + +## 失败项 + +### Compute + +两台机器都未达到当前 H100 绝对 TFLOPS 阈值,且部分 dtype 的跨 GPU spread 超过 3%。 + +| 机器 | 代表性失败 | +|---|---| +| aikubeworker0012 | FP16 spread 3.04%,BF16 spread 4.58%,FP64 spread 3.41%;FP32/TF32/FP16/BF16/FP8/FP64/INT8 绝对阈值均 FAIL | +| aikubeworker0016 | BF16 spread 3.44%,FP64 spread 4.64%;FP32/TF32/FP16/BF16/FP8/FP64/INT8 绝对阈值均 FAIL | + +### NCCL + +NCCL 已经使用真实 `nccl-tests` bus BW,不是 torchrun fallback。失败主要来自小 size 以及部分 256M/2G op 未达阈值。 + +| 机器 | allreduce best | alltoall best | broadcast best | reducescatter best | allgather best | sendrecv best | Verdict | +|---|---:|---:|---:|---:|---:|---:|---| +| aikubeworker0012 | 472.3 | 343.3 | 364.1 | 352.8 | 366.4 | 369.0 | FAIL | +| aikubeworker0016 | 472.4 | 344.3 | 363.6 | 353.1 | 366.4 | 368.9 | FAIL | + +关键原因: + +- `1M` size 在所有 op 上都明显低于阈值。 +- `reducescatter`、`allgather` 的 2G 也低于 405 GB/s 阈值。 +- `broadcast/sendrecv` 的 256M 低于 360 GB/s 阈值。 + +### Stress + +两台机器的 1800 秒 PyTorch BF16 GEMM 压力测试均跑满,但 telemetry 判定 FAIL。 + +| 机器 | 平均稳态功耗 | 最高温度范围 | 温差 | TFLOPS jitter | throttle events | XID | Verdict | +|---|---|---|---:|---:|---:|---:|---| +| aikubeworker0012 | 约 697-698W/GPU | 56-68C | 12C | 4.37% | 9712 | 0 | FAIL | +| aikubeworker0016 | 约 698W/GPU | 51-62C | 11C | 3.40% | 9944 | 0 | FAIL | + +失败原因: + +- GPU 间温差超过 5C 阈值。 +- 观测到大量非 idle throttle,首个原因是 `0x4`,即 `sw_power_cap`。 + +### RDMA/InfiniBand + +本轮 `test all` 是单节点 RDMA 路径,`ibping` 显示为 `local_loopback`。这份结果不能替代跨节点 RDMA 验收,但仍反映单节点 perftest read bandwidth 未达标。 + +| 机器 | ib_write_bw | ib_read_bw | ib_write_lat | ib_read_lat | Verdict | +|---|---:|---:|---:|---:|---| +| aikubeworker0012 | 49.5 GB/s PASS | 39.1 GB/s FAIL | 1.25 us PASS | 2.60 us PASS | FAIL | +| aikubeworker0016 | 48.6 GB/s PASS | 40.3 GB/s FAIL | 1.29 us PASS | 2.59 us PASS | FAIL | + +另外,两台机器都有 `mlx5_4`、`mlx5_5` 处于 ACTIVE 但速率为 100 Gb/sec,低于当前 400G 端口阈值,因此 RDMA port check 也有 FAIL。 + +## 当前阻塞 + +1. Compute 阈值口径较严,当前实测绝对 TFLOPS 全 dtype 未达配置阈值,尤其 INT8 路径仅约 100 TFLOPS。 +2. NCCL 真实 bus BW 已可测,但多 op/size 未达 PDF 阈值。 +3. Stress 负载可跑满 30 分钟,但温差和 `sw_power_cap` throttle 导致 FAIL。 +4. 单节点 RDMA read bandwidth 未达 47 GB/s,且部分 IB 端口速率低于 400G。 +5. 跨节点 RDMA 需要继续使用单独 server/client 报告;不能把本轮 `local_loopback` 当作跨节点验收。 + +## 状态判断 + +脚本能力已经基本补齐到 PDF 验收口径:真实 nccl-tests、30 分钟 stress telemetry、NVLink、DCGM r3、RDMA perftest/ibping/counter、逐 GPU compute、8 卡 DDP training、最终任一 FAIL 即整机 FAIL 都已经跑通。 + +当前剩余问题主要不是脚本缺项,而是两台机器的实际验收数据有多项未达标。 diff --git a/reports_test_all_pdf_aikubeworker0012_20260522_182656.md b/reports_test_all_pdf_aikubeworker0012_20260522_182656.md new file mode 100644 index 0000000..283d875 --- /dev/null +++ b/reports_test_all_pdf_aikubeworker0012_20260522_182656.md @@ -0,0 +1,259 @@ +# GPU Test Report + +- **Date:** 2026-05-22T18:27:01.103760 +- **Host:** aikubeworker0012 +- **GPU:** NVIDIA H100 80GB HBM3 x8 +- **Driver:** 580.159.03 | **CUDA:** 13.0 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Failed or unverified items: +- Compute Throughput: FAIL (worst FP32 52 vs >= 54) +- DCGM: ERROR: dcgmi diag -r 3 timeout after 1200s +- NCCL: FAIL +- Stress Test: FAIL +- RDMA: FAIL +- Training: FAIL (188741 tokens/sec) + +## Summary + +| Test | Result | +|------|--------| +| GPU Info | PASS (8 GPUs detected) | +| Health Check | PASS | +| Memory Bandwidth | PASS (108.1%) | +| Compute Throughput | FAIL (worst FP32 52 vs >= 54) | +| NVLink/NVSwitch | PASS | +| DCGM | ERROR: dcgmi diag -r 3 timeout after 1200s | +| NCCL | FAIL | +| Stress Test | FAIL | +| RDMA | FAIL | +| Training | FAIL (188741 tokens/sec) | + +## GPU Information + +| GPU | Model | VRAM | Temp | Power | SM Clock | +|-----|-------|------|------|-------|----------| +| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 70/700W | 345 MHz | +| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 73/700W | 345 MHz | +| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 26C | 69/700W | 345 MHz | +| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 70/700W | 345 MHz | +| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 69/700W | 345 MHz | +| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 27C | 70/700W | 345 MHz | +| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 25C | 71/700W | 345 MHz | +| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 24C | 72/700W | 345 MHz | + +## Health Check + +**Overall: PASS** + +| GPU | Temp | Power | ECC | PCIe | Throttle | Status | +|-----|------|-------|-----|------|----------|--------| +| 0 | 25C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 1 | 25C PASS | 73W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 2 | 26C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 3 | 24C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 4 | 24C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 5 | 27C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 6 | 25C PASS | 71W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 7 | 24C PASS | 72W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | + +## Memory Bandwidth + +Source: nvbandwidth + +| Metric | Value | Peak | Efficiency | +|--------|-------|------|------------| +| H2D (PCIe) | 55.5 GB/s | 64 GB/s | 86.7% | +| D2H (PCIe) | 54.3 GB/s | 64 GB/s | 84.8% | +| D2D (NVLink) | 486.6 GB/s | 450 GB/s | 108.1% | + +**Verdict: PASS** (D2D efficiency 108.1%) + +## Compute Throughput + +| DType | Achieved (TFLOPS) | Peak | Threshold | Status | +|-------|-------------------|------|------------|--------| +| FP32 | 52.0 | 67 | >= 54 | FAIL | +| TF32 | 364.8 | 495 | >= 444 | FAIL | +| FP16 | 685.0 | 990 | >= 734 | FAIL | +| BF16 | 715.9 | 990 | >= 745 | FAIL | +| FP8 | 1166.6 | 1979 | >= 1400 | FAIL | +| FP64 | 46.9 | 0 | >= 63 | FAIL | +| INT8 | 100.4 | 0 | >= 1536 | FAIL | + +**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 58.9%) + +### Compute Consistency + +| DType | Min | Mean | Max | Spread | Limit | Status | +|-------|-----|------|-----|--------|-------|--------| +| FP32 | 51.9 | 52.0 | 52.2 | 0.58% | <= 3% | PASS | +| TF32 | 360.9 | 364.9 | 368.2 | 2.00% | <= 3% | PASS | +| FP16 | 676.0 | 685.0 | 689.9 | 2.03% | <= 3% | PASS | +| BF16 | 697.3 | 715.9 | 730.2 | 4.60% | <= 3% | FAIL | +| FP8 | 1141.8 | 1166.6 | 1180.3 | 3.30% | <= 3% | FAIL | +| FP64 | 45.8 | 46.9 | 47.7 | 4.05% | <= 3% | FAIL | +| INT8 | 100.4 | 100.4 | 100.4 | 0.00% | <= 3% | PASS | + +### Compute Per-GPU TFLOPS + +| GPU | FP32 | TF32 | FP16 | BF16 | FP8 | FP64 | INT8 | +|---|---|---|---|---|---|---|---| +| 0 | 51.9 | 368.2 | 689.5 | 730.2 | 1180.3 | 47.1 | 100.4 | +| 1 | 51.9 | 366.8 | 688.7 | 721.6 | 1170.1 | 47.7 | 100.4 | +| 2 | 51.9 | 366.3 | 689.9 | 711.3 | 1167.8 | 47.2 | 100.4 | +| 3 | 51.9 | 363.0 | 677.6 | 699.2 | 1176.3 | 46.6 | 100.4 | +| 4 | 52.2 | 365.3 | 685.0 | 725.4 | 1163.0 | 46.8 | 100.4 | +| 5 | 52.1 | 363.9 | 684.2 | 725.0 | 1172.1 | 46.9 | 100.4 | +| 6 | 51.9 | 364.4 | 688.8 | 717.3 | 1161.2 | 46.9 | 100.4 | +| 7 | 51.9 | 360.9 | 676.0 | 697.3 | 1141.8 | 45.8 | 100.4 | + +## NVLink/NVSwitch + +**Overall: PASS** + +| GPU | Active Links | Issues | +|-----|--------------|--------| +| 0 | 18/18 | OK | +| 1 | 18/18 | OK | +| 2 | 18/18 | OK | +| 3 | 18/18 | OK | +| 4 | 18/18 | OK | +| 5 | 18/18 | OK | +| 6 | 18/18 | OK | +| 7 | 18/18 | OK | + +## DCGM Diagnostic + +**Overall: FAIL** (dcgmi diag -r 3 timeout after 1200s) + +## NCCL Multi-GPU + +Source: nccl-tests | GPUs: 8 + +| Operation | Bus BW (GB/s) | Threshold | Status | +|-----------|---------------|-----------|--------| +| allreduce | 472.4 | >= 405 | FAIL | +| alltoall | 344.4 | >= 315 | FAIL | +| broadcast | 363.8 | >= 360 | FAIL | +| reducescatter | 353.0 | >= 405 | FAIL | +| allgather | 366.4 | >= 405 | FAIL | +| sendrecv | 368.9 | >= 360 | FAIL | + +### NCCL allreduce by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 24.0, 24.9, 24.7 | 24.0 | 24.5 | 1.57% | >= 405 | FAIL | +| 256M | 421.4, 421.7, 421.4 | 421.4 | 421.5 | 0.03% | >= 405 | PASS | +| 2G | 471.8, 473.0, 472.3 | 471.8 | 472.4 | 0.10% | >= 405 | PASS | + +### NCCL alltoall by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 8.1, 8.0, 8.0 | 8.0 | 8.0 | 0.59% | >= 315 | FAIL | +| 256M | 312.3, 310.9, 319.2 | 310.9 | 314.1 | 1.15% | >= 315 | FAIL | +| 2G | 343.1, 346.2, 344.0 | 343.1 | 344.4 | 0.38% | >= 315 | PASS | + +### NCCL broadcast by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 14.6, 13.6, 14.5 | 13.6 | 14.2 | 3.16% | >= 360 | FAIL | +| 256M | 343.8, 344.2, 344.5 | 343.8 | 344.2 | 0.08% | >= 360 | FAIL | +| 2G | 363.5, 363.3, 364.7 | 363.3 | 363.8 | 0.17% | >= 360 | PASS | + +### NCCL reducescatter by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 14.1, 14.3, 14.3 | 14.1 | 14.2 | 0.66% | >= 405 | FAIL | +| 256M | 328.1, 328.3, 328.3 | 328.1 | 328.2 | 0.03% | >= 405 | FAIL | +| 2G | 354.0, 352.6, 352.3 | 352.3 | 353.0 | 0.21% | >= 405 | FAIL | + +### NCCL allgather by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 14.5, 14.5, 14.3 | 14.3 | 14.4 | 0.65% | >= 405 | FAIL | +| 256M | 350.7, 350.7, 350.5 | 350.5 | 350.6 | 0.03% | >= 405 | FAIL | +| 2G | 366.6, 366.3, 366.3 | 366.3 | 366.4 | 0.04% | >= 405 | FAIL | + +### NCCL sendrecv by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 18.5, 18.4, 18.1 | 18.1 | 18.3 | 0.93% | >= 360 | FAIL | +| 256M | 352.3, 350.6, 350.5 | 350.5 | 351.1 | 0.24% | >= 360 | FAIL | +| 2G | 368.8, 369.0, 368.8 | 368.8 | 368.9 | 0.03% | >= 360 | PASS | + +**Overall: FAIL** + +## Stress Test + +- **Source:** pytorch +- **Duration:** 1800s (requested 1800s) +- **Telemetry samples:** 1541 +- **Max temp:** {0: 60.0, 1: 60.0, 2: 68.0, 3: 56.0, 4: 60.0, 5: 68.0, 6: 65.0, 7: 56.0} +- **Avg power:** {0: 697.7, 1: 697.4, 2: 697.2, 3: 697.7, 4: 697.5, 5: 698.0, 6: 697.8, 7: 698.4} +- **Temp delta:** 12.0 C +- **TFLOPS jitter:** 3.16% +- **Steady TFLOPS samples:** 37676 +- **Throttle events:** 11912 +- **XID events:** 0 +- **Failure reasons:** + - GPU temperature delta 12.0C exceeds 5.0C + - non-idle throttle reasons observed in 11912 samples (first: GPU 0 0x4) +- **Result: FAIL** + +## RDMA/InfiniBand + +### RDMA Port Checks + +| Device | Port | State | Rate | Required | Status | +|--------|------|-------|------|----------|--------| +| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL | +| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL | +| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | + +| Test | Value | Threshold | Status | +|------|-------|-----------|--------| +| ib_write_bw | 49.2 GB/s | >= 47 GB/s | PASS | +| ib_read_bw | 39.1 GB/s | >= 47 GB/s | FAIL | +| ib_write_lat | 5.68 us | <= 2 us | FAIL | +| ib_read_lat | 16.00 us | <= 3.5 us | FAIL | +| ibping | target=0x58 count=5 | 0% packet loss | PASS | + +- **PFC/ECN/CNP/congestion counters checked:** 0 +- **PFC/ECN/CNP/congestion non-zero:** no +- **Failure reasons:** + - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE) + - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE) + - ib_read_bw bandwidth 39.11GB/s < 47GB/s + - ib_write_lat latency 5.68us > 2.0us + - ib_read_lat latency 16.0us > 3.5us +**Overall: FAIL** + +## Training Simulation + +| Metric | Value | +|--------|-------| +| Model | synthetic_transformer_1.5b | +| Params | 1470.5M | +| Throughput | 188741 tokens/sec | +| Avg Step Time | 86.8 ms | +| Peak Memory | 18.1 GB | +| Final Loss | 0.0041 | +| Step Jitter | 626.74% | +| Distributed Mode | ddp | +| Verdict | FAIL (188741 tokens/sec) | + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_test_all_pdf_aikubeworker0016_20260522_182856.md b/reports_test_all_pdf_aikubeworker0016_20260522_182856.md new file mode 100644 index 0000000..dbee788 --- /dev/null +++ b/reports_test_all_pdf_aikubeworker0016_20260522_182856.md @@ -0,0 +1,259 @@ +# GPU Test Report + +- **Date:** 2026-05-22T18:29:01.245683 +- **Host:** aikubeworker0016 +- **GPU:** NVIDIA H100 80GB HBM3 x8 +- **Driver:** 580.159.03 | **CUDA:** 13.0 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Failed or unverified items: +- Compute Throughput: FAIL (worst FP32 52 vs >= 54) +- DCGM: ERROR: dcgmi diag -r 3 timeout after 1200s +- NCCL: FAIL +- Stress Test: FAIL +- RDMA: FAIL +- Training: FAIL (193836 tokens/sec) + +## Summary + +| Test | Result | +|------|--------| +| GPU Info | PASS (8 GPUs detected) | +| Health Check | PASS | +| Memory Bandwidth | PASS (108.1%) | +| Compute Throughput | FAIL (worst FP32 52 vs >= 54) | +| NVLink/NVSwitch | PASS | +| DCGM | ERROR: dcgmi diag -r 3 timeout after 1200s | +| NCCL | FAIL | +| Stress Test | FAIL | +| RDMA | FAIL | +| Training | FAIL (193836 tokens/sec) | + +## GPU Information + +| GPU | Model | VRAM | Temp | Power | SM Clock | +|-----|-------|------|------|-------|----------| +| 0 | NVIDIA H100 80GB HBM3 | 81559 MB | 19C | 70/700W | 345 MHz | +| 1 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz | +| 2 | NVIDIA H100 80GB HBM3 | 81559 MB | 20C | 67/700W | 345 MHz | +| 3 | NVIDIA H100 80GB HBM3 | 81559 MB | 19C | 67/700W | 345 MHz | +| 4 | NVIDIA H100 80GB HBM3 | 81559 MB | 19C | 67/700W | 345 MHz | +| 5 | NVIDIA H100 80GB HBM3 | 81559 MB | 21C | 69/700W | 345 MHz | +| 6 | NVIDIA H100 80GB HBM3 | 81559 MB | 19C | 68/700W | 345 MHz | +| 7 | NVIDIA H100 80GB HBM3 | 81559 MB | 19C | 66/700W | 345 MHz | + +## Health Check + +**Overall: PASS** + +| GPU | Temp | Power | ECC | PCIe | Throttle | Status | +|-----|------|-------|-----|------|----------|--------| +| 0 | 19C PASS | 70W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 1 | 20C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 2 | 20C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 3 | 19C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 4 | 19C PASS | 67W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 5 | 21C PASS | 69W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 6 | 19C PASS | 68W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | +| 7 | 19C PASS | 66W PASS | S:0 D:0 | Gen5x16 | PASS | **PASS** | + +## Memory Bandwidth + +Source: nvbandwidth + +| Metric | Value | Peak | Efficiency | +|--------|-------|------|------------| +| H2D (PCIe) | 55.5 GB/s | 64 GB/s | 86.7% | +| D2H (PCIe) | 54.7 GB/s | 64 GB/s | 85.5% | +| D2D (NVLink) | 486.6 GB/s | 450 GB/s | 108.1% | + +**Verdict: PASS** (D2D efficiency 108.1%) + +## Compute Throughput + +| DType | Achieved (TFLOPS) | Peak | Threshold | Status | +|-------|-------------------|------|------------|--------| +| FP32 | 52.0 | 67 | >= 54 | FAIL | +| TF32 | 366.2 | 495 | >= 444 | FAIL | +| FP16 | 684.8 | 990 | >= 734 | FAIL | +| BF16 | 720.7 | 990 | >= 745 | FAIL | +| FP8 | 1180.3 | 1979 | >= 1400 | FAIL | +| FP64 | 47.3 | 0 | >= 63 | FAIL | +| INT8 | 100.5 | 0 | >= 1536 | FAIL | + +**Verdict: FAIL** (absolute TFLOPS thresholds; worst efficiency 59.6%) + +### Compute Consistency + +| DType | Min | Mean | Max | Spread | Limit | Status | +|-------|-----|------|-----|--------|-------|--------| +| FP32 | 51.9 | 52.0 | 52.2 | 0.58% | <= 3% | PASS | +| TF32 | 361.1 | 366.2 | 368.9 | 2.13% | <= 3% | PASS | +| FP16 | 672.6 | 684.8 | 695.0 | 3.27% | <= 3% | FAIL | +| BF16 | 703.6 | 720.7 | 734.2 | 4.25% | <= 3% | FAIL | +| FP8 | 1158.6 | 1180.3 | 1241.8 | 7.05% | <= 3% | FAIL | +| FP64 | 46.7 | 47.3 | 48.0 | 2.75% | <= 3% | PASS | +| INT8 | 100.4 | 100.5 | 101.1 | 0.70% | <= 3% | PASS | + +### Compute Per-GPU TFLOPS + +| GPU | FP32 | TF32 | FP16 | BF16 | FP8 | FP64 | INT8 | +|---|---|---|---|---|---|---|---| +| 0 | 51.9 | 361.1 | 673.3 | 703.6 | 1158.6 | 46.7 | 100.4 | +| 1 | 52.0 | 367.0 | 684.0 | 725.7 | 1184.3 | 47.3 | 100.4 | +| 2 | 52.2 | 368.7 | 695.0 | 734.2 | 1197.7 | 48.0 | 100.4 | +| 3 | 51.9 | 367.8 | 688.0 | 708.1 | 1174.8 | 47.3 | 100.4 | +| 4 | 52.0 | 365.2 | 688.4 | 718.2 | 1160.5 | 47.0 | 101.1 | +| 5 | 52.1 | 368.9 | 684.2 | 733.7 | 1160.5 | 47.3 | 100.4 | +| 6 | 51.9 | 364.0 | 672.6 | 715.6 | 1164.4 | 47.1 | 100.4 | +| 7 | 51.9 | 367.0 | 692.5 | 726.5 | 1241.8 | 47.6 | 100.4 | + +## NVLink/NVSwitch + +**Overall: PASS** + +| GPU | Active Links | Issues | +|-----|--------------|--------| +| 0 | 18/18 | OK | +| 1 | 18/18 | OK | +| 2 | 18/18 | OK | +| 3 | 18/18 | OK | +| 4 | 18/18 | OK | +| 5 | 18/18 | OK | +| 6 | 18/18 | OK | +| 7 | 18/18 | OK | + +## DCGM Diagnostic + +**Overall: FAIL** (dcgmi diag -r 3 timeout after 1200s) + +## NCCL Multi-GPU + +Source: nccl-tests | GPUs: 8 + +| Operation | Bus BW (GB/s) | Threshold | Status | +|-----------|---------------|-----------|--------| +| allreduce | 472.5 | >= 405 | FAIL | +| alltoall | 344.2 | >= 315 | FAIL | +| broadcast | 363.8 | >= 360 | FAIL | +| reducescatter | 352.5 | >= 405 | FAIL | +| allgather | 366.8 | >= 405 | FAIL | +| sendrecv | 369.0 | >= 360 | FAIL | + +### NCCL allreduce by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 24.7, 24.1, 24.5 | 24.1 | 24.4 | 1.02% | >= 405 | FAIL | +| 256M | 421.8, 422.1, 421.4 | 421.4 | 421.8 | 0.07% | >= 405 | PASS | +| 2G | 472.8, 472.2, 472.6 | 472.2 | 472.5 | 0.05% | >= 405 | PASS | + +### NCCL alltoall by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 8.0, 8.0, 7.9 | 7.9 | 8.0 | 0.59% | >= 315 | FAIL | +| 256M | 326.8, 315.4, 315.8 | 315.4 | 319.3 | 1.65% | >= 315 | PASS | +| 2G | 344.2, 343.8, 344.6 | 343.8 | 344.2 | 0.09% | >= 315 | PASS | + +### NCCL broadcast by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 14.4, 14.2, 14.1 | 14.1 | 14.2 | 0.88% | >= 360 | FAIL | +| 256M | 345.3, 344.9, 344.4 | 344.4 | 344.9 | 0.11% | >= 360 | FAIL | +| 2G | 363.6, 363.9, 363.8 | 363.6 | 363.8 | 0.03% | >= 360 | PASS | + +### NCCL reducescatter by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 14.3, 14.1, 14.1 | 14.1 | 14.2 | 0.67% | >= 405 | FAIL | +| 256M | 328.2, 328.3, 328.4 | 328.2 | 328.3 | 0.02% | >= 405 | FAIL | +| 2G | 352.2, 352.7, 352.6 | 352.2 | 352.5 | 0.06% | >= 405 | FAIL | + +### NCCL allgather by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 14.2, 14.5, 14.3 | 14.2 | 14.3 | 0.87% | >= 405 | FAIL | +| 256M | 350.6, 350.6, 350.5 | 350.5 | 350.6 | 0.01% | >= 405 | FAIL | +| 2G | 367.0, 366.8, 366.5 | 366.5 | 366.8 | 0.06% | >= 405 | FAIL | + +### NCCL sendrecv by size + +| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status | +|------|---------------------|-------|------|--------|-----------|--------| +| 1M | 18.4, 18.2, 18.6 | 18.2 | 18.4 | 0.89% | >= 360 | FAIL | +| 256M | 350.7, 350.8, 351.1 | 350.7 | 350.9 | 0.05% | >= 360 | FAIL | +| 2G | 369.0, 369.0, 368.9 | 368.9 | 369.0 | 0.01% | >= 360 | PASS | + +**Overall: FAIL** + +## Stress Test + +- **Source:** pytorch +- **Duration:** 1800s (requested 1800s) +- **Telemetry samples:** 1541 +- **Max temp:** {0: 51.0, 1: 59.0, 2: 62.0, 3: 53.0, 4: 53.0, 5: 62.0, 6: 57.0, 7: 53.0} +- **Avg power:** {0: 698.7, 1: 698.0, 2: 698.1, 3: 697.9, 4: 697.7, 5: 698.2, 6: 698.0, 7: 697.7} +- **Temp delta:** 11.0 C +- **TFLOPS jitter:** 3.05% +- **Steady TFLOPS samples:** 37841 +- **Throttle events:** 11912 +- **XID events:** 0 +- **Failure reasons:** + - GPU temperature delta 11.0C exceeds 5.0C + - non-idle throttle reasons observed in 11912 samples (first: GPU 0 0x4) +- **Result: FAIL** + +## RDMA/InfiniBand + +### RDMA Port Checks + +| Device | Port | State | Rate | Required | Status | +|--------|------|-------|------|----------|--------| +| mlx5_0 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_1 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_4 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL | +| mlx5_5 | 1 | 4: ACTIVE | 100 Gb/sec (2X HDR) | >= 400Gbps ACTIVE | FAIL | +| mlx5_6 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | +| mlx5_7 | 1 | 4: ACTIVE | 400 Gb/sec (4X NDR) | >= 400Gbps ACTIVE | PASS | + +| Test | Value | Threshold | Status | +|------|-------|-----------|--------| +| ib_write_bw | 48.4 GB/s | >= 47 GB/s | PASS | +| ib_read_bw | 40.3 GB/s | >= 47 GB/s | FAIL | +| ib_write_lat | 2.44 us | <= 2 us | FAIL | +| ib_read_lat | 16.00 us | <= 3.5 us | FAIL | +| ibping | target=0x4b count=5 | 0% packet loss | PASS | + +- **PFC/ECN/CNP/congestion counters checked:** 0 +- **PFC/ECN/CNP/congestion non-zero:** no +- **Failure reasons:** + - mlx5_4 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE) + - mlx5_5 port 1 state/rate failed (4: ACTIVE, 100 Gb/sec (2X HDR); required >= 400.0Gbps ACTIVE) + - ib_read_bw bandwidth 40.29GB/s < 47GB/s + - ib_write_lat latency 2.44us > 2.0us + - ib_read_lat latency 16.0us > 3.5us +**Overall: FAIL** + +## Training Simulation + +| Metric | Value | +|--------|-------| +| Model | synthetic_transformer_1.5b | +| Params | 1470.5M | +| Throughput | 193836 tokens/sec | +| Avg Step Time | 84.5 ms | +| Peak Memory | 18.1 GB | +| Final Loss | 0.004 | +| Step Jitter | 521.24% | +| Distributed Mode | ddp | +| Verdict | FAIL (193836 tokens/sec) | + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_training_warmup_aikubeworker0012_20260522_194528.md b/reports_training_warmup_aikubeworker0012_20260522_194528.md new file mode 100644 index 0000000..948e866 --- /dev/null +++ b/reports_training_warmup_aikubeworker0012_20260522_194528.md @@ -0,0 +1,43 @@ +# GPU Test Report + +- **Date:** 2026-05-22T19:46:07.450315 +- **Host:** aikubeworker0012 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Missing required evidence: +- GPU Info +- Health Check +- Memory Bandwidth +- Compute Throughput +- NVLink/NVSwitch +- NCCL +- Stress Test +- RDMA +- DCGM + +## Summary + +| Test | Result | +|------|--------| +| Training | PASS (216654 tokens/sec) | + +## Training Simulation + +| Metric | Value | +|--------|-------| +| Model | synthetic_transformer_1.5b | +| Params | 1470.5M | +| Throughput | 216654 tokens/sec | +| Avg Step Time | 75.6 ms | +| Warmup Steps | 5 | +| Peak Memory | 18.1 GB | +| Final Loss | 0.0039 | +| Step Jitter | 0.87% | +| Distributed Mode | ddp | +| Verdict | PASS (216654 tokens/sec) | + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/reports_training_warmup_aikubeworker0016_20260522_194609.md b/reports_training_warmup_aikubeworker0016_20260522_194609.md new file mode 100644 index 0000000..61570ca --- /dev/null +++ b/reports_training_warmup_aikubeworker0016_20260522_194609.md @@ -0,0 +1,43 @@ +# GPU Test Report + +- **Date:** 2026-05-22T19:46:48.023650 +- **Host:** aikubeworker0016 + +## Overall Acceptance Verdict + +**Result: FAIL** + +Missing required evidence: +- GPU Info +- Health Check +- Memory Bandwidth +- Compute Throughput +- NVLink/NVSwitch +- NCCL +- Stress Test +- RDMA +- DCGM + +## Summary + +| Test | Result | +|------|--------| +| Training | PASS (217236 tokens/sec) | + +## Training Simulation + +| Metric | Value | +|--------|-------| +| Model | synthetic_transformer_1.5b | +| Params | 1470.5M | +| Throughput | 217236 tokens/sec | +| Avg Step Time | 75.4 ms | +| Warmup Steps | 5 | +| Peak Memory | 18.1 GB | +| Final Loss | 0.0039 | +| Step Jitter | 1.23% | +| Distributed Mode | ddp | +| Verdict | PASS (217236 tokens/sec) | + +--- +*Generated by GPU Test Suite v0.2.0* \ No newline at end of file diff --git a/test_all_aikubeworker0016_中文结果与验收差距.md b/test_all_aikubeworker0016_中文结果与验收差距.md new file mode 100644 index 0000000..d05e25a --- /dev/null +++ b/test_all_aikubeworker0016_中文结果与验收差距.md @@ -0,0 +1,73 @@ +# aikubeworker0016 `test all` 中文结果与 H100 验收差距 + +测试命令: + +```bash +/root/gpu-test-venv/bin/python gpu_tester.py --test all --report --format json --output reports_all/test_all.json +``` + +测试机器:`aikubeworker0016 / 172.72.8.16` + +原始结果:`reports_all_aikubeworker0016.json` + +## 先说结论 + +项目输出里最后显示 `Suite complete: 8/8 tests passed`,但这个结论不能直接当成生产验收 PASS。 + +原因是当前 `all` 的汇总逻辑主要看模块有没有抛 `error`,没有把 `nccl.passed=false` 和 `rdma.passed=false` 当成整套失败。因此按 PDF 的生产验收口径,这台机器目前不能算完整验收通过。 + +## 本次 `test all` 实际结果 + +| 模块 | 当前结果 | 关键数据 | 按 PDF 验收看 | +| --- | --- | --- | --- | +| GPU 信息 | 已覆盖 | 8 张 H100,Driver 580.159.03,CUDA 13.0 | 基础信息 OK,但 NVLink 链路专项不足 | +| 健康检查 | PASS | health.passed=true | 基础健康 OK,但缺 retired pages、AER/Replay、fabricmanager 日志、stress 期间采样 | +| Memory | 有结果 | H2D 55.5 GB/s,D2H 55.3 GB/s,D2D 486.5 GB/s | 单项看起来不错,但缺 8x8 P2P 矩阵验收 | +| Compute | 有结果 | FP32 51.9,TF32 357.0,FP16 664.0,BF16 700.1,FP8 1116.2 TFLOPS | 对 PDF 绝对门槛不全通过 | +| NCCL | 实际不合格 | source=torchrun_fallback,`nccl.passed=false`,无 bus BW 性能数据 | 不满足 PDF NCCL 性能验收 | +| Stress | PASS | PyTorch fallback,60 秒,8 GPU 状态 PASS | 不满足 PDF 的 30/60 分钟 burn-in;负载只有约 64MB/卡,压力明显不够 | +| RDMA/IB | 实际不合格 | ib_write_bw/read_bw 0.13 GB/s WARN;write_lat 4.10us PASS;read_lat 16us WARN | 当前是 localhost 单节点口径,不满足 PDF RDMA 生产验收 | +| Training | 有结果 | synthetic 1.47B,52471 tokens/s,peak 27.31GB,loss 0.0041 | tokens/s 过线,但代码实际不是 8 卡分布式训练验收 | + +## Compute 对 PDF 门槛的判断 + +PDF H100 PASS 门槛: + +| DType | 本次结果 | PDF PASS 门槛 | 判断 | +| --- | ---: | ---: | --- | +| FP32 | 51.9 TFLOPS | >= 54 | WARN | +| TF32 | 357.0 TFLOPS | >= 444 | FAIL | +| FP16 | 664.0 TFLOPS | >= 734 | WARN | +| BF16 | 700.1 TFLOPS | >= 745 | WARN | +| FP8 | 1116.2 TFLOPS | >= 1400 | FAIL | +| FP64 | 未测 | >= 63 | 缺失 | +| INT8 | 未测 | >= 1536 | 缺失 | + +说明:PDF 里 WARN 区间是 PASS 门槛的 90%-100%。TF32 和 FP8 低于 90% 门槛,所以按 PDF 是 FAIL。 + +## 如果只执行当前仓库 `test all`,少了什么 + +1. 少 NVLink 专项验收:没有逐卡检查 18 条链路、25GB/s 速率、CRC/Replay/Recovery error = 0。 +2. 少 DCGM 诊断:没有 `dcgmi diag -r 3`。 +3. 少长时间 burn-in:当前是 60 秒,不是 30/60 分钟。 +4. 少 stress 期间 1 秒级采样:温度、功耗、throttle、XID、TFLOPS 抖动都没按 PDF 统计。 +5. 少真正 NCCL 性能:当前退化到 torchrun fallback,没有 `nccl-tests` bus BW。 +6. 少 NCCL 全操作和三档消息:PDF 要 AllReduce/AllGather/ReduceScatter/Broadcast/SendRecv/AllToAll,且 1MB/256MB/2GB 都过线。 +7. 少 NCCL 重复 3 次取最差值和标准差 <=3%。 +8. 少完整 P2P 8x8 矩阵:没有非对角均值、最小值、偏差判断。 +9. 少逐 GPU compute 一致性:没有真正分别测 8 卡同 dtype 极差/均值 <=3%。 +10. 少 FP64 和 INT8。 +11. 少 RDMA 生产口径:当前 `localhost`,64KB message,阈值 10us;PDF 要 4MB BW、8B latency、write/read >=47GB/s、write_lat <=2us、read_lat <=3.5us。 +12. 少 PFC/ECN 错误计数和 ibping 双向。 +13. 少真正 8 卡分布式 Training Simulation 验收。 +14. 少严格最终 verdict:当前代码会把 `passed=false` 的模块也计入“通过”,这是验收逻辑漏洞。 + +## 建议 + +`test all` 可以继续作为快速初筛跑,但如果目标是对齐 `H100_production_acceptance.pdf`,需要把它升级成“生产验收模式”。优先级如下: + +1. 先修汇总 verdict:任何子模块 `passed=false` 必须导致整机 FAIL。 +2. 先装好 `nccl-tests` 和 `gpu-burn`,否则 NCCL/Stress 都不是生产口径。 +3. 增加 NVLink、DCGM、长时间 telemetry、P2P 矩阵。 +4. 改 RDMA 为生产参数,且支持跨节点。 +5. 改 compute/training 为逐 GPU/8 卡分布式验收。