921 lines
24 KiB
JSON
921 lines
24 KiB
JSON
{
|
|
"timestamp": "2026-05-22T15:49:02.368516",
|
|
"gpu_info": {
|
|
"driver_version": "580.159.03",
|
|
"cuda_version": "13.0",
|
|
"gpu_count": 8,
|
|
"gpus": [
|
|
{
|
|
"index": 0,
|
|
"name": "NVIDIA H100 80GB HBM3",
|
|
"uuid": "GPU-dfbc9513-255d-4fe7-2b77-7b1ec3972e75",
|
|
"pci_bus_id": "00000000:18:00.0",
|
|
"pcie_link_gen": 5,
|
|
"pcie_link_width": 16,
|
|
"vram_total_mb": 81559,
|
|
"vram_used_mb": 4,
|
|
"vram_free_mb": 81076,
|
|
"power_draw": 69.98,
|
|
"power_limit": 700.0,
|
|
"clock_sm": 345,
|
|
"clock_mem": 2619,
|
|
"temperature": 21,
|
|
"fan_speed": 0,
|
|
"persistence_mode": false,
|
|
"compute_mode": "Default",
|
|
"serial_number": "1651924016120",
|
|
"ecc_errors_single": 0,
|
|
"ecc_errors_double": 0
|
|
},
|
|
{
|
|
"index": 1,
|
|
"name": "NVIDIA H100 80GB HBM3",
|
|
"uuid": "GPU-bb845ef7-d7b5-f011-9395-ea74274e2282",
|
|
"pci_bus_id": "00000000:2A:00.0",
|
|
"pcie_link_gen": 5,
|
|
"pcie_link_width": 16,
|
|
"vram_total_mb": 81559,
|
|
"vram_used_mb": 4,
|
|
"vram_free_mb": 81076,
|
|
"power_draw": 67.54,
|
|
"power_limit": 700.0,
|
|
"clock_sm": 345,
|
|
"clock_mem": 2619,
|
|
"temperature": 21,
|
|
"fan_speed": 0,
|
|
"persistence_mode": false,
|
|
"compute_mode": "Default",
|
|
"serial_number": "1651924015483",
|
|
"ecc_errors_single": 0,
|
|
"ecc_errors_double": 0
|
|
},
|
|
{
|
|
"index": 2,
|
|
"name": "NVIDIA H100 80GB HBM3",
|
|
"uuid": "GPU-3720cf13-2a34-be38-27be-0a7adc4addc4",
|
|
"pci_bus_id": "00000000:3A:00.0",
|
|
"pcie_link_gen": 5,
|
|
"pcie_link_width": 16,
|
|
"vram_total_mb": 81559,
|
|
"vram_used_mb": 4,
|
|
"vram_free_mb": 81076,
|
|
"power_draw": 66.82,
|
|
"power_limit": 700.0,
|
|
"clock_sm": 345,
|
|
"clock_mem": 2619,
|
|
"temperature": 22,
|
|
"fan_speed": 0,
|
|
"persistence_mode": false,
|
|
"compute_mode": "Default",
|
|
"serial_number": "1651924025595",
|
|
"ecc_errors_single": 0,
|
|
"ecc_errors_double": 0
|
|
},
|
|
{
|
|
"index": 3,
|
|
"name": "NVIDIA H100 80GB HBM3",
|
|
"uuid": "GPU-87080b2d-ac43-be0d-d574-c193078850ae",
|
|
"pci_bus_id": "00000000:5D:00.0",
|
|
"pcie_link_gen": 5,
|
|
"pcie_link_width": 16,
|
|
"vram_total_mb": 81559,
|
|
"vram_used_mb": 4,
|
|
"vram_free_mb": 81076,
|
|
"power_draw": 67.02,
|
|
"power_limit": 700.0,
|
|
"clock_sm": 345,
|
|
"clock_mem": 2619,
|
|
"temperature": 21,
|
|
"fan_speed": 0,
|
|
"persistence_mode": false,
|
|
"compute_mode": "Default",
|
|
"serial_number": "1651924016862",
|
|
"ecc_errors_single": 0,
|
|
"ecc_errors_double": 0
|
|
},
|
|
{
|
|
"index": 4,
|
|
"name": "NVIDIA H100 80GB HBM3",
|
|
"uuid": "GPU-599bd883-cc5c-a5dd-6c33-c15f7049da48",
|
|
"pci_bus_id": "00000000:9A:00.0",
|
|
"pcie_link_gen": 5,
|
|
"pcie_link_width": 16,
|
|
"vram_total_mb": 81559,
|
|
"vram_used_mb": 4,
|
|
"vram_free_mb": 81076,
|
|
"power_draw": 67.24,
|
|
"power_limit": 700.0,
|
|
"clock_sm": 345,
|
|
"clock_mem": 2619,
|
|
"temperature": 21,
|
|
"fan_speed": 0,
|
|
"persistence_mode": false,
|
|
"compute_mode": "Default",
|
|
"serial_number": "1651924025670",
|
|
"ecc_errors_single": 0,
|
|
"ecc_errors_double": 0
|
|
},
|
|
{
|
|
"index": 5,
|
|
"name": "NVIDIA H100 80GB HBM3",
|
|
"uuid": "GPU-a1c6bba4-61b0-e623-06c9-9c88635e26fe",
|
|
"pci_bus_id": "00000000:AB:00.0",
|
|
"pcie_link_gen": 5,
|
|
"pcie_link_width": 16,
|
|
"vram_total_mb": 81559,
|
|
"vram_used_mb": 4,
|
|
"vram_free_mb": 81076,
|
|
"power_draw": 69.31,
|
|
"power_limit": 700.0,
|
|
"clock_sm": 345,
|
|
"clock_mem": 2619,
|
|
"temperature": 23,
|
|
"fan_speed": 0,
|
|
"persistence_mode": false,
|
|
"compute_mode": "Default",
|
|
"serial_number": "1651924027166",
|
|
"ecc_errors_single": 0,
|
|
"ecc_errors_double": 0
|
|
},
|
|
{
|
|
"index": 6,
|
|
"name": "NVIDIA H100 80GB HBM3",
|
|
"uuid": "GPU-98745a0c-39bd-3e56-d6ca-54ba3647ab6d",
|
|
"pci_bus_id": "00000000:BA:00.0",
|
|
"pcie_link_gen": 5,
|
|
"pcie_link_width": 16,
|
|
"vram_total_mb": 81559,
|
|
"vram_used_mb": 4,
|
|
"vram_free_mb": 81076,
|
|
"power_draw": 67.84,
|
|
"power_limit": 700.0,
|
|
"clock_sm": 345,
|
|
"clock_mem": 2619,
|
|
"temperature": 21,
|
|
"fan_speed": 0,
|
|
"persistence_mode": false,
|
|
"compute_mode": "Default",
|
|
"serial_number": "1651924026234",
|
|
"ecc_errors_single": 0,
|
|
"ecc_errors_double": 0
|
|
},
|
|
{
|
|
"index": 7,
|
|
"name": "NVIDIA H100 80GB HBM3",
|
|
"uuid": "GPU-8c73bd8b-666b-357e-ac5d-c75ac7a759db",
|
|
"pci_bus_id": "00000000:DB:00.0",
|
|
"pcie_link_gen": 5,
|
|
"pcie_link_width": 16,
|
|
"vram_total_mb": 81559,
|
|
"vram_used_mb": 4,
|
|
"vram_free_mb": 81076,
|
|
"power_draw": 66.21,
|
|
"power_limit": 700.0,
|
|
"clock_sm": 345,
|
|
"clock_mem": 2619,
|
|
"temperature": 21,
|
|
"fan_speed": 0,
|
|
"persistence_mode": false,
|
|
"compute_mode": "Default",
|
|
"serial_number": "1651924027255",
|
|
"ecc_errors_single": 0,
|
|
"ecc_errors_double": 0
|
|
}
|
|
],
|
|
"topology": "\t\u001b[4mGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tNIC3\tNIC4\tNIC5\tNIC6\tNIC7\tNIC8\tNIC9\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\u001b[0m\nGPU0\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tPIX\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU1\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tPIX\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU2\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tPIX\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU3\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU4\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU5\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU6\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tPIX\t56-111,168-223\t1\t\tN/A\nGPU7\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nNIC0\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC3\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC4\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t X \tPIX\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC5\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\tPIX\t X \tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC6\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\t\t\t\t\nNIC7\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\t\t\t\t\nNIC8\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\t\t\t\t\nNIC9\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n NIC4: mlx5_4\n NIC5: mlx5_5\n NIC6: mlx5_6\n NIC7: mlx5_7\n NIC8: mlx5_8\n NIC9: mlx5_9\n\n",
|
|
"timestamp": "2026-05-22T15:49:09.197459",
|
|
"detected_gpu_type": "h100",
|
|
"gpu_label": "H100 SXM5"
|
|
},
|
|
"health": {
|
|
"passed": true,
|
|
"gpu_health": [
|
|
{
|
|
"index": 0,
|
|
"status": "WARN",
|
|
"checks": {
|
|
"temperature": {
|
|
"value": 21,
|
|
"status": "PASS",
|
|
"threshold": 75
|
|
},
|
|
"power": {
|
|
"value": 69.86,
|
|
"limit": 700.0,
|
|
"status": "PASS"
|
|
},
|
|
"ecc_errors": {
|
|
"single": 0,
|
|
"double": 0,
|
|
"status": "PASS"
|
|
},
|
|
"memory_errors": {
|
|
"status": "PASS"
|
|
},
|
|
"pcie_link": {
|
|
"gen": 5,
|
|
"width": 16,
|
|
"status": "PASS"
|
|
},
|
|
"clock_speed": {
|
|
"sm": 345,
|
|
"mem": 2619,
|
|
"status": "PASS"
|
|
},
|
|
"throttling": {
|
|
"status": "PASS",
|
|
"reasons": []
|
|
},
|
|
"persistence_mode": {
|
|
"enabled": false,
|
|
"status": "WARN"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"index": 1,
|
|
"status": "WARN",
|
|
"checks": {
|
|
"temperature": {
|
|
"value": 21,
|
|
"status": "PASS",
|
|
"threshold": 75
|
|
},
|
|
"power": {
|
|
"value": 67.48,
|
|
"limit": 700.0,
|
|
"status": "PASS"
|
|
},
|
|
"ecc_errors": {
|
|
"single": 0,
|
|
"double": 0,
|
|
"status": "PASS"
|
|
},
|
|
"memory_errors": {
|
|
"status": "PASS"
|
|
},
|
|
"pcie_link": {
|
|
"gen": 5,
|
|
"width": 16,
|
|
"status": "PASS"
|
|
},
|
|
"clock_speed": {
|
|
"sm": 345,
|
|
"mem": 2619,
|
|
"status": "PASS"
|
|
},
|
|
"throttling": {
|
|
"status": "PASS",
|
|
"reasons": []
|
|
},
|
|
"persistence_mode": {
|
|
"enabled": false,
|
|
"status": "WARN"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"index": 2,
|
|
"status": "WARN",
|
|
"checks": {
|
|
"temperature": {
|
|
"value": 22,
|
|
"status": "PASS",
|
|
"threshold": 75
|
|
},
|
|
"power": {
|
|
"value": 66.76,
|
|
"limit": 700.0,
|
|
"status": "PASS"
|
|
},
|
|
"ecc_errors": {
|
|
"single": 0,
|
|
"double": 0,
|
|
"status": "PASS"
|
|
},
|
|
"memory_errors": {
|
|
"status": "PASS"
|
|
},
|
|
"pcie_link": {
|
|
"gen": 5,
|
|
"width": 16,
|
|
"status": "PASS"
|
|
},
|
|
"clock_speed": {
|
|
"sm": 345,
|
|
"mem": 2619,
|
|
"status": "PASS"
|
|
},
|
|
"throttling": {
|
|
"status": "PASS",
|
|
"reasons": []
|
|
},
|
|
"persistence_mode": {
|
|
"enabled": false,
|
|
"status": "WARN"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"index": 3,
|
|
"status": "WARN",
|
|
"checks": {
|
|
"temperature": {
|
|
"value": 21,
|
|
"status": "PASS",
|
|
"threshold": 75
|
|
},
|
|
"power": {
|
|
"value": 67.06,
|
|
"limit": 700.0,
|
|
"status": "PASS"
|
|
},
|
|
"ecc_errors": {
|
|
"single": 0,
|
|
"double": 0,
|
|
"status": "PASS"
|
|
},
|
|
"memory_errors": {
|
|
"status": "PASS"
|
|
},
|
|
"pcie_link": {
|
|
"gen": 5,
|
|
"width": 16,
|
|
"status": "PASS"
|
|
},
|
|
"clock_speed": {
|
|
"sm": 345,
|
|
"mem": 2619,
|
|
"status": "PASS"
|
|
},
|
|
"throttling": {
|
|
"status": "PASS",
|
|
"reasons": []
|
|
},
|
|
"persistence_mode": {
|
|
"enabled": false,
|
|
"status": "WARN"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"index": 4,
|
|
"status": "WARN",
|
|
"checks": {
|
|
"temperature": {
|
|
"value": 21,
|
|
"status": "PASS",
|
|
"threshold": 75
|
|
},
|
|
"power": {
|
|
"value": 67.23,
|
|
"limit": 700.0,
|
|
"status": "PASS"
|
|
},
|
|
"ecc_errors": {
|
|
"single": 0,
|
|
"double": 0,
|
|
"status": "PASS"
|
|
},
|
|
"memory_errors": {
|
|
"status": "PASS"
|
|
},
|
|
"pcie_link": {
|
|
"gen": 5,
|
|
"width": 16,
|
|
"status": "PASS"
|
|
},
|
|
"clock_speed": {
|
|
"sm": 345,
|
|
"mem": 2619,
|
|
"status": "PASS"
|
|
},
|
|
"throttling": {
|
|
"status": "PASS",
|
|
"reasons": []
|
|
},
|
|
"persistence_mode": {
|
|
"enabled": false,
|
|
"status": "WARN"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"index": 5,
|
|
"status": "WARN",
|
|
"checks": {
|
|
"temperature": {
|
|
"value": 23,
|
|
"status": "PASS",
|
|
"threshold": 75
|
|
},
|
|
"power": {
|
|
"value": 69.27,
|
|
"limit": 700.0,
|
|
"status": "PASS"
|
|
},
|
|
"ecc_errors": {
|
|
"single": 0,
|
|
"double": 0,
|
|
"status": "PASS"
|
|
},
|
|
"memory_errors": {
|
|
"status": "PASS"
|
|
},
|
|
"pcie_link": {
|
|
"gen": 5,
|
|
"width": 16,
|
|
"status": "PASS"
|
|
},
|
|
"clock_speed": {
|
|
"sm": 345,
|
|
"mem": 2619,
|
|
"status": "PASS"
|
|
},
|
|
"throttling": {
|
|
"status": "PASS",
|
|
"reasons": []
|
|
},
|
|
"persistence_mode": {
|
|
"enabled": false,
|
|
"status": "WARN"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"index": 6,
|
|
"status": "WARN",
|
|
"checks": {
|
|
"temperature": {
|
|
"value": 21,
|
|
"status": "PASS",
|
|
"threshold": 75
|
|
},
|
|
"power": {
|
|
"value": 67.81,
|
|
"limit": 700.0,
|
|
"status": "PASS"
|
|
},
|
|
"ecc_errors": {
|
|
"single": 0,
|
|
"double": 0,
|
|
"status": "PASS"
|
|
},
|
|
"memory_errors": {
|
|
"status": "PASS"
|
|
},
|
|
"pcie_link": {
|
|
"gen": 5,
|
|
"width": 16,
|
|
"status": "PASS"
|
|
},
|
|
"clock_speed": {
|
|
"sm": 345,
|
|
"mem": 2619,
|
|
"status": "PASS"
|
|
},
|
|
"throttling": {
|
|
"status": "PASS",
|
|
"reasons": []
|
|
},
|
|
"persistence_mode": {
|
|
"enabled": false,
|
|
"status": "WARN"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"index": 7,
|
|
"status": "WARN",
|
|
"checks": {
|
|
"temperature": {
|
|
"value": 21,
|
|
"status": "PASS",
|
|
"threshold": 75
|
|
},
|
|
"power": {
|
|
"value": 66.3,
|
|
"limit": 700.0,
|
|
"status": "PASS"
|
|
},
|
|
"ecc_errors": {
|
|
"single": 0,
|
|
"double": 0,
|
|
"status": "PASS"
|
|
},
|
|
"memory_errors": {
|
|
"status": "PASS"
|
|
},
|
|
"pcie_link": {
|
|
"gen": 5,
|
|
"width": 16,
|
|
"status": "PASS"
|
|
},
|
|
"clock_speed": {
|
|
"sm": 345,
|
|
"mem": 2619,
|
|
"status": "PASS"
|
|
},
|
|
"throttling": {
|
|
"status": "PASS",
|
|
"reasons": []
|
|
},
|
|
"persistence_mode": {
|
|
"enabled": false,
|
|
"status": "WARN"
|
|
}
|
|
}
|
|
}
|
|
],
|
|
"system_health": {
|
|
"nvidia_persistenced": {
|
|
"installed": true,
|
|
"running": false
|
|
},
|
|
"hugepages": {
|
|
"configured": false,
|
|
"count": 0
|
|
},
|
|
"swap": {
|
|
"enabled": true
|
|
},
|
|
"transparent_hugepage": "madvise",
|
|
"file_descriptors": {
|
|
"soft": 1024,
|
|
"max": 1048576
|
|
},
|
|
"infiniband_devices": [
|
|
"mlx5_4",
|
|
"mlx5_2",
|
|
"mlx5_0",
|
|
"mlx5_9",
|
|
"mlx5_7",
|
|
"mlx5_5",
|
|
"mlx5_3",
|
|
"mlx5_1",
|
|
"mlx5_8",
|
|
"mlx5_6"
|
|
],
|
|
"rdma_devices": [
|
|
"abi_version",
|
|
"uverbs4",
|
|
"uverbs2",
|
|
"uverbs0",
|
|
"uverbs9",
|
|
"uverbs7",
|
|
"uverbs5",
|
|
"uverbs3",
|
|
"uverbs1",
|
|
"uverbs8",
|
|
"uverbs6"
|
|
],
|
|
"nccl_env_vars": {}
|
|
},
|
|
"timestamp": "2026-05-22T15:49:11.294816",
|
|
"detected_gpu_type": "h100"
|
|
},
|
|
"memory_bench": {
|
|
"memory": {
|
|
"source": "nvbandwidth",
|
|
"h2d_bandwidth_gbps": 55.5,
|
|
"d2h_bandwidth_gbps": 55.3,
|
|
"d2d_bandwidth_gbps": 486.5,
|
|
"h2d_peak_gbps": 64,
|
|
"d2h_peak_gbps": 64,
|
|
"d2d_peak_gbps": 450.0,
|
|
"h2d_efficiency_pct": 86.7,
|
|
"d2h_efficiency_pct": 86.4,
|
|
"d2d_efficiency_pct": 108.1,
|
|
"peak_bandwidth_gbps": 3400,
|
|
"efficiency_pct": 108.1,
|
|
"results_by_test": {
|
|
"h2d": 55.5,
|
|
"d2h": 55.3,
|
|
"d2d_write": 397.4,
|
|
"d2d_read": 395.1,
|
|
"d2d_bidir": 486.5
|
|
},
|
|
"per_gpu": []
|
|
}
|
|
},
|
|
"compute_bench": {
|
|
"compute": {
|
|
"per_dtype_tflops": {
|
|
"fp32": 51.9,
|
|
"tf32": 357.0,
|
|
"fp16": 664.0,
|
|
"bf16": 700.1,
|
|
"fp8": 1116.2
|
|
},
|
|
"peak_tflops": {
|
|
"fp32": 67,
|
|
"tf32": 495,
|
|
"fp16": 990,
|
|
"bf16": 990,
|
|
"fp8": 1979
|
|
},
|
|
"efficiency_pct": {
|
|
"fp32": 77.5,
|
|
"tf32": 72.1,
|
|
"fp16": 67.1,
|
|
"bf16": 70.7,
|
|
"fp8": 56.4
|
|
},
|
|
"pass_thresholds_tflops": {
|
|
"fp32": 54,
|
|
"tf32": 444,
|
|
"fp16": 734,
|
|
"bf16": 745,
|
|
"fp8": 1400
|
|
},
|
|
"per_gpu": [
|
|
{
|
|
"index": 0,
|
|
"fp32": 51.9,
|
|
"tf32": 357.0,
|
|
"fp16": 664.0,
|
|
"bf16": 700.1,
|
|
"fp8": 1116.2
|
|
},
|
|
{
|
|
"index": 1,
|
|
"fp32": 51.9,
|
|
"tf32": 357.0,
|
|
"fp16": 664.0,
|
|
"bf16": 700.1,
|
|
"fp8": 1116.2
|
|
},
|
|
{
|
|
"index": 2,
|
|
"fp32": 51.9,
|
|
"tf32": 357.0,
|
|
"fp16": 664.0,
|
|
"bf16": 700.1,
|
|
"fp8": 1116.2
|
|
},
|
|
{
|
|
"index": 3,
|
|
"fp32": 51.9,
|
|
"tf32": 357.0,
|
|
"fp16": 664.0,
|
|
"bf16": 700.1,
|
|
"fp8": 1116.2
|
|
},
|
|
{
|
|
"index": 4,
|
|
"fp32": 51.9,
|
|
"tf32": 357.0,
|
|
"fp16": 664.0,
|
|
"bf16": 700.1,
|
|
"fp8": 1116.2
|
|
},
|
|
{
|
|
"index": 5,
|
|
"fp32": 51.9,
|
|
"tf32": 357.0,
|
|
"fp16": 664.0,
|
|
"bf16": 700.1,
|
|
"fp8": 1116.2
|
|
},
|
|
{
|
|
"index": 6,
|
|
"fp32": 51.9,
|
|
"tf32": 357.0,
|
|
"fp16": 664.0,
|
|
"bf16": 700.1,
|
|
"fp8": 1116.2
|
|
},
|
|
{
|
|
"index": 7,
|
|
"fp32": 51.9,
|
|
"tf32": 357.0,
|
|
"fp16": 664.0,
|
|
"bf16": 700.1,
|
|
"fp8": 1116.2
|
|
}
|
|
],
|
|
"matrix_size": 8192,
|
|
"warmup": 50,
|
|
"iterations": 500
|
|
}
|
|
},
|
|
"nccl": {
|
|
"passed": false,
|
|
"source": "torchrun_fallback",
|
|
"tests": {
|
|
"NCCL version 2.21.5+cuda12.4": {
|
|
"status": "FAIL",
|
|
"error": null
|
|
},
|
|
"allreduce": {
|
|
"status": "PASS",
|
|
"error": null
|
|
},
|
|
"broadcast": {
|
|
"status": "PASS",
|
|
"error": null
|
|
},
|
|
"allgather": {
|
|
"status": "PASS",
|
|
"error": null
|
|
},
|
|
"reducescatter": {
|
|
"status": "PASS",
|
|
"error": null
|
|
},
|
|
"alltoall": {
|
|
"status": "PASS",
|
|
"error": null
|
|
}
|
|
},
|
|
"gpu_count": 8
|
|
},
|
|
"stress": {
|
|
"source": "pytorch",
|
|
"passed": true,
|
|
"duration_sec": 60,
|
|
"elapsed_sec": 60.0,
|
|
"gpu_status": {
|
|
"0": "PASS",
|
|
"1": "PASS",
|
|
"2": "PASS",
|
|
"3": "PASS",
|
|
"4": "PASS",
|
|
"5": "PASS",
|
|
"6": "PASS",
|
|
"7": "PASS"
|
|
},
|
|
"timestamp": "2026-05-22T15:51:56.803540"
|
|
},
|
|
"rdma": {
|
|
"passed": false,
|
|
"devices": [
|
|
{
|
|
"name": "mlx5_0",
|
|
"ports": [
|
|
{
|
|
"port": "1",
|
|
"rate": "400 Gb/sec (4X NDR)",
|
|
"state": "4: ACTIVE",
|
|
"phys_state": "5: LinkUp",
|
|
"gid": "fe80:0000:0000:0000:58a2:e103:0088:81e0"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "mlx5_1",
|
|
"ports": [
|
|
{
|
|
"port": "1",
|
|
"rate": "400 Gb/sec (4X NDR)",
|
|
"state": "4: ACTIVE",
|
|
"phys_state": "5: LinkUp",
|
|
"gid": "fe80:0000:0000:0000:9c63:c003:0054:e00a"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "mlx5_2",
|
|
"ports": [
|
|
{
|
|
"port": "1",
|
|
"rate": "25 Gb/sec (1X EDR)",
|
|
"state": "4: ACTIVE",
|
|
"phys_state": "5: LinkUp",
|
|
"gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "mlx5_3",
|
|
"ports": [
|
|
{
|
|
"port": "1",
|
|
"rate": "25 Gb/sec (1X EDR)",
|
|
"state": "1: DOWN",
|
|
"phys_state": "3: Disabled",
|
|
"gid": "fe80:0000:0000:0000:c670:bdff:fefd:5bd9"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "mlx5_4",
|
|
"ports": [
|
|
{
|
|
"port": "1",
|
|
"rate": "100 Gb/sec (2X HDR)",
|
|
"state": "4: ACTIVE",
|
|
"phys_state": "5: LinkUp",
|
|
"gid": "fe80:0000:0000:0000:9c63:c003:005f:58ec"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "mlx5_5",
|
|
"ports": [
|
|
{
|
|
"port": "1",
|
|
"rate": "100 Gb/sec (2X HDR)",
|
|
"state": "4: ACTIVE",
|
|
"phys_state": "5: LinkUp",
|
|
"gid": "fe80:0000:0000:0000:9c63:c003:005f:58ed"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "mlx5_6",
|
|
"ports": [
|
|
{
|
|
"port": "1",
|
|
"rate": "400 Gb/sec (4X NDR)",
|
|
"state": "4: ACTIVE",
|
|
"phys_state": "5: LinkUp",
|
|
"gid": "fe80:0000:0000:0000:9c63:c003:0055:0e56"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "mlx5_7",
|
|
"ports": [
|
|
{
|
|
"port": "1",
|
|
"rate": "400 Gb/sec (4X NDR)",
|
|
"state": "4: ACTIVE",
|
|
"phys_state": "5: LinkUp",
|
|
"gid": "fe80:0000:0000:0000:a088:c203:00f0:286c"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "mlx5_8",
|
|
"ports": [
|
|
{
|
|
"port": "1",
|
|
"rate": "25 Gb/sec (1X EDR)",
|
|
"state": "4: ACTIVE",
|
|
"phys_state": "5: LinkUp",
|
|
"gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "mlx5_9",
|
|
"ports": [
|
|
{
|
|
"port": "1",
|
|
"rate": "25 Gb/sec (1X EDR)",
|
|
"state": "1: DOWN",
|
|
"phys_state": "3: Disabled",
|
|
"gid": "fe80:0000:0000:0000:c670:bdff:fefd:569d"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"bandwidth_tests": [
|
|
{
|
|
"test": "ib_write_bw",
|
|
"status": "WARN",
|
|
"bandwidth_gbps": 0.13,
|
|
"min_required_gbps": 50
|
|
},
|
|
{
|
|
"test": "ib_read_bw",
|
|
"status": "WARN",
|
|
"bandwidth_gbps": 0.13,
|
|
"min_required_gbps": 50
|
|
}
|
|
],
|
|
"latency_tests": [
|
|
{
|
|
"test": "ib_write_lat",
|
|
"status": "PASS",
|
|
"latency_us": 4.1,
|
|
"max_allowed_us": 10
|
|
},
|
|
{
|
|
"test": "ib_read_lat",
|
|
"status": "WARN",
|
|
"latency_us": 16.0,
|
|
"max_allowed_us": 10
|
|
}
|
|
],
|
|
"timestamp": "2026-05-22T15:52:03.507540"
|
|
},
|
|
"training": {
|
|
"model": "synthetic_transformer",
|
|
"total_params_m": 1470.5,
|
|
"num_layers": 6,
|
|
"hidden_size": 4096,
|
|
"gpu_count": 8,
|
|
"dtype": "bfloat16",
|
|
"batch_size": 8,
|
|
"seq_length": 2048,
|
|
"num_steps": 50,
|
|
"avg_step_time_ms": 312.3,
|
|
"throughput_tokens_per_sec": 52471.0,
|
|
"throughput_samples_per_sec": 25.62,
|
|
"peak_memory_gb": 27.31,
|
|
"final_loss": 0.0041,
|
|
"timestamp": "2026-05-22T15:52:32.650522"
|
|
}
|
|
} |