test_gpu_scripts/reports_all_aikubeworker0016.json

921 lines
24 KiB
JSON

{
"timestamp": "2026-05-22T15:49:02.368516",
"gpu_info": {
"driver_version": "580.159.03",
"cuda_version": "13.0",
"gpu_count": 8,
"gpus": [
{
"index": 0,
"name": "NVIDIA H100 80GB HBM3",
"uuid": "GPU-dfbc9513-255d-4fe7-2b77-7b1ec3972e75",
"pci_bus_id": "00000000:18:00.0",
"pcie_link_gen": 5,
"pcie_link_width": 16,
"vram_total_mb": 81559,
"vram_used_mb": 4,
"vram_free_mb": 81076,
"power_draw": 69.98,
"power_limit": 700.0,
"clock_sm": 345,
"clock_mem": 2619,
"temperature": 21,
"fan_speed": 0,
"persistence_mode": false,
"compute_mode": "Default",
"serial_number": "1651924016120",
"ecc_errors_single": 0,
"ecc_errors_double": 0
},
{
"index": 1,
"name": "NVIDIA H100 80GB HBM3",
"uuid": "GPU-bb845ef7-d7b5-f011-9395-ea74274e2282",
"pci_bus_id": "00000000:2A:00.0",
"pcie_link_gen": 5,
"pcie_link_width": 16,
"vram_total_mb": 81559,
"vram_used_mb": 4,
"vram_free_mb": 81076,
"power_draw": 67.54,
"power_limit": 700.0,
"clock_sm": 345,
"clock_mem": 2619,
"temperature": 21,
"fan_speed": 0,
"persistence_mode": false,
"compute_mode": "Default",
"serial_number": "1651924015483",
"ecc_errors_single": 0,
"ecc_errors_double": 0
},
{
"index": 2,
"name": "NVIDIA H100 80GB HBM3",
"uuid": "GPU-3720cf13-2a34-be38-27be-0a7adc4addc4",
"pci_bus_id": "00000000:3A:00.0",
"pcie_link_gen": 5,
"pcie_link_width": 16,
"vram_total_mb": 81559,
"vram_used_mb": 4,
"vram_free_mb": 81076,
"power_draw": 66.82,
"power_limit": 700.0,
"clock_sm": 345,
"clock_mem": 2619,
"temperature": 22,
"fan_speed": 0,
"persistence_mode": false,
"compute_mode": "Default",
"serial_number": "1651924025595",
"ecc_errors_single": 0,
"ecc_errors_double": 0
},
{
"index": 3,
"name": "NVIDIA H100 80GB HBM3",
"uuid": "GPU-87080b2d-ac43-be0d-d574-c193078850ae",
"pci_bus_id": "00000000:5D:00.0",
"pcie_link_gen": 5,
"pcie_link_width": 16,
"vram_total_mb": 81559,
"vram_used_mb": 4,
"vram_free_mb": 81076,
"power_draw": 67.02,
"power_limit": 700.0,
"clock_sm": 345,
"clock_mem": 2619,
"temperature": 21,
"fan_speed": 0,
"persistence_mode": false,
"compute_mode": "Default",
"serial_number": "1651924016862",
"ecc_errors_single": 0,
"ecc_errors_double": 0
},
{
"index": 4,
"name": "NVIDIA H100 80GB HBM3",
"uuid": "GPU-599bd883-cc5c-a5dd-6c33-c15f7049da48",
"pci_bus_id": "00000000:9A:00.0",
"pcie_link_gen": 5,
"pcie_link_width": 16,
"vram_total_mb": 81559,
"vram_used_mb": 4,
"vram_free_mb": 81076,
"power_draw": 67.24,
"power_limit": 700.0,
"clock_sm": 345,
"clock_mem": 2619,
"temperature": 21,
"fan_speed": 0,
"persistence_mode": false,
"compute_mode": "Default",
"serial_number": "1651924025670",
"ecc_errors_single": 0,
"ecc_errors_double": 0
},
{
"index": 5,
"name": "NVIDIA H100 80GB HBM3",
"uuid": "GPU-a1c6bba4-61b0-e623-06c9-9c88635e26fe",
"pci_bus_id": "00000000:AB:00.0",
"pcie_link_gen": 5,
"pcie_link_width": 16,
"vram_total_mb": 81559,
"vram_used_mb": 4,
"vram_free_mb": 81076,
"power_draw": 69.31,
"power_limit": 700.0,
"clock_sm": 345,
"clock_mem": 2619,
"temperature": 23,
"fan_speed": 0,
"persistence_mode": false,
"compute_mode": "Default",
"serial_number": "1651924027166",
"ecc_errors_single": 0,
"ecc_errors_double": 0
},
{
"index": 6,
"name": "NVIDIA H100 80GB HBM3",
"uuid": "GPU-98745a0c-39bd-3e56-d6ca-54ba3647ab6d",
"pci_bus_id": "00000000:BA:00.0",
"pcie_link_gen": 5,
"pcie_link_width": 16,
"vram_total_mb": 81559,
"vram_used_mb": 4,
"vram_free_mb": 81076,
"power_draw": 67.84,
"power_limit": 700.0,
"clock_sm": 345,
"clock_mem": 2619,
"temperature": 21,
"fan_speed": 0,
"persistence_mode": false,
"compute_mode": "Default",
"serial_number": "1651924026234",
"ecc_errors_single": 0,
"ecc_errors_double": 0
},
{
"index": 7,
"name": "NVIDIA H100 80GB HBM3",
"uuid": "GPU-8c73bd8b-666b-357e-ac5d-c75ac7a759db",
"pci_bus_id": "00000000:DB:00.0",
"pcie_link_gen": 5,
"pcie_link_width": 16,
"vram_total_mb": 81559,
"vram_used_mb": 4,
"vram_free_mb": 81076,
"power_draw": 66.21,
"power_limit": 700.0,
"clock_sm": 345,
"clock_mem": 2619,
"temperature": 21,
"fan_speed": 0,
"persistence_mode": false,
"compute_mode": "Default",
"serial_number": "1651924027255",
"ecc_errors_single": 0,
"ecc_errors_double": 0
}
],
"topology": "\t\u001b[4mGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tNIC3\tNIC4\tNIC5\tNIC6\tNIC7\tNIC8\tNIC9\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\u001b[0m\nGPU0\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tPIX\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU1\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tPIX\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU2\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tPIX\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU3\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU4\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU5\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU6\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tPIX\t56-111,168-223\t1\t\tN/A\nGPU7\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nNIC0\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC3\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC4\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t X \tPIX\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC5\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\tPIX\t X \tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC6\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\t\t\t\t\nNIC7\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\t\t\t\t\nNIC8\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\t\t\t\t\nNIC9\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n NIC4: mlx5_4\n NIC5: mlx5_5\n NIC6: mlx5_6\n NIC7: mlx5_7\n NIC8: mlx5_8\n NIC9: mlx5_9\n\n",
"timestamp": "2026-05-22T15:49:09.197459",
"detected_gpu_type": "h100",
"gpu_label": "H100 SXM5"
},
"health": {
"passed": true,
"gpu_health": [
{
"index": 0,
"status": "WARN",
"checks": {
"temperature": {
"value": 21,
"status": "PASS",
"threshold": 75
},
"power": {
"value": 69.86,
"limit": 700.0,
"status": "PASS"
},
"ecc_errors": {
"single": 0,
"double": 0,
"status": "PASS"
},
"memory_errors": {
"status": "PASS"
},
"pcie_link": {
"gen": 5,
"width": 16,
"status": "PASS"
},
"clock_speed": {
"sm": 345,
"mem": 2619,
"status": "PASS"
},
"throttling": {
"status": "PASS",
"reasons": []
},
"persistence_mode": {
"enabled": false,
"status": "WARN"
}
}
},
{
"index": 1,
"status": "WARN",
"checks": {
"temperature": {
"value": 21,
"status": "PASS",
"threshold": 75
},
"power": {
"value": 67.48,
"limit": 700.0,
"status": "PASS"
},
"ecc_errors": {
"single": 0,
"double": 0,
"status": "PASS"
},
"memory_errors": {
"status": "PASS"
},
"pcie_link": {
"gen": 5,
"width": 16,
"status": "PASS"
},
"clock_speed": {
"sm": 345,
"mem": 2619,
"status": "PASS"
},
"throttling": {
"status": "PASS",
"reasons": []
},
"persistence_mode": {
"enabled": false,
"status": "WARN"
}
}
},
{
"index": 2,
"status": "WARN",
"checks": {
"temperature": {
"value": 22,
"status": "PASS",
"threshold": 75
},
"power": {
"value": 66.76,
"limit": 700.0,
"status": "PASS"
},
"ecc_errors": {
"single": 0,
"double": 0,
"status": "PASS"
},
"memory_errors": {
"status": "PASS"
},
"pcie_link": {
"gen": 5,
"width": 16,
"status": "PASS"
},
"clock_speed": {
"sm": 345,
"mem": 2619,
"status": "PASS"
},
"throttling": {
"status": "PASS",
"reasons": []
},
"persistence_mode": {
"enabled": false,
"status": "WARN"
}
}
},
{
"index": 3,
"status": "WARN",
"checks": {
"temperature": {
"value": 21,
"status": "PASS",
"threshold": 75
},
"power": {
"value": 67.06,
"limit": 700.0,
"status": "PASS"
},
"ecc_errors": {
"single": 0,
"double": 0,
"status": "PASS"
},
"memory_errors": {
"status": "PASS"
},
"pcie_link": {
"gen": 5,
"width": 16,
"status": "PASS"
},
"clock_speed": {
"sm": 345,
"mem": 2619,
"status": "PASS"
},
"throttling": {
"status": "PASS",
"reasons": []
},
"persistence_mode": {
"enabled": false,
"status": "WARN"
}
}
},
{
"index": 4,
"status": "WARN",
"checks": {
"temperature": {
"value": 21,
"status": "PASS",
"threshold": 75
},
"power": {
"value": 67.23,
"limit": 700.0,
"status": "PASS"
},
"ecc_errors": {
"single": 0,
"double": 0,
"status": "PASS"
},
"memory_errors": {
"status": "PASS"
},
"pcie_link": {
"gen": 5,
"width": 16,
"status": "PASS"
},
"clock_speed": {
"sm": 345,
"mem": 2619,
"status": "PASS"
},
"throttling": {
"status": "PASS",
"reasons": []
},
"persistence_mode": {
"enabled": false,
"status": "WARN"
}
}
},
{
"index": 5,
"status": "WARN",
"checks": {
"temperature": {
"value": 23,
"status": "PASS",
"threshold": 75
},
"power": {
"value": 69.27,
"limit": 700.0,
"status": "PASS"
},
"ecc_errors": {
"single": 0,
"double": 0,
"status": "PASS"
},
"memory_errors": {
"status": "PASS"
},
"pcie_link": {
"gen": 5,
"width": 16,
"status": "PASS"
},
"clock_speed": {
"sm": 345,
"mem": 2619,
"status": "PASS"
},
"throttling": {
"status": "PASS",
"reasons": []
},
"persistence_mode": {
"enabled": false,
"status": "WARN"
}
}
},
{
"index": 6,
"status": "WARN",
"checks": {
"temperature": {
"value": 21,
"status": "PASS",
"threshold": 75
},
"power": {
"value": 67.81,
"limit": 700.0,
"status": "PASS"
},
"ecc_errors": {
"single": 0,
"double": 0,
"status": "PASS"
},
"memory_errors": {
"status": "PASS"
},
"pcie_link": {
"gen": 5,
"width": 16,
"status": "PASS"
},
"clock_speed": {
"sm": 345,
"mem": 2619,
"status": "PASS"
},
"throttling": {
"status": "PASS",
"reasons": []
},
"persistence_mode": {
"enabled": false,
"status": "WARN"
}
}
},
{
"index": 7,
"status": "WARN",
"checks": {
"temperature": {
"value": 21,
"status": "PASS",
"threshold": 75
},
"power": {
"value": 66.3,
"limit": 700.0,
"status": "PASS"
},
"ecc_errors": {
"single": 0,
"double": 0,
"status": "PASS"
},
"memory_errors": {
"status": "PASS"
},
"pcie_link": {
"gen": 5,
"width": 16,
"status": "PASS"
},
"clock_speed": {
"sm": 345,
"mem": 2619,
"status": "PASS"
},
"throttling": {
"status": "PASS",
"reasons": []
},
"persistence_mode": {
"enabled": false,
"status": "WARN"
}
}
}
],
"system_health": {
"nvidia_persistenced": {
"installed": true,
"running": false
},
"hugepages": {
"configured": false,
"count": 0
},
"swap": {
"enabled": true
},
"transparent_hugepage": "madvise",
"file_descriptors": {
"soft": 1024,
"max": 1048576
},
"infiniband_devices": [
"mlx5_4",
"mlx5_2",
"mlx5_0",
"mlx5_9",
"mlx5_7",
"mlx5_5",
"mlx5_3",
"mlx5_1",
"mlx5_8",
"mlx5_6"
],
"rdma_devices": [
"abi_version",
"uverbs4",
"uverbs2",
"uverbs0",
"uverbs9",
"uverbs7",
"uverbs5",
"uverbs3",
"uverbs1",
"uverbs8",
"uverbs6"
],
"nccl_env_vars": {}
},
"timestamp": "2026-05-22T15:49:11.294816",
"detected_gpu_type": "h100"
},
"memory_bench": {
"memory": {
"source": "nvbandwidth",
"h2d_bandwidth_gbps": 55.5,
"d2h_bandwidth_gbps": 55.3,
"d2d_bandwidth_gbps": 486.5,
"h2d_peak_gbps": 64,
"d2h_peak_gbps": 64,
"d2d_peak_gbps": 450.0,
"h2d_efficiency_pct": 86.7,
"d2h_efficiency_pct": 86.4,
"d2d_efficiency_pct": 108.1,
"peak_bandwidth_gbps": 3400,
"efficiency_pct": 108.1,
"results_by_test": {
"h2d": 55.5,
"d2h": 55.3,
"d2d_write": 397.4,
"d2d_read": 395.1,
"d2d_bidir": 486.5
},
"per_gpu": []
}
},
"compute_bench": {
"compute": {
"per_dtype_tflops": {
"fp32": 51.9,
"tf32": 357.0,
"fp16": 664.0,
"bf16": 700.1,
"fp8": 1116.2
},
"peak_tflops": {
"fp32": 67,
"tf32": 495,
"fp16": 990,
"bf16": 990,
"fp8": 1979
},
"efficiency_pct": {
"fp32": 77.5,
"tf32": 72.1,
"fp16": 67.1,
"bf16": 70.7,
"fp8": 56.4
},
"pass_thresholds_tflops": {
"fp32": 54,
"tf32": 444,
"fp16": 734,
"bf16": 745,
"fp8": 1400
},
"per_gpu": [
{
"index": 0,
"fp32": 51.9,
"tf32": 357.0,
"fp16": 664.0,
"bf16": 700.1,
"fp8": 1116.2
},
{
"index": 1,
"fp32": 51.9,
"tf32": 357.0,
"fp16": 664.0,
"bf16": 700.1,
"fp8": 1116.2
},
{
"index": 2,
"fp32": 51.9,
"tf32": 357.0,
"fp16": 664.0,
"bf16": 700.1,
"fp8": 1116.2
},
{
"index": 3,
"fp32": 51.9,
"tf32": 357.0,
"fp16": 664.0,
"bf16": 700.1,
"fp8": 1116.2
},
{
"index": 4,
"fp32": 51.9,
"tf32": 357.0,
"fp16": 664.0,
"bf16": 700.1,
"fp8": 1116.2
},
{
"index": 5,
"fp32": 51.9,
"tf32": 357.0,
"fp16": 664.0,
"bf16": 700.1,
"fp8": 1116.2
},
{
"index": 6,
"fp32": 51.9,
"tf32": 357.0,
"fp16": 664.0,
"bf16": 700.1,
"fp8": 1116.2
},
{
"index": 7,
"fp32": 51.9,
"tf32": 357.0,
"fp16": 664.0,
"bf16": 700.1,
"fp8": 1116.2
}
],
"matrix_size": 8192,
"warmup": 50,
"iterations": 500
}
},
"nccl": {
"passed": false,
"source": "torchrun_fallback",
"tests": {
"NCCL version 2.21.5+cuda12.4": {
"status": "FAIL",
"error": null
},
"allreduce": {
"status": "PASS",
"error": null
},
"broadcast": {
"status": "PASS",
"error": null
},
"allgather": {
"status": "PASS",
"error": null
},
"reducescatter": {
"status": "PASS",
"error": null
},
"alltoall": {
"status": "PASS",
"error": null
}
},
"gpu_count": 8
},
"stress": {
"source": "pytorch",
"passed": true,
"duration_sec": 60,
"elapsed_sec": 60.0,
"gpu_status": {
"0": "PASS",
"1": "PASS",
"2": "PASS",
"3": "PASS",
"4": "PASS",
"5": "PASS",
"6": "PASS",
"7": "PASS"
},
"timestamp": "2026-05-22T15:51:56.803540"
},
"rdma": {
"passed": false,
"devices": [
{
"name": "mlx5_0",
"ports": [
{
"port": "1",
"rate": "400 Gb/sec (4X NDR)",
"state": "4: ACTIVE",
"phys_state": "5: LinkUp",
"gid": "fe80:0000:0000:0000:58a2:e103:0088:81e0"
}
]
},
{
"name": "mlx5_1",
"ports": [
{
"port": "1",
"rate": "400 Gb/sec (4X NDR)",
"state": "4: ACTIVE",
"phys_state": "5: LinkUp",
"gid": "fe80:0000:0000:0000:9c63:c003:0054:e00a"
}
]
},
{
"name": "mlx5_2",
"ports": [
{
"port": "1",
"rate": "25 Gb/sec (1X EDR)",
"state": "4: ACTIVE",
"phys_state": "5: LinkUp",
"gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf"
}
]
},
{
"name": "mlx5_3",
"ports": [
{
"port": "1",
"rate": "25 Gb/sec (1X EDR)",
"state": "1: DOWN",
"phys_state": "3: Disabled",
"gid": "fe80:0000:0000:0000:c670:bdff:fefd:5bd9"
}
]
},
{
"name": "mlx5_4",
"ports": [
{
"port": "1",
"rate": "100 Gb/sec (2X HDR)",
"state": "4: ACTIVE",
"phys_state": "5: LinkUp",
"gid": "fe80:0000:0000:0000:9c63:c003:005f:58ec"
}
]
},
{
"name": "mlx5_5",
"ports": [
{
"port": "1",
"rate": "100 Gb/sec (2X HDR)",
"state": "4: ACTIVE",
"phys_state": "5: LinkUp",
"gid": "fe80:0000:0000:0000:9c63:c003:005f:58ed"
}
]
},
{
"name": "mlx5_6",
"ports": [
{
"port": "1",
"rate": "400 Gb/sec (4X NDR)",
"state": "4: ACTIVE",
"phys_state": "5: LinkUp",
"gid": "fe80:0000:0000:0000:9c63:c003:0055:0e56"
}
]
},
{
"name": "mlx5_7",
"ports": [
{
"port": "1",
"rate": "400 Gb/sec (4X NDR)",
"state": "4: ACTIVE",
"phys_state": "5: LinkUp",
"gid": "fe80:0000:0000:0000:a088:c203:00f0:286c"
}
]
},
{
"name": "mlx5_8",
"ports": [
{
"port": "1",
"rate": "25 Gb/sec (1X EDR)",
"state": "4: ACTIVE",
"phys_state": "5: LinkUp",
"gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf"
}
]
},
{
"name": "mlx5_9",
"ports": [
{
"port": "1",
"rate": "25 Gb/sec (1X EDR)",
"state": "1: DOWN",
"phys_state": "3: Disabled",
"gid": "fe80:0000:0000:0000:c670:bdff:fefd:569d"
}
]
}
],
"bandwidth_tests": [
{
"test": "ib_write_bw",
"status": "WARN",
"bandwidth_gbps": 0.13,
"min_required_gbps": 50
},
{
"test": "ib_read_bw",
"status": "WARN",
"bandwidth_gbps": 0.13,
"min_required_gbps": 50
}
],
"latency_tests": [
{
"test": "ib_write_lat",
"status": "PASS",
"latency_us": 4.1,
"max_allowed_us": 10
},
{
"test": "ib_read_lat",
"status": "WARN",
"latency_us": 16.0,
"max_allowed_us": 10
}
],
"timestamp": "2026-05-22T15:52:03.507540"
},
"training": {
"model": "synthetic_transformer",
"total_params_m": 1470.5,
"num_layers": 6,
"hidden_size": 4096,
"gpu_count": 8,
"dtype": "bfloat16",
"batch_size": 8,
"seq_length": 2048,
"num_steps": 50,
"avg_step_time_ms": 312.3,
"throughput_tokens_per_sec": 52471.0,
"throughput_samples_per_sec": 25.62,
"peak_memory_gb": 27.31,
"final_loss": 0.0041,
"timestamp": "2026-05-22T15:52:32.650522"
}
}