{ "timestamp": "2026-05-22T15:49:02.368516", "gpu_info": { "driver_version": "580.159.03", "cuda_version": "13.0", "gpu_count": 8, "gpus": [ { "index": 0, "name": "NVIDIA H100 80GB HBM3", "uuid": "GPU-dfbc9513-255d-4fe7-2b77-7b1ec3972e75", "pci_bus_id": "00000000:18:00.0", "pcie_link_gen": 5, "pcie_link_width": 16, "vram_total_mb": 81559, "vram_used_mb": 4, "vram_free_mb": 81076, "power_draw": 69.98, "power_limit": 700.0, "clock_sm": 345, "clock_mem": 2619, "temperature": 21, "fan_speed": 0, "persistence_mode": false, "compute_mode": "Default", "serial_number": "1651924016120", "ecc_errors_single": 0, "ecc_errors_double": 0 }, { "index": 1, "name": "NVIDIA H100 80GB HBM3", "uuid": "GPU-bb845ef7-d7b5-f011-9395-ea74274e2282", "pci_bus_id": "00000000:2A:00.0", "pcie_link_gen": 5, "pcie_link_width": 16, "vram_total_mb": 81559, "vram_used_mb": 4, "vram_free_mb": 81076, "power_draw": 67.54, "power_limit": 700.0, "clock_sm": 345, "clock_mem": 2619, "temperature": 21, "fan_speed": 0, "persistence_mode": false, "compute_mode": "Default", "serial_number": "1651924015483", "ecc_errors_single": 0, "ecc_errors_double": 0 }, { "index": 2, "name": "NVIDIA H100 80GB HBM3", "uuid": "GPU-3720cf13-2a34-be38-27be-0a7adc4addc4", "pci_bus_id": "00000000:3A:00.0", "pcie_link_gen": 5, "pcie_link_width": 16, "vram_total_mb": 81559, "vram_used_mb": 4, "vram_free_mb": 81076, "power_draw": 66.82, "power_limit": 700.0, "clock_sm": 345, "clock_mem": 2619, "temperature": 22, "fan_speed": 0, "persistence_mode": false, "compute_mode": "Default", "serial_number": "1651924025595", "ecc_errors_single": 0, "ecc_errors_double": 0 }, { "index": 3, "name": "NVIDIA H100 80GB HBM3", "uuid": "GPU-87080b2d-ac43-be0d-d574-c193078850ae", "pci_bus_id": "00000000:5D:00.0", "pcie_link_gen": 5, "pcie_link_width": 16, "vram_total_mb": 81559, "vram_used_mb": 4, "vram_free_mb": 81076, "power_draw": 67.02, "power_limit": 700.0, "clock_sm": 345, "clock_mem": 2619, "temperature": 21, "fan_speed": 0, "persistence_mode": false, "compute_mode": "Default", "serial_number": "1651924016862", "ecc_errors_single": 0, "ecc_errors_double": 0 }, { "index": 4, "name": "NVIDIA H100 80GB HBM3", "uuid": "GPU-599bd883-cc5c-a5dd-6c33-c15f7049da48", "pci_bus_id": "00000000:9A:00.0", "pcie_link_gen": 5, "pcie_link_width": 16, "vram_total_mb": 81559, "vram_used_mb": 4, "vram_free_mb": 81076, "power_draw": 67.24, "power_limit": 700.0, "clock_sm": 345, "clock_mem": 2619, "temperature": 21, "fan_speed": 0, "persistence_mode": false, "compute_mode": "Default", "serial_number": "1651924025670", "ecc_errors_single": 0, "ecc_errors_double": 0 }, { "index": 5, "name": "NVIDIA H100 80GB HBM3", "uuid": "GPU-a1c6bba4-61b0-e623-06c9-9c88635e26fe", "pci_bus_id": "00000000:AB:00.0", "pcie_link_gen": 5, "pcie_link_width": 16, "vram_total_mb": 81559, "vram_used_mb": 4, "vram_free_mb": 81076, "power_draw": 69.31, "power_limit": 700.0, "clock_sm": 345, "clock_mem": 2619, "temperature": 23, "fan_speed": 0, "persistence_mode": false, "compute_mode": "Default", "serial_number": "1651924027166", "ecc_errors_single": 0, "ecc_errors_double": 0 }, { "index": 6, "name": "NVIDIA H100 80GB HBM3", "uuid": "GPU-98745a0c-39bd-3e56-d6ca-54ba3647ab6d", "pci_bus_id": "00000000:BA:00.0", "pcie_link_gen": 5, "pcie_link_width": 16, "vram_total_mb": 81559, "vram_used_mb": 4, "vram_free_mb": 81076, "power_draw": 67.84, "power_limit": 700.0, "clock_sm": 345, "clock_mem": 2619, "temperature": 21, "fan_speed": 0, "persistence_mode": false, "compute_mode": "Default", "serial_number": "1651924026234", "ecc_errors_single": 0, "ecc_errors_double": 0 }, { "index": 7, "name": "NVIDIA H100 80GB HBM3", "uuid": "GPU-8c73bd8b-666b-357e-ac5d-c75ac7a759db", "pci_bus_id": "00000000:DB:00.0", "pcie_link_gen": 5, "pcie_link_width": 16, "vram_total_mb": 81559, "vram_used_mb": 4, "vram_free_mb": 81076, "power_draw": 66.21, "power_limit": 700.0, "clock_sm": 345, "clock_mem": 2619, "temperature": 21, "fan_speed": 0, "persistence_mode": false, "compute_mode": "Default", "serial_number": "1651924027255", "ecc_errors_single": 0, "ecc_errors_double": 0 } ], "topology": "\t\u001b[4mGPU0\tGPU1\tGPU2\tGPU3\tGPU4\tGPU5\tGPU6\tGPU7\tNIC0\tNIC1\tNIC2\tNIC3\tNIC4\tNIC5\tNIC6\tNIC7\tNIC8\tNIC9\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\u001b[0m\nGPU0\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tPIX\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU1\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tPIX\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU2\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tPIX\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU3\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tNV18\tNODE\tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t0-55,112-167\t0\t\tN/A\nGPU4\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU5\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nGPU6\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tNV18\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tPIX\t56-111,168-223\t1\t\tN/A\nGPU7\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\tNV18\t X \tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t56-111,168-223\t1\t\tN/A\nNIC0\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC1\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC2\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC3\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC4\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\t X \tPIX\tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC5\tNODE\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tNODE\tNODE\tPIX\t X \tSYS\tSYS\tSYS\tSYS\t\t\t\t\nNIC6\tSYS\tSYS\tSYS\tSYS\tPIX\tNODE\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\t X \tNODE\tNODE\tNODE\t\t\t\t\nNIC7\tSYS\tSYS\tSYS\tSYS\tNODE\tPIX\tNODE\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\t X \tNODE\tNODE\t\t\t\t\nNIC8\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\t X \tPIX\t\t\t\t\nNIC9\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\tNODE\tSYS\tSYS\tSYS\tSYS\tSYS\tSYS\tNODE\tNODE\tPIX\t X \t\t\t\t\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n\nNIC Legend:\n\n NIC0: mlx5_0\n NIC1: mlx5_1\n NIC2: mlx5_2\n NIC3: mlx5_3\n NIC4: mlx5_4\n NIC5: mlx5_5\n NIC6: mlx5_6\n NIC7: mlx5_7\n NIC8: mlx5_8\n NIC9: mlx5_9\n\n", "timestamp": "2026-05-22T15:49:09.197459", "detected_gpu_type": "h100", "gpu_label": "H100 SXM5" }, "health": { "passed": true, "gpu_health": [ { "index": 0, "status": "WARN", "checks": { "temperature": { "value": 21, "status": "PASS", "threshold": 75 }, "power": { "value": 69.86, "limit": 700.0, "status": "PASS" }, "ecc_errors": { "single": 0, "double": 0, "status": "PASS" }, "memory_errors": { "status": "PASS" }, "pcie_link": { "gen": 5, "width": 16, "status": "PASS" }, "clock_speed": { "sm": 345, "mem": 2619, "status": "PASS" }, "throttling": { "status": "PASS", "reasons": [] }, "persistence_mode": { "enabled": false, "status": "WARN" } } }, { "index": 1, "status": "WARN", "checks": { "temperature": { "value": 21, "status": "PASS", "threshold": 75 }, "power": { "value": 67.48, "limit": 700.0, "status": "PASS" }, "ecc_errors": { "single": 0, "double": 0, "status": "PASS" }, "memory_errors": { "status": "PASS" }, "pcie_link": { "gen": 5, "width": 16, "status": "PASS" }, "clock_speed": { "sm": 345, "mem": 2619, "status": "PASS" }, "throttling": { "status": "PASS", "reasons": [] }, "persistence_mode": { "enabled": false, "status": "WARN" } } }, { "index": 2, "status": "WARN", "checks": { "temperature": { "value": 22, "status": "PASS", "threshold": 75 }, "power": { "value": 66.76, "limit": 700.0, "status": "PASS" }, "ecc_errors": { "single": 0, "double": 0, "status": "PASS" }, "memory_errors": { "status": "PASS" }, "pcie_link": { "gen": 5, "width": 16, "status": "PASS" }, "clock_speed": { "sm": 345, "mem": 2619, "status": "PASS" }, "throttling": { "status": "PASS", "reasons": [] }, "persistence_mode": { "enabled": false, "status": "WARN" } } }, { "index": 3, "status": "WARN", "checks": { "temperature": { "value": 21, "status": "PASS", "threshold": 75 }, "power": { "value": 67.06, "limit": 700.0, "status": "PASS" }, "ecc_errors": { "single": 0, "double": 0, "status": "PASS" }, "memory_errors": { "status": "PASS" }, "pcie_link": { "gen": 5, "width": 16, "status": "PASS" }, "clock_speed": { "sm": 345, "mem": 2619, "status": "PASS" }, "throttling": { "status": "PASS", "reasons": [] }, "persistence_mode": { "enabled": false, "status": "WARN" } } }, { "index": 4, "status": "WARN", "checks": { "temperature": { "value": 21, "status": "PASS", "threshold": 75 }, "power": { "value": 67.23, "limit": 700.0, "status": "PASS" }, "ecc_errors": { "single": 0, "double": 0, "status": "PASS" }, "memory_errors": { "status": "PASS" }, "pcie_link": { "gen": 5, "width": 16, "status": "PASS" }, "clock_speed": { "sm": 345, "mem": 2619, "status": "PASS" }, "throttling": { "status": "PASS", "reasons": [] }, "persistence_mode": { "enabled": false, "status": "WARN" } } }, { "index": 5, "status": "WARN", "checks": { "temperature": { "value": 23, "status": "PASS", "threshold": 75 }, "power": { "value": 69.27, "limit": 700.0, "status": "PASS" }, "ecc_errors": { "single": 0, "double": 0, "status": "PASS" }, "memory_errors": { "status": "PASS" }, "pcie_link": { "gen": 5, "width": 16, "status": "PASS" }, "clock_speed": { "sm": 345, "mem": 2619, "status": "PASS" }, "throttling": { "status": "PASS", "reasons": [] }, "persistence_mode": { "enabled": false, "status": "WARN" } } }, { "index": 6, "status": "WARN", "checks": { "temperature": { "value": 21, "status": "PASS", "threshold": 75 }, "power": { "value": 67.81, "limit": 700.0, "status": "PASS" }, "ecc_errors": { "single": 0, "double": 0, "status": "PASS" }, "memory_errors": { "status": "PASS" }, "pcie_link": { "gen": 5, "width": 16, "status": "PASS" }, "clock_speed": { "sm": 345, "mem": 2619, "status": "PASS" }, "throttling": { "status": "PASS", "reasons": [] }, "persistence_mode": { "enabled": false, "status": "WARN" } } }, { "index": 7, "status": "WARN", "checks": { "temperature": { "value": 21, "status": "PASS", "threshold": 75 }, "power": { "value": 66.3, "limit": 700.0, "status": "PASS" }, "ecc_errors": { "single": 0, "double": 0, "status": "PASS" }, "memory_errors": { "status": "PASS" }, "pcie_link": { "gen": 5, "width": 16, "status": "PASS" }, "clock_speed": { "sm": 345, "mem": 2619, "status": "PASS" }, "throttling": { "status": "PASS", "reasons": [] }, "persistence_mode": { "enabled": false, "status": "WARN" } } } ], "system_health": { "nvidia_persistenced": { "installed": true, "running": false }, "hugepages": { "configured": false, "count": 0 }, "swap": { "enabled": true }, "transparent_hugepage": "madvise", "file_descriptors": { "soft": 1024, "max": 1048576 }, "infiniband_devices": [ "mlx5_4", "mlx5_2", "mlx5_0", "mlx5_9", "mlx5_7", "mlx5_5", "mlx5_3", "mlx5_1", "mlx5_8", "mlx5_6" ], "rdma_devices": [ "abi_version", "uverbs4", "uverbs2", "uverbs0", "uverbs9", "uverbs7", "uverbs5", "uverbs3", "uverbs1", "uverbs8", "uverbs6" ], "nccl_env_vars": {} }, "timestamp": "2026-05-22T15:49:11.294816", "detected_gpu_type": "h100" }, "memory_bench": { "memory": { "source": "nvbandwidth", "h2d_bandwidth_gbps": 55.5, "d2h_bandwidth_gbps": 55.3, "d2d_bandwidth_gbps": 486.5, "h2d_peak_gbps": 64, "d2h_peak_gbps": 64, "d2d_peak_gbps": 450.0, "h2d_efficiency_pct": 86.7, "d2h_efficiency_pct": 86.4, "d2d_efficiency_pct": 108.1, "peak_bandwidth_gbps": 3400, "efficiency_pct": 108.1, "results_by_test": { "h2d": 55.5, "d2h": 55.3, "d2d_write": 397.4, "d2d_read": 395.1, "d2d_bidir": 486.5 }, "per_gpu": [] } }, "compute_bench": { "compute": { "per_dtype_tflops": { "fp32": 51.9, "tf32": 357.0, "fp16": 664.0, "bf16": 700.1, "fp8": 1116.2 }, "peak_tflops": { "fp32": 67, "tf32": 495, "fp16": 990, "bf16": 990, "fp8": 1979 }, "efficiency_pct": { "fp32": 77.5, "tf32": 72.1, "fp16": 67.1, "bf16": 70.7, "fp8": 56.4 }, "pass_thresholds_tflops": { "fp32": 54, "tf32": 444, "fp16": 734, "bf16": 745, "fp8": 1400 }, "per_gpu": [ { "index": 0, "fp32": 51.9, "tf32": 357.0, "fp16": 664.0, "bf16": 700.1, "fp8": 1116.2 }, { "index": 1, "fp32": 51.9, "tf32": 357.0, "fp16": 664.0, "bf16": 700.1, "fp8": 1116.2 }, { "index": 2, "fp32": 51.9, "tf32": 357.0, "fp16": 664.0, "bf16": 700.1, "fp8": 1116.2 }, { "index": 3, "fp32": 51.9, "tf32": 357.0, "fp16": 664.0, "bf16": 700.1, "fp8": 1116.2 }, { "index": 4, "fp32": 51.9, "tf32": 357.0, "fp16": 664.0, "bf16": 700.1, "fp8": 1116.2 }, { "index": 5, "fp32": 51.9, "tf32": 357.0, "fp16": 664.0, "bf16": 700.1, "fp8": 1116.2 }, { "index": 6, "fp32": 51.9, "tf32": 357.0, "fp16": 664.0, "bf16": 700.1, "fp8": 1116.2 }, { "index": 7, "fp32": 51.9, "tf32": 357.0, "fp16": 664.0, "bf16": 700.1, "fp8": 1116.2 } ], "matrix_size": 8192, "warmup": 50, "iterations": 500 } }, "nccl": { "passed": false, "source": "torchrun_fallback", "tests": { "NCCL version 2.21.5+cuda12.4": { "status": "FAIL", "error": null }, "allreduce": { "status": "PASS", "error": null }, "broadcast": { "status": "PASS", "error": null }, "allgather": { "status": "PASS", "error": null }, "reducescatter": { "status": "PASS", "error": null }, "alltoall": { "status": "PASS", "error": null } }, "gpu_count": 8 }, "stress": { "source": "pytorch", "passed": true, "duration_sec": 60, "elapsed_sec": 60.0, "gpu_status": { "0": "PASS", "1": "PASS", "2": "PASS", "3": "PASS", "4": "PASS", "5": "PASS", "6": "PASS", "7": "PASS" }, "timestamp": "2026-05-22T15:51:56.803540" }, "rdma": { "passed": false, "devices": [ { "name": "mlx5_0", "ports": [ { "port": "1", "rate": "400 Gb/sec (4X NDR)", "state": "4: ACTIVE", "phys_state": "5: LinkUp", "gid": "fe80:0000:0000:0000:58a2:e103:0088:81e0" } ] }, { "name": "mlx5_1", "ports": [ { "port": "1", "rate": "400 Gb/sec (4X NDR)", "state": "4: ACTIVE", "phys_state": "5: LinkUp", "gid": "fe80:0000:0000:0000:9c63:c003:0054:e00a" } ] }, { "name": "mlx5_2", "ports": [ { "port": "1", "rate": "25 Gb/sec (1X EDR)", "state": "4: ACTIVE", "phys_state": "5: LinkUp", "gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf" } ] }, { "name": "mlx5_3", "ports": [ { "port": "1", "rate": "25 Gb/sec (1X EDR)", "state": "1: DOWN", "phys_state": "3: Disabled", "gid": "fe80:0000:0000:0000:c670:bdff:fefd:5bd9" } ] }, { "name": "mlx5_4", "ports": [ { "port": "1", "rate": "100 Gb/sec (2X HDR)", "state": "4: ACTIVE", "phys_state": "5: LinkUp", "gid": "fe80:0000:0000:0000:9c63:c003:005f:58ec" } ] }, { "name": "mlx5_5", "ports": [ { "port": "1", "rate": "100 Gb/sec (2X HDR)", "state": "4: ACTIVE", "phys_state": "5: LinkUp", "gid": "fe80:0000:0000:0000:9c63:c003:005f:58ed" } ] }, { "name": "mlx5_6", "ports": [ { "port": "1", "rate": "400 Gb/sec (4X NDR)", "state": "4: ACTIVE", "phys_state": "5: LinkUp", "gid": "fe80:0000:0000:0000:9c63:c003:0055:0e56" } ] }, { "name": "mlx5_7", "ports": [ { "port": "1", "rate": "400 Gb/sec (4X NDR)", "state": "4: ACTIVE", "phys_state": "5: LinkUp", "gid": "fe80:0000:0000:0000:a088:c203:00f0:286c" } ] }, { "name": "mlx5_8", "ports": [ { "port": "1", "rate": "25 Gb/sec (1X EDR)", "state": "4: ACTIVE", "phys_state": "5: LinkUp", "gid": "fe80:0000:0000:0000:a02d:75ff:feae:2bcf" } ] }, { "name": "mlx5_9", "ports": [ { "port": "1", "rate": "25 Gb/sec (1X EDR)", "state": "1: DOWN", "phys_state": "3: Disabled", "gid": "fe80:0000:0000:0000:c670:bdff:fefd:569d" } ] } ], "bandwidth_tests": [ { "test": "ib_write_bw", "status": "WARN", "bandwidth_gbps": 0.13, "min_required_gbps": 50 }, { "test": "ib_read_bw", "status": "WARN", "bandwidth_gbps": 0.13, "min_required_gbps": 50 } ], "latency_tests": [ { "test": "ib_write_lat", "status": "PASS", "latency_us": 4.1, "max_allowed_us": 10 }, { "test": "ib_read_lat", "status": "WARN", "latency_us": 16.0, "max_allowed_us": 10 } ], "timestamp": "2026-05-22T15:52:03.507540" }, "training": { "model": "synthetic_transformer", "total_params_m": 1470.5, "num_layers": 6, "hidden_size": 4096, "gpu_count": 8, "dtype": "bfloat16", "batch_size": 8, "seq_length": 2048, "num_steps": 50, "avg_step_time_ms": 312.3, "throughput_tokens_per_sec": 52471.0, "throughput_samples_per_sec": 25.62, "peak_memory_gb": 27.31, "final_loss": 0.0041, "timestamp": "2026-05-22T15:52:32.650522" } }