- gpu_specs: H100 新增 compute_pass_thresholds_tflops 字段 (fp32:54 / tf32:444 / fp16:734 / bf16:745 / fp8:1400), 与 marketing peak 解耦,作为绝对 TFLOPS PASS 门槛 - benchmark: compute 结果中透出 pass_thresholds_tflops 供 report 使用 - report: compute 判定改用绝对 TFLOPS (PASS ≥门槛 / WARN ≥门槛×90% / FAIL <门槛×90%);表头切换为 Threshold 列;Memory D2D verdict 由 50/30 收紧至 80/60;无阈值配置的 GPU 保留旧 % 效率逻辑 - nccl: _OP_BW_FRACTIONS 收紧至 AllReduce/AllGather/ReduceScatter 0.45、Broadcast/SendRecv 0.40、AllToAll 0.35,与验收文档 §5 一致 - configs: benchmark 默认 matrix_size 4096→8192、warmup 10→50、 iterations 100→500、use_compile 改 true;health temp_warning 80→75、temp_critical 90→85,匹配生产验收稳态温度要求 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
64 lines
1.1 KiB
YAML
64 lines
1.1 KiB
YAML
# GPU type: auto-detect or override to a100/a800/h100/h200/b200/b300
|
|
gpu_type: auto
|
|
|
|
benchmark:
|
|
memory:
|
|
size_mb: 4096
|
|
iterations: 10
|
|
nvbandwidth_buffer_mb: 512
|
|
nvbandwidth_samples: 3
|
|
compute:
|
|
dtypes:
|
|
- fp32
|
|
- tf32
|
|
- fp16
|
|
- bf16
|
|
- fp8
|
|
matrix_size: 8192
|
|
warmup: 50
|
|
iterations: 500
|
|
use_compile: true
|
|
|
|
health:
|
|
temp_warning: 75
|
|
temp_critical: 85
|
|
power_limit: null # null = auto-detect from GPU TDP per gpu_specs.py
|
|
|
|
nccl:
|
|
min_bandwidth_gbps: null # null = auto-detect (40% of GPU NVLink BW)
|
|
test_allreduce: true
|
|
test_alltoall: true
|
|
test_broadcast: true
|
|
test_reduce_scatter: false
|
|
test_allgather: false
|
|
test_sendrecv: false
|
|
|
|
stress:
|
|
duration_sec: 60
|
|
use_doubles: false
|
|
use_tensor_cores: true
|
|
memory_pct: 90
|
|
gpus: all
|
|
|
|
rdma:
|
|
min_bandwidth_gbps: 50
|
|
max_latency_us: 10
|
|
ib_iterations: 1000
|
|
msg_size: 65536
|
|
ib_device: null
|
|
ib_port: 1
|
|
|
|
training:
|
|
model: gpt2
|
|
batch_size: 8
|
|
seq_length: 2048
|
|
num_steps: 50
|
|
dtype: bf16
|
|
|
|
report:
|
|
output_dir: ./reports
|
|
format: json
|
|
|
|
tools:
|
|
install_dir: /opt/gpu-test-tools
|