- gpu_specs: 新增 H20/H20-3e (中国合规版 H200) 规格定义,并修复 GPU 名称匹配顺序,避免 "H200" 被 "H20" 子串误匹配 - benchmark(compute): 引入 L2 cache 规避的 matrix pool 轮换 + 可选 torch.compile(max-autotune),FP8 增加 _scaled_mm 探测, 显著提升 FP16/BF16/FP8 实测吞吐准确性 - benchmark(memory): nvbandwidth 增加 --disableAffinity 规避 fabricmanager NVML 不兼容;全 0 结果时自动回退到 PyTorch; D2D 平均值排除对角线零值 - nccl: 各通信操作 (AllReduce/AllToAll/Broadcast 等) 使用独立 带宽阈值比例,避免 AllToAll 误报 WARN - rdma: 仅按 link_layer=InfiniBand 过滤端口,无 IB 硬件或全 DOWN 时直接 SKIP 而非报错 - stress: 计算矩阵尺寸封顶 4096,并改为先并发派发再统一同步, 修复 8 卡串行执行导致 duration 严重超时的问题 - report: 兼容 RDMA SKIP 状态与 PyTorch 回退场景的 Memory 判定, 避免回退结果被误判为 FAIL - config: 新增 benchmark.compute.use_compile 开关 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
64 lines
1.1 KiB
YAML
64 lines
1.1 KiB
YAML
# GPU type: auto-detect or override to a100/a800/h100/h200/b200/b300
|
|
gpu_type: auto
|
|
|
|
benchmark:
|
|
memory:
|
|
size_mb: 4096
|
|
iterations: 10
|
|
nvbandwidth_buffer_mb: 512
|
|
nvbandwidth_samples: 3
|
|
compute:
|
|
dtypes:
|
|
- fp32
|
|
- tf32
|
|
- fp16
|
|
- bf16
|
|
- fp8
|
|
matrix_size: 4096
|
|
warmup: 10
|
|
iterations: 100
|
|
use_compile: false
|
|
|
|
health:
|
|
temp_warning: 80
|
|
temp_critical: 90
|
|
power_limit: null # null = auto-detect from GPU TDP per gpu_specs.py
|
|
|
|
nccl:
|
|
min_bandwidth_gbps: null # null = auto-detect (40% of GPU NVLink BW)
|
|
test_allreduce: true
|
|
test_alltoall: true
|
|
test_broadcast: true
|
|
test_reduce_scatter: false
|
|
test_allgather: false
|
|
test_sendrecv: false
|
|
|
|
stress:
|
|
duration_sec: 60
|
|
use_doubles: false
|
|
use_tensor_cores: true
|
|
memory_pct: 90
|
|
gpus: all
|
|
|
|
rdma:
|
|
min_bandwidth_gbps: 50
|
|
max_latency_us: 10
|
|
ib_iterations: 1000
|
|
msg_size: 65536
|
|
ib_device: null
|
|
ib_port: 1
|
|
|
|
training:
|
|
model: gpt2
|
|
batch_size: 8
|
|
seq_length: 2048
|
|
num_steps: 50
|
|
dtype: bf16
|
|
|
|
report:
|
|
output_dir: ./reports
|
|
format: json
|
|
|
|
tools:
|
|
install_dir: /opt/gpu-test-tools
|