test_gpu_scripts/configs/default.yaml
qinyusen 2cb776d7d5 fix: generic branding, wire up report generation, fix --config flag
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-05-06 19:32:01 +08:00

63 lines
1.1 KiB
YAML

# GPU type: auto-detect or override to h100/h200/b200/b300
gpu_type: auto
benchmark:
memory:
size_mb: 4096
iterations: 10
nvbandwidth_buffer_mb: 512
nvbandwidth_samples: 3
compute:
dtypes:
- fp32
- tf32
- fp16
- bf16
- fp8
matrix_size: 4096
warmup: 10
iterations: 100
health:
temp_warning: 80
temp_critical: 90
power_limit: null # null = auto-detect from GPU TDP (H100/H200: 700W, B200: 1000W, B300: 1200W)
nccl:
min_bandwidth_gbps: null # null = auto-detect (40% of GPU NVLink BW)
test_allreduce: true
test_alltoall: true
test_broadcast: true
test_reduce_scatter: false
test_allgather: false
test_sendrecv: false
stress:
duration_sec: 60
use_doubles: false
use_tensor_cores: true
memory_pct: 90
gpus: all
rdma:
min_bandwidth_gbps: 50
max_latency_us: 10
ib_iterations: 1000
msg_size: 65536
ib_device: null
ib_port: 1
training:
model: gpt2
batch_size: 8
seq_length: 2048
num_steps: 50
dtype: bf16
report:
output_dir: ./reports
format: json
tools:
install_dir: /opt/h200-test-tools