72 lines
2.1 KiB
YAML
72 lines
2.1 KiB
YAML
common:
|
|
# The number of historical images
|
|
img_history_size: 2
|
|
# The number of future actions to predict
|
|
action_chunk_size: 64
|
|
# The number of cameras to be used in the model
|
|
num_cameras: 3
|
|
# Dimension for state/action, we use the same space for both state and action
|
|
# This MUST be equal to configs/state_vec.py
|
|
state_dim: 128
|
|
|
|
|
|
dataset:
|
|
# We will extract the data from raw dataset
|
|
# and store them in the disk buffer by producer
|
|
# When training, we will read the data
|
|
# randomly from the buffer by consumer
|
|
# The producer will replace the data which has been
|
|
# read by the consumer with new data
|
|
|
|
# The path to the buffer (at least 400GB)
|
|
buf_path: /path/to/buffer
|
|
# The number of chunks in the buffer
|
|
buf_num_chunks: 512
|
|
# The number of samples (step rather than episode) in each chunk
|
|
buf_chunk_size: 512
|
|
|
|
# We will filter the episodes with length less than `epsd_len_thresh_low`
|
|
epsd_len_thresh_low: 32
|
|
# For those more than `epsd_len_thresh_high`,
|
|
# we will randomly sample `epsd_len_thresh_high` steps each time we load the episode
|
|
# to better balance the training datasets
|
|
epsd_len_thresh_high: 2048
|
|
# How to fit the image size
|
|
image_aspect_ratio: pad
|
|
# Maximum number of language tokens
|
|
tokenizer_max_length: 1024
|
|
|
|
model:
|
|
# Config for condition adpators
|
|
lang_adaptor: mlp2x_gelu
|
|
img_adaptor: mlp2x_gelu
|
|
state_adaptor: mlp3x_gelu
|
|
lang_token_dim: 4096
|
|
img_token_dim: 1152
|
|
# Dim of action or proprioception vector
|
|
# A `state` refers to an action or a proprioception vector
|
|
state_token_dim: 128
|
|
# Config for RDT structure
|
|
rdt:
|
|
# 1B: num_head 32 hidden_size 2048
|
|
hidden_size: 2048
|
|
depth: 28
|
|
num_heads: 32
|
|
cond_pos_embed_type: multimodal
|
|
# For noise scheduler
|
|
noise_scheduler:
|
|
type: ddpm
|
|
num_train_timesteps: 1000
|
|
num_inference_timesteps: 5
|
|
beta_schedule: squaredcos_cap_v2 # Critical choice
|
|
prediction_type: sample
|
|
clip_sample: False
|
|
# For EMA (params averaging)
|
|
# We do not use EMA currently
|
|
ema:
|
|
update_after_step: 0
|
|
inv_gamma: 1.0
|
|
power: 0.75
|
|
min_value: 0.0
|
|
max_value: 0.9999
|