d-robotics-rdt/rdt170m-run/configs/base.yaml

common:
  # The number of historical images
  img_history_size: 2
  # The number of future actions to predict
  action_chunk_size: 64
  # The number of cameras to be used in the model
  num_cameras: 3
  # Dimension for state/action, we use the same space for both state and action
  # This MUST be equal to configs/state_vec.py
  state_dim: 128


dataset:
  # We will extract the data from raw dataset
  # and store them in the disk buffer by producer
  # When training, we will read the data
  # randomly from the buffer by consumer
  # The producer will replace the data which has been
  # read by the consumer with new data

  # The path to the buffer (at least 400GB)
  buf_path: /path/to/buffer
  # The number of chunks in the buffer
  buf_num_chunks: 512
  # The number of samples (step rather than episode) in each chunk
  buf_chunk_size: 512

  # We will filter the episodes with length less than `epsd_len_thresh_low`
  epsd_len_thresh_low: 32
  # For those more than `epsd_len_thresh_high`,
  # we will randomly sample `epsd_len_thresh_high` steps each time we load the episode
  # to better balance the training datasets
  epsd_len_thresh_high: 2048
  # How to fit the image size
  image_aspect_ratio: pad
  # Maximum number of language tokens
  tokenizer_max_length: 1024

model:
  # Config for condition adpators
  lang_adaptor: mlp2x_gelu
  img_adaptor: mlp2x_gelu
  state_adaptor: mlp3x_gelu
  lang_token_dim: 4096
  img_token_dim: 1152
  # Dim of action or proprioception vector
  # A `state` refers to an action or a proprioception vector
  state_token_dim: 128
  # Config for RDT structure
  rdt:
    # 1B: num_head 32 hidden_size 2048
    hidden_size: 2048
    depth: 28
    num_heads: 32
    cond_pos_embed_type: multimodal
  # For noise scheduler
  noise_scheduler:
    type: ddpm
    num_train_timesteps: 1000
    num_inference_timesteps: 5
    beta_schedule: squaredcos_cap_v2  # Critical choice
    prediction_type: sample
    clip_sample: False
  # For EMA (params averaging)
  # We do not use EMA currently
  ema:
    update_after_step: 0
    inv_gamma: 1.0
    power: 0.75
    min_value: 0.0
    max_value: 0.9999