50 lines
1.5 KiB
Bash
50 lines
1.5 KiB
Bash
#!/bin/bash
|
|
|
|
export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
|
|
export NCCL_IB_DISABLE=0
|
|
export NCCL_SOCKET_IFNAME=bond0
|
|
export NCCL_DEBUG=INFO
|
|
export NCCL_NVLS_ENABLE=0
|
|
|
|
export TEXT_ENCODER_NAME="google/t5-v1_1-xxl"
|
|
export VISION_ENCODER_NAME="google/siglip-so400m-patch14-384"
|
|
export OUTPUT_DIR="./checkpoints/rdt-pretrain-1b"
|
|
export CFLAGS="-I/usr/include"
|
|
export LDFLAGS="-L/usr/lib/x86_64-linux-gnu"
|
|
export CUTLASS_PATH="/path/to/cutlass"
|
|
|
|
export WANDB_PROJECT="robotics_diffusion_transformer"
|
|
|
|
if [ ! -d "$OUTPUT_DIR" ]; then
|
|
mkdir "$OUTPUT_DIR"
|
|
echo "Folder '$OUTPUT_DIR' created"
|
|
else
|
|
echo "Folder '$OUTPUT_DIR' already exists"
|
|
fi
|
|
|
|
# For run in a single node/machine
|
|
# accelerate launch main.py \
|
|
# --deepspeed="./configs/zero2.json" \
|
|
# ...
|
|
|
|
deepspeed --hostfile=hostfile.txt main.py \
|
|
--deepspeed="./configs/zero2.json" \
|
|
--pretrained_text_encoder_name_or_path=$TEXT_ENCODER_NAME \
|
|
--pretrained_vision_encoder_name_or_path=$VISION_ENCODER_NAME \
|
|
--output_dir=$OUTPUT_DIR \
|
|
--train_batch_size=32 \
|
|
--sample_batch_size=64 \
|
|
--max_train_steps=1000000 \
|
|
--checkpointing_period=1000 \
|
|
--sample_period=500 \
|
|
--checkpoints_total_limit=40 \
|
|
--lr_scheduler="constant" \
|
|
--learning_rate=1e-4 \
|
|
--mixed_precision="bf16" \
|
|
--dataloader_num_workers=8 \
|
|
--dataset_type="pretrain" \
|
|
--report_to=wandb
|
|
|
|
# Use this to resume training from some previous checkpoint
|
|
# --resume_from_checkpoint="checkpoint-1000" \
|