108 lines
4.1 KiB
Bash
108 lines
4.1 KiB
Bash
#!/bin/bash
|
|
|
|
BEGIN_TIME=$(date +%s)
|
|
|
|
CONFIG_NAME="Train_Config_Default"
|
|
CONFIG_FILE="model_config/$CONFIG_NAME.yml"
|
|
echo "CONFIG_FILE_PATH: $CONFIG_FILE"
|
|
|
|
### ============Read Input Config and ReLoad Config YAML===================
|
|
ln -s /home/qi.xiong/Temp/RDT-1B/input/weights ../weights
|
|
|
|
TRAIN_CONFIG_FILE="input/config.json"
|
|
echo "TRAIN_CONFIG_FILE_PATH: $TRAIN_CONFIG_FILE"
|
|
python scripts/read_config.py "$TRAIN_CONFIG_FILE" "$CONFIG_FILE"
|
|
|
|
### ============Read Input Config and ReLoad Config YAML===================
|
|
|
|
export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
|
|
export NCCL_DEBUG=INFO
|
|
export NCCL_NVLS_ENABLE=0
|
|
export NCCL_SOCKET_IFNAME=eth0
|
|
# export TEXT_ENCODER_NAME="google/t5-v1_1-xxl"
|
|
export VISION_ENCODER_NAME="../weights/siglip-so400m-patch14-384"
|
|
export CFLAGS="-I/usr/include"
|
|
export LDFLAGS="-L/usr/lib/x86_64-linux-gnu"
|
|
export WANDB_PROJECT="RDT-1B"
|
|
export WANDB_DEFAULT_RUN_NAME=$CONFIG_NAME
|
|
export NCCL_P2P_DISABLE=1
|
|
export NCCL_IB_DISABLE=1
|
|
|
|
# check if YAML exist
|
|
if [ ! -f "$CONFIG_FILE" ]; then
|
|
echo "Config file $CONFIG_FILE does not exist!"
|
|
exit 1
|
|
fi
|
|
|
|
PRETRAINED_MODEL_NAME=$(python scripts/read_yaml.py "$CONFIG_FILE" pretrained_model_name_or_path)
|
|
TRAIN_BATCH_SIZE=$(python scripts/read_yaml.py "$CONFIG_FILE" train_batch_size)
|
|
SAMPLE_BATCH_SIZE=$(python scripts/read_yaml.py "$CONFIG_FILE" sample_batch_size)
|
|
MAX_TRAIN_STEPS=$(python scripts/read_yaml.py "$CONFIG_FILE" max_train_steps)
|
|
CHECKPOINTING_PERIOD=$(python scripts/read_yaml.py "$CONFIG_FILE" checkpointing_period)
|
|
SAMPLE_PERIOD=$(python scripts/read_yaml.py "$CONFIG_FILE" sample_period)
|
|
CHECKPOINTS_TOTAL_LIMIT=$(python scripts/read_yaml.py "$CONFIG_FILE" checkpoints_total_limit)
|
|
LR_SCHEDULER=$(python scripts/read_yaml.py "$CONFIG_FILE" lr_scheduler)
|
|
LEARNING_RATE=$(python scripts/read_yaml.py "$CONFIG_FILE" learning_rate)
|
|
DATALOADER_NUM_WORKERS=$(python scripts/read_yaml.py "$CONFIG_FILE" dataloader_num_workers)
|
|
DATASET_TYPE=$(python scripts/read_yaml.py "$CONFIG_FILE" dataset_type)
|
|
STATE_NOISE_SNR=$(python scripts/read_yaml.py "$CONFIG_FILE" state_noise_snr)
|
|
GRAD_ACCUM_STEPS=$(python scripts/read_yaml.py "$CONFIG_FILE" gradient_accumulation_steps)
|
|
OUTPUT_DIR=$(python scripts/read_yaml.py "$CONFIG_FILE" checkpoint_path)
|
|
CUDA_USE=$(python scripts/read_yaml.py "$CONFIG_FILE" cuda_visible_device)
|
|
|
|
|
|
PRETRAINED_MODEL_NAME=$(echo "$PRETRAINED_MODEL_NAME" | tr -d '"')
|
|
CUDA_USE=$(echo "$CUDA_USE" | tr -d '"')
|
|
OUTPUT_DIR=$(echo "$OUTPUT_DIR" | tr -d '"')
|
|
|
|
# create output path
|
|
if [ ! -d "$OUTPUT_DIR" ]; then
|
|
mkdir -p "$OUTPUT_DIR"
|
|
echo "Created output directory: $OUTPUT_DIR"
|
|
else
|
|
echo "Output directory already exists: $OUTPUT_DIR"
|
|
fi
|
|
|
|
export CUDA_VISIBLE_DEVICES=$CUDA_USE
|
|
|
|
python -m data.compute_dataset_stat_hdf5 --task_name $CONFIG_NAME
|
|
|
|
accelerate launch --main_process_port=28499 main.py \
|
|
--deepspeed="./configs/zero2.json" \
|
|
--pretrained_model_name_or_path=$PRETRAINED_MODEL_NAME \
|
|
--pretrained_text_encoder_name_or_path=$TEXT_ENCODER_NAME \
|
|
--pretrained_vision_encoder_name_or_path=$VISION_ENCODER_NAME \
|
|
--output_dir=$OUTPUT_DIR \
|
|
--train_batch_size=$TRAIN_BATCH_SIZE \
|
|
--sample_batch_size=$SAMPLE_BATCH_SIZE \
|
|
--max_train_steps=$MAX_TRAIN_STEPS \
|
|
--checkpointing_period=$CHECKPOINTING_PERIOD \
|
|
--sample_period=$SAMPLE_PERIOD \
|
|
--checkpoints_total_limit=$CHECKPOINTS_TOTAL_LIMIT \
|
|
--lr_scheduler="constant" \
|
|
--learning_rate=$LEARNING_RATE \
|
|
--mixed_precision="bf16" \
|
|
--dataloader_num_workers=$DATALOADER_NUM_WORKERS \
|
|
--image_aug \
|
|
--dataset_type="finetune" \
|
|
--state_noise_snr=$STATE_NOISE_SNR \
|
|
--load_from_hdf5 \
|
|
--report_to=wandb \
|
|
--precomp_lang_embed \
|
|
--gradient_accumulation_steps=$GRAD_ACCUM_STEPS \
|
|
--model_config_path=$CONFIG_FILE \
|
|
--CONFIG_NAME=$CONFIG_NAME \
|
|
--output_log_path=$OUTPUT_DIR/output.log
|
|
|
|
END_TIME=$(date +%s)
|
|
RUNTIME=$((END_TIME - BEGIN_TIME))
|
|
echo "Total runtime: $RUNTIME seconds"
|
|
|
|
### ============Generate Output JSON===================
|
|
|
|
python scripts/generate_output_json.py "$TRAIN_CONFIG_FILE" "$OUTPUT_DIR" "$RUNTIME"
|
|
|
|
### ============Generate Output JSON===================
|
|
|
|
|