#!/bin/bash BEGIN_TIME=$(date +%s) CONFIG_NAME="Train_Config_Default" CONFIG_FILE="model_config/$CONFIG_NAME.yml" echo "CONFIG_FILE_PATH: $CONFIG_FILE" ### ============Read Input Config and ReLoad Config YAML=================== TRAIN_CONFIG_FILE="input/config.json" echo "TRAIN_CONFIG_FILE_PATH: $TRAIN_CONFIG_FILE" python3 scripts/read_config.py "$TRAIN_CONFIG_FILE" "$CONFIG_FILE" ### ============Read Input Config and ReLoad Config YAML=================== export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_DEBUG=INFO export NCCL_NVLS_ENABLE=0 export NCCL_SOCKET_IFNAME=eth0 # export TEXT_ENCODER_NAME="google/t5-v1_1-xxl" export VISION_ENCODER_NAME="../weights/siglip-so400m-patch14-384" export CFLAGS="-I/usr/include" export LDFLAGS="-L/usr/lib/x86_64-linux-gnu" export WANDB_PROJECT="RDT-170M" export WANDB_DEFAULT_RUN_NAME=$CONFIG_NAME export NCCL_P2P_DISABLE=1 export NCCL_IB_DISABLE=1 # check if YAML exist if [ ! -f "$CONFIG_FILE" ]; then echo "Config file $CONFIG_FILE does not exist!" exit 1 fi PRETRAINED_MODEL_NAME=$(python3 scripts/read_yaml.py "$CONFIG_FILE" pretrained_model_name_or_path) TRAIN_BATCH_SIZE=$(python3 scripts/read_yaml.py "$CONFIG_FILE" train_batch_size) SAMPLE_BATCH_SIZE=$(python3 scripts/read_yaml.py "$CONFIG_FILE" sample_batch_size) MAX_TRAIN_STEPS=$(python3 scripts/read_yaml.py "$CONFIG_FILE" max_train_steps) CHECKPOINTING_PERIOD=$(python3 scripts/read_yaml.py "$CONFIG_FILE" checkpointing_period) SAMPLE_PERIOD=$(python3 scripts/read_yaml.py "$CONFIG_FILE" sample_period) CHECKPOINTS_TOTAL_LIMIT=$(python3 scripts/read_yaml.py "$CONFIG_FILE" checkpoints_total_limit) LR_SCHEDULER=$(python3 scripts/read_yaml.py "$CONFIG_FILE" lr_scheduler) LEARNING_RATE=$(python3 scripts/read_yaml.py "$CONFIG_FILE" learning_rate) DATALOADER_NUM_WORKERS=$(python3 scripts/read_yaml.py "$CONFIG_FILE" dataloader_num_workers) DATASET_TYPE=$(python3 scripts/read_yaml.py "$CONFIG_FILE" dataset_type) STATE_NOISE_SNR=$(python3 scripts/read_yaml.py "$CONFIG_FILE" state_noise_snr) GRAD_ACCUM_STEPS=$(python3 scripts/read_yaml.py "$CONFIG_FILE" gradient_accumulation_steps) OUTPUT_DIR=$(python3 scripts/read_yaml.py "$CONFIG_FILE" checkpoint_path) CUDA_USE=$(python3 scripts/read_yaml.py "$CONFIG_FILE" cuda_visible_device) export WANDB_MODE=disabled PRETRAINED_MODEL_NAME=$(echo "$PRETRAINED_MODEL_NAME" | tr -d '"') CUDA_USE=$(echo "$CUDA_USE" | tr -d '"') OUTPUT_DIR=$(echo "$OUTPUT_DIR" | tr -d '"') # create output path if [ ! -d "$OUTPUT_DIR" ]; then mkdir -p "$OUTPUT_DIR" echo "Created output directory: $OUTPUT_DIR" else echo "Output directory already exists: $OUTPUT_DIR" fi export CUDA_VISIBLE_DEVICES=$CUDA_USE python3 -m data.compute_dataset_stat_hdf5 --task_name $CONFIG_NAME accelerate launch --main_process_port=28499 main.py \ --deepspeed="./configs/zero2.json" \ --pretrained_model_name_or_path=$PRETRAINED_MODEL_NAME \ --pretrained_text_encoder_name_or_path=$TEXT_ENCODER_NAME \ --pretrained_vision_encoder_name_or_path=$VISION_ENCODER_NAME \ --output_dir=$OUTPUT_DIR \ --train_batch_size=$TRAIN_BATCH_SIZE \ --sample_batch_size=$SAMPLE_BATCH_SIZE \ --max_train_steps=$MAX_TRAIN_STEPS \ --checkpointing_period=$CHECKPOINTING_PERIOD \ --sample_period=$SAMPLE_PERIOD \ --checkpoints_total_limit=$CHECKPOINTS_TOTAL_LIMIT \ --lr_scheduler="constant" \ --learning_rate=$LEARNING_RATE \ --mixed_precision="bf16" \ --dataloader_num_workers=$DATALOADER_NUM_WORKERS \ --image_aug \ --dataset_type="finetune" \ --state_noise_snr=$STATE_NOISE_SNR \ --load_from_hdf5 \ --report_to=wandb \ --precomp_lang_embed \ --gradient_accumulation_steps=$GRAD_ACCUM_STEPS \ --model_config_path=$CONFIG_FILE \ --CONFIG_NAME=$CONFIG_NAME \ --output_log_path=$OUTPUT_DIR/output.log END_TIME=$(date +%s) RUNTIME=$((END_TIME - BEGIN_TIME)) echo "Total runtime: $RUNTIME seconds" ### ============Generate Output JSON=================== sleep 10 python3 scripts/generate_output_json.py "$TRAIN_CONFIG_FILE" "$OUTPUT_DIR" "$RUNTIME" ### ============Generate Output JSON===================