Files
ScreenCoder_UI2Code/post-training/VLM-R1/run_exp.sh
2025-10-22 15:38:32 +08:00

110 lines
2.6 KiB
Bash
Executable File

# 1. Export paths
cd src/open-r1-multimodal
export DEBUG_MODE="true"
export PROJ_ROOT="$HOME_ROOT/code_mllm"
RUN_NAME="Qwen2.5-VL-7B-GRPO-websight"
export LOG_PATH="./debug_log_$RUN_NAME.txt"
export PLACEHOLDER_PATH="$PROJ_ROOT/VLM-R1/placeholder.jpg"
export CSS_PATH="$PROJ_ROOT/VLM-R1/tailwind.min.css"
image_folder="$PROJ_ROOT/LLaMA-Factory/data"
data_file_paths="$PROJ_ROOT/LLaMA-Factory/data/CodeMLLM/websight/train_rl.json"
# 2. Experiment parameters
model_name="Qwen/Qwen2.5-VL-7B-Instruct"
gpu_num="8"
bs_per_device=1
num_generations=8 # assert (bs_per_device x gpu_num) % num_generations == 0
resume="True"
#!/usr/bin/env bash
set -euo pipefail
# 1. Ensure PROJ_ROOT is set
if [[ -z "${PROJ_ROOT:-}" ]]; then
echo "ERROR: PROJ_ROOT is not defined." >&2
exit 1
fi
# 3. Declare expected type for each
declare -A expected=(
[PLACEHOLDER_PATH]=file
[CSS_PATH]=file
[image_folder]=dir
[data_file_paths]=file
)
# 4. Test existence
all_good=true
for var in "${!expected[@]}"; do
path="${!var}"
type="${expected[$var]}"
case "$type" in
file)
if [[ ! -f "$path" ]]; then
echo "✗ File missing: $var$path" >&2
all_good=false
else
echo "✔ File exists: $var$path"
fi
;;
dir)
if [[ ! -d "$path" ]]; then
echo "✗ Directory missing: $var$path" >&2
all_good=false
else
echo "✔ Directory exists: $var$path"
fi
;;
*)
echo "WARNING: Unknown type for $var: $type" >&2
;;
esac
done
# 5. Exit non-zero if any missing
if ! $all_good; then
echo "One or more paths were missing." >&2
exit 2
fi
echo "All paths verified successfully."
torchrun --nproc_per_node=$gpu_num \
--nnodes="1" \
--node_rank="0" \
--master_addr="127.0.0.1" \
--master_port="12346" \
src/open_r1/grpo_jsonl.py \
--deepspeed local_scripts/zero3.json \
--output_dir $PROJ_ROOT/VLM-R1/output/$RUN_NAME \
--model_name_or_path $model_name \
--dataset_name none \
--image_folders $image_folder\
--data_file_paths $data_file_paths \
--freeze_vision_modules true \
--max_pixels 1843200 \
--max_prompt_length 4096 \
--max_completion_length 2048 \
--num_generations $num_generations \
--per_device_train_batch_size $bs_per_device \
--gradient_accumulation_steps 1 \
--logging_steps 1 \
--bf16 \
--torch_dtype bfloat16 \
--data_seed 42 \
--report_to wandb \
--gradient_checkpointing true \
--attn_implementation flash_attention_2 \
--num_train_epochs 2 \
--run_name $RUN_NAME \
--save_steps 100 \
--save_only_model true \
--resume_from_checkpoint $resume \