mirror of
https://github.com/leigest519/ScreenCoder.git
synced 2026-03-27 23:32:30 +01:00
Add post-training folder
This commit is contained in:
266
post-training/LLaMA-Factory/examples/README.md
Normal file
266
post-training/LLaMA-Factory/examples/README.md
Normal file
@@ -0,0 +1,266 @@
|
||||
We provide diverse examples about fine-tuning LLMs.
|
||||
|
||||
Make sure to execute these commands in the `LLaMA-Factory` directory.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [LoRA Fine-Tuning](#lora-fine-tuning)
|
||||
- [QLoRA Fine-Tuning](#qlora-fine-tuning)
|
||||
- [Full-Parameter Fine-Tuning](#full-parameter-fine-tuning)
|
||||
- [Merging LoRA Adapters and Quantization](#merging-lora-adapters-and-quantization)
|
||||
- [Inferring LoRA Fine-Tuned Models](#inferring-lora-fine-tuned-models)
|
||||
- [Extras](#extras)
|
||||
|
||||
Use `CUDA_VISIBLE_DEVICES` (GPU) or `ASCEND_RT_VISIBLE_DEVICES` (NPU) to choose computing devices.
|
||||
|
||||
By default, LLaMA-Factory uses all visible computing devices.
|
||||
|
||||
## Examples
|
||||
|
||||
### LoRA Fine-Tuning
|
||||
|
||||
#### (Continuous) Pre-Training
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### Multimodal Supervised Fine-Tuning
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llava1_5_lora_sft.yaml
|
||||
llamafactory-cli train examples/train_lora/qwen2vl_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### DPO/ORPO/SimPO Training
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
|
||||
```
|
||||
|
||||
#### Multimodal DPO/ORPO/SimPO Training
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/qwen2vl_lora_dpo.yaml
|
||||
```
|
||||
|
||||
#### Reward Modeling
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
|
||||
```
|
||||
|
||||
#### PPO Training
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
|
||||
```
|
||||
|
||||
#### KTO Training
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llama3_lora_kto.yaml
|
||||
```
|
||||
|
||||
#### Preprocess Dataset
|
||||
|
||||
It is useful for large dataset, use `tokenized_path` in config to load the preprocessed dataset.
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llama3_preprocess.yaml
|
||||
```
|
||||
|
||||
#### Evaluating on MMLU/CMMLU/C-Eval Benchmarks
|
||||
|
||||
```bash
|
||||
llamafactory-cli eval examples/train_lora/llama3_lora_eval.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning on Multiple Nodes
|
||||
|
||||
```bash
|
||||
FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
|
||||
FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning with DeepSpeed ZeRO-3 (Weight Sharding)
|
||||
|
||||
```bash
|
||||
FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning with Ray on 4 GPUs
|
||||
|
||||
```bash
|
||||
USE_RAY=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ray.yaml
|
||||
```
|
||||
|
||||
### QLoRA Fine-Tuning
|
||||
|
||||
#### Supervised Fine-Tuning with 4/8-bit Bitsandbytes/HQQ/EETQ Quantization (Recommended)
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_qlora/llama3_lora_sft_otfq.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning with 4-bit Bitsandbytes Quantization on Ascend NPU
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_qlora/llama3_lora_sft_gptq.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning with 4-bit AWQ Quantization
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_qlora/llama3_lora_sft_awq.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning with 2-bit AQLM Quantization
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
|
||||
```
|
||||
|
||||
### Full-Parameter Fine-Tuning
|
||||
|
||||
#### Supervised Fine-Tuning on Single Node
|
||||
|
||||
```bash
|
||||
FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning on Multiple Nodes
|
||||
|
||||
```bash
|
||||
FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
|
||||
FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### Multimodal Supervised Fine-Tuning
|
||||
|
||||
```bash
|
||||
FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2vl_full_sft.yaml
|
||||
```
|
||||
|
||||
### Merging LoRA Adapters and Quantization
|
||||
|
||||
#### Merge LoRA Adapters
|
||||
|
||||
Note: DO NOT use quantized model or `quantization_bit` when merging LoRA adapters.
|
||||
|
||||
```bash
|
||||
llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### Quantizing Model using AutoGPTQ
|
||||
|
||||
```bash
|
||||
llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
|
||||
```
|
||||
|
||||
### Save Ollama modelfile
|
||||
|
||||
```bash
|
||||
llamafactory-cli export examples/merge_lora/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
### Inferring LoRA Fine-Tuned Models
|
||||
|
||||
#### Batch Generation using vLLM Tensor Parallel
|
||||
|
||||
```
|
||||
python scripts/vllm_infer.py --model_name_or_path path_to_merged_model --dataset alpaca_en_demo
|
||||
```
|
||||
|
||||
#### Use CLI ChatBox
|
||||
|
||||
```bash
|
||||
llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### Use Web UI ChatBox
|
||||
|
||||
```bash
|
||||
llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### Launch OpenAI-style API
|
||||
|
||||
```bash
|
||||
llamafactory-cli api examples/inference/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
### Extras
|
||||
|
||||
#### Full-Parameter Fine-Tuning using GaLore
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### Full-Parameter Fine-Tuning using APOLLO
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/apollo/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### Full-Parameter Fine-Tuning using BAdam
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### Full-Parameter Fine-Tuning using Adam-mini
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/adam_mini/qwen2_full_sft.yaml
|
||||
```
|
||||
|
||||
#### LoRA+ Fine-Tuning
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### PiSSA Fine-Tuning
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### Mixture-of-Depths Fine-Tuning
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### LLaMA-Pro Fine-Tuning
|
||||
|
||||
```bash
|
||||
bash examples/extras/llama_pro/expand.sh
|
||||
llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
|
||||
```
|
||||
|
||||
#### FSDP+QLoRA Fine-Tuning
|
||||
|
||||
```bash
|
||||
bash examples/extras/fsdp_qlora/train.sh
|
||||
```
|
||||
|
||||
#### Computing BLEU and ROUGE Scores
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/nlg_eval/llama3_lora_predict.yaml
|
||||
```
|
||||
266
post-training/LLaMA-Factory/examples/README_zh.md
Normal file
266
post-training/LLaMA-Factory/examples/README_zh.md
Normal file
@@ -0,0 +1,266 @@
|
||||
我们提供了多样化的大模型微调示例脚本。
|
||||
|
||||
请确保在 `LLaMA-Factory` 目录下执行下述命令。
|
||||
|
||||
## 目录
|
||||
|
||||
- [LoRA 微调](#lora-微调)
|
||||
- [QLoRA 微调](#qlora-微调)
|
||||
- [全参数微调](#全参数微调)
|
||||
- [合并 LoRA 适配器与模型量化](#合并-lora-适配器与模型量化)
|
||||
- [推理 LoRA 模型](#推理-lora-模型)
|
||||
- [杂项](#杂项)
|
||||
|
||||
使用 `CUDA_VISIBLE_DEVICES`(GPU)或 `ASCEND_RT_VISIBLE_DEVICES`(NPU)选择计算设备。
|
||||
|
||||
LLaMA-Factory 默认使用所有可见的计算设备。
|
||||
|
||||
## 示例
|
||||
|
||||
### LoRA 微调
|
||||
|
||||
#### (增量)预训练
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml
|
||||
```
|
||||
|
||||
#### 指令监督微调
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### 多模态指令监督微调
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llava1_5_lora_sft.yaml
|
||||
llamafactory-cli train examples/train_lora/qwen2vl_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### DPO/ORPO/SimPO 训练
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
|
||||
```
|
||||
|
||||
#### 多模态 DPO/ORPO/SimPO 训练
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/qwen2vl_lora_dpo.yaml
|
||||
```
|
||||
|
||||
#### 奖励模型训练
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
|
||||
```
|
||||
|
||||
#### PPO 训练
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
|
||||
```
|
||||
|
||||
#### KTO 训练
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llama3_lora_kto.yaml
|
||||
```
|
||||
|
||||
#### 预处理数据集
|
||||
|
||||
对于大数据集有帮助,在配置中使用 `tokenized_path` 以加载预处理后的数据集。
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_lora/llama3_preprocess.yaml
|
||||
```
|
||||
|
||||
#### 在 MMLU/CMMLU/C-Eval 上评估
|
||||
|
||||
```bash
|
||||
llamafactory-cli eval examples/train_lora/llama3_lora_eval.yaml
|
||||
```
|
||||
|
||||
#### 多机指令监督微调
|
||||
|
||||
```bash
|
||||
FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
|
||||
FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### 使用 DeepSpeed ZeRO-3 平均分配显存
|
||||
|
||||
```bash
|
||||
FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.yaml
|
||||
```
|
||||
|
||||
#### 使用 Ray 在 4 张 GPU 上微调
|
||||
|
||||
```bash
|
||||
USE_RAY=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ray.yaml
|
||||
```
|
||||
|
||||
### QLoRA 微调
|
||||
|
||||
#### 基于 4/8 比特 Bitsandbytes/HQQ/EETQ 量化进行指令监督微调(推荐)
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_qlora/llama3_lora_sft_otfq.yaml
|
||||
```
|
||||
|
||||
#### 在 NPU 上基于 4 比特 Bitsandbytes 量化进行指令监督微调
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
|
||||
```
|
||||
|
||||
#### 基于 4/8 比特 GPTQ 量化进行指令监督微调
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_qlora/llama3_lora_sft_gptq.yaml
|
||||
```
|
||||
|
||||
#### 基于 4 比特 AWQ 量化进行指令监督微调
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_qlora/llama3_lora_sft_awq.yaml
|
||||
```
|
||||
|
||||
#### 基于 2 比特 AQLM 量化进行指令监督微调
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
|
||||
```
|
||||
|
||||
### 全参数微调
|
||||
|
||||
#### 在单机上进行指令监督微调
|
||||
|
||||
```bash
|
||||
FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### 在多机上进行指令监督微调
|
||||
|
||||
```bash
|
||||
FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
|
||||
FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### 多模态指令监督微调
|
||||
|
||||
```bash
|
||||
FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2vl_full_sft.yaml
|
||||
```
|
||||
|
||||
### 合并 LoRA 适配器与模型量化
|
||||
|
||||
#### 合并 LoRA 适配器
|
||||
|
||||
注:请勿使用量化后的模型或 `quantization_bit` 参数来合并 LoRA 适配器。
|
||||
|
||||
```bash
|
||||
llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### 使用 AutoGPTQ 量化模型
|
||||
|
||||
```bash
|
||||
llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
|
||||
```
|
||||
|
||||
### 保存 Ollama 配置文件
|
||||
|
||||
```bash
|
||||
llamafactory-cli export examples/merge_lora/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
### 推理 LoRA 模型
|
||||
|
||||
#### 使用 vLLM+TP 批量推理
|
||||
|
||||
```
|
||||
python scripts/vllm_infer.py --model_name_or_path path_to_merged_model --dataset alpaca_en_demo
|
||||
```
|
||||
|
||||
#### 使用命令行对话框
|
||||
|
||||
```bash
|
||||
llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### 使用浏览器对话框
|
||||
|
||||
```bash
|
||||
llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### 启动 OpenAI 风格 API
|
||||
|
||||
```bash
|
||||
llamafactory-cli api examples/inference/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
### 杂项
|
||||
|
||||
#### 使用 GaLore 进行全参数训练
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### 使用 APOLLO 进行全参数训练
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/apollo/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### 使用 BAdam 进行全参数训练
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### 使用 Adam-mini 进行全参数训练
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/adam_mini/qwen2_full_sft.yaml
|
||||
```
|
||||
|
||||
#### LoRA+ 微调
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### PiSSA 微调
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### 深度混合微调
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### LLaMA-Pro 微调
|
||||
|
||||
```bash
|
||||
bash examples/extras/llama_pro/expand.sh
|
||||
llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
|
||||
```
|
||||
|
||||
#### FSDP+QLoRA 微调
|
||||
|
||||
```bash
|
||||
bash examples/extras/fsdp_qlora/train.sh
|
||||
```
|
||||
|
||||
#### 计算 BLEU 和 ROUGE 分数
|
||||
|
||||
```bash
|
||||
llamafactory-cli train examples/extras/nlg_eval/llama3_lora_predict.yaml
|
||||
```
|
||||
@@ -0,0 +1,25 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: FSDP
|
||||
downcast_bf16: 'no'
|
||||
fsdp_config:
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_backward_prefetch: BACKWARD_PRE
|
||||
fsdp_forward_prefetch: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_offload_params: false
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_use_orig_params: true
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: bf16 # or fp16
|
||||
num_machines: 1 # the number of nodes
|
||||
num_processes: 2 # the number of GPUs in all nodes
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
@@ -0,0 +1,25 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: FSDP
|
||||
downcast_bf16: 'no'
|
||||
fsdp_config:
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_backward_prefetch: BACKWARD_PRE
|
||||
fsdp_forward_prefetch: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_offload_params: true # offload may affect training speed
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_use_orig_params: true
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: bf16 # or fp16
|
||||
num_machines: 1 # the number of nodes
|
||||
num_processes: 2 # the number of GPUs in all nodes
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"zero_allow_untested_optimizer": true,
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 0,
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 5e8,
|
||||
"overlap_comm": false,
|
||||
"reduce_scatter": true,
|
||||
"reduce_bucket_size": 5e8,
|
||||
"contiguous_gradients": true,
|
||||
"round_robin_gradients": true
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"zero_allow_untested_optimizer": true,
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 5e8,
|
||||
"overlap_comm": false,
|
||||
"reduce_scatter": true,
|
||||
"reduce_bucket_size": 5e8,
|
||||
"contiguous_gradients": true,
|
||||
"round_robin_gradients": true
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"zero_allow_untested_optimizer": true,
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 5e8,
|
||||
"overlap_comm": false,
|
||||
"reduce_scatter": true,
|
||||
"reduce_bucket_size": 5e8,
|
||||
"contiguous_gradients": true,
|
||||
"round_robin_gradients": true
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"zero_allow_untested_optimizer": true,
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"overlap_comm": false,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 1e9,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"stage3_max_live_parameters": 1e9,
|
||||
"stage3_max_reuse_distance": 1e9,
|
||||
"stage3_gather_16bit_weights_on_model_save": true
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
{
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"zero_allow_untested_optimizer": true,
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"offload_param": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"overlap_comm": false,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 1e9,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"stage3_max_live_parameters": 1e9,
|
||||
"stage3_max_reuse_distance": 1e9,
|
||||
"stage3_gather_16bit_weights_on_model_save": true
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
### model
|
||||
model_name_or_path: Qwen/Qwen2-1.5B-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: full
|
||||
use_adam_mini: true
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: qwen
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/qwen2-1_5b/full/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-5
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,48 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: full
|
||||
use_apollo: true
|
||||
apollo_layerwise: true # choices: [true, false], use false for DDP training
|
||||
apollo_target: all
|
||||
apollo_rank: 128
|
||||
apollo_scale: 32.0
|
||||
apollo_scale_type: channel
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/full/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 1 # use 1 for layerwise apollo
|
||||
learning_rate: 1.0e-5
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
pure_bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,46 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: full
|
||||
use_badam: true
|
||||
badam_mode: layer
|
||||
badam_switch_mode: ascending
|
||||
badam_switch_interval: 50
|
||||
badam_verbose: 2
|
||||
# deepspeed: examples/deepspeed/ds_z3_config.json
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/full/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-5
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,45 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
quantization_bit: 4
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
|
||||
--config_file examples/accelerate/fsdp_config.yaml \
|
||||
src/train.py examples/extras/fsdp_qlora/llama3_lora_sft.yaml
|
||||
@@ -0,0 +1,47 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: full
|
||||
use_galore: true
|
||||
galore_layerwise: true # choices: [true, false], use false for DDP training
|
||||
galore_target: all
|
||||
galore_rank: 128
|
||||
galore_scale: 2.0
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/full/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 1 # use 1 for layerwise galore
|
||||
learning_rate: 1.0e-5
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
pure_bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
python scripts/llama_pro.py \
|
||||
--model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--output_dir models/llama3-8b-pro \
|
||||
--num_expand 8
|
||||
@@ -0,0 +1,45 @@
|
||||
### model
|
||||
model_name_or_path: models/llama3-8b-pro
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: freeze
|
||||
freeze_trainable_layers: 8
|
||||
freeze_trainable_modules: all
|
||||
use_llama_pro: true
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b-pro/freeze/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,45 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
loraplus_lr_ratio: 16.0
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,44 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: full
|
||||
mixture_of_depths: convert
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b-mod/full/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
optim: paged_adamw_8bit
|
||||
learning_rate: 1.0e-5
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
pure_bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,31 @@
|
||||
# The batch generation can be SLOW using this config.
|
||||
# For faster inference, we recommend to use `scripts/vllm_infer.py`.
|
||||
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
adapter_name_or_path: saves/llama3-8b/lora/sft
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_predict: true
|
||||
finetuning_type: lora
|
||||
|
||||
### dataset
|
||||
eval_dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 50
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/predict
|
||||
overwrite_output_dir: true
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### eval
|
||||
per_device_eval_batch_size: 1
|
||||
predict_with_generate: true
|
||||
ddp_timeout: 180000000
|
||||
@@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
python scripts/pissa_init.py \
|
||||
--model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--output_dir models/llama3-8b-pissa
|
||||
@@ -0,0 +1,47 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
pissa_init: true
|
||||
pissa_iter: 16
|
||||
pissa_convert: true
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,4 @@
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
template: llama3
|
||||
infer_backend: huggingface # choices: [huggingface, vllm]
|
||||
trust_remote_code: true
|
||||
@@ -0,0 +1,4 @@
|
||||
model_name_or_path: saves/llama3-8b/full/sft
|
||||
template: llama3
|
||||
infer_backend: huggingface # choices: [huggingface, vllm]
|
||||
trust_remote_code: true
|
||||
@@ -0,0 +1,5 @@
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
adapter_name_or_path: saves/llama3-8b/lora/sft
|
||||
template: llama3
|
||||
infer_backend: huggingface # choices: [huggingface, vllm]
|
||||
trust_remote_code: true
|
||||
@@ -0,0 +1,4 @@
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
template: llama3
|
||||
infer_backend: sglang
|
||||
trust_remote_code: true
|
||||
@@ -0,0 +1,5 @@
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
template: llama3
|
||||
infer_backend: vllm
|
||||
vllm_enforce_eager: true
|
||||
trust_remote_code: true
|
||||
@@ -0,0 +1,4 @@
|
||||
model_name_or_path: llava-hf/llava-1.5-7b-hf
|
||||
template: llava
|
||||
infer_backend: huggingface # choices: [huggingface, vllm]
|
||||
trust_remote_code: true
|
||||
@@ -0,0 +1,4 @@
|
||||
model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
|
||||
template: qwen2_vl
|
||||
infer_backend: huggingface # choices: [huggingface, vllm]
|
||||
trust_remote_code: true
|
||||
@@ -0,0 +1,10 @@
|
||||
### model
|
||||
model_name_or_path: saves/llama3-8b/full/sft
|
||||
template: llama3
|
||||
trust_remote_code: true
|
||||
|
||||
### export
|
||||
export_dir: output/llama3_full_sft
|
||||
export_size: 5
|
||||
export_device: cpu
|
||||
export_legacy_format: false
|
||||
@@ -0,0 +1,12 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
template: llama3
|
||||
trust_remote_code: true
|
||||
|
||||
### export
|
||||
export_dir: output/llama3_gptq
|
||||
export_quantization_bit: 4
|
||||
export_quantization_dataset: data/c4_demo.json
|
||||
export_size: 5
|
||||
export_device: cpu
|
||||
export_legacy_format: false
|
||||
@@ -0,0 +1,13 @@
|
||||
### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
|
||||
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
adapter_name_or_path: saves/llama3-8b/lora/sft
|
||||
template: llama3
|
||||
trust_remote_code: true
|
||||
|
||||
### export
|
||||
export_dir: output/llama3_lora_sft
|
||||
export_size: 5
|
||||
export_device: cpu
|
||||
export_legacy_format: false
|
||||
@@ -0,0 +1,13 @@
|
||||
### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
|
||||
|
||||
### model
|
||||
model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
|
||||
adapter_name_or_path: saves/qwen2_vl-7b/lora/sft
|
||||
template: qwen2_vl
|
||||
trust_remote_code: true
|
||||
|
||||
### export
|
||||
export_dir: output/qwen2_vl_lora_sft
|
||||
export_size: 5
|
||||
export_device: cpu
|
||||
export_legacy_format: false
|
||||
@@ -0,0 +1,45 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: full
|
||||
deepspeed: examples/deepspeed/ds_z3_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/full/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 1.0e-5
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
resume_from_checkpoint: null
|
||||
|
||||
### eval
|
||||
# eval_dataset: alpaca_en_demo
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,49 @@
|
||||
### model
|
||||
model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct
|
||||
image_max_pixels: 1843200 # 1280*720*2
|
||||
video_max_pixels: 16384
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: full
|
||||
# freeze_trainable_layers: 1
|
||||
# freeze_trainable_modules: all
|
||||
freeze_vision_tower: true # choices: [true, false]
|
||||
freeze_multi_modal_projector: false # choices: [true, false]
|
||||
freeze_language_model: false
|
||||
deepspeed: examples/deepspeed/ds_z3_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
|
||||
# deepspeed: examples/deepspeed/ds_z3_offload_config.json
|
||||
|
||||
### dataset
|
||||
dataset: websight_toy
|
||||
template: qwen2_vl
|
||||
cutoff_len: 8192
|
||||
max_samples: 9000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/websight/toy/3b
|
||||
# output_dir: saves/mrweb/fulll
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 1.0e-5
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,49 @@
|
||||
### model
|
||||
model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
|
||||
image_max_pixels: 1843200 # 1280*720*2
|
||||
video_max_pixels: 16384
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: full
|
||||
# freeze_trainable_layers: 1
|
||||
# freeze_trainable_modules: all
|
||||
freeze_vision_tower: true # choices: [true, false]
|
||||
freeze_multi_modal_projector: false # choices: [true, false]
|
||||
train_mm_proj_only: false # choices: [true, false]
|
||||
deepspeed: examples/deepspeed/ds_z3_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
|
||||
# deepspeed: examples/deepspeed/ds_z3_offload_config.json
|
||||
|
||||
### dataset
|
||||
dataset: websight
|
||||
template: qwen2_vl
|
||||
cutoff_len: 8192
|
||||
max_samples: 9000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
### output
|
||||
output_dir: saves/websight/full/7b
|
||||
# output_dir: saves/mrweb/fulll
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 1.0e-5
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,48 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: dpo
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
pref_beta: 0.1
|
||||
pref_loss: sigmoid # choices: [sigmoid (dpo), orpo, simpo]
|
||||
|
||||
### dataset
|
||||
dataset: dpo_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/dpo
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 5.0e-6
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
resume_from_checkpoint: null
|
||||
|
||||
### eval
|
||||
# eval_dataset: dpo_en_demo
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,19 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
adapter_name_or_path: saves/llama3-8b/lora/sft
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
finetuning_type: lora
|
||||
|
||||
### dataset
|
||||
task: mmlu_test # choices: [mmlu_test, ceval_validation, cmmlu_test]
|
||||
template: fewshot
|
||||
lang: en
|
||||
n_shot: 5
|
||||
|
||||
### output
|
||||
save_dir: saves/llama3-8b/lora/eval
|
||||
|
||||
### eval
|
||||
batch_size: 4
|
||||
@@ -0,0 +1,44 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: kto
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
pref_beta: 0.1
|
||||
|
||||
### dataset
|
||||
dataset: kto_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/kto
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 5.0e-6
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,43 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
reward_model: saves/llama3-8b/lora/reward
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: ppo
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/ppo
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-5
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### generate
|
||||
max_new_tokens: 512
|
||||
top_k: 0
|
||||
top_p: 0.9
|
||||
@@ -0,0 +1,45 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: pt
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
|
||||
### dataset
|
||||
dataset: c4_demo
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/pretrain
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
resume_from_checkpoint: null
|
||||
|
||||
### eval
|
||||
# eval_dataset: c4_demo
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,46 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: rm
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
|
||||
### dataset
|
||||
dataset: dpo_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/reward
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
resume_from_checkpoint: null
|
||||
|
||||
### eval
|
||||
# eval_dataset: dpo_en_demo
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,46 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
resume_from_checkpoint: null
|
||||
|
||||
### eval
|
||||
# eval_dataset: alpaca_en_demo
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,47 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
deepspeed: examples/deepspeed/ds_z3_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
resume_from_checkpoint: null
|
||||
|
||||
### eval
|
||||
# eval_dataset: alpaca_en_demo
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,61 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct # or use local absolute path
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
dataset_dir: REMOTE:llamafactory/demo_data # or use local absolute path
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: tmp_dir
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### ray
|
||||
ray_run_name: llama3_8b_sft_lora
|
||||
ray_storage_path: ./saves
|
||||
ray_num_workers: 4 # Number of GPUs to use.
|
||||
placement_strategy: PACK
|
||||
resources_per_worker:
|
||||
GPU: 1
|
||||
# ray_init_kwargs:
|
||||
# runtime_env:
|
||||
# env_vars:
|
||||
# <YOUR-ENV-VAR-HERE>: "<YOUR-ENV-VAR-HERE>"
|
||||
# pip:
|
||||
# - emoji
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
resume_from_checkpoint: null
|
||||
|
||||
### eval
|
||||
# eval_dataset: alpaca_en_demo
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,23 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
tokenized_path: saves/llama3-8b/dataset/sft
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
overwrite_output_dir: true
|
||||
@@ -0,0 +1,49 @@
|
||||
# pip install git+https://github.com/hiyouga/transformers.git@llama4_train
|
||||
|
||||
### model
|
||||
model_name_or_path: meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
deepspeed: examples/deepspeed/ds_z3_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
|
||||
|
||||
### dataset
|
||||
dataset: mllm_demo,identity,alpaca_en_demo
|
||||
template: llama4
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama4-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 2
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
resume_from_checkpoint: null
|
||||
|
||||
### eval
|
||||
# eval_dataset: alpaca_en_demo
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,45 @@
|
||||
### model
|
||||
model_name_or_path: llava-hf/llava-1.5-7b-hf
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
|
||||
### dataset
|
||||
dataset: mllm_demo
|
||||
template: llava
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llava1_5-7b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
resume_from_checkpoint: null
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,49 @@
|
||||
### model
|
||||
model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
|
||||
image_max_pixels: 262144
|
||||
video_max_pixels: 16384
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: dpo
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
pref_beta: 0.1
|
||||
pref_loss: sigmoid # choices: [sigmoid (dpo), orpo, simpo]
|
||||
|
||||
### dataset
|
||||
dataset: rlhf_v
|
||||
template: qwen2_vl
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/qwen2_vl-7b/lora/dpo
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 5.0e-6
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
resume_from_checkpoint: null
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,47 @@
|
||||
### model
|
||||
model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
|
||||
image_max_pixels: 262144
|
||||
video_max_pixels: 16384
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
|
||||
### dataset
|
||||
dataset: mllm_demo,identity,alpaca_en_demo # video: mllm_video_demo
|
||||
template: qwen2_vl
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/qwen2_vl-7b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
resume_from_checkpoint: null
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,44 @@
|
||||
### model
|
||||
model_name_or_path: ISTA-DASLab/Meta-Llama-3-8B-Instruct-AQLM-2Bit-1x16
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,44 @@
|
||||
### model
|
||||
model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-AWQ
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,47 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
quantization_bit: 4
|
||||
quantization_method: bnb
|
||||
double_quantization: false
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,44 @@
|
||||
### model
|
||||
model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
@@ -0,0 +1,46 @@
|
||||
### model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
quantization_bit: 4 # choices: [8 (bnb/hqq/eetq), 4 (bnb/hqq), 3 (hqq), 2 (hqq)]
|
||||
quantization_method: bnb # choices: [bnb, hqq, eetq]
|
||||
trust_remote_code: true
|
||||
|
||||
### method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_rank: 8
|
||||
lora_target: all
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
template: llama3
|
||||
cutoff_len: 2048
|
||||
max_samples: 1000
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
dataloader_num_workers: 4
|
||||
|
||||
### output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
save_only_model: false
|
||||
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||
|
||||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
bf16: true
|
||||
ddp_timeout: 180000000
|
||||
|
||||
### eval
|
||||
# val_size: 0.1
|
||||
# per_device_eval_batch_size: 1
|
||||
# eval_strategy: steps
|
||||
# eval_steps: 500
|
||||
Reference in New Issue
Block a user