diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index 0ae7bd93f..0cf45af52 100644
--- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -456,6 +456,9 @@
 |[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation)|polylm|default|-|-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)|
 |[AI-ModelScope/aya-expanse-8b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-8b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-8b](https://huggingface.co/CohereForAI/aya-expanse-8b)|
 |[AI-ModelScope/aya-expanse-32b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-32b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-32b](https://huggingface.co/CohereForAI/aya-expanse-32b)|
+|[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|-|-|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)|
+|[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|-|-|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)|
+|[iic/nlp_structbert_backbone_base_std](https://modelscope.cn/models/iic/nlp_structbert_backbone_base_std)|bert|dummy|-|-|-|
 
 
 ### 多模态大模型
@@ -466,24 +469,24 @@
 |[Qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/Qwen/Qwen-VL-Chat-Int4)|qwen_vl|qwen_vl|-|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
 |[Qwen/Qwen-Audio-Chat](https://modelscope.cn/models/Qwen/Qwen-Audio-Chat)|qwen_audio|qwen_audio|-|audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
 |[Qwen/Qwen-Audio](https://modelscope.cn/models/Qwen/Qwen-Audio)|qwen_audio|qwen_audio|-|audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
-|[Qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
-|[Qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
-|[Qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
-|[Qwen/Qwen2-VL-2B](https://modelscope.cn/models/Qwen/Qwen2-VL-2B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
-|[Qwen/Qwen2-VL-7B](https://modelscope.cn/models/Qwen/Qwen2-VL-7B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
-|[Qwen/Qwen2-VL-72B](https://modelscope.cn/models/Qwen/Qwen2-VL-72B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
-|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
-|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
-|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
-|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
-|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
-|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
-|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
-|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
-|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
+|[Qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
+|[Qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
+|[Qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
+|[Qwen/Qwen2-VL-2B](https://modelscope.cn/models/Qwen/Qwen2-VL-2B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
+|[Qwen/Qwen2-VL-7B](https://modelscope.cn/models/Qwen/Qwen2-VL-7B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
+|[Qwen/Qwen2-VL-72B](https://modelscope.cn/models/Qwen/Qwen2-VL-72B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
+|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
+|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
+|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
+|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
+|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
+|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
+|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
+|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
+|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
 |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
 |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
-|[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)|
+|[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)|
 |[AIDC-AI/Ovis1.6-Gemma2-9B](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Gemma2-9B)|ovis1_6|ovis1_6|transformers>=4.42|vision|[AIDC-AI/Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)|
 |[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)|glm4v|glm4v|transformers>=4.42|-|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
 |[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|vision|[THUDM/glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b)|
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
index 56397f528..b5ac0a249 100644
--- a/docs/source_en/Instruction/Supported-models-and-datasets.md
+++ b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -456,6 +456,9 @@ The table below introduces the models integrated with ms-swift:
 |[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation)|polylm|default|-|-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)|
 |[AI-ModelScope/aya-expanse-8b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-8b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-8b](https://huggingface.co/CohereForAI/aya-expanse-8b)|
 |[AI-ModelScope/aya-expanse-32b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-32b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-32b](https://huggingface.co/CohereForAI/aya-expanse-32b)|
+|[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|-|-|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)|
+|[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|-|-|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)|
+|[iic/nlp_structbert_backbone_base_std](https://modelscope.cn/models/iic/nlp_structbert_backbone_base_std)|bert|dummy|-|-|-|
 
 
 ### Multimodal large models
@@ -466,24 +469,24 @@ The table below introduces the models integrated with ms-swift:
 |[Qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/Qwen/Qwen-VL-Chat-Int4)|qwen_vl|qwen_vl|-|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
 |[Qwen/Qwen-Audio-Chat](https://modelscope.cn/models/Qwen/Qwen-Audio-Chat)|qwen_audio|qwen_audio|-|audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
 |[Qwen/Qwen-Audio](https://modelscope.cn/models/Qwen/Qwen-Audio)|qwen_audio|qwen_audio|-|audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
-|[Qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
-|[Qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
-|[Qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
-|[Qwen/Qwen2-VL-2B](https://modelscope.cn/models/Qwen/Qwen2-VL-2B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
-|[Qwen/Qwen2-VL-7B](https://modelscope.cn/models/Qwen/Qwen2-VL-7B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
-|[Qwen/Qwen2-VL-72B](https://modelscope.cn/models/Qwen/Qwen2-VL-72B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
-|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
-|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
-|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
-|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
-|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
-|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
-|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
-|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
-|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
+|[Qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
+|[Qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
+|[Qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
+|[Qwen/Qwen2-VL-2B](https://modelscope.cn/models/Qwen/Qwen2-VL-2B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
+|[Qwen/Qwen2-VL-7B](https://modelscope.cn/models/Qwen/Qwen2-VL-7B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
+|[Qwen/Qwen2-VL-72B](https://modelscope.cn/models/Qwen/Qwen2-VL-72B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
+|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
+|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
+|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
+|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
+|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
+|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
+|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
+|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
+|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
 |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
 |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
-|[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)|
+|[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)|
 |[AIDC-AI/Ovis1.6-Gemma2-9B](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Gemma2-9B)|ovis1_6|ovis1_6|transformers>=4.42|vision|[AIDC-AI/Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)|
 |[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)|glm4v|glm4v|transformers>=4.42|-|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
 |[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|vision|[THUDM/glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b)|
diff --git a/examples/deploy/client/README.md b/examples/deploy/client/README.md
deleted file mode 100644
index 5198b501b..000000000
--- a/examples/deploy/client/README.md
+++ /dev/null
@@ -1 +0,0 @@
-In each client `.py` program, we have added the `run_deploy` context. The `run_deploy` function is a simple way to deploy locally, making it convenient for users to run the program directly. In common deployment scenarios, users only need to remove the deployment context and modify the `host` and `port` in order to use the client.
diff --git a/examples/deploy/client/llm/swift_client.py b/examples/deploy/client/llm/swift_client.py
index bad18dd86..16df8d3af 100644
--- a/examples/deploy/client/llm/swift_client.py
+++ b/examples/deploy/client/llm/swift_client.py
@@ -53,6 +53,7 @@ def run_client(host: str = '127.0.0.1', port: int = 8000):
                            DeployArguments)
     from swift.plugin import InferStats
     # TODO: The current 'pt' deployment does not support automatic batch.
+    # NOTE: In a real deployment scenario, please comment out the context of run_deploy.
     with run_deploy(
             DeployArguments(model='Qwen/Qwen2.5-1.5B-Instruct', verbose=False, log_interval=-1,
                             infer_backend='vllm')) as port:
diff --git a/examples/deploy/client/mllm/swift_client.py b/examples/deploy/client/mllm/swift_client.py
index 2dc76e689..85f7f418c 100644
--- a/examples/deploy/client/mllm/swift_client.py
+++ b/examples/deploy/client/mllm/swift_client.py
@@ -118,6 +118,7 @@ def run_client(host: str = '127.0.0.1', port: int = 8000):
                            DeployArguments)
     from swift.plugin import InferStats
     # TODO: The current 'pt' deployment does not support automatic batch.
+    # NOTE: In a real deployment scenario, please comment out the context of run_deploy.
     with run_deploy(
             DeployArguments(model='Qwen/Qwen2-VL-2B-Instruct', verbose=False, log_interval=-1,
                             infer_backend='vllm')) as port:
diff --git a/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb b/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
index 65c43fdc6..1ecd96cad 100644
--- a/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
+++ b/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
@@ -8,7 +8,9 @@
     "\n",
     "Here is a demonstration of using python to perform self-cognition SFT of Qwen2.5-3B-Instruct. Through this tutorial, you can quickly understand some details of swift sft, which will be of great help in customizing ms-swift for you~\n",
     "\n",
-    "Are you ready? Let's begin the journey..."
+    "Are you ready? Let's begin the journey...\n",
+    "\n",
+    "中文版：https://modelscope.cn/notebook/share/ipynb/4340fdeb/self-cognition-sft.ipynb"
    ]
   },
   {
diff --git a/examples/notebook/qwen2.5-self-cognition/sft.sh b/examples/notebook/qwen2.5-self-cognition/sft.sh
index 43f97974f..5881365a0 100644
--- a/examples/notebook/qwen2.5-self-cognition/sft.sh
+++ b/examples/notebook/qwen2.5-self-cognition/sft.sh
@@ -26,5 +26,5 @@ swift sft \
     --warmup_ratio 0.05 \
     --dataloader_num_workers 4 \
     --dataset_num_proc 4 \
-    --model_author 小黄 'Xiao Huang' \
-    --model_name '魔搭' 'ModelScope'
+    --model_name 小黄 'Xiao Huang' \
+    --model_author '魔搭' 'ModelScope'
diff --git a/examples/infer/pt/all_to_all.sh b/examples/train/all_to_all/infer.sh
similarity index 100%
rename from examples/infer/pt/all_to_all.sh
rename to examples/train/all_to_all/infer.sh
diff --git a/examples/train/seq_cls/bert/deploy.sh b/examples/train/seq_cls/bert/deploy.sh
new file mode 100644
index 000000000..13825d349
--- /dev/null
+++ b/examples/train/seq_cls/bert/deploy.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=0 \
+swift deploy \
+    --model output/vx-xxx/checkpoint-xxx \
+    --served_model_name bert-base-chinese
+
+# curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
+# "model": "bert-base-chinese",
+# "messages": [{"role": "user", "content": "Task: Sentiment Classification\nSentence: 包装差，容易被调包。\nCategory: negative, positive\nOutput:"}]
+# }'
diff --git a/examples/train/seq_cls/bert/infer.sh b/examples/train/seq_cls/bert/infer.sh
new file mode 100644
index 000000000..abd8f1f02
--- /dev/null
+++ b/examples/train/seq_cls/bert/infer.sh
@@ -0,0 +1,5 @@
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --model output/vx-xxx/checkpoint-xxx \
+    --load_data_args true \
+    --max_batch_size 16
diff --git a/examples/train/seq_cls/bert/sft.sh b/examples/train/seq_cls/bert/sft.sh
new file mode 100644
index 000000000..35081e0af
--- /dev/null
+++ b/examples/train/seq_cls/bert/sft.sh
@@ -0,0 +1,24 @@
+# If `num_labels` is provided, it will be considered a classification task,
+# and AutoModelForSequenceClassification will be used to load the model.
+# The BERT model does not require templates, so it can usually be used without registration.
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model AI-ModelScope/bert-base-chinese \
+    --train_type full \
+    --dataset 'DAMO_NLP/jd:cls#2000' \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps 16 \
+    --eval_steps 50 \
+    --save_steps 50 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 512 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --num_labels 2 \
+    --task_type seq_cls
diff --git a/examples/train/seq_cls/qwen2_5/deploy.sh b/examples/train/seq_cls/qwen2_5/deploy.sh
new file mode 100644
index 000000000..5476dae49
--- /dev/null
+++ b/examples/train/seq_cls/qwen2_5/deploy.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0 \
+swift deploy \
+    --adapters output/vx-xxx/checkpoint-xxx
+
+# curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
+# "model": "Qwen2.5-7B",
+# "messages": [{"role": "user", "content": "Task: Sentiment Classification\nSentence: 包装差，容易被调包。\nCategory: negative, positive\nOutput:"}]
+# }'
diff --git a/examples/train/seq_cls/infer.sh b/examples/train/seq_cls/qwen2_5/infer.sh
similarity index 83%
rename from examples/train/seq_cls/infer.sh
rename to examples/train/seq_cls/qwen2_5/infer.sh
index c994148de..43aa93bcc 100644
--- a/examples/train/seq_cls/infer.sh
+++ b/examples/train/seq_cls/qwen2_5/infer.sh
@@ -1,6 +1,5 @@
 CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
-    --max_new_tokens 2048 \
     --load_data_args true \
     --max_batch_size 16
diff --git a/examples/train/seq_cls/sft.sh b/examples/train/seq_cls/qwen2_5/sft.sh
similarity index 100%
rename from examples/train/seq_cls/sft.sh
rename to examples/train/seq_cls/qwen2_5/sft.sh
diff --git a/swift/llm/__init__.py b/swift/llm/__init__.py
index 1c5013173..2173ce93b 100644
--- a/swift/llm/__init__.py
+++ b/swift/llm/__init__.py
@@ -18,7 +18,8 @@
     from .model import (register_model, MODEL_MAPPING, ModelType, get_model_tokenizer, safe_snapshot_download,
                         HfConfigFactory, ModelInfo, ModelMeta, ModelKeys, register_model_arch, MultiModelKeys,
                         ModelArch, get_model_arch, MODEL_ARCH_MAPPING, get_model_info_meta, get_model_name, ModelGroup,
-                        Model, get_model_tokenizer_with_flash_attn, get_model_tokenizer_multimodal, load_by_unsloth)
+                        Model, get_model_tokenizer_with_flash_attn, get_model_tokenizer_multimodal, load_by_unsloth,
+                        git_clone_github)
     from .dataset import (AlpacaPreprocessor, ResponsePreprocessor, MessagesPreprocessor, AutoPreprocessor,
                           DATASET_MAPPING, MediaResource, register_dataset, register_dataset_info, EncodePreprocessor,
                           LazyLLMDataset, ConstantLengthDataset, standard_keys, load_dataset, DATASET_TYPE,
@@ -51,7 +52,7 @@
             'ModelInfo', 'ModelMeta', 'ModelKeys', 'register_model_arch', 'MultiModelKeys', 'ModelArch',
             'MODEL_ARCH_MAPPING', 'get_model_arch', 'get_model_info_meta', 'get_model_name', 'register_model',
             'ModelGroup', 'Model', 'get_model_tokenizer_with_flash_attn', 'get_model_tokenizer_multimodal',
-            'load_by_unsloth'
+            'load_by_unsloth', 'git_clone_github'
         ],
         'dataset': [
             'AlpacaPreprocessor', 'ClsPreprocessor', 'ComposePreprocessor', 'MessagesPreprocessor', 'DATASET_MAPPING',
diff --git a/swift/llm/argument/infer_args.py b/swift/llm/argument/infer_args.py
index ab5f994e1..29a586a07 100644
--- a/swift/llm/argument/infer_args.py
+++ b/swift/llm/argument/infer_args.py
@@ -132,11 +132,9 @@ def _init_result_path(self, folder_name: str) -> None:
     def _init_stream(self):
         self.eval_human = not (self.dataset and self.split_dataset_ratio > 0 or self.val_dataset)
 
-        if self.stream and self.template:
-            template_meta = get_template_meta(self.template)
-            if self.num_beams != 1 or not template_meta.support_stream:
-                self.stream = False
-                logger.info('Setting args.stream: False')
+        if self.stream and self.num_beams != 1:
+            self.stream = False
+            logger.info('Setting args.stream: False')
 
     def _init_pt_ddp(self):
         if self.infer_backend != 'pt' or not is_dist():
diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py
index d0d5b002c..56af8a47e 100644
--- a/swift/llm/dataset/dataset/llm.py
+++ b/swift/llm/dataset/dataset/llm.py
@@ -9,17 +9,20 @@
 from ..register import DatasetMeta, SubsetDataset, register_dataset
 
 
-def _concat_inst_inp_alpaca_zh(inst: str, inp: str) -> str:
-    if inp.startswith('输入：'):
-        inp = inp[3:]
-    return f'{inst}\n{inp}'
+class AlpacaZhPreprocessor(AlpacaPreprocessor):
+
+    @classmethod
+    def concat_inst_input(cls, instruction, input_):
+        if input_ and input_.startswith('输入：'):
+            input_ = input_[3:]
+        return super().concat_inst_input(instruction, input_)
 
 
 register_dataset(
     DatasetMeta(
         ms_dataset_id='AI-ModelScope/alpaca-gpt4-data-zh',
         hf_dataset_id='llm-wizard/alpaca-gpt4-data-zh',
-        preprocess_func=AlpacaPreprocessor(concat_inst_input=_concat_inst_inp_alpaca_zh),
+        preprocess_func=AlpacaZhPreprocessor(),
         tags=['chat', 'general', '🔥'],
     ))
 
diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py
index 24c6fa73a..c3d692f4c 100644
--- a/swift/llm/dataset/preprocessor/core.py
+++ b/swift/llm/dataset/preprocessor/core.py
@@ -312,18 +312,14 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
 
 class AlpacaPreprocessor(ResponsePreprocessor):
 
-    def __init__(self,
-                 *,
-                 concat_inst_input: Union[Callable[[str, str], str]] = '\n',
-                 columns_mapping: Optional[Dict[str, str]] = None,
-                 **kwargs) -> None:
-        """Alpaca format preprocessor
-
-        Args:
-            concat_inst_input: The concat sep between instruction and input
-        """
-        super().__init__(columns_mapping=columns_mapping, **kwargs)
-        self.concat_inst_input = concat_inst_input
+    @classmethod
+    def concat_inst_input(cls, instruction, input_):
+        if instruction and input_:
+            query = f'{instruction}\n{input_}'
+        else:
+            query = instruction or input_
+        assert isinstance(query, str), f'query: {query}'
+        return query
 
     def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         instruction = row.pop('instruction', None)
@@ -331,15 +327,7 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         output = row.pop('output', None)
         if output is not None:
             row['response'] = output
-
-        if instruction is not None or input_ is not None:
-            instruction = instruction or ''
-            input_ = input_ or ''
-            if isinstance(self.concat_inst_input, str):
-                query = instruction + self.concat_inst_input + input_
-            else:
-                query = self.concat_inst_input(instruction, input_)
-            row['query'] = query
+        row['query'] = self.concat_inst_input(instruction, input_)
         return super().preprocess(row)
 
 
diff --git a/swift/llm/infer/deploy.py b/swift/llm/infer/deploy.py
index 1d1956da1..ff0ab8ec5 100644
--- a/swift/llm/infer/deploy.py
+++ b/swift/llm/infer/deploy.py
@@ -121,6 +121,8 @@ def _post_process(self, request_info, response, return_cmpl_response: bool = Fal
 
     def _set_request_config(self, request_config) -> None:
         default_request_config = self.args.get_request_config()
+        if default_request_config is None:
+            return
         for key, val in asdict(request_config).items():
             default_val = getattr(default_request_config, key)
             if default_val is not None and (val is None or isinstance(val, (list, tuple)) and len(val) == 0):
diff --git a/swift/llm/model/__init__.py b/swift/llm/model/__init__.py
index d0db4befb..754d71520 100644
--- a/swift/llm/model/__init__.py
+++ b/swift/llm/model/__init__.py
@@ -6,4 +6,4 @@
                        get_default_torch_dtype, get_model_info_meta, get_model_name, get_model_tokenizer,
                        get_model_tokenizer_multimodal, get_model_tokenizer_with_flash_attn, get_model_with_value_head,
                        load_by_unsloth, register_model)
-from .utils import HfConfigFactory, ModelInfo, safe_snapshot_download
+from .utils import HfConfigFactory, ModelInfo, git_clone_github, safe_snapshot_download
diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py
index 4ac9de3d7..a87f901c7 100644
--- a/swift/llm/model/constant.py
+++ b/swift/llm/model/constant.py
@@ -94,6 +94,9 @@ class LLMModelType:
     polylm = 'polylm'
     aya = 'aya'
 
+    modern_bert = 'modern_bert'
+    bert = 'bert'
+
 
 class MLLMModelType:
     qwen_vl = 'qwen_vl'
diff --git a/swift/llm/model/model/__init__.py b/swift/llm/model/model/__init__.py
index 8808a518b..a972ec64e 100644
--- a/swift/llm/model/model/__init__.py
+++ b/swift/llm/model/model/__init__.py
@@ -1,2 +1,2 @@
-from . import (baai, baichuan, codefuse, deepseek, gemma, glm, internlm, llama, llava, llm, mamba, microsoft, minicpm,
-               mistral, mllm, mplug, openbuddy, qwen, telechat, yi)
+from . import (baai, baichuan, bert, codefuse, deepseek, gemma, glm, internlm, llama, llava, llm, mamba, microsoft,
+               minicpm, mistral, mllm, mplug, openbuddy, qwen, telechat, yi)
diff --git a/swift/llm/model/model/bert.py b/swift/llm/model/model/bert.py
new file mode 100644
index 000000000..f83aef353
--- /dev/null
+++ b/swift/llm/model/model/bert.py
@@ -0,0 +1,38 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from transformers import AutoConfig
+
+from swift.utils import get_logger
+from ..constant import LLMModelType
+from ..register import Model, ModelGroup, ModelMeta, get_model_tokenizer_from_local, register_model
+
+logger = get_logger()
+
+
+def get_model_tokenizer_modern_bert(model_dir, *args, **kwargs):
+    model_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
+    model_config.reference_compile = False
+    kwargs['model_config'] = model_config
+    return get_model_tokenizer_from_local(model_dir, *args, **kwargs)
+
+
+register_model(
+    ModelMeta(
+        LLMModelType.modern_bert, [
+            ModelGroup([
+                Model('answerdotai/ModernBERT-base', 'answerdotai/ModernBERT-base'),
+                Model('answerdotai/ModernBERT-large', 'answerdotai/ModernBERT-large'),
+            ])
+        ],
+        None,
+        get_model_tokenizer_modern_bert,
+        requires=['transformers>=4.48'],
+        tags=['bert']))
+
+register_model(
+    ModelMeta(
+        LLMModelType.bert, [ModelGroup([
+            Model('iic/nlp_structbert_backbone_base_std'),
+        ])],
+        None,
+        get_model_tokenizer_from_local,
+        tags=['bert']))
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
index ca5003ce3..a98406eb8 100644
--- a/swift/llm/model/register.py
+++ b/swift/llm/model/register.py
@@ -53,7 +53,7 @@ class ModelMeta:
     # Used to list the model_ids from modelscope/huggingface,
     # which participate in the automatic inference of the model_type.
     model_groups: List[ModelGroup]
-    template: str
+    template: Optional[str]
     get_function: GetModelTokenizerFunction
 
     model_arch: Optional[str] = None
@@ -70,6 +70,8 @@ class ModelMeta:
     tags: List[str] = field(default_factory=list)
 
     def __post_init__(self):
+        if self.template is None:
+            self.template = 'dummy'
         if not isinstance(self.model_groups, (list, tuple)):
             self.model_groups = [self.model_groups]
 
@@ -508,7 +510,9 @@ def get_model_tokenizer(
     tokenizer.model_info = model_info
     tokenizer.model_meta = model_meta
 
-    pad_token = tokenizer.pad_token_id or tokenizer.eos_token_id
+    pad_token = tokenizer.pad_token_id
+    if pad_token is None:
+        pad_token = tokenizer.eos_token_id
     if tokenizer.eos_token_id is None:
         tokenizer.eos_token_id = pad_token
     if tokenizer.pad_token_id is None:
diff --git a/swift/llm/model/utils.py b/swift/llm/model/utils.py
index 7195b266e..efacbc91f 100644
--- a/swift/llm/model/utils.py
+++ b/swift/llm/model/utils.py
@@ -274,6 +274,8 @@ def git_clone_github(github_url: str,
                      local_repo_name: Optional[str] = None,
                      branch: Optional[str] = None,
                      commit_hash: Optional[str] = None) -> str:
+    if github_url.endswith('.git'):
+        github_url = github_url[:-4]
     git_cache_dir = os.path.join(get_cache_dir(), '_github')
     os.makedirs(git_cache_dir, exist_ok=True)
     if local_repo_name is None:
@@ -282,8 +284,7 @@ def git_clone_github(github_url: str,
     local_repo_path = os.path.join(git_cache_dir, local_repo_name)
     with safe_ddp_context(hash_id=local_repo_path):
         if not os.path.exists(local_repo_path):
-            if not github_url.endswith('.git'):
-                github_url = f'{github_url}.git'
+            github_url = f'{github_url}.git'
             command = ['git', '-C', git_cache_dir, 'clone', github_url, local_repo_name]
             command_str = f"git -C '{git_cache_dir}' clone '{github_url}' {local_repo_name}"
             if branch is not None:
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
index 1ec5107cf..7384678cc 100644
--- a/swift/llm/template/base.py
+++ b/swift/llm/template/base.py
@@ -179,6 +179,12 @@ def _kto_encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         encoded['label'] = label
         return encoded
 
+    def _seq_cls_encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = self._encode(inputs)
+        if inputs.label is not None:
+            encoded['labels'] = int(inputs.label)
+        return encoded
+
     def encode(self,
                inputs: Union[TemplateInputs, Dict[str, Any], InferRequest],
                return_template_inputs: bool = False) -> Dict[str, Any]:
@@ -201,8 +207,10 @@ def encode(self,
             encoded = Template._encode(self, inputs)
             for key in ['images', 'audios', 'videos']:
                 encoded[key] = getattr(inputs, key)
-        elif self.mode in {'pt', 'train', 'seq_cls'}:
+        elif self.mode in {'pt', 'train'}:
             encoded = self._encode(inputs)
+        elif self.mode == 'seq_cls':
+            encoded = self._seq_cls_encode(inputs)
         elif self.mode == 'rlhf':
             encoded = self._rlhf_encode(inputs)
         elif self.mode == 'kto':
@@ -599,7 +607,8 @@ def _swift_encode(self, inputs: StdTemplateInputs):
 
     def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         template_backend = self.template_backend
-        if self.template_meta.template_type == 'dummy' and self.use_chat_template and not self.is_training:
+        if (self.template_meta.template_type == 'dummy' and self.use_chat_template and not self.is_training
+                and self.mode != 'seq_cls'):
             template_backend = 'jinja'
         res_context_list, loss_scale_list, answer_len = (
             self._swift_encode(inputs) if template_backend == 'swift' else self._jinja_encode(inputs))
@@ -649,10 +658,6 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
             for k in list(encoded.keys()):
                 if k.endswith('loss_scale'):
                     encoded[k] = None
-
-        # sequence_classification
-        if inputs.label is not None:
-            encoded['label'] = inputs.label
         return encoded
 
     def _debug_logger(self, generate_ids):
@@ -807,7 +812,7 @@ def _seq_cls_data_collator(self,
                                batch: List[Dict[str, Any]],
                                *,
                                padding_to: Optional[int] = None) -> Dict[str, Any]:
-        labels = [b['label'] for b in batch if b.get('label') is not None]
+        labels = [b.pop('labels') for b in batch if b.get('labels') is not None]
         res = self._data_collator(batch, padding_to=padding_to)
         if labels:
             res['labels'] = torch.tensor(labels, dtype=torch.long)
@@ -930,8 +935,9 @@ def print_inputs(self, inputs: Dict[str, Any], tokenizer_kwargs: Optional[Dict[s
             if val is not None:
                 key_upper = key.upper()
                 logger.info(f'[{key_upper}_IDS] {val}')
-                val_str = self.safe_decode(val, **tokenizer_kwargs)
-                logger.info(f'[{key_upper}] {val_str}')
+                if isinstance(val, (list, tuple, torch.Tensor)):
+                    val_str = self.safe_decode(val, **tokenizer_kwargs)
+                    logger.info(f'[{key_upper}] {val_str}')
         if inputs.get('loss_scale') is not None:
             val = inputs['loss_scale']
             logger.info(f'[LOSS_SCALE] {val}')
diff --git a/swift/llm/template/template/microsoft.py b/swift/llm/template/template/microsoft.py
index d94f441fe..e7a46d9f6 100644
--- a/swift/llm/template/template/microsoft.py
+++ b/swift/llm/template/template/microsoft.py
@@ -87,7 +87,7 @@ def decode(self, generate_ids: List[int], **kwargs) -> Any:
         chat_sep=None,
         suffix=['</s>'],
         template_cls=FlorenceTemplate,
-        support_stream=False))
+    ))
 
 
 @dataclass
diff --git a/swift/llm/template/template/utils.py b/swift/llm/template/template/utils.py
index f1c8182fd..fcbdddf19 100644
--- a/swift/llm/template/template/utils.py
+++ b/swift/llm/template/template/utils.py
@@ -22,8 +22,9 @@ class ChatmlTemplateMeta(TemplateMeta):
 @dataclass
 class EmptyTemplateMeta(TemplateMeta):
     prefix: Prompt = field(default_factory=list)
-    prompt: Prompt = field(default_factory=list)
-    chat_sep: Optional[Prompt] = field(default_factory=list)
+    prompt: Prompt = field(default_factory=lambda: ['{{QUERY}}'])
+    chat_sep: Optional[Prompt] = None
+    auto_add_bos: bool = True
 
 
 register_template(ChatmlTemplateMeta(LLMTemplateType.chatml))
diff --git a/swift/llm/template/template_meta.py b/swift/llm/template/template_meta.py
index 5ca9ea136..98520516c 100644
--- a/swift/llm/template/template_meta.py
+++ b/swift/llm/template/template_meta.py
@@ -47,7 +47,6 @@ class TemplateMeta:
     placeholder_tokens: List[Union[int, str]] = field(default_factory=list)
 
     default_tools_prompt: str = 'react_en'
-    support_stream: bool = True
 
     def to_generate_template_meta(self) -> 'TemplateMeta':
         self = deepcopy(self)
@@ -60,7 +59,6 @@ def to_generate_template_meta(self) -> 'TemplateMeta':
             auto_add_bos=True,
             stop_words=self.stop_words,
             placeholder_tokens=self.placeholder_tokens,
-            support_stream=self.support_stream,
         )
 
     @staticmethod
diff --git a/swift/ui/app.py b/swift/ui/app.py
index 5ea31dfea..92d87e5a4 100644
--- a/swift/ui/app.py
+++ b/swift/ui/app.py
@@ -87,6 +87,13 @@ def run(self):
             if is_gradio_app:
                 from swift.utils import find_free_port
                 LLMInfer.element('port').value = str(find_free_port())
+                for f in fields(self.args):
+                    if getattr(self.args, f.name) and f.name in LLMInfer.elements() and hasattr(
+                            LLMInfer.elements()[f.name], 'value') and f.name != 'port':
+                        value = getattr(self.args, f.name)
+                        if isinstance(value, list):
+                            value = ' '.join([v or '' for v in value])
+                        LLMInfer.elements()[f.name].value = value
                 app.load(LLMInfer.deploy_model, list(LLMInfer.valid_elements().values()),
                          [LLMInfer.element('runtime_tab'),
                           LLMInfer.element('running_tasks')])
diff --git a/swift/ui/llm_infer/llm_infer.py b/swift/ui/llm_infer/llm_infer.py
index d855d34d8..6b6b609fe 100644
--- a/swift/ui/llm_infer/llm_infer.py
+++ b/swift/ui/llm_infer/llm_infer.py
@@ -297,7 +297,7 @@ def deploy_model(cls, *args):
                 time.sleep(1)
                 cnt += 1
                 if cnt >= 60:
-                    logger.warn(f'Deploy costing too much time, please check log file: {log_file}')
+                    logger.warning_once(f'Deploy costing too much time, please check log file: {log_file}')
             logger.info('Deploy done.')
         cls.deployed = True
         running_task = Runtime.refresh_tasks(log_file)
diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py
index 512a50ba7..d318e752e 100644
--- a/swift/utils/torch_utils.py
+++ b/swift/utils/torch_utils.py
@@ -221,9 +221,9 @@ def find_all_linears(model: nn.Module) -> List[str]:
     else:
         linear_cls = [nn.Linear]
 
-    # 'score': classification model
+    # 'score', 'classifier': classification model
     # 'v_head': reward model
-    ignore_layers = [lm_head_name, 'score', 'v_head']
+    ignore_layers = [lm_head_name, 'score', 'v_head', 'classifier']
     return _find_layers(
         model, lambda name, module: isinstance(module, tuple(linear_cls)) and all(layer not in name
                                                                                   for layer in ignore_layers))
diff --git a/tests/general/test_dataset.py b/tests/general/test_dataset.py
index cf4da8312..371401fbe 100644
--- a/tests/general/test_dataset.py
+++ b/tests/general/test_dataset.py
@@ -15,8 +15,11 @@ def test_sft():
     # _test_dataset(['AI-ModelScope/Duet-v0.5'])
     # _test_dataset(['swift/SlimOrca', 'swift/cosmopedia-100k'])
     # _test_dataset(['OmniData/Zhihu-KOL-More-Than-100-Upvotes'])
-    _test_dataset(['OmniData/Zhihu-KOL'])
-    # _test_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000', 'AI-ModelScope/alpaca-gpt4-data-en#200'])
+    # _test_dataset(['OmniData/Zhihu-KOL'])
+    _test_dataset([
+        'AI-ModelScope/alpaca-gpt4-data-zh#1000', 'AI-ModelScope/alpaca-gpt4-data-en#1000',
+        'AI-ModelScope/LongAlpaca-12k#1000'
+    ])
     # _test_dataset(['swift/Infinity-Instruct:all'])
     # _test_dataset(['swift/sharegpt:all'])
     # _test_dataset(['AI-ModelScope/sharegpt_gpt4:all'])
diff --git a/tests/general/test_stream.py b/tests/general/test_stream.py
index 08828d12f..ad2069622 100644
--- a/tests/general/test_stream.py
+++ b/tests/general/test_stream.py
@@ -3,11 +3,10 @@
 
 def test_local_dataset():
     # please use git clone
-    local_dataset = '/mnt/nas2/huangjintao.hjt/work/datasets/swift-sft-mixture:firefly#100'
-    dataset = load_dataset(datasets=[local_dataset], streaming=True)[0]
-    for i, x in enumerate(dataset):
-        pass
-    print(i, x)
+    from swift.llm import git_clone_github
+    model_dir = git_clone_github('https://www.modelscope.cn/datasets/swift/swift-sft-mixture.git')
+    dataset = load_dataset(datasets=[f'{model_dir}:firefly'], streaming=True)[0]
+    print(next(iter(dataset)))
 
 
 def test_hub_dataset():
diff --git a/tests/train/test_cls.py b/tests/train/test_cls.py
index 5d20f790b..27ed5b902 100644
--- a/tests/train/test_cls.py
+++ b/tests/train/test_cls.py
@@ -1,9 +1,11 @@
 import os
 
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
 kwargs = {
     'per_device_train_batch_size': 2,
     'per_device_eval_batch_size': 2,
-    'save_steps': 10,
+    'save_steps': 50,
     'gradient_accumulation_steps': 4,
     'num_train_epochs': 1,
 }
@@ -12,10 +14,31 @@
 def test_llm():
     from swift.llm import TrainArguments, sft_main, infer_main, InferArguments
     result = sft_main(
-        TrainArguments(model='Qwen/Qwen2.5-7B-Instruct', num_labels=2, dataset=['DAMO_NLP/jd:cls#2000'], **kwargs))
+        TrainArguments(
+            model='Qwen/Qwen2.5-1.5B-Instruct',
+            train_type='lora',
+            num_labels=2,
+            dataset=['DAMO_NLP/jd:cls#2000'],
+            **kwargs))
     last_model_checkpoint = result['last_model_checkpoint']
     infer_main(InferArguments(adapters=last_model_checkpoint, load_data_args=True))
 
 
+def test_bert():
+
+    from swift.llm import TrainArguments, sft_main, infer_main, InferArguments
+    result = sft_main(
+        TrainArguments(
+            model='answerdotai/ModernBERT-base',
+            # model='iic/nlp_structbert_backbone_base_std',
+            train_type='full',
+            num_labels=2,
+            dataset=['DAMO_NLP/jd:cls#2000'],
+            **kwargs))
+    last_model_checkpoint = result['last_model_checkpoint']
+    infer_main(InferArguments(model=last_model_checkpoint, load_data_args=True))
+
+
 if __name__ == '__main__':
-    test_llm()
+    # test_llm()
+    test_bert()