From e471074f6427943113323eb917331b6b951eb32d Mon Sep 17 00:00:00 2001 From: Jintao Date: Thu, 26 Dec 2024 20:21:36 +0800 Subject: [PATCH 1/2] Fix deepspeed (#2778) --- ...44\350\241\214\345\217\202\346\225\260.md" | 2 +- .../Instruction/Command-line-parameters.md | 2 +- swift/llm/argument/train_args.py | 5 ++- swift/llm/argument/tuner_args.py | 2 +- swift/llm/ds_config/zero0.json | 31 ++++++++++++++++ swift/llm/ds_config/zero1.json | 35 +++++++++++++++++++ swift/llm/ds_config/zero2.json | 18 ---------- swift/llm/ds_config/zero2_offload.json | 18 ---------- swift/llm/ds_config/zero3.json | 18 ---------- swift/llm/ds_config/zero3_offload.json | 18 ---------- swift/trainers/trainers.py | 2 -- swift/ui/llm_train/advanced.py | 2 +- swift/utils/utils.py | 2 +- 13 files changed, 75 insertions(+), 80 deletions(-) create mode 100644 swift/llm/ds_config/zero0.json create mode 100644 swift/llm/ds_config/zero1.json diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 5ec575c0b..db8dd4201 100644 --- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -87,7 +87,7 @@ - 🔥output_dir: 默认为`output/` - 🔥gradient_checkpointing: 是否使用gradient_checkpointing,默认为True -- 🔥deepspeed: 默认为None. 可以设置为'zero2', 'zero3', 'zero2_offload', 'zero3_offload'来使用ms-swift内置的deepspeed配置文件 +- 🔥deepspeed: 默认为None. 可以设置为'zero0', 'zero1', 'zero2', 'zero3', 'zero2_offload', 'zero3_offload'来使用ms-swift内置的deepspeed配置文件 - 🔥per_device_train_batch_size: 默认值1 - 🔥per_device_eval_batch_size: 默认值1 - weight_decay: weight衰减系数,默认值0.1 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index 21e7020d6..28be6ae39 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -88,7 +88,7 @@ This parameter list inherits from transformers `Seq2SeqTrainingArguments`, with - 🔥output_dir: Default is `output/`. - 🔥gradient_checkpointing: Whether to use gradient checkpointing, default is True. -- 🔥deepspeed: Default is None. Can be set to 'zero2', 'zero3', 'zero2_offload', 'zero3_offload' to use the built-in deepspeed configuration files from ms-swift. +- 🔥deepspeed: Default is None. Can be set to 'zero0', 'zero1', 'zero2', 'zero3', 'zero2_offload', 'zero3_offload' to use the built-in deepspeed configuration files from ms-swift. - 🔥per_device_train_batch_size: Default is 1. - 🔥per_device_eval_batch_size: Default is 1. - weight_decay: Weight decay coefficient, default value is 0.1. diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py index 1d2566930..a2deb20b5 100644 --- a/swift/llm/argument/train_args.py +++ b/swift/llm/argument/train_args.py @@ -164,7 +164,10 @@ def _init_deepspeed(self): f'local_world_size: {self.local_world_size}.') ds_config_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'ds_config')) - deepspeed_mapping = {name: f'{name}.json' for name in ['zero2', 'zero3', 'zero2_offload', 'zero3_offload']} + deepspeed_mapping = { + name: f'{name}.json' + for name in ['zero0', 'zero1', 'zero2', 'zero3', 'zero2_offload', 'zero3_offload'] + } for ds_name, ds_config in deepspeed_mapping.items(): if self.deepspeed == ds_name: self.deepspeed = os.path.join(ds_config_folder, ds_config) diff --git a/swift/llm/argument/tuner_args.py b/swift/llm/argument/tuner_args.py index b920ac602..79ef33d50 100644 --- a/swift/llm/argument/tuner_args.py +++ b/swift/llm/argument/tuner_args.py @@ -222,4 +222,4 @@ def _init_multimodal_full(self): if self.freeze_parameters: logger.info(f'freeze_parameters: {self.freeze_parameters}') if self.trainable_parameters: - logger.info(f'trainable_parameters: {self.trainable_parameters}') + logger.info(f'additional trainable_parameters: {self.trainable_parameters}') diff --git a/swift/llm/ds_config/zero0.json b/swift/llm/ds_config/zero0.json new file mode 100644 index 000000000..4ba4833e4 --- /dev/null +++ b/swift/llm/ds_config/zero0.json @@ -0,0 +1,31 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + "bf16": { + "enabled": "auto" + }, + + "zero_optimization": { + "stage": 0, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} diff --git a/swift/llm/ds_config/zero1.json b/swift/llm/ds_config/zero1.json new file mode 100644 index 000000000..ad439964e --- /dev/null +++ b/swift/llm/ds_config/zero1.json @@ -0,0 +1,35 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + "bf16": { + "enabled": "auto" + }, + + "zero_optimization": { + "stage": 1, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} diff --git a/swift/llm/ds_config/zero2.json b/swift/llm/ds_config/zero2.json index d190eef52..c09e5055b 100644 --- a/swift/llm/ds_config/zero2.json +++ b/swift/llm/ds_config/zero2.json @@ -12,24 +12,6 @@ "enabled": "auto" }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - }, - - "scheduler": { - "type": "WarmupCosineLR", - "params": { - "total_num_steps": "auto", - "warmup_num_steps": "auto" - } - }, - "zero_optimization": { "stage": 2, "offload_optimizer": { diff --git a/swift/llm/ds_config/zero2_offload.json b/swift/llm/ds_config/zero2_offload.json index 651022fd5..d7d472f15 100644 --- a/swift/llm/ds_config/zero2_offload.json +++ b/swift/llm/ds_config/zero2_offload.json @@ -12,24 +12,6 @@ "enabled": "auto" }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - }, - - "scheduler": { - "type": "WarmupCosineLR", - "params": { - "total_num_steps": "auto", - "warmup_num_steps": "auto" - } - }, - "zero_optimization": { "stage": 2, "offload_optimizer": { diff --git a/swift/llm/ds_config/zero3.json b/swift/llm/ds_config/zero3.json index ab32d0bc7..23bbe0e50 100644 --- a/swift/llm/ds_config/zero3.json +++ b/swift/llm/ds_config/zero3.json @@ -12,24 +12,6 @@ "enabled": "auto" }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - }, - - "scheduler": { - "type": "WarmupCosineLR", - "params": { - "total_num_steps": "auto", - "warmup_num_steps": "auto" - } - }, - "zero_optimization": { "stage": 3, "offload_optimizer": { diff --git a/swift/llm/ds_config/zero3_offload.json b/swift/llm/ds_config/zero3_offload.json index 965e1d8e1..043e97c16 100644 --- a/swift/llm/ds_config/zero3_offload.json +++ b/swift/llm/ds_config/zero3_offload.json @@ -12,24 +12,6 @@ "enabled": "auto" }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - }, - - "scheduler": { - "type": "WarmupCosineLR", - "params": { - "total_num_steps": "auto", - "warmup_num_steps": "auto" - } - }, - "zero_optimization": { "stage": 3, "offload_optimizer": { diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index 1614b2bdc..12ac987c0 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -149,8 +149,6 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N labels = inputs['labels'] # fix https://github.com/huggingface/transformers/issues/34263 if num_items_in_batch is not None: - if getattr(self.args, 'average_tokens_across_devices', False): - outputs.loss *= self.accelerator.num_processes outputs.loss = outputs.loss * (labels[:, 1:] != -100).sum() / num_items_in_batch if isinstance(outputs, dict) and 'loss' not in outputs: diff --git a/swift/ui/llm_train/advanced.py b/swift/ui/llm_train/advanced.py index eb9f04c6f..a4583316c 100644 --- a/swift/ui/llm_train/advanced.py +++ b/swift/ui/llm_train/advanced.py @@ -159,6 +159,6 @@ def do_build_ui(cls, base_tab: Type['BaseUI']): scale=20, allow_custom_value=True, value=None, - choices=['zero2', 'zero3', 'zero2_offload', 'zero3_offload']) + choices=['zero0', 'zero1', 'zero2', 'zero3', 'zero2_offload', 'zero3_offload']) with gr.Row(): gr.Textbox(elem_id='more_params', lines=4, scale=20) diff --git a/swift/utils/utils.py b/swift/utils/utils.py index 9ad002e4a..cc1c3d3d4 100644 --- a/swift/utils/utils.py +++ b/swift/utils/utils.py @@ -37,7 +37,7 @@ def check_json_format(obj: Any, token_safe: bool = True) -> Any: elif isinstance(obj, Mapping): res = {} for k, v in obj.items(): - if token_safe and isinstance(k, str) and '_token' in k: + if token_safe and isinstance(k, str) and '_token' in k and isinstance(v, str): res[k] = None else: res[k] = check_json_format(v, token_safe) From 53945855d148766a908235903478ff09e97ffeaa Mon Sep 17 00:00:00 2001 From: Jintao Date: Thu, 26 Dec 2024 20:58:23 +0800 Subject: [PATCH 2/2] fix qwen2vl (#2779) --- ...14\346\225\260\346\215\256\351\233\206.md" | 38 +++++++++---------- .../Supported-models-and-datasets.md | 38 +++++++++---------- swift/llm/__init__.py | 4 +- swift/llm/model/model/qwen.py | 16 +++++++- swift/llm/template/__init__.py | 2 +- swift/llm/template/template/qwen.py | 15 ++------ 6 files changed, 59 insertions(+), 54 deletions(-) diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 0cf45af52..8b2ecd8df 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -456,9 +456,9 @@ |[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation)|polylm|default|-|-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)| |[AI-ModelScope/aya-expanse-8b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-8b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-8b](https://huggingface.co/CohereForAI/aya-expanse-8b)| |[AI-ModelScope/aya-expanse-32b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-32b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-32b](https://huggingface.co/CohereForAI/aya-expanse-32b)| -|[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|-|-|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)| -|[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|-|-|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)| -|[iic/nlp_structbert_backbone_base_std](https://modelscope.cn/models/iic/nlp_structbert_backbone_base_std)|bert|dummy|-|-|-| +|[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|transformers>=4.48|bert|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)| +|[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|transformers>=4.48|bert|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)| +|[iic/nlp_structbert_backbone_base_std](https://modelscope.cn/models/iic/nlp_structbert_backbone_base_std)|bert|dummy|-|bert|-| ### 多模态大模型 @@ -469,24 +469,24 @@ |[Qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/Qwen/Qwen-VL-Chat-Int4)|qwen_vl|qwen_vl|-|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)| |[Qwen/Qwen-Audio-Chat](https://modelscope.cn/models/Qwen/Qwen-Audio-Chat)|qwen_audio|qwen_audio|-|audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)| |[Qwen/Qwen-Audio](https://modelscope.cn/models/Qwen/Qwen-Audio)|qwen_audio|qwen_audio|-|audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)| -|[Qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| -|[Qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| -|[Qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)| -|[Qwen/Qwen2-VL-2B](https://modelscope.cn/models/Qwen/Qwen2-VL-2B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)| -|[Qwen/Qwen2-VL-7B](https://modelscope.cn/models/Qwen/Qwen2-VL-7B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)| -|[Qwen/Qwen2-VL-72B](https://modelscope.cn/models/Qwen/Qwen2-VL-72B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)| -|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| -|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| -|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)| -|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| -|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| -|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)| -|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| -|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| -|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)| +|[Qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| +|[Qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| +|[Qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)| +|[Qwen/Qwen2-VL-2B](https://modelscope.cn/models/Qwen/Qwen2-VL-2B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)| +|[Qwen/Qwen2-VL-7B](https://modelscope.cn/models/Qwen/Qwen2-VL-7B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)| +|[Qwen/Qwen2-VL-72B](https://modelscope.cn/models/Qwen/Qwen2-VL-72B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)| +|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| +|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| +|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)| +|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| +|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| +|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)| +|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| +|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| +|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)| |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)| |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)| -|[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)| +|[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)| |[AIDC-AI/Ovis1.6-Gemma2-9B](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Gemma2-9B)|ovis1_6|ovis1_6|transformers>=4.42|vision|[AIDC-AI/Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)| |[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)|glm4v|glm4v|transformers>=4.42|-|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)| |[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|vision|[THUDM/glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b)| diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md index b5ac0a249..6b693513d 100644 --- a/docs/source_en/Instruction/Supported-models-and-datasets.md +++ b/docs/source_en/Instruction/Supported-models-and-datasets.md @@ -456,9 +456,9 @@ The table below introduces the models integrated with ms-swift: |[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation)|polylm|default|-|-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)| |[AI-ModelScope/aya-expanse-8b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-8b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-8b](https://huggingface.co/CohereForAI/aya-expanse-8b)| |[AI-ModelScope/aya-expanse-32b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-32b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-32b](https://huggingface.co/CohereForAI/aya-expanse-32b)| -|[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|-|-|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)| -|[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|-|-|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)| -|[iic/nlp_structbert_backbone_base_std](https://modelscope.cn/models/iic/nlp_structbert_backbone_base_std)|bert|dummy|-|-|-| +|[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|transformers>=4.48|bert|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)| +|[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|transformers>=4.48|bert|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)| +|[iic/nlp_structbert_backbone_base_std](https://modelscope.cn/models/iic/nlp_structbert_backbone_base_std)|bert|dummy|-|bert|-| ### Multimodal large models @@ -469,24 +469,24 @@ The table below introduces the models integrated with ms-swift: |[Qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/Qwen/Qwen-VL-Chat-Int4)|qwen_vl|qwen_vl|-|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)| |[Qwen/Qwen-Audio-Chat](https://modelscope.cn/models/Qwen/Qwen-Audio-Chat)|qwen_audio|qwen_audio|-|audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)| |[Qwen/Qwen-Audio](https://modelscope.cn/models/Qwen/Qwen-Audio)|qwen_audio|qwen_audio|-|audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)| -|[Qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| -|[Qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| -|[Qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)| -|[Qwen/Qwen2-VL-2B](https://modelscope.cn/models/Qwen/Qwen2-VL-2B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)| -|[Qwen/Qwen2-VL-7B](https://modelscope.cn/models/Qwen/Qwen2-VL-7B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)| -|[Qwen/Qwen2-VL-72B](https://modelscope.cn/models/Qwen/Qwen2-VL-72B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)| -|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| -|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| -|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)| -|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| -|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| -|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)| -|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| -|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| -|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)| +|[Qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| +|[Qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| +|[Qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)| +|[Qwen/Qwen2-VL-2B](https://modelscope.cn/models/Qwen/Qwen2-VL-2B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)| +|[Qwen/Qwen2-VL-7B](https://modelscope.cn/models/Qwen/Qwen2-VL-7B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)| +|[Qwen/Qwen2-VL-72B](https://modelscope.cn/models/Qwen/Qwen2-VL-72B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)| +|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| +|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| +|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)| +|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| +|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| +|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)| +|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| +|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| +|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)| |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)| |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)| -|[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)| +|[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)| |[AIDC-AI/Ovis1.6-Gemma2-9B](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Gemma2-9B)|ovis1_6|ovis1_6|transformers>=4.42|vision|[AIDC-AI/Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)| |[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)|glm4v|glm4v|transformers>=4.42|-|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)| |[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|vision|[THUDM/glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b)| diff --git a/swift/llm/__init__.py b/swift/llm/__init__.py index 2173ce93b..8e8145b6a 100644 --- a/swift/llm/__init__.py +++ b/swift/llm/__init__.py @@ -14,7 +14,7 @@ RLHFArguments, WebUIArguments, BaseArguments) from .template import (TEMPLATE_MAPPING, Template, Word, get_template, TemplateType, register_template, TemplateInputs, Messages, TemplateMeta, get_template_meta, InferRequest, load_image, - MaxLengthError) + MaxLengthError, load_file) from .model import (register_model, MODEL_MAPPING, ModelType, get_model_tokenizer, safe_snapshot_download, HfConfigFactory, ModelInfo, ModelMeta, ModelKeys, register_model_arch, MultiModelKeys, ModelArch, get_model_arch, MODEL_ARCH_MAPPING, get_model_info_meta, get_model_name, ModelGroup, @@ -45,7 +45,7 @@ 'template': [ 'TEMPLATE_MAPPING', 'Template', 'Word', 'get_template', 'TemplateType', 'register_template', 'TemplateInputs', 'Messages', 'TemplateMeta', 'get_template_meta', 'InferRequest', 'load_image', - 'MaxLengthError' + 'MaxLengthError', 'load_file' ], 'model': [ 'MODEL_MAPPING', 'ModelType', 'get_model_tokenizer', 'safe_snapshot_download', 'HfConfigFactory', diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py index 66069df3c..80ef247aa 100644 --- a/swift/llm/model/model/qwen.py +++ b/swift/llm/model/model/qwen.py @@ -474,12 +474,24 @@ def _get_cast_dtype(self) -> torch.dtype: def patch_qwen_vl_utils(): from qwen_vl_utils import vision_process + if hasattr(vision_process, '_patch'): + return for key in [ 'image_factor', 'min_pixels', 'max_pixels', 'max_ratio', 'video_min_pixels', 'video_max_pixels', 'video_total_pixels', 'frame_factor', 'fps', 'fps_min_frames', 'fps_max_frames' ]: type_func = float if key == 'fps' else int setattr(vision_process, key.upper(), get_env_args(key, type_func, getattr(vision_process, key.upper()))) + from qwen_vl_utils import vision_process + _read_video_decord = vision_process._read_video_decord + + def _new_read_video_decord(ele: dict): + from swift.llm import load_file + ele['video'] = load_file(ele['video']) + return _read_video_decord(ele) + + vision_process.VIDEO_READER_BACKENDS['decord'] = _new_read_video_decord + vision_process._patch = True def get_model_tokenizer_qwen2_vl(model_dir: str, @@ -530,7 +542,7 @@ def get_model_tokenizer_qwen2_vl(model_dir: str, get_model_tokenizer_qwen2_vl, model_arch=ModelArch.qwen2_vl, architectures=['Qwen2VLForConditionalGeneration'], - requires=['transformers>=4.45', 'qwen_vl_utils', 'pyav', 'decord'], + requires=['transformers>=4.45', 'qwen_vl_utils>=0.0.6', 'pyav', 'decord'], tags=['vision', 'video'])) register_model( @@ -544,7 +556,7 @@ def get_model_tokenizer_qwen2_vl(model_dir: str, get_model_tokenizer_qwen2_vl, model_arch=ModelArch.qwen2_vl, architectures=['Qwen2VLForConditionalGeneration'], - requires=['transformers>=4.45', 'qwen_vl_utils', 'pyav', 'decord'], + requires=['transformers>=4.45', 'qwen_vl_utils>=0.0.6', 'pyav', 'decord'], tags=['vision', 'video'])) diff --git a/swift/llm/template/__init__.py b/swift/llm/template/__init__.py index 52f99f0f1..f650558b1 100644 --- a/swift/llm/template/__init__.py +++ b/swift/llm/template/__init__.py @@ -6,4 +6,4 @@ from .template_inputs import InferRequest, Messages, TemplateInputs, Tool from .template_meta import TemplateMeta from .utils import Word, split_action_action_input, split_parts_by_regex, split_str_parts_by -from .vision_utils import load_image +from .vision_utils import load_file, load_image diff --git a/swift/llm/template/template/qwen.py b/swift/llm/template/template/qwen.py index 9c9e3becb..0ec5469e1 100644 --- a/swift/llm/template/template/qwen.py +++ b/swift/llm/template/template/qwen.py @@ -12,7 +12,7 @@ from ..template_inputs import StdTemplateInputs from ..template_meta import TemplateMeta from ..utils import Context, Word, findall -from ..vision_utils import load_audio_qwen, load_batch +from ..vision_utils import load_audio_qwen, load_batch, load_file from .utils import DEFAULT_SYSTEM, ChatmlTemplateMeta @@ -145,11 +145,6 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in register_template(QwenTemplateMeta(MLLMTemplateType.qwen2_audio, template_cls=Qwen2AudioTemplate)) -def _process_image_qwen(image): - - return image - - class Qwen2VLTemplate(Template): image_token_id = 151655 video_token_id = 151656 @@ -162,7 +157,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int inputs.images[index] = fetch_image({'image': inputs.images[index]}) return ['<|vision_start|><|image_pad|><|vision_end|>'] else: - inputs.videos[index] = fetch_video({'video': inputs.videos[index]}) + inputs.videos[index] = fetch_video({'video': inputs.videos[index]}).to(torch.uint8) return ['<|vision_start|><|video_pad|><|vision_end|>'] def replace_object(self, object_: Dict[str, Any], index: int, inputs: StdTemplateInputs) -> List[Context]: @@ -198,12 +193,10 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]: if locals()[media_type]: if media_type == 'images': media_token = self.image_token_id - media_inputs = processor.image_processor( - images=images, videos=None, return_tensors='pt', do_rescale=False) + media_inputs = processor.image_processor(images=images, videos=None, return_tensors='pt') media_grid_thw = media_inputs['image_grid_thw'] else: - media_inputs = processor.image_processor( - images=None, videos=videos, return_tensors='pt', do_rescale=False) + media_inputs = processor.image_processor(images=None, videos=videos, return_tensors='pt') media_grid_thw = media_inputs['video_grid_thw'] media_token = self.video_token_id idx_list = findall(input_ids, media_token)