modelscope · Jintao-Huang · Aug 6, 2024 · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024
diff --git a/docs/source/LLM/Megatron训练文档.md b/docs/source/LLM/Megatron训练文档.md
@@ -1,4 +1,4 @@
-# Megatron训练文档 （测试版）
+# Megatron训练文档
 
 ## 目录
 - [环境准备](#环境准备)

diff --git a/docs/source/LLM/支持的模型和数据集.md b/docs/source/LLM/支持的模型和数据集.md
diff --git a/docs/source/LLM/自定义与拓展.md b/docs/source/LLM/自定义与拓展.md
@@ -64,21 +64,21 @@ system,query,response
 ```jsonl
 {"system": "00000", "query": "11111", "response": "22222"}
 {"query": "aaaaa", "response": "bbbbb"}
-{"query": "AAAAA", "response": "BBBBB"}
+{"system": "00001", "query": "AAAAA", "response": "BBBBB"}
 ```
 
 多轮对话:
 
 ```jsonl
 {"system": "00000", "query": "55555", "response": "66666"}
 {"query": "eeeee", "response": "fffff", "history": []}
-{"query": "EEEEE", "response": "FFFFF", "history": [["query1", "response1"], ["query2", "response2"]]}
+{"system": "00001", "query": "EEEEE", "response": "FFFFF", "history": [["query1", "response1"], ["query2", "response2"]]}
 ```
 
 ```json
 [{"system": "00000", "query": "55555", "response": "66666"},
 {"query": "eeeee", "response": "fffff", "history": []},
-{"query": "EEEEE", "response": "FFFFF", "history": [["query1", "response1"], ["query2", "response2"]]}]
+{"system": "00001", "query": "EEEEE", "response": "FFFFF", "history": [["query1", "response1"], ["query2", "response2"]]}]
 ```
 
 **格式2:**
@@ -102,7 +102,7 @@ system,query,response
 ```jsonl
 {"system": "00000", "conversation": [{"human": "11111", "assistant": "22222"}]}
 {"conversation": [{"human": "aaaaa", "assistant": "bbbbb"}]}
-{"conversation": [{"human": "AAAAA", "assistant": "BBBBB"}, {"human": "CCCCC", "assistant": "DDDDD"}, {"human": "EEEEE", "assistant": "FFFFF"}]}
+{"system": "00001", "conversation": [{"human": "AAAAA", "assistant": "BBBBB"}, {"human": "CCCCC", "assistant": "DDDDD"}, {"human": "EEEEE", "assistant": "FFFFF"}]}
 ```
 
 **格式5:**

diff --git a/docs/source_en/LLM/Customization.md b/docs/source_en/LLM/Customization.md
@@ -58,21 +58,21 @@ system,query,response
 ```jsonl
 {"system": "00000", "query": "11111", "response": "22222"}
 {"query": "aaaaa", "response": "bbbbb"}
-{"query": "AAAAA", "response": "BBBBB"}
+{"system": "00001", "query": "AAAAA", "response": "BBBBB"}
 ```
 
 Multi-Round Dialogue
 
 ```jsonl
 {"system": "00000", "query": "55555", "response": "66666"}
 {"query": "eeeee", "response": "fffff", "history": []}
-{"query": "EEEEE", "response": "FFFFF", "history": [["query1", "response1"], ["query2", "response2"]]}
+{"system": "00001", "query": "EEEEE", "response": "FFFFF", "history": [["query1", "response1"], ["query2", "response2"]]}
 ```
 
 ```json
 [{"system": "00000", "query": "55555", "response": "66666"},
 {"query": "eeeee", "response": "fffff", "history": []},
-{"query": "EEEEE", "response": "FFFFF", "history": [["query1", "response1"], ["query2", "response2"]]}]
+{"system": "00001", "query": "EEEEE", "response": "FFFFF", "history": [["query1", "response1"], ["query2", "response2"]]}]
 ```
 
 **Format 2:**
@@ -96,7 +96,7 @@ Multi-Round Dialogue
 ```jsonl
 {"system": "00000", "conversation": [{"human": "11111", "assistant": "22222"}]}
 {"conversation": [{"human": "aaaaa", "assistant": "bbbbb"}]}
-{"conversation": [{"human": "AAAAA", "assistant": "BBBBB"}, {"human": "CCCCC", "assistant": "DDDDD"}, {"human": "EEEEE", "assistant": "FFFFF"}]}
+{"system": "00001", "conversation": [{"human": "AAAAA", "assistant": "BBBBB"}, {"human": "CCCCC", "assistant": "DDDDD"}, {"human": "EEEEE", "assistant": "FFFFF"}]}
 ```
 
 **Format 5:**

diff --git a/docs/source_en/LLM/Megatron-training.md b/docs/source_en/LLM/Megatron-training.md
@@ -1,4 +1,4 @@
-# Megatron Training Documentation (Beta)
+# Megatron Training Documentation
 
 ## Table of Contents
 - [Environment Preparation](#Environment-Preparation)

diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
diff --git a/scripts/utils/run_model_info.py b/scripts/utils/run_model_info.py
@@ -9,9 +9,9 @@ def get_model_info_table():
     model_name_list = ModelType.get_model_name_list()
     result = [
         '| Model Type | Model ID | Default Lora Target Modules | Default Template |'
-        ' Support Flash Attn | Support vLLM |  Support LMDeploy | Requires | Tags | HF Model ID |\n'
+        ' Support Flash Attn | Support vLLM | Support LMDeploy | Support Megatron | Requires | Tags | HF Model ID |\n'
         '| ---------  | -------- | --------------------------- | ---------------- |'
-        ' ------------------ | ------------ | ----------------- | -------- | ---- | ----------- |\n'
+        ' ------------------ | ------------ | ---------------- | ---------------- | -------- | ---- | ----------- |\n'
     ] * 2
     res_llm: List[Any] = []
     res_mllm: List[Any] = []
@@ -31,6 +31,8 @@ def get_model_info_table():
         support_vllm = bool_mapping[support_vllm]
         support_lmdeploy = model_info.get('support_lmdeploy', False)
         support_lmdeploy = bool_mapping[support_lmdeploy]
+        support_megatron = model_info.get('support_megatron', False)
+        support_megatron = bool_mapping[support_megatron]
         requires = ', '.join(model_info['requires'])
         tags = model_info.get('tags', [])
         if 'multi-modal' in tags:
@@ -46,7 +48,7 @@ def get_model_info_table():
             hf_model_id = '-'
         r = [
             model_name, model_id, lora_target_modules, template, support_flash_attn, support_vllm, support_lmdeploy,
-            requires, tags_str, hf_model_id
+            support_megatron, requires, tags_str, hf_model_id
         ]
         if is_multi_modal:
             res_mllm.append(r)
@@ -57,13 +59,13 @@ def get_model_info_table():
     for i, res in enumerate([res_llm, res_mllm]):
         for r in res:
             ms_url = f'https://modelscope.cn/models/{r[1]}/summary'
-            if r[9] != '-':
-                hf_url = f'https://huggingface.co/{r[9]}'
-                hf_model_id_str = f'[{r[9]}]({hf_url})'
+            if r[10] != '-':
+                hf_url = f'https://huggingface.co/{r[10]}'
+                hf_model_id_str = f'[{r[10]}]({hf_url})'
             else:
                 hf_model_id_str = '-'
-            text[i] += (
-                f'|{r[0]}|[{r[1]}]({ms_url})|{r[2]}|{r[3]}|{r[4]}|{r[5]}|{r[6]}|{r[7]}|{r[8]}|{hf_model_id_str}|\n')
+            text[i] += (f'|{r[0]}|[{r[1]}]({ms_url})|{r[2]}|{r[3]}|{r[4]}|{r[5]}|{r[6]}|{r[7]}|{r[8]}'
+                        f'|{r[9]}|{hf_model_id_str}|\n')
         result[i] += text[i]
 
     for i, fpath in enumerate(fpaths):

diff --git a/swift/llm/export.py b/swift/llm/export.py
@@ -298,7 +298,7 @@ def llm_export(args: ExportArguments) -> None:
             megatron_args = MegatronArguments(**res)
             extra_args = megatron_args.parse_to_megatron()
             patch_megatron(tokenizer)
-            convert_hf_to_megatron(model, extra_args, args.check_model_forward, args.torch_dtype)
+            convert_hf_to_megatron(model, extra_args, args.torch_dtype)
             fpath = os.path.join(args.megatron_output_dir, 'export_args.json')
             with open(fpath, 'w', encoding='utf-8') as f:
                 json.dump(check_json_format(args.__dict__), f, ensure_ascii=False, indent=2)

diff --git a/swift/llm/megatron/argument.py b/swift/llm/megatron/argument.py
@@ -86,7 +86,6 @@ class MegatronMixin:
     tensor_model_parallel_size: int = 1
     pipeline_model_parallel_size: int = 1
     seed: int = 42
-    transformer_impl: str = 'transformer_engine'
     sequence_parallel: bool = False
 
     apply_query_key_layer_scaling: bool = False  # fp16

diff --git a/swift/llm/megatron/convert.py b/swift/llm/megatron/convert.py
@@ -11,7 +11,6 @@
 def convert_hf_to_megatron(
     hf_model,
     extra_args: Dict[str, Any],
-    check_model_forward: bool = False,
     save_torch_dtype: Optional[torch.dtype] = None,
 ) -> None:
     from megatron.training.initialize import initialize_megatron
@@ -24,10 +23,6 @@ def convert_hf_to_megatron(
     convert_module.convert_checkpoint_from_transformers_to_megatron(hf_model, mg_model, args)
     if save_torch_dtype is not None:
         mg_model.to(save_torch_dtype)
-    if check_model_forward and hasattr(convert_module, 'check_hf_mg_forward'):
-        if save_torch_dtype is not None:
-            hf_model.to(save_torch_dtype)
-        convert_module.check_hf_mg_forward(hf_model, mg_model, args)
     convert_module.save_mgmodel(mg_model, args)
 
 

diff --git a/swift/llm/megatron/model.py b/swift/llm/megatron/model.py
@@ -6,63 +6,85 @@
 MEGATRON_MODEL_MAPPING = {}
 
 
-def register_megatron_model(model_type_list: List[str], convert_module: str, get_function: Optional[Callable] = None):
-    model_info = {'convert_module': convert_module}
+def register_megatron_model(
+        model_type_list: List[str],
+        convert_module: str,
+        model_module: str,  # GPTModel
+        config_cls,  # transformer_config_cls
+        get_function: Optional[Callable] = None):
+    megatron_model_info = {
+        'convert_module': convert_module,
+        'model_module': model_module,
+        'config_cls': config_cls,
+    }
+    res_model_type_list = []
+    for model_type in model_type_list:
+        model_info = MODEL_MAPPING[model_type]
+        support_megatron = model_info.get('support_megatron', False)
+        if support_megatron:
+            res_model_type_list.append(model_type)
+    model_type_list = res_model_type_list
+
     if get_function is not None:
-        model_info['get_function'] = get_function
+        megatron_model_info['get_function'] = get_function
         for model_type in model_type_list:
-            MEGATRON_MODEL_MAPPING[model_type] = model_info
+            MEGATRON_MODEL_MAPPING[model_type] = megatron_model_info
         return
 
     def _register_model(get_function: Callable) -> Callable:
-        model_info['get_function'] = get_function
+        megatron_model_info['get_function'] = get_function
         for model_type in model_type_list:
-            MEGATRON_MODEL_MAPPING[model_type] = model_info
+            MEGATRON_MODEL_MAPPING[model_type] = megatron_model_info
         return get_function
 
     return _register_model
 
 
+qwen1half_model_type = [model_type for model_type in MODEL_MAPPING.keys() if model_type.startswith('qwen1half')]
+
+
+@register_megatron_model(
+    [model_type for model_type in qwen1half_model_type if ('32b' not in model_type or '110b' not in model_type)],
+    'qwen.hf2mcore_qwen1_5_dense_mha', 'qwen1_5', 'QwenTransformerConfig')
 @register_megatron_model([model_type for model_type in MODEL_MAPPING.keys() if model_type.startswith('qwen2')],
-                         'qwen.hf2mcore_qwen2_dense_and_moe_gqa')
-def get_qwen2_model(pre_process=True, post_process=True):
-    from megatron.training import get_args
-    from megatron.training.arguments import core_transformer_config_from_args
-    from megatron_patch.model.qwen2.transformer_config import Qwen2TransformerConfig
-    from megatron_patch.model.qwen2.layer_specs import (get_gpt_layer_local_spec,
-                                                        get_gpt_layer_with_transformer_engine_spec)
-    from megatron_patch.model.qwen2.model import GPTModel
-
-    args = get_args()
-    config = core_transformer_config_from_args(args, Qwen2TransformerConfig)
-    use_te = args.transformer_impl == 'transformer_engine'
-
-    if use_te:
-        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(args.num_experts, args.moe_grouped_gemm,
-                                                                            args.qk_layernorm)
-    else:
-        transformer_layer_spec = get_gpt_layer_local_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm)
-
-    model = GPTModel(
-        config=config,
-        transformer_layer_spec=transformer_layer_spec,
-        vocab_size=args.padded_vocab_size,
-        max_sequence_length=args.max_position_embeddings,
-        pre_process=pre_process,
-        post_process=post_process,
-        fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
-        parallel_output=True,
-        share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
-        position_embedding_type=args.position_embedding_type,
-        rotary_percent=args.rotary_percent,
-        rotary_base=args.rotary_base,
-        seq_len_interpolation_factor=args.rotary_seq_len_interpolation_factor)
-    return model
+                         'qwen.hf2mcore_qwen2_dense_and_moe_gqa', 'qwen2', 'Qwen2TransformerConfig')
+def get_model_provider(gpt_model_cls, transformer_config_cls, layer_spec_module):
+
+    def model_provider(pre_process=True, post_process=True):
+        from megatron.training import get_args
+        from megatron.training.arguments import core_transformer_config_from_args
+        args = get_args()
+        config = core_transformer_config_from_args(args, transformer_config_cls)
+        transformer_layer_spec = layer_spec_module.get_gpt_layer_with_transformer_engine_spec(
+            args.num_experts, args.moe_grouped_gemm, args.qk_layernorm)
+        model = gpt_model_cls(
+            config=config,
+            transformer_layer_spec=transformer_layer_spec,
+            vocab_size=args.padded_vocab_size,
+            max_sequence_length=args.max_position_embeddings,
+            pre_process=pre_process,
+            post_process=post_process,
+            fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+            parallel_output=True,
+            share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+            position_embedding_type=args.position_embedding_type,
+            rotary_percent=args.rotary_percent,
+            rotary_base=args.rotary_base,
+            seq_len_interpolation_factor=args.rotary_seq_len_interpolation_factor)
+        return model
+
+    return model_provider
 
 
 def get_megatron_model_convert(model_type: str):
     model_info = MEGATRON_MODEL_MAPPING[model_type]
-    model_provider = model_info['get_function']
-    convert_module = model_info['convert_module']
-    convert_module = importlib.import_module(convert_module)
+    model_module = model_info['model_module']
+    config_cls = model_info['config_cls']
+
+    gpt_model_cls = importlib.import_module(f'megatron_patch.model.{model_module}.model').GPTModel
+    transformer_config_cls = getattr(
+        importlib.import_module(f'megatron_patch.model.{model_module}.transformer_config'), config_cls)
+    layer_spec_module = importlib.import_module(f'megatron_patch.model.{model_module}.layer_specs')
+    model_provider = model_info['get_function'](gpt_model_cls, transformer_config_cls, layer_spec_module)
+    convert_module = importlib.import_module(f"toolkits.model_checkpoints_convertor.{model_info['convert_module']}")
     return model_provider, convert_module
diff --git a/swift/llm/megatron/utils.py b/swift/llm/megatron/utils.py
@@ -1,8 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
+import shutil
 import sys
 from functools import partial, wraps
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Mapping, Optional
 
 import torch
 import torch.distributed as dist
@@ -29,7 +30,16 @@ def init_megatron_env() -> None:
             'https://github.com/alibaba/Pai-Megatron-Patch', commit_hash='6fd5d050b240fd959f0ba69f1e9cd9a053e5a81d')
         os.environ['PAI_MEGATRON_PATCH_PATH'] = megatron_patch_path
     sys.path.append(os.environ['PAI_MEGATRON_PATCH_PATH'])
-    sys.path.append(os.path.join(megatron_patch_path, 'toolkits/model_checkpoints_convertor'))
+
+    # rename qwen1.5->qwen1_5 files
+    qwen1_5_folders = ['toolkits/model_checkpoints_convertor/qwen']
+    for folder in qwen1_5_folders:
+        dir_path = os.path.join(megatron_patch_path, folder)
+        for fname in os.listdir(dir_path):
+            old_path = os.path.join(dir_path, fname)
+            new_path = os.path.join(dir_path, fname.replace('qwen1.', 'qwen1_'))
+            if old_path != new_path:
+                shutil.move(old_path, new_path)
 
 
 def patch_megatron(tokenizer):
@@ -53,14 +63,18 @@ def _initialize_distributed(*_args, **kwargs):
 
     initialize._initialize_distributed = _initialize_distributed
 
-    _old_load_checkpoint = training.load_checkpoint
+    _old_load_state_dict = torch.nn.Module.load_state_dict
 
-    @wraps(_old_load_checkpoint)
-    def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', strict=False):
-        # default: strict=False
-        return _old_load_checkpoint(model, optimizer, opt_param_scheduler, load_arg=load_arg, strict=strict)
+    @wraps(_old_load_state_dict)
+    def _load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True, *args, **kwargs):
+        if strict:
+            keys = self.state_dict().keys() ^ state_dict.keys()
+            new_keys = [k for k in keys if not k.endswith('_extra_state')]
+            if keys and not new_keys:
+                strict = False
+        return _old_load_state_dict(self, state_dict, strict, *args, **kwargs)
 
-    training.load_checkpoint = load_checkpoint
+    torch.nn.Module.load_state_dict = _load_state_dict
 
     _old_training_log = training.training_log
 

diff --git a/swift/llm/sft.py b/swift/llm/sft.py
@@ -78,10 +78,10 @@ def llm_sft_megatron(args: SftArguments) -> Dict[str, Any]:
 
     res = MegatronArguments.load_megatron_config(tokenizer.model_dir)
     res.update(MegatronArguments.from_sft_args(args, train_dataset, val_dataset))
-    model_provider, _ = get_megatron_model_convert(args.model_type)
     megatron_args = MegatronArguments(**res)
     extra_args = megatron_args.parse_to_megatron()
 
+    model_provider, _ = get_megatron_model_convert(args.model_type)
     train_valid_test_datasets_provider = partial(
         _train_valid_test_datasets_provider, train_dataset=train_dataset, val_dataset=val_dataset, template=template)
     train_valid_test_datasets_provider.is_distributed = True

diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
@@ -1522,7 +1522,6 @@ class ExportArguments(InferArguments):
     hf_output_dir: Optional[str] = None
     tp: int = 1
     pp: int = 1
-    check_model_forward: bool = False
 
     # The parameter has been defined in InferArguments.
     # merge_lora, hub_token