enable load model from modelscope

Signed-off-by: intellinjun <[email protected]>
intel · Mar 6, 2024 · 51fb111 · 51fb111
1 parent aa4a8ab
commit 51fb111
Show file tree

Hide file tree

Showing 18 changed files with 108 additions and 41 deletions.
diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
@@ -66,7 +66,7 @@ def __import_package(self, model_type):
             import neural_speed.qwen_cpp as cpp_model
         elif model_type == "mistral":
             import neural_speed.mistral_cpp as cpp_model
-        elif model_type == "qwen":
+        elif model_type == "qwen2":
             import neural_speed.qwen_cpp as cpp_model
         elif model_type == "phi":
             import neural_speed.phi_cpp as cpp_model
@@ -87,8 +87,12 @@ def get_model_type(model_config):
 
     def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_autoround=False,
             weight_dtype="int4", alg="sym", group_size=32,
-            scale_dtype="fp32", compute_dtype="int8", use_ggml=False):
-        self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+            scale_dtype="fp32", compute_dtype="int8", use_ggml=False, model_hub="huggingface"):
+        if model_hub == "modelscope":
+            from modelscope import AutoConfig
+            self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+        else:           
+            self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
         model_type = Model.get_model_type(self.config)
         self.model_type = model_type
         self.__import_package(model_type)
@@ -129,7 +133,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, use_au
             return
 
         if not os.path.exists(fp32_bin):
-            convert_model(model_name, fp32_bin, "f32")
+            convert_model(model_name, fp32_bin, "f32", model_hub = model_hub)
             assert os.path.exists(fp32_bin), "Fail to convert pytorch model"
 
         if not use_quant:

diff --git a/neural_speed/convert/__init__.py b/neural_speed/convert/__init__.py
@@ -19,11 +19,15 @@
 from transformers import AutoConfig
 import subprocess
 
-model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder", "whisper": "whisper"}
+model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder", "whisper": "whisper", "qwen2": "qwen"}
 
 
-def convert_model(model, outfile, outtype="f32", whisper_repo_path=None, use_quantized_model=False):
-    config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+def convert_model(model, outfile, outtype="f32", model_hub="huggingface", use_quantized_model=False):
+    if model_hub == "modelscope":
+        from modelscope import AutoConfig
+        config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+    else:       
+        config = AutoConfig.from_pretrained(model, trust_remote_code=True)
     model_type = model_maps.get(config.model_type, config.model_type)
 
     if use_quantized_model:
@@ -34,6 +38,7 @@ def convert_model(model, outfile, outtype="f32", whisper_repo_path=None, use_qua
     cmd.extend(["python", path])
     cmd.extend(["--outfile", outfile])
     cmd.extend(["--outtype", outtype])
+    cmd.extend(["--model_hub", model_hub])
     cmd.extend([model])
 
     print("cmd:", cmd)

diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py
@@ -19,7 +19,6 @@
 import argparse
 from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
                     Union)
-from transformers import AutoModel, AutoConfig, AutoModelForCausalLM, AutoTokenizer
 from sentencepiece import SentencePieceProcessor  # type: ignore
 
 
@@ -231,6 +230,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -243,7 +243,11 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+    print("Loading model: ", dir_model)
     config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True)

diff --git a/neural_speed/convert/convert_bloom.py b/neural_speed/convert/convert_bloom.py
@@ -24,7 +24,6 @@
 import argparse
 from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
                     Union)
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
 
 
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
@@ -54,6 +53,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -66,7 +66,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(dir_model)
     config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     hparams = config.to_dict()

diff --git a/neural_speed/convert/convert_chatglm.py b/neural_speed/convert/convert_chatglm.py
@@ -19,7 +19,6 @@
 import argparse
 from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
                     Union)
-from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
 from sentencepiece import SentencePieceProcessor  # type: ignore
 import gguf
 
@@ -612,6 +611,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     parser.add_argument("--format",
                         type=str,
@@ -629,7 +629,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import AutoConfig, AutoModel, AutoTokenizer
+    else:
+        from transformers import AutoConfig, AutoModel, AutoTokenizer
     config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)

diff --git a/neural_speed/convert/convert_dolly.py b/neural_speed/convert/convert_dolly.py
@@ -32,7 +32,6 @@
 import argparse
 from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
                     Union)
-from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
@@ -62,6 +61,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -74,7 +74,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(dir_model)
     print("Loading model: ", dir_model)
     model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)

diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py
@@ -24,7 +24,6 @@
 import argparse
 from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
                     Union)
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
 
 
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
@@ -54,6 +53,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -66,7 +66,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+        if args.model_hub == "modelscope":
+        from modelscope import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
     with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as f:

diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py
@@ -29,7 +29,6 @@
 import argparse
 from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar,
                     Union)
-from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
@@ -59,6 +58,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -68,7 +68,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import  AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
     print("Loading model: ", dir_model)
     tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)

diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py
@@ -62,6 +62,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -74,7 +75,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(dir_model)
     print("Loading model: ", dir_model)
     model = AutoModelForCausalLM.from_pretrained(dir_model, torch_dtype=torch.float16 if ftype == 1 else torch.float32)

diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
@@ -35,7 +35,6 @@
                     Union)
 import numpy as np
 from sentencepiece import SentencePieceProcessor  # type: ignore
-from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
 import gguf
 
 if TYPE_CHECKING:
@@ -1423,6 +1422,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         type=Path,
                         help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model",
                         type=Path,
                         help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
@@ -1432,7 +1432,6 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         choices=["NE", "GGUF"],
                         help="convert to the GGUF or NE format")
     args = parser.parse_args(args_in)
-
     vocab: Vocab
     if args.dump_single:
         model_plus = lazy_load_file(args.model)
@@ -1449,8 +1448,12 @@ def main(args_in: Optional[List[str]] = None) -> None:
             model_plus = load_some_model(args.model)
         else:
             print("Loadding the model from HF.")
-            model = AutoModel.from_pretrained(args.model, low_cpu_mem_usage=True, trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+            if args.model_hub == "modelscope":
+                from modelscope import AutoModelForCausalLM, AutoTokenizer
+            else:
+                from transformers import AutoModelForCausalLM, AutoTokenizer
+            model = AutoModelForCausalLM.from_pretrained(str(args.model), low_cpu_mem_usage=True, trust_remote_code=True)
+            tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True)
             cache_path = Path(tokenizer.vocab_file).parent
             args.model = cache_path
 

diff --git a/neural_speed/convert/convert_mistral.py b/neural_speed/convert/convert_mistral.py
@@ -36,7 +36,6 @@
 
 import numpy as np
 from sentencepiece import SentencePieceProcessor  # type: ignore
-from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
 
 if TYPE_CHECKING:
     from typing_extensions import TypeAlias
@@ -1298,11 +1297,11 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         type=Path,
                         help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model",
                         type=Path,
                         help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
     args = parser.parse_args(args_in)
-
     vocab: Vocab
     if args.dump_single:
         model_plus = lazy_load_file(args.model)
@@ -1318,8 +1317,12 @@ def main(args_in: Optional[List[str]] = None) -> None:
             print("Loadding the model from the local path.")
         else:
             print("Loadding the model from HF.")
-            model = AutoModel.from_pretrained(args.model, low_cpu_mem_usage=True, trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+            if args.model_hub == "modelscope":
+                from modelscope import AutoConfig, AutoModel, AutoTokenizer
+            else:
+                from transformers import AutoConfig, AutoModel, AutoTokenizer
+            model = AutoModel.from_pretrained(str(args.model), low_cpu_mem_usage=True, trust_remote_code=True)
+            tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True)
             cache_path = Path(tokenizer.vocab_file).parent
             args.model = cache_path
 

diff --git a/neural_speed/convert/convert_mixtral.py b/neural_speed/convert/convert_mixtral.py
@@ -36,7 +36,6 @@
 
 import numpy as np
 from sentencepiece import SentencePieceProcessor  # type: ignore
-from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
 
 if TYPE_CHECKING:
     from typing_extensions import TypeAlias
@@ -1300,6 +1299,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
                         type=Path,
                         help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model",
                         type=Path,
                         help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
@@ -1320,8 +1320,12 @@ def main(args_in: Optional[List[str]] = None) -> None:
             print("Loadding the model from the local path.")
         else:
             print("Loadding the model from HF.")
-            model = AutoModel.from_pretrained(args.model, low_cpu_mem_usage=True, trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+            if args.model_hub == "modelscope":
+                from modelscope import AutoConfig, AutoModel, AutoTokenizer
+            else:
+                from transformers import AutoConfig, AutoModel, AutoTokenizer
+            model = AutoModel.from_pretrained(str(args.model), low_cpu_mem_usage=True, trust_remote_code=True)
+            tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True)
             cache_path = Path(tokenizer.vocab_file).parent
             args.model = cache_path
 

diff --git a/neural_speed/convert/convert_mpt.py b/neural_speed/convert/convert_mpt.py
@@ -51,6 +51,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--model_hub", choices=["huggingface","modelscope"], default = "huggingface", help="hub to load model")
     parser.add_argument("model", type=Path, help="directory containing model file")
     args = parser.parse_args(args_in)
 
@@ -62,7 +63,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     ftype = 0
     if args.outtype == "f16":
         ftype = 1
-
+    if args.model_hub == "modelscope":
+        from modelscope import AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)
     hparams = model.config.to_dict()