enable bfloat16 for cpu (#1034)

if there's no cuda. disable custom kernels # What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.  Signed-off-by: Wang, Yi A <[email protected]>
huggingface · Sep 19, 2023 · eeaa22a · eeaa22a
1 parent c8a01d7
commit eeaa22a
Show file tree

Hide file tree

Showing 15 changed files with 18 additions and 16 deletions.
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
@@ -153,7 +153,7 @@ def get_model(
         )
     elif model_type == "mpt":
         return MPTSharded(
-            model_id, revision, quantize=quantize, trust_remote_code=trust_remote_code
+            model_id, revision, quantize=quantize, dtype=dtype, trust_remote_code=trust_remote_code
         )
 
     elif model_type == "gpt_neox":

diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py
@@ -51,7 +51,7 @@ def __init__(
             dtype = torch.float16 if dtype is None else dtype
         else:
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,

diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
@@ -492,7 +492,7 @@ def __init__(
                 raise ValueError("quantization is not available on CPU")
 
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,

diff --git a/server/text_generation_server/models/custom_modeling/bloom_modeling.py b/server/text_generation_server/models/custom_modeling/bloom_modeling.py
@@ -40,7 +40,7 @@
 )
 
 CUSTOM_KERNELS_ENABLED = False
-if not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True":
+if torch.cuda.is_available() and not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True":
     try:
         from custom_kernels import fused_bloom_attention_cuda
 

diff --git a/server/text_generation_server/models/custom_modeling/neox_modeling.py b/server/text_generation_server/models/custom_modeling/neox_modeling.py
@@ -49,7 +49,7 @@
 
 
 CUSTOM_KERNELS_ENABLED = False
-if not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True":
+if torch.cuda.is_available() and not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True":
     try:
         from custom_kernels import fused_attention_cuda
 

diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
@@ -167,7 +167,7 @@ def __init__(
             dtype = torch.float16 if dtype is None else dtype
         else:
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,

diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py
@@ -33,7 +33,7 @@ def __init__(
             dtype = torch.float16 if dtype is None else dtype
         else:
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,

diff --git a/server/text_generation_server/models/idefics.py b/server/text_generation_server/models/idefics.py
@@ -42,7 +42,7 @@ def __init__(
             dtype = torch.bfloat16 if dtype is None else dtype
         else:
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
         self.device, self.dtype = device, dtype
 
         config = IdeficsConfig.from_pretrained(

diff --git a/server/text_generation_server/models/idefics_causal_lm.py b/server/text_generation_server/models/idefics_causal_lm.py
@@ -560,7 +560,7 @@ def __init__(
                 raise ValueError("quantization is not available on CPU")
 
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,

diff --git a/server/text_generation_server/models/mpt.py b/server/text_generation_server/models/mpt.py
@@ -43,14 +43,16 @@ def __init__(
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
         self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16
+            dtype = torch.float16 if dtype is None else dtype
         else:
-            raise NotImplementedError("MPTSharded is only available on GPU")
+            device = torch.device("cpu")
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,

diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py
@@ -31,7 +31,7 @@ def __init__(
             dtype = torch.float16 if dtype is None else dtype
         else:
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,

diff --git a/server/text_generation_server/models/rw.py b/server/text_generation_server/models/rw.py
@@ -23,7 +23,7 @@ def __init__(
                 raise ValueError("quantization is not available on CPU")
 
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,

diff --git a/server/text_generation_server/models/santacoder.py b/server/text_generation_server/models/santacoder.py
@@ -30,7 +30,7 @@ def __init__(
                 raise ValueError("quantization is not available on CPU")
 
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,

diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
@@ -541,7 +541,7 @@ def __init__(
                 raise ValueError("quantization is not available on CPU")
 
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         model = AutoModelForSeq2SeqLM.from_pretrained(
             model_id,

diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py
@@ -34,7 +34,7 @@ def __init__(
             dtype = torch.float16 if dtype is None else dtype
         else:
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         config = AutoConfig.from_pretrained(
             model_id,