forked from databricks/databricks-ml-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_load_inference.py
117 lines (89 loc) · 4.81 KB
/
01_load_inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# Databricks notebook source
# MAGIC %md
# MAGIC # Load and Inference MPT-30B-instruct model
# MAGIC
# MAGIC MPT-30B is a decoder-style transformer pretrained from scratch on 1T tokens of English text and code. It includes an 8k token context window. It supports for context-length extrapolation via ALiBi. The size of MPT-30B was also specifically chosen to make it easy to deploy on a single GPU—either 1xA100-80GB in 16-bit precision or 1xA100-40GB in 8-bit precision.
# MAGIC
# MAGIC Environment for this notebook:
# MAGIC - Runtime: 13.1 GPU ML Runtime
# MAGIC - Instance:
# MAGIC - `Standard_NC24ads_A100_v4` (1 A100-80GB GPU) on Azure
# MAGIC - `g5.12xlarge` (4 A10 GPUs) on AWS: Note that the triton-based FlashAttention would not work for this model with smaller GPUs such as A10, set `USE_TRITON=False` below
# COMMAND ----------
USE_TRITON = True # TODO: Please change this to False if not using A100-80GB GPU
# COMMAND ----------
# MAGIC %md
# MAGIC ## Install required libraries
# COMMAND ----------
# Skip this step if running on Databricks runtime 13.2 GPU and above.
!wget -O /local_disk0/tmp/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb && \
dpkg -i /local_disk0/tmp/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb && \
wget -O /local_disk0/tmp/libcublas-dev-11-7_11.10.1.25-1_amd64.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcublas-dev-11-7_11.10.1.25-1_amd64.deb && \
dpkg -i /local_disk0/tmp/libcublas-dev-11-7_11.10.1.25-1_amd64.deb && \
wget -O /local_disk0/tmp/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb && \
dpkg -i /local_disk0/tmp/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb && \
wget -O /local_disk0/tmp/libcurand-dev-11-7_10.2.10.91-1_amd64.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcurand-dev-11-7_10.2.10.91-1_amd64.deb && \
dpkg -i /local_disk0/tmp/libcurand-dev-11-7_10.2.10.91-1_amd64.deb
# COMMAND ----------
# MAGIC %pip install xformers==0.0.20 einops==0.6.1 flash-attn==v1.0.3.post0 triton fastertransformer
# MAGIC %pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python
# COMMAND ----------
# MAGIC %md
# MAGIC ## Inference
# MAGIC
# MAGIC Note: This model requires that `trust_remote_code=True` be passed to the `from_pretrained` method. This is because MosaicML uses a custom model architecture that is not yet part of the transformers package.
# COMMAND ----------
import transformers
from transformers import AutoTokenizer, pipeline
import torch
# COMMAND ----------
name = "mosaicml/mpt-30b-instruct"
revision = "2abf1163dd8c9b11f07d805c06e6ec90a1f2037e"
config = transformers.AutoConfig.from_pretrained(
name,
revision=revision,
trust_remote_code=True
)
config.max_seq_len = 16384
config.init_device = 'cuda' # For fast initialization directly on GPU!
if USE_TRITON:
config.attn_config['attn_impl'] = 'triton' # change this to use triton-based FlashAttention
model = transformers.AutoModelForCausalLM.from_pretrained(
name,
config=config,
torch_dtype=torch.bfloat16, # Load model weights in bfloat16
trust_remote_code=True,
revision=revision,
cache_dir="/local_disk0/.cache/huggingface/",
device_map = 'auto',
)
tokenizer = AutoTokenizer.from_pretrained('mosaicml/mpt-30b')
generator = pipeline("text-generation",
model=model,
config=config,
tokenizer=tokenizer,
torch_dtype=torch.bfloat16)
# COMMAND ----------
def generate_text(prompt, **kwargs):
if "max_new_tokens" not in kwargs:
kwargs["max_new_tokens"] = 512
kwargs.update(
{
"pad_token_id": tokenizer.eos_token_id,
"eos_token_id": tokenizer.eos_token_id,
"use_cache": True, # make sure we use kv cache for lowest latency
}
)
template = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n###Instruction\n{instruction}\n\n### Response\n"
if isinstance(prompt, str):
full_prompt = template.format(instruction=prompt)
generated_text = generator(full_prompt, **kwargs)[0]["generated_text"]
elif isinstance(prompt, list):
full_prompts = list(map(lambda promp: template.format(instruction=promp), prompt))
outputs = generator(full_prompts, **kwargs)
generated_text = [out[0]["generated_text"] for out in outputs]
return generated_text
# COMMAND ----------
generate_text("Tell me a funny joke.\nDon't make it too funny though.", temperature=0.5, max_new_tokens=1024)
# COMMAND ----------
generate_text(["What is ML?", "Name 10 colors"], max_new_tokens=100)