diff --git a/.gitignore b/.gitignore
index 4695fd4..dc90f30 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,24 @@
 *.class
 *.log
 
+distribute*
+.noseids
+*.pyc
+*.swp
+*egg-info*
+build/
+dist/
+.cache
+venv
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
 _SUCCESS
 .DS_Store
 .idea
+.vscode
 model-export-demo/target
 .fleet
 *.iml
diff --git a/llm-models/mpt/mpt-7b-8k/01_load_inference.py b/llm-models/mpt/mpt-7b-8k/01_load_inference.py
new file mode 100644
index 0000000..af157b4
--- /dev/null
+++ b/llm-models/mpt/mpt-7b-8k/01_load_inference.py
@@ -0,0 +1,208 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC # `mpt-7b-8k-instruct` Inference on Databricks
+# MAGIC
+# MAGIC The [mpt-7b-8k-instruct](https://huggingface.co/mosaicml/mpt-7b-8k-instruct) Large Language Model (LLM) is a instruct fine-tuned version of the [mpt-7b-8k](https://huggingface.co/mosaicml/mpt-7b-8k) generative text model using a variety of publicly available conversation datasets.
+# MAGIC
+# MAGIC [vllm](https://github.com/vllm-project/vllm/tree/main) is an open-source library that makes LLM inference fast with various optimizations.
+# MAGIC Environment for this notebook:
+# MAGIC - Runtime: 14.3 GPU ML Runtime
+# MAGIC - Instance:
+# MAGIC     - `g5.xlarge` on aws
+# MAGIC     - `Standard_NV36ads_A10_v5` on azure
+# MAGIC     - `g2-standard-4` on gcp
+
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Install required packages
+
+# COMMAND ----------
+
+# MAGIC %pip install -U  torch==2.0.1+cu118  torchvision==0.15.2+cu118  transformers==4.37.2  accelerate==0.26.1  einops==0.7.0  flash-attn==2.5.2 
+# MAGIC %pip install vllm
+# MAGIC dbutils.library.restartPython()
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Inference
+# MAGIC Load and run inference on Databricks.
+
+# COMMAND ----------
+from transformers import AutoTokenizer
+
+# Load the model
+
+# it is suggested to pin the revision commit hash and not change it for reproducibility because the uploader might change the model afterwards; you can find the commmit history of `mpt-7b-8k-instruct`. in https://huggingface.co/mosaicml/mpt-7b-8k-instruct/commits/main
+model = "mosaicml/mpt-7b-8k-instruct"
+revision = "fa099ce469116153c8c0238c1d220c01e871a992"
+
+from vllm import LLM
+llm = LLM(model=model, revision=revision)
+
+# COMMAND ----------
+
+DEFAULT_SYSTEM_PROMPT = """\
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+
+PROMPT_FOR_GENERATION_FORMAT = """
+### Instruction:
+{system_prompt}
+{instruction}
+
+### Response:\n
+""".format(
+    system_prompt=DEFAULT_SYSTEM_PROMPT,
+    instruction="{instruction}"
+)
+
+# COMMAND ----------
+from vllm import SamplingParams
+# Define the function to generate text
+def gen_text(prompts, use_template=False, **kwargs):
+    if use_template:
+        full_prompts = [
+            PROMPT_FOR_GENERATION_FORMAT.format(instruction=prompt)
+            for prompt in prompts
+        ]
+    else:
+        full_prompts = prompts
+    # the default max length is pretty small (16), which would cut the generated output in the middle, so it's necessary to increase the threshold to the complete response
+    if "max_tokens" not in kwargs:
+        kwargs["max_tokens"] = 512
+    
+    sampling_params = SamplingParams(**kwargs)
+    outputs = llm.generate(full_prompts, sampling_params=sampling_params)
+    texts = [out.outputs[0].text for out in outputs]
+    
+    return texts
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Inference on a single input
+
+# COMMAND ----------
+
+results = gen_text(["What is a large language model?"])
+print(results[0])
+
+# COMMAND ----------
+# Use args such as temperature and max_tokens to control text generation
+results = gen_text(["What is a large language model?"], temperature=0.5, max_tokens=100, use_template=True)
+print(results[0])
+
+# COMMAND ----------
+
+# Check that the generation quality when the context is long
+from transformers import AutoTokenizer
+long_input = """Provide a concise summary of the below passage.
+
+Hannah Arendt was one of the seminal political thinkers of the twentieth century. The power and originality of her thinking was evident in works such as The Origins of Totalitarianism, The Human Condition, On Revolution and The Life of the Mind. In these works and in numerous essays she grappled with the most crucial political events of her time, trying to grasp their meaning and historical import, and showing how they affected our categories of moral and political judgment. What was required, in her view, was a new framework that could enable us to come to terms with the twin horrors of the twentieth century, Nazism and Stalinism. She provided such framework in her book on totalitarianism, and went on to develop a new set of philosophical categories that could illuminate the human condition and provide a fresh perspective on the nature of political life.
+
+Although some of her works now belong to the classics of the Western tradition of political thought, she has always remained difficult to classify. Her political philosophy cannot be characterized in terms of the traditional categories of conservatism, liberalism, and socialism. Nor can her thinking be assimilated to the recent revival of communitarian political thought, to be found, for example, in the writings of A. MacIntyre, M. Sandel, C. Taylor and M. Walzer. Her name has been invoked by a number of critics of the liberal tradition, on the grounds that she presented a vision of politics that stood in opposition some key liberal principles. There are many strands of Arendt’s thought that could justify such a claim, in particular, her critique of representative democracy, her stress on civic engagement and political deliberation, her separation of morality from politics, and her praise of the revolutionary tradition. However, it would be a mistake to view Arendt as an anti-liberal thinker. Arendt was in fact a stern defender of constitutionalism and the rule of law, an advocate of fundamental human rights (among which she included not only the right to life, liberty, and freedom of expression, but also the right to action and to opinion), and a critic of all forms of political community based on traditional ties and customs, as well as those based on religious, ethnic, or racial identity.
+
+Arendt’s political thought cannot, in this sense, be identified either with the liberal tradition or with the claims advanced by a number of its critics. Arendt did not conceive of politics as a means for the satisfaction of individual preferences, nor as a way to integrate individuals around a shared conception of the good. Her conception of politics is based instead on the idea of active citizenship, that is, on the value and importance of civic engagement and collective deliberation about all matters affecting the political community. If there is a tradition of thought with which Arendt can be identified, it is the classical tradition of civic republicanism originating in Aristotle and embodied in the writings of Machiavelli, Montesquieu, Jefferson, and Tocqueville. According to this tradition politics finds its authentic expression whenever citizens gather together in a public space to deliberate and decide about matters of collective concern. Political activity is valued not because it may lead to agreement or to a shared conception of the good, but because it enables each citizen to exercise his or her powers of agency, to develop the capacities for judgment and to attain by concerted action some measure of political efficacy."""
+
+def get_num_tokens(text):
+    tokenizer = AutoTokenizer.from_pretrained("mosaicml/mpt-7b-8k-instruct", padding_side="left")
+    inputs = tokenizer(text, return_tensors="pt").input_ids.to("cuda")
+    return inputs.shape[1]
+
+print('number of tokens for input:', get_num_tokens(long_input))
+
+results = gen_text([long_input], use_template=True, max_tokens=150)
+print(results[0])
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Batch inference
+
+# COMMAND ----------
+
+# From databricks-dolly-15k
+inputs = [
+  "Think of some family rules to promote a healthy family relationship",
+  "In the series A Song of Ice and Fire, who is the founder of House Karstark?",
+  "which weighs more, cold or hot water?",
+  "Write a short paragraph about why you should not have both a pet cat and a pet bird.",
+  "Is beauty objective or subjective?",
+  "What is SVM?",
+  "What is the current capital of Japan?",
+  "Name 10 colors",
+  "How should I invest my money?",
+  "What are some ways to improve the value of your home?",
+  "What does fasting mean?",
+  "What is cloud computing in simple terms?",
+  "What is the meaning of life?",
+  "What is Linux?",
+  "Why do people like gardening?",
+  "What makes for a good photograph?"
+]
+
+# COMMAND ----------
+
+results = gen_text(inputs, use_template=True)
+
+for i, output in enumerate(results):
+  print(f"======Output No. {i+1}======")
+  print(output)
+  print("\n")
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Measure inference speed
+# MAGIC Text generation speed is often measured with token/s, which is the average number of tokens that are generated by the model per second.
+# MAGIC
+
+# COMMAND ----------
+
+import time
+
+def get_gen_text_throughput(prompt, use_template=True, **kwargs):
+    """
+    Return tuple ( number of tokens / sec, num tokens, output ) of the generated tokens
+    """
+    if use_template:
+        full_prompt = PROMPT_FOR_GENERATION_FORMAT.format(instruction=prompt)
+    else:
+        full_prompt = prompt
+    if "max_tokens" not in kwargs:
+        kwargs["max_tokens"] = 512
+    sampling_params = SamplingParams(**kwargs)
+    
+    num_input_tokens = get_num_tokens(full_prompt)
+
+    # measure the time it takes for text generation
+    start = time.time()
+   
+    outputs = llm.generate(full_prompt, sampling_params=sampling_params)
+    duration = time.time() - start
+
+    # get the number of generated tokens
+    token_ids = outputs[0].outputs[0].token_ids
+    n_tokens = len(token_ids)
+    # show the generated text in logging
+    text = outputs[0].outputs[0].text
+    return (n_tokens / duration, n_tokens, text)
+
+# COMMAND ----------
+
+throughput, n_tokens, result = get_gen_text_throughput("What is ML?", use_template=False)
+
+print(f"{throughput} tokens/sec, {n_tokens} tokens (not including prompt)")
+
+# COMMAND ----------
+
+# When the context is long or the generated text is long, it takes longer to generate each token in average
+throughput, n_tokens, result = get_gen_text_throughput(long_input, max_tokens=200, use_template=True)
+
+print(f"{throughput} tokens/sec, {n_tokens} tokens (not including prompt)")
+
+# COMMAND ----------
\ No newline at end of file
diff --git a/llm-models/mpt/mpt-7b-8k/02_mlflow_logging_inference.py b/llm-models/mpt/mpt-7b-8k/02_mlflow_logging_inference.py
new file mode 100644
index 0000000..05c9fb2
--- /dev/null
+++ b/llm-models/mpt/mpt-7b-8k/02_mlflow_logging_inference.py
@@ -0,0 +1,217 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC # Manage `mpt-7b-8k-instruct` model with MLFlow on Databricks
+# MAGIC
+# MAGIC The [mpt-7b-8k-instruct](https://huggingface.co/mosaicml/mpt-7b-8k-instruct) Large Language Model (LLM) is a instruct fine-tuned version of the [mpt-7b-8k](https://huggingface.co/mosaicml/mpt-7b-8k) generative text model using a variety of publicly available conversation datasets.
+# MAGIC
+# MAGIC Environment for this notebook:
+# MAGIC - Runtime: 14.3 GPU ML Runtime
+# MAGIC - Instance:
+# MAGIC     - `g5.xlarge` on aws
+# MAGIC     - `Standard_NV36ads_A10_v5` on azure
+# MAGIC     - `g2-standard-4` on gcp
+
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Install required packages
+
+# COMMAND ----------
+
+# MAGIC %pip install -U mlflow-skinny[databricks]>=2.6.0
+# MAGIC %pip install -U  torch==2.0.1+cu118  torchvision==0.15.2+cu118  transformers==4.37.2  accelerate==0.26.1  einops==0.7.0  flash-attn==2.5.2 
+# MAGIC %pip install --upgrade databricks-sdk
+
+# MAGIC dbutils.library.restartPython()
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Log the model to MLFlow
+
+# COMMAND ----------
+
+# it is suggested to pin the revision commit hash and not change it for reproducibility because the uploader might change the model afterwards; you can find the commmit history of `mpt-7b-8k-instruct`. in https://huggingface.co/mosaicml/mpt-7b-8k-instruct/commits/main
+model_name = "mosaicml/mpt-7b-8k-instruct"
+revision = "fa099ce469116153c8c0238c1d220c01e871a992"
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+
+# Load model
+model = AutoModelForCausalLM.from_pretrained(model_name, revision=revision, torch_dtype=torch.bfloat16,
+                                             cache_dir="/local_disk0/.cache/huggingface/")
+tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision)
+
+# COMMAND ----------
+
+# Define prompt template to get the expected features and performance for the chat versions. See our reference code in github for details: https://github.com/facebookresearch/llama/blob/main/llama/generation.py#L212
+
+DEFAULT_SYSTEM_PROMPT = """\
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+
+def build_prompt(instruction):
+    return """
+### Instruction:
+{system_prompt}
+{instruction}
+
+### Response:\n""".format(
+        system_prompt=DEFAULT_SYSTEM_PROMPT,
+        instruction=instruction
+    )
+
+# COMMAND ----------
+
+import mlflow
+from mlflow.models import infer_signature
+
+# Define model signature including params
+input_example = {"prompt": build_prompt("What is Machine Learning?")}
+inference_config = {
+    "temperature": 1.0,
+    "max_new_tokens": 100,
+    "do_sample": True,
+}
+signature = infer_signature(
+    model_input=input_example,
+    model_output="Machien Learning is...",
+    params=inference_config
+)
+
+# Log the model with its details such as artifacts, pip requirements and input example
+with mlflow.start_run() as run:
+    mlflow.transformers.log_model(
+        transformers_model={
+            "model": model,
+            "tokenizer": tokenizer,
+        },
+        task="text-generation",
+        artifact_path="model",
+        pip_requirements=["torch==2.0.1+cu118", "torchvision==0.15.2+cu118", "transformers==4.37.2", "accelerate==0.26.1", "einops==0.7.0", "flash-attn==2.5.2"],
+        input_example=input_example,
+        signature=signature,
+        # Add the metadata task so that the model serving endpoint created later will be optimized
+        metadata={
+            "task": "llm/v1/completions",
+            "databricks_model_source": "example-notebooks",
+            "databricks_model_size_parameters": "7b"
+        }
+    )
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Register the model to Unity Catalog
+# MAGIC  By default, MLflow registers models in the Databricks workspace model registry. To register models in Unity Catalog instead, we follow the [documentation](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html) and set the registry server as Databricks Unity Catalog.
+# MAGIC
+# MAGIC  In order to register a model in Unity Catalog, there are [several requirements](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html#requirements), such as Unity Catalog must be enabled in your workspace.
+# MAGIC
+
+# COMMAND ----------
+
+# Configure MLflow Python client to register model in Unity Catalog
+import mlflow
+
+mlflow.set_registry_uri("databricks-uc")
+
+# COMMAND ----------
+
+# Register model to Unity Catalog
+# This may take 2 minutes to complete
+
+registered_name = "models.default.mpt-7b-8k-instruct"  # Note that the UC model name follows the pattern <catalog_name>.<schema_name>.<model_name>, corresponding to the catalog, schema, and registered model name
+
+result = mlflow.register_model(
+    "runs:/" + run.info.run_id + "/model",
+    registered_name,
+)
+
+
+# COMMAND ----------
+
+from mlflow import MlflowClient
+
+client = MlflowClient()
+
+# Choose the right model version registered in the above cell.
+client.set_registered_model_alias(name=registered_name, alias="Champion", version=result.version)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Load the model from Unity Catalog
+
+# COMMAND ----------
+
+import mlflow
+
+loaded_model = mlflow.pyfunc.load_model(f"models:/{registered_name}@Champion")
+
+# Make a prediction using the loaded model
+loaded_model.predict(
+    {"prompt": "What is large language model?"},
+    params={
+        "temperature": 0.5,
+        "max_new_tokens": 100,
+    }
+)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Deploying the model to Model Serving
+# MAGIC Once the model is registered, we can use API to create a Databricks GPU Model Serving Endpoint that serves the `mpt-7b-8k-instruct` model.
+# MAGIC
+# MAGIC Note that the below deployment requires GPU model serving. For more information on GPU model serving, see the [documentation](https://docs.databricks.com/en/machine-learning/model-serving/create-manage-serving-endpoints.html#gpu). The feature is in Public Preview.
+# MAGIC
+# MAGIC Models in `mpt-7b-8k-instruct` family are supported for Optimized LLM Serving, which provides an order of magnitude better throughput and latency improvement. 
+# MAGIC You can deploy this model directly to Optimized LLM serving ([AWS](https://docs.databricks.com/en/machine-learning/model-serving/llm-optimized-model-serving.html#input-and-output-schema-format)|[Azure](https://learn.microsoft.com/en-us/azure/databricks/machine-learning/model-serving/llm-optimized-model-serving)) for improved throughput and latency.
+# MAGIC Databricks recommends using the provisioned throughput ([AWS](https://docs.databricks.com/en/machine-learning/foundation-models/deploy-prov-throughput-foundation-model-apis.html)|[Azure](https://learn.microsoft.com/en-us/azure/databricks/machine-learning/foundation-models/deploy-prov-throughput-foundation-model-apis)) experience for optimized inference of LLMs.
+
+# COMMAND ----------
+
+model_version = result  # the returned result of mlflow.register_model
+served_name = f'{model_version.name.replace(".", "_")}_{model_version.version}'
+
+# COMMAND ----------
+
+import requests
+import json
+
+# To deploy your model in provisioned throughput mode via API, you must specify `min_provisioned_throughput` and `max_provisioned_throughput` fields in your request.
+# Minimum desired provisioned throughput
+min_provisioned_throughput = 980
+
+# Maximum desired provisioned throughput
+max_provisioned_throughput = 2940
+
+# Get the API endpoint and token for the current notebook context
+API_ROOT = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().get()
+API_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
+
+# send the POST request to create the serving endpoint
+data = {
+    "name": served_name,
+    "config": {
+        "served_models": [
+            {
+                "model_name": model_version.name,
+                "model_version": model_version.version,
+                "min_provisioned_throughput": min_provisioned_throughput,
+                "max_provisioned_throughput": max_provisioned_throughput,
+            }
+        ]
+    },
+}
+
+headers = {"Context-Type": "text/json", "Authorization": f"Bearer {API_TOKEN}"}
+
+response = requests.post(
+    url=f"{API_ROOT}/api/2.0/serving-endpoints", json=data, headers=headers
+)
+
+print(json.dumps(response.json(), indent=4))
diff --git a/llm-models/mpt/mpt-7b-8k/03_langchain_inference.py b/llm-models/mpt/mpt-7b-8k/03_langchain_inference.py
new file mode 100644
index 0000000..f81e975
--- /dev/null
+++ b/llm-models/mpt/mpt-7b-8k/03_langchain_inference.py
@@ -0,0 +1,84 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC # Load `mpt-7b-8k-instruct` model from Langchain on Databricks
+# MAGIC
+# MAGIC This example notebook shows how to wrap Databricks endpoints as LLMs in LangChain.
+# MAGIC
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Install required packages
+
+# COMMAND ----------
+
+# MAGIC %pip install -U langchain
+# MAGIC dbutils.library.restartPython()
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Wrapping a serving endpoint with Langchain
+# MAGIC Prerequisites:
+# MAGIC - Run `02_mlflow_logging_inference` to deploy the model to a Databricks serving endpoint
+
+# COMMAND ----------
+
+from langchain_community.llms import Databricks
+
+# If running a Databricks notebook attached to an interactive cluster in "single user"
+# or "no isolation shared" mode, you only need to specify the endpoint name to create
+# a `Databricks` instance to query a serving endpoint in the same workspace.
+
+registered_name = "models_default_mpt-7b-8k-instruct_1"
+
+llm = Databricks(endpoint_name=registered_name)
+
+llm("How are you?")
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC You can define `transform_input_fn` and `transform_output_fn` if the app
+# MAGIC expects a different input schema and does not return a JSON string,
+# MAGIC respectively, or you want to apply a prompt template on top.
+
+# COMMAND ----------
+
+def transform_input(**request):
+    """
+    Add more instructions into the prompt.
+    """
+    DEFAULT_SYSTEM_PROMPT = """\
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+    full_prompt = """### Instruction:
+{system_prompt}
+{instruction}
+
+### Response:\n""".format(
+        system_prompt=DEFAULT_SYSTEM_PROMPT,
+        instruction=request["prompt"]
+    )
+    request["prompt"] = full_prompt
+    return request
+
+
+def transform_output(response):
+    """
+    Add timestamps for the anwsers.
+    """
+    from datetime import datetime
+    now = datetime.now()
+    current_time = now.strftime("%d/%m/%Y %H:%M:%S")
+    return f"[{current_time}] mpt: {response}"
+
+
+llm = Databricks(
+    endpoint_name=registered_name,
+    transform_input_fn=transform_input,
+    transform_output_fn=transform_output,
+)
+
+print(llm("How to master Python in 3 days?"))
\ No newline at end of file
diff --git a/llm-models/mpt/mpt-7b-8k/04_fine_tune_qlora.py b/llm-models/mpt/mpt-7b-8k/04_fine_tune_qlora.py
new file mode 100644
index 0000000..0b7ebab
--- /dev/null
+++ b/llm-models/mpt/mpt-7b-8k/04_fine_tune_qlora.py
@@ -0,0 +1,347 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC # Fine tune `mpt-7b-8k` with QLORA
+# MAGIC
+# MAGIC The [mpt-7b-8k](https://huggingface.co/mosaicml/mpt-7b-8k) Large Language Model (LLM) is a pretrained generative text model with 7 billion parameters.
+# MAGIC
+# MAGIC This notebook is to fine-tune [mpt-7b-8k](https://huggingface.co/mosaicml/mpt-7b-8k) models on the [mosaicml/dolly_hhrlhf](https://huggingface.co/datasets/mosaicml/dolly_hhrlhf) dataset.
+# MAGIC
+# MAGIC Environment for this notebook:
+# MAGIC - Runtime: 14.3 GPU ML Runtime
+# MAGIC - Instance:
+# MAGIC     - `g5.8xlarge` on aws
+# MAGIC     - `Standard_NV36ads_A10_v5` on azure
+# MAGIC     - `g2-standard-8` or `a2-highgpu-1g` on gcp
+
+# MAGIC
+# MAGIC We leverage the PEFT library from Hugging Face, as well as QLoRA for more memory efficient finetuning.
+
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Install required packages
+# MAGIC
+# MAGIC Run the cells below to setup and install the required libraries. For our experiment we will need `accelerate`, `peft`, `transformers`, `datasets` and TRL to leverage the recent [`SFTTrainer`](https://huggingface.co/docs/trl/main/en/sft_trainer). We will use `bitsandbytes` to [quantize the base model into 4bit](https://huggingface.co/blog/4bit-transformers-bitsandbytes). We will also install `einops` as it is a requirement to load Falcon models.
+
+# COMMAND ----------
+
+# MAGIC %pip install -U  torch==2.0.1+cu118  torchvision==0.15.2+cu118  transformers==4.37.2  accelerate==0.26.1  einops==0.7.0  flash-attn==2.5.2 
+# MAGIC %pip install bitsandbytes==0.41.1 einops==0.7.0 trl==0.7.10 peft==0.5.0
+# MAGIC dbutils.library.restartPython()
+
+# COMMAND ----------
+
+# Define some parameters
+model_output_location = "/local_disk0/mpt-7b-8k-lora-fine-tune"
+local_output_dir = "/local_disk0/results"
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Dataset
+# MAGIC
+# MAGIC We will use the [mosaicml/dolly_hhrlhf](https://huggingface.co/datasets/mosaicml/dolly_hhrlhf) dataset.
+
+# COMMAND ----------
+
+from datasets import load_dataset
+
+dataset_name = "mosaicml/dolly_hhrlhf"
+dataset = load_dataset(dataset_name, split="train")
+
+# COMMAND ----------
+
+dataset["prompt"][0]
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Loading the model
+# MAGIC
+# MAGIC In this section we will load the [mpt-7b-8k](https://huggingface.co/mosaicml/mpt-7b-8k), quantize it in 4bit and attach LoRA adapters on it.
+
+# COMMAND ----------
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
+
+# it is suggested to pin the revision commit hash and not change it for reproducibility because the uploader might change the model afterwards; you can find the commmit history of `mpt-7b-8k-instruct`. in https://huggingface.co/mosaicml/mpt-7b-8k-instruct/commits/main
+model = "mosaicml/mpt-7b-8k"
+revision = "d589309775b245b91d2cc5b177526ba0e8a37376"
+
+tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16,
+)
+
+model = AutoModelForCausalLM.from_pretrained(
+    model,
+    quantization_config=bnb_config,
+    revision=revision,
+    trust_remote_code=True,
+)
+model.config.use_cache = False
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC Load the configuration file in order to create the LoRA model. 
+# MAGIC
+# MAGIC According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance.
+
+# COMMAND ----------
+
+# Choose all linear layers from the model
+import bitsandbytes as bnb
+
+def find_all_linear_names(model):
+    cls = bnb.nn.Linear4bit
+    lora_module_names = set()
+    for name, module in model.named_modules():
+        if isinstance(module, cls):
+            names = name.split('.')
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+
+
+    if 'lm_head' in lora_module_names: # needed for 16-bit
+        lora_module_names.remove('lm_head')
+    return list(lora_module_names)
+
+linear_layers = find_all_linear_names(model)
+print(f"Linear layers in the model: {linear_layers}")
+
+# COMMAND ----------
+
+from peft import LoraConfig
+
+lora_alpha = 16
+lora_dropout = 0.1
+lora_r = 64
+
+peft_config = LoraConfig(
+    lora_alpha=lora_alpha,
+    lora_dropout=lora_dropout,
+    r=lora_r,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules=linear_layers,
+)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Loading the trainer
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.
+
+# COMMAND ----------
+
+from transformers import TrainingArguments
+
+per_device_train_batch_size = 4
+gradient_accumulation_steps = 4
+optim = "paged_adamw_32bit"
+save_steps = 500
+logging_steps = 100
+learning_rate = 2e-4
+max_grad_norm = 0.3
+max_steps = 1000
+warmup_ratio = 0.03
+lr_scheduler_type = "constant"
+
+training_arguments = TrainingArguments(
+    output_dir=local_output_dir,
+    per_device_train_batch_size=per_device_train_batch_size,
+    gradient_accumulation_steps=gradient_accumulation_steps,
+    optim=optim,
+    save_steps=save_steps,
+    logging_steps=logging_steps,
+    learning_rate=learning_rate,
+    fp16=True,
+    max_grad_norm=max_grad_norm,
+    max_steps=max_steps,
+    warmup_ratio=warmup_ratio,
+    group_by_length=True,
+    lr_scheduler_type=lr_scheduler_type,
+    ddp_find_unused_parameters=False,
+    gradient_checkpointing=False,
+)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC Then finally pass everthing to the trainer
+
+# COMMAND ----------
+
+from trl import SFTTrainer
+
+max_seq_length = 512
+
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=dataset,
+    peft_config=peft_config,
+    dataset_text_field="prompt",
+    max_seq_length=max_seq_length,
+    tokenizer=tokenizer,
+    args=training_arguments,
+)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC We will also pre-process the model by upcasting the layer norms in float 32 for more stable training
+
+# COMMAND ----------
+
+for name, module in trainer.model.named_modules():
+    if "norm" in name:
+        module = module.to(torch.float32)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Train the model
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC Now let's train the model! Simply call `trainer.train()`
+
+# COMMAND ----------
+
+trainer.train()
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Save the LORA model
+
+# COMMAND ----------
+
+trainer.save_model(model_output_location)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Log the fine tuned model to MLFlow
+
+# COMMAND ----------
+
+import torch
+from peft import PeftModel, PeftConfig
+
+peft_model_id = model_output_location
+config = PeftConfig.from_pretrained(peft_model_id)
+
+from huggingface_hub import snapshot_download
+# Download the model snapshot from huggingface
+snapshot_location = snapshot_download(repo_id=config.base_model_name_or_path)
+
+
+# COMMAND ----------
+
+import mlflow
+class FineTunedQLORA(mlflow.pyfunc.PythonModel):
+  def load_context(self, context):
+    self.tokenizer = AutoTokenizer.from_pretrained(context.artifacts['repository'])
+    self.tokenizer.pad_token = tokenizer.eos_token
+    config = PeftConfig.from_pretrained(context.artifacts['lora'])
+    base_model = AutoModelForCausalLM.from_pretrained(
+      context.artifacts['repository'], 
+      return_dict=True, 
+      load_in_4bit=True, 
+      device_map={"":0},
+      trust_remote_code=True,
+    )
+    self.model = PeftModel.from_pretrained(base_model, context.artifacts['lora'])
+  
+  def predict(self, context, model_input):
+    prompt = model_input["prompt"][0]
+    temperature = model_input.get("temperature", [1.0])[0]
+    max_new_tokens = model_input.get("max_new_tokens", [100])[0]
+    batch = self.tokenizer(prompt, padding=True, truncation=True,return_tensors='pt').to('cuda')
+    with torch.cuda.amp.autocast():
+      output_tokens = self.model.generate(
+          input_ids = batch.input_ids, 
+          max_new_tokens=max_new_tokens,
+          temperature=temperature,
+          top_p=0.7,
+          num_return_sequences=1,
+          do_sample=True,
+          pad_token_id=tokenizer.eos_token_id,
+          eos_token_id=tokenizer.eos_token_id,
+      )
+    generated_text = self.tokenizer.decode(output_tokens[0], skip_special_tokens=True)
+
+    return generated_text
+
+# COMMAND ----------
+
+from mlflow.models.signature import ModelSignature
+from mlflow.types import DataType, Schema, ColSpec
+import pandas as pd
+import mlflow
+
+# Define input and output schema
+input_schema = Schema([
+    ColSpec(DataType.string, "prompt"), 
+    ColSpec(DataType.double, "temperature"), 
+    ColSpec(DataType.long, "max_new_tokens")])
+output_schema = Schema([ColSpec(DataType.string)])
+signature = ModelSignature(inputs=input_schema, outputs=output_schema)
+
+# Define input example
+input_example=pd.DataFrame({
+            "prompt":["what is ML?"], 
+            "temperature": [0.5],
+            "max_new_tokens": [100]})
+
+with mlflow.start_run() as run:  
+    mlflow.pyfunc.log_model(
+        "model",
+        python_model=FineTunedQLORA(),
+        artifacts={'repository' : snapshot_location, "lora": peft_model_id},
+        pip_requirements=["torch", "transformers", "accelerate", "einops", "loralib", "bitsandbytes", "peft"],
+        input_example=pd.DataFrame({"prompt":["what is ML?"], "temperature": [0.5],"max_new_tokens": [100]}),
+        signature=signature
+    )
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC Run model inference with the model logged in MLFlow.
+
+# COMMAND ----------
+
+import mlflow
+import pandas as pd
+
+
+prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
+### Instruction:
+if one get corona and you are self isolating and it is not severe, is there any meds that one can take?
+
+### Response: """
+# Load model as a PyFuncModel.
+run_id = run.info.run_id
+logged_model = f"runs:/{run_id}/model"
+
+loaded_model = mlflow.pyfunc.load_model(logged_model)
+
+text_example=pd.DataFrame({
+            "prompt":[prompt], 
+            "temperature": [0.5],
+            "max_new_tokens": [100]})
+
+# Predict on a Pandas DataFrame.
+loaded_model.predict(text_example)
\ No newline at end of file
diff --git a/llm-models/scripts/confs/mpt-7b-8k.yml b/llm-models/scripts/confs/mpt-7b-8k.yml
new file mode 100644
index 0000000..c17cc3e
--- /dev/null
+++ b/llm-models/scripts/confs/mpt-7b-8k.yml
@@ -0,0 +1,17 @@
+model_family_name: mpt
+model_name: mpt-7b-8k
+base_model_name: mpt-7b-8k
+fine_tuned_model_name: mpt-7b-8k-instruct
+hf_org_name: mosaicml
+model_size: 7
+support_optimized_serving: true
+support_gradient_checkpointing: false
+support_vllm: true
+model_type: instruct
+pip_requirements:
+  - torch==2.0.1+cu118
+  - torchvision==0.15.2+cu118
+  - transformers==4.37.2
+  - accelerate==0.26.1
+  - einops==0.7.0
+  - flash-attn==2.5.2
\ No newline at end of file
diff --git a/llm-models/scripts/template.py b/llm-models/scripts/template.py
new file mode 100644
index 0000000..95ce168
--- /dev/null
+++ b/llm-models/scripts/template.py
@@ -0,0 +1,25 @@
+from jinja2 import Environment, PackageLoader, StrictUndefined, FileSystemLoader, BaseLoader
+
+
+class TemplateManager:
+    def __init__(self, loader: BaseLoader):
+        self._env = Environment(loader=loader, trim_blocks=True, undefined=StrictUndefined)
+
+    @staticmethod
+    def get_package_loader(package: str):
+        return PackageLoader(package, "templates")
+
+    @staticmethod
+    def get_filesystem_loader(location: str):
+        return FileSystemLoader(location)
+
+    def get_template(self, template_name):
+        return self._env.get_template(template_name)
+
+    def render_template(self, template_name, **kwargs):
+        template = self.get_template(template_name)
+        return template.render(**kwargs)
+
+    def dump_template(self, template_name, outout_file, **kwargs):
+        template = self.get_template(template_name)
+        return template.stream(**kwargs).dump(outout_file)
diff --git a/llm-models/scripts/templates/macros.jinja b/llm-models/scripts/templates/macros.jinja
new file mode 100644
index 0000000..cd9e3f5
--- /dev/null
+++ b/llm-models/scripts/templates/macros.jinja
@@ -0,0 +1,9 @@
+
+{# macro to render a python string list with double quote strings #}
+{% macro render_string_list(lst) %}
+{% set params = [] %}
+{% for elem in lst %}
+{% set params = params.append('"' + elem +'"') %}
+{% endfor %}
+[{{ params|join(", ") }}]
+{%- endmacro %}
diff --git a/llm-models/scripts/templates/text_generation/01_load_inference.py.jinja b/llm-models/scripts/templates/text_generation/01_load_inference.py.jinja
new file mode 100644
index 0000000..4c582b0
--- /dev/null
+++ b/llm-models/scripts/templates/text_generation/01_load_inference.py.jinja
@@ -0,0 +1,293 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC # `{{ model_name }}` Inference on Databricks
+# MAGIC
+# MAGIC The [{{ fine_tuned_model_name }}](https://huggingface.co/{{ hf_org_name }}/{{ fine_tuned_model_name }}) Large Language Model (LLM) is a instruct fine-tuned version of the [{{ base_model_name }}](https://huggingface.co/{{ hf_org_name }}/{{ base_model_name }}) generative text model using a variety of publicly available conversation datasets.
+# MAGIC
+{% if support_vllm %}
+# MAGIC [vllm](https://github.com/vllm-project/vllm/tree/main) is an open-source library that makes LLM inference fast with various optimizations.
+{% endif %}
+# MAGIC Environment for this notebook:
+# MAGIC - Runtime: 14.3 GPU ML Runtime
+# MAGIC - Instance:
+{% for (cloud_name, compute_size) in compute_type.items() %}
+# MAGIC     - {{ compute_size }} on {{ cloud_name }}
+{% endfor %}
+
+{% if model_family_name == "llama-2"%}
+# MAGIC Requirements:
+# MAGIC - To get the access of the model on HuggingFace, please visit the [Meta website](https://ai.meta.com/resources/models-and-libraries/llama-downloads) and accept our license terms and acceptable use policy before submitting this form. Requests will be processed in 1-2 days.
+{% endif %}
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Install required packages
+
+# COMMAND ----------
+
+# MAGIC %pip install -U {% for pip_package in pip_requirements %} {{ pip_package }} {% endfor %}
+
+{% if support_vllm %}
+# MAGIC %pip install vllm
+{% endif %}
+# MAGIC dbutils.library.restartPython()
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Inference
+# MAGIC Load and run inference on Databricks.
+
+# COMMAND ----------
+from transformers import AutoTokenizer
+
+# Load the model
+
+# it is suggested to pin the revision commit hash and not change it for reproducibility because the uploader might change the model afterwards; you can find the commmit history of `{{ fine_tuned_model_name }}`. in https://huggingface.co/{{hf_org_name}}/{{fine_tuned_model_name}}/commits/main
+model = "{{ hf_org_name }}/{{ fine_tuned_model_name }}"
+revision = "{{ revision }}"
+
+{% if support_vllm %}
+from vllm import LLM
+llm = LLM(model=model, revision=revision)
+{% else %}
+import transformers
+import torch
+
+# it is suggested to pin the revision commit hash and not change it for reproducibility because the uploader might change the model afterwards; you can find the commmit history of `{{ fine_tuned_model_name }}`. in https://huggingface.co/{{hf_org_name}}/{{fine_tuned_model_name}}/commits/main
+model = "{{ hf_org_name }}/{{ fine_tuned_model_name }}"
+revision = "{{ revision }}"
+
+tokenizer = AutoTokenizer.from_pretrained(model, padding_side="left")
+pipeline = transformers.pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    revision=revision,
+    do_sample=True,
+    return_full_text=False
+)
+
+# Required tokenizer setting for batch inference
+pipeline.tokenizer.pad_token_id = tokenizer.eos_token_id
+{% endif %}
+
+# COMMAND ----------
+
+DEFAULT_SYSTEM_PROMPT = """\
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+
+PROMPT_FOR_GENERATION_FORMAT = """
+{{ prompt_template }}
+""".format(
+    system_prompt=DEFAULT_SYSTEM_PROMPT,
+    instruction="{instruction}"
+)
+
+# COMMAND ----------
+{% if support_vllm %}
+from vllm import SamplingParams
+{% endif %}
+# Define the function to generate text
+def gen_text(prompts, use_template=False, **kwargs):
+    if use_template:
+        full_prompts = [
+            PROMPT_FOR_GENERATION_FORMAT.format(instruction=prompt)
+            for prompt in prompts
+        ]
+    else:
+        full_prompts = prompts
+{% if support_vllm %}
+    # the default max length is pretty small (16), which would cut the generated output in the middle, so it's necessary to increase the threshold to the complete response
+    if "max_tokens" not in kwargs:
+        kwargs["max_tokens"] = 512
+    
+    sampling_params = SamplingParams(**kwargs)
+    outputs = llm.generate(full_prompts, sampling_params=sampling_params)
+    texts = [out.outputs[0].text for out in outputs]
+    
+{% else %}
+    if "batch_size" not in kwargs:
+        kwargs["batch_size"] = 1
+    
+    # the default max length is pretty small (20), which would cut the generated output in the middle, so it's necessary to increase the threshold to the complete response
+    if "max_new_tokens" not in kwargs:
+        kwargs["max_new_tokens"] = 512
+
+    # configure other text generation arguments, see common configurable args here: https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
+    kwargs.update(
+        {
+            "pad_token_id": tokenizer.eos_token_id,  # Hugging Face sets pad_token_id to eos_token_id by default; setting here to not see redundant message
+            "eos_token_id": tokenizer.eos_token_id,
+        }
+    )
+
+    outputs = pipeline(full_prompts, **kwargs)
+    texts = [out[0]["generated_text"] for out in outputs]
+{% endif %}
+    return texts
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Inference on a single input
+
+# COMMAND ----------
+
+results = gen_text(["What is a large language model?"])
+print(results[0])
+
+# COMMAND ----------
+{% if support_vllm %}
+# Use args such as temperature and max_tokens to control text generation
+results = gen_text(["What is a large language model?"], temperature=0.5, max_tokens=100, use_template=True)
+{% else %}
+# Use args such as temperature and max_new_tokens to control text generation
+results = gen_text(["What is a large language model?"], temperature=0.5, max_new_tokens=100, use_template=True)
+{% endif %}
+print(results[0])
+
+# COMMAND ----------
+
+# Check that the generation quality when the context is long
+from transformers import AutoTokenizer
+long_input = """Provide a concise summary of the below passage.
+
+Hannah Arendt was one of the seminal political thinkers of the twentieth century. The power and originality of her thinking was evident in works such as The Origins of Totalitarianism, The Human Condition, On Revolution and The Life of the Mind. In these works and in numerous essays she grappled with the most crucial political events of her time, trying to grasp their meaning and historical import, and showing how they affected our categories of moral and political judgment. What was required, in her view, was a new framework that could enable us to come to terms with the twin horrors of the twentieth century, Nazism and Stalinism. She provided such framework in her book on totalitarianism, and went on to develop a new set of philosophical categories that could illuminate the human condition and provide a fresh perspective on the nature of political life.
+
+Although some of her works now belong to the classics of the Western tradition of political thought, she has always remained difficult to classify. Her political philosophy cannot be characterized in terms of the traditional categories of conservatism, liberalism, and socialism. Nor can her thinking be assimilated to the recent revival of communitarian political thought, to be found, for example, in the writings of A. MacIntyre, M. Sandel, C. Taylor and M. Walzer. Her name has been invoked by a number of critics of the liberal tradition, on the grounds that she presented a vision of politics that stood in opposition some key liberal principles. There are many strands of Arendt’s thought that could justify such a claim, in particular, her critique of representative democracy, her stress on civic engagement and political deliberation, her separation of morality from politics, and her praise of the revolutionary tradition. However, it would be a mistake to view Arendt as an anti-liberal thinker. Arendt was in fact a stern defender of constitutionalism and the rule of law, an advocate of fundamental human rights (among which she included not only the right to life, liberty, and freedom of expression, but also the right to action and to opinion), and a critic of all forms of political community based on traditional ties and customs, as well as those based on religious, ethnic, or racial identity.
+
+Arendt’s political thought cannot, in this sense, be identified either with the liberal tradition or with the claims advanced by a number of its critics. Arendt did not conceive of politics as a means for the satisfaction of individual preferences, nor as a way to integrate individuals around a shared conception of the good. Her conception of politics is based instead on the idea of active citizenship, that is, on the value and importance of civic engagement and collective deliberation about all matters affecting the political community. If there is a tradition of thought with which Arendt can be identified, it is the classical tradition of civic republicanism originating in Aristotle and embodied in the writings of Machiavelli, Montesquieu, Jefferson, and Tocqueville. According to this tradition politics finds its authentic expression whenever citizens gather together in a public space to deliberate and decide about matters of collective concern. Political activity is valued not because it may lead to agreement or to a shared conception of the good, but because it enables each citizen to exercise his or her powers of agency, to develop the capacities for judgment and to attain by concerted action some measure of political efficacy."""
+
+def get_num_tokens(text):
+    tokenizer = AutoTokenizer.from_pretrained("{{ hf_org_name }}/{{ fine_tuned_model_name }}", padding_side="left")
+    inputs = tokenizer(text, return_tensors="pt").input_ids.to("cuda")
+    return inputs.shape[1]
+
+print('number of tokens for input:', get_num_tokens(long_input))
+
+results = gen_text([long_input], use_template=True, max_tokens=150)
+print(results[0])
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Batch inference
+
+# COMMAND ----------
+
+# From databricks-dolly-15k
+inputs = [
+  "Think of some family rules to promote a healthy family relationship",
+  "In the series A Song of Ice and Fire, who is the founder of House Karstark?",
+  "which weighs more, cold or hot water?",
+  "Write a short paragraph about why you should not have both a pet cat and a pet bird.",
+  "Is beauty objective or subjective?",
+  "What is SVM?",
+  "What is the current capital of Japan?",
+  "Name 10 colors",
+  "How should I invest my money?",
+  "What are some ways to improve the value of your home?",
+  "What does fasting mean?",
+  "What is cloud computing in simple terms?",
+  "What is the meaning of life?",
+  "What is Linux?",
+  "Why do people like gardening?",
+  "What makes for a good photograph?"
+]
+
+# COMMAND ----------
+
+results = gen_text(inputs, use_template=True)
+
+for i, output in enumerate(results):
+  print(f"======Output No. {i+1}======")
+  print(output)
+  print("\n")
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Measure inference speed
+# MAGIC Text generation speed is often measured with token/s, which is the average number of tokens that are generated by the model per second.
+# MAGIC
+
+# COMMAND ----------
+
+import time
+
+def get_gen_text_throughput(prompt, use_template=True, **kwargs):
+    """
+    Return tuple ( number of tokens / sec, num tokens, output ) of the generated tokens
+    """
+    if use_template:
+        full_prompt = PROMPT_FOR_GENERATION_FORMAT.format(instruction=prompt)
+    else:
+        full_prompt = prompt
+{% if support_vllm %}
+    if "max_tokens" not in kwargs:
+        kwargs["max_tokens"] = 512
+    sampling_params = SamplingParams(**kwargs)
+{% else %}
+    if "max_new_tokens" not in kwargs:
+        kwargs["max_new_tokens"] = 512
+    kwargs.update(
+        {
+            "do_sample": True,
+            "pad_token_id": tokenizer.eos_token_id,
+            "eos_token_id": tokenizer.eos_token_id,
+            "return_tensors": True,  # make the pipeline return token ids instead of decoded text to get the number of generated tokens
+        }
+    )
+{% endif %}
+    
+    num_input_tokens = get_num_tokens(full_prompt)
+
+    # measure the time it takes for text generation
+    start = time.time()
+{% if support_vllm %}   
+    outputs = llm.generate(full_prompt, sampling_params=sampling_params)
+{% else %}
+    outputs = pipeline(full_prompt, **kwargs)
+{% endif %}
+    duration = time.time() - start
+
+    # get the number of generated tokens
+{% if support_vllm %}
+    token_ids = outputs[0].outputs[0].token_ids
+    n_tokens = len(token_ids)
+    # show the generated text in logging
+    text = outputs[0].outputs[0].text
+    return (n_tokens / duration, n_tokens, text)
+{% else %}
+    n_tokens = len(outputs[0]["generated_token_ids"])
+    text = tokenizer.batch_decode(
+        outputs[0]["generated_token_ids"][num_input_tokens:], skip_special_tokens=True
+    )
+    text = "".join(text)
+    return ((n_tokens - num_input_tokens) / duration, (n_tokens - num_input_tokens), result)
+{% endif %}
+
+# COMMAND ----------
+
+throughput, n_tokens, result = get_gen_text_throughput("What is ML?", use_template=False)
+
+print(f"{throughput} tokens/sec, {n_tokens} tokens (not including prompt)")
+
+# COMMAND ----------
+
+# When the context is long or the generated text is long, it takes longer to generate each token in average
+{% if support_vllm %}
+throughput, n_tokens, result = get_gen_text_throughput(long_input, max_tokens=200, use_template=True)
+{% else %}
+throughput, n_tokens, result = get_gen_text_throughput(long_input, max_new_tokens=200, use_template=True)
+{% endif %}
+
+print(f"{throughput} tokens/sec, {n_tokens} tokens (not including prompt)")
+
+# COMMAND ----------
diff --git a/llm-models/scripts/templates/text_generation/02_mlflow_logging_inference.py.jinja b/llm-models/scripts/templates/text_generation/02_mlflow_logging_inference.py.jinja
new file mode 100644
index 0000000..b87ea03
--- /dev/null
+++ b/llm-models/scripts/templates/text_generation/02_mlflow_logging_inference.py.jinja
@@ -0,0 +1,242 @@
+{% import 'macros.jinja' as m with context %}
+# Databricks notebook source
+# MAGIC %md
+# MAGIC # Manage `{{ model_name }}` model with MLFlow on Databricks
+# MAGIC
+# MAGIC The [{{ fine_tuned_model_name }}](https://huggingface.co/{{ hf_org_name }}/{{ fine_tuned_model_name }}) Large Language Model (LLM) is a instruct fine-tuned version of the [{{ base_model_name }}](https://huggingface.co/{{ hf_org_name }}/{{ base_model_name }}) generative text model using a variety of publicly available conversation datasets.
+# MAGIC
+# MAGIC Environment for this notebook:
+# MAGIC - Runtime: 14.3 GPU ML Runtime
+# MAGIC - Instance:
+{% for (cloud_name, compute_size) in compute_type.items() %}
+# MAGIC     - {{ compute_size }} on {{ cloud_name }}
+{% endfor %}
+
+{% if model_family_name == "llama-2"%}
+# MAGIC Requirements:
+# MAGIC - To get the access of the model on HuggingFace, please visit the [Meta website](https://ai.meta.com/resources/models-and-libraries/llama-downloads) and accept our license terms and acceptable use policy before submitting this form. Requests will be processed in 1-2 days.
+{% endif %}
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Install required packages
+
+# COMMAND ----------
+
+# MAGIC %pip install -U mlflow-skinny[databricks]>=2.6.0
+# MAGIC %pip install -U {% for pip_package in pip_requirements %} {{ pip_package }} {% endfor %}
+
+# MAGIC %pip install --upgrade databricks-sdk
+
+# MAGIC dbutils.library.restartPython()
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Log the model to MLFlow
+
+# COMMAND ----------
+
+# it is suggested to pin the revision commit hash and not change it for reproducibility because the uploader might change the model afterwards; you can find the commmit history of `{{ fine_tuned_model_name }}`. in https://huggingface.co/{{hf_org_name}}/{{fine_tuned_model_name}}/commits/main
+model_name = "{{ hf_org_name }}/{{ fine_tuned_model_name }}"
+revision = "{{ revision }}"
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+
+# Load model
+model = AutoModelForCausalLM.from_pretrained(model_name, revision=revision, torch_dtype=torch.bfloat16,
+                                             cache_dir="/local_disk0/.cache/huggingface/")
+tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision)
+
+# COMMAND ----------
+
+# Define prompt template to get the expected features and performance for the chat versions. See our reference code in github for details: https://github.com/facebookresearch/llama/blob/main/llama/generation.py#L212
+
+DEFAULT_SYSTEM_PROMPT = """\
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+
+def build_prompt(instruction):
+    return """
+{{ prompt_template }}""".format(
+        system_prompt=DEFAULT_SYSTEM_PROMPT,
+        instruction=instruction
+    )
+
+# COMMAND ----------
+
+import mlflow
+from mlflow.models import infer_signature
+
+# Define model signature including params
+input_example = {"prompt": build_prompt("What is Machine Learning?")}
+inference_config = {
+    "temperature": 1.0,
+    "max_new_tokens": 100,
+    "do_sample": True,
+}
+signature = infer_signature(
+    model_input=input_example,
+    model_output="Machien Learning is...",
+    params=inference_config
+)
+
+# Log the model with its details such as artifacts, pip requirements and input example
+with mlflow.start_run() as run:
+    mlflow.transformers.log_model(
+        transformers_model={
+            "model": model,
+            "tokenizer": tokenizer,
+        },
+        task="text-generation",
+        artifact_path="model",
+        pip_requirements={{ m.render_string_list(pip_requirements) }},
+        input_example=input_example,
+        signature=signature,
+        # Add the metadata task so that the model serving endpoint created later will be optimized
+        metadata={
+            "task": "llm/v1/completions",
+            "databricks_model_source": "example-notebooks",
+            "databricks_model_size_parameters": "{{model_size}}b"
+        }
+    )
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Register the model to Unity Catalog
+# MAGIC  By default, MLflow registers models in the Databricks workspace model registry. To register models in Unity Catalog instead, we follow the [documentation](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html) and set the registry server as Databricks Unity Catalog.
+# MAGIC
+# MAGIC  In order to register a model in Unity Catalog, there are [several requirements](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html#requirements), such as Unity Catalog must be enabled in your workspace.
+# MAGIC
+
+# COMMAND ----------
+
+# Configure MLflow Python client to register model in Unity Catalog
+import mlflow
+
+mlflow.set_registry_uri("databricks-uc")
+
+# COMMAND ----------
+
+# Register model to Unity Catalog
+# This may take 2 minutes to complete
+
+registered_name = "models.default.{{ fine_tuned_model_name }}"  # Note that the UC model name follows the pattern <catalog_name>.<schema_name>.<model_name>, corresponding to the catalog, schema, and registered model name
+
+result = mlflow.register_model(
+    "runs:/" + run.info.run_id + "/model",
+    registered_name,
+)
+
+
+# COMMAND ----------
+
+from mlflow import MlflowClient
+
+client = MlflowClient()
+
+# Choose the right model version registered in the above cell.
+client.set_registered_model_alias(name=registered_name, alias="Champion", version=result.version)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Load the model from Unity Catalog
+
+# COMMAND ----------
+
+import mlflow
+
+loaded_model = mlflow.pyfunc.load_model(f"models:/{registered_name}@Champion")
+
+# Make a prediction using the loaded model
+loaded_model.predict(
+    {"prompt": "What is large language model?"},
+    params={
+        "temperature": 0.5,
+        "max_new_tokens": 100,
+    }
+)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Deploying the model to Model Serving
+# MAGIC Once the model is registered, we can use API to create a Databricks GPU Model Serving Endpoint that serves the `{{ fine_tuned_model_name }}` model.
+# MAGIC
+# MAGIC Note that the below deployment requires GPU model serving. For more information on GPU model serving, see the [documentation](https://docs.databricks.com/en/machine-learning/model-serving/create-manage-serving-endpoints.html#gpu). The feature is in Public Preview.
+{% if support_optimized_serving %}
+# MAGIC
+# MAGIC Models in `{{ model_name }}` family are supported for Optimized LLM Serving, which provides an order of magnitude better throughput and latency improvement. 
+# MAGIC You can deploy this model directly to Optimized LLM serving ([AWS](https://docs.databricks.com/en/machine-learning/model-serving/llm-optimized-model-serving.html#input-and-output-schema-format)|[Azure](https://learn.microsoft.com/en-us/azure/databricks/machine-learning/model-serving/llm-optimized-model-serving)) for improved throughput and latency.
+# MAGIC Databricks recommends using the provisioned throughput ([AWS](https://docs.databricks.com/en/machine-learning/foundation-models/deploy-prov-throughput-foundation-model-apis.html)|[Azure](https://learn.microsoft.com/en-us/azure/databricks/machine-learning/foundation-models/deploy-prov-throughput-foundation-model-apis)) experience for optimized inference of LLMs.
+{% endif %}
+
+# COMMAND ----------
+
+model_version = result  # the returned result of mlflow.register_model
+served_name = f'{model_version.name.replace(".", "_")}_{model_version.version}'
+
+# COMMAND ----------
+
+{% if support_optimized_serving %}
+import requests
+import json
+
+# To deploy your model in provisioned throughput mode via API, you must specify `min_provisioned_throughput` and `max_provisioned_throughput` fields in your request.
+# Minimum desired provisioned throughput
+min_provisioned_throughput = 980
+
+# Maximum desired provisioned throughput
+max_provisioned_throughput = 2940
+
+# Get the API endpoint and token for the current notebook context
+API_ROOT = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().get()
+API_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
+
+# send the POST request to create the serving endpoint
+data = {
+    "name": served_name,
+    "config": {
+        "served_models": [
+            {
+                "model_name": model_version.name,
+                "model_version": model_version.version,
+                "min_provisioned_throughput": min_provisioned_throughput,
+                "max_provisioned_throughput": max_provisioned_throughput,
+            }
+        ]
+    },
+}
+
+headers = {"Context-Type": "text/json", "Authorization": f"Bearer {API_TOKEN}"}
+
+response = requests.post(
+    url=f"{API_ROOT}/api/2.0/serving-endpoints", json=data, headers=headers
+)
+
+print(json.dumps(response.json(), indent=4))
+{% else %}
+import datetime
+from databricks.sdk import WorkspaceClient
+from databricks.sdk.service.serving import EndpointCoreConfigInput
+w = WorkspaceClient()
+
+config = EndpointCoreConfigInput.from_dict({
+    "served_models": [
+        {
+            "name": served_name,
+            "model_name": model_version.name,
+            "model_version": model_version.version,
+            "workload_type": "<WORKLOAD TYPE>",
+            "workload_size": "Small",
+            "scale_to_zero_enabled": "False",
+        }
+    ]
+})
+model_details = w.serving_endpoints.create_and_wait(name=endpoint_name, config=config, timeout=datetime.timedelta(minutes=90))
+{% endif %}
\ No newline at end of file
diff --git a/llm-models/scripts/templates/text_generation/03_langchain_inference.py.jinja b/llm-models/scripts/templates/text_generation/03_langchain_inference.py.jinja
new file mode 100644
index 0000000..7f321de
--- /dev/null
+++ b/llm-models/scripts/templates/text_generation/03_langchain_inference.py.jinja
@@ -0,0 +1,80 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC # Load `{{ model_name }}` model from Langchain on Databricks
+# MAGIC
+# MAGIC This example notebook shows how to wrap Databricks endpoints as LLMs in LangChain.
+# MAGIC
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Install required packages
+
+# COMMAND ----------
+
+# MAGIC %pip install -U langchain
+# MAGIC dbutils.library.restartPython()
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Wrapping a serving endpoint with Langchain
+# MAGIC Prerequisites:
+# MAGIC - Run `02_mlflow_logging_inference` to deploy the model to a Databricks serving endpoint
+
+# COMMAND ----------
+
+from langchain_community.llms import Databricks
+
+# If running a Databricks notebook attached to an interactive cluster in "single user"
+# or "no isolation shared" mode, you only need to specify the endpoint name to create
+# a `Databricks` instance to query a serving endpoint in the same workspace.
+
+registered_name = "models_default_{{ fine_tuned_model_name }}_1"
+
+llm = Databricks(endpoint_name=registered_name)
+
+llm("How are you?")
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC You can define `transform_input_fn` and `transform_output_fn` if the app
+# MAGIC expects a different input schema and does not return a JSON string,
+# MAGIC respectively, or you want to apply a prompt template on top.
+
+# COMMAND ----------
+
+def transform_input(**request):
+    """
+    Add more instructions into the prompt.
+    """
+    DEFAULT_SYSTEM_PROMPT = """\
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+    full_prompt = """{{ prompt_template }}""".format(
+        system_prompt=DEFAULT_SYSTEM_PROMPT,
+        instruction=request["prompt"]
+    )
+    request["prompt"] = full_prompt
+    return request
+
+
+def transform_output(response):
+    """
+    Add timestamps for the anwsers.
+    """
+    from datetime import datetime
+    now = datetime.now()
+    current_time = now.strftime("%d/%m/%Y %H:%M:%S")
+    return f"[{current_time}] mpt: {response}"
+
+
+llm = Databricks(
+    endpoint_name=registered_name,
+    transform_input_fn=transform_input,
+    transform_output_fn=transform_output,
+)
+
+print(llm("How to master Python in 3 days?"))
diff --git a/llm-models/scripts/templates/text_generation/04_fine_tune_qlora.py.jinja b/llm-models/scripts/templates/text_generation/04_fine_tune_qlora.py.jinja
new file mode 100644
index 0000000..030292c
--- /dev/null
+++ b/llm-models/scripts/templates/text_generation/04_fine_tune_qlora.py.jinja
@@ -0,0 +1,354 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC # Fine tune `{{ base_model_name }}` with QLORA
+# MAGIC
+# MAGIC The [{{ base_model_name }}](https://huggingface.co/{{ hf_org_name }}/{{ base_model_name }}) Large Language Model (LLM) is a pretrained generative text model with {{ model_size }} billion parameters.
+# MAGIC
+# MAGIC This notebook is to fine-tune [{{ base_model_name }}](https://huggingface.co/{{ hf_org_name }}/{{ base_model_name }}) models on the [mosaicml/dolly_hhrlhf](https://huggingface.co/datasets/mosaicml/dolly_hhrlhf) dataset.
+# MAGIC
+# MAGIC Environment for this notebook:
+# MAGIC - Runtime: 14.3 GPU ML Runtime
+# MAGIC - Instance:
+{% for (cloud_name, compute_size) in peft_type.items() %}
+# MAGIC     - {{ compute_size }} on {{ cloud_name }}
+{% endfor %}
+
+# MAGIC
+# MAGIC We leverage the PEFT library from Hugging Face, as well as QLoRA for more memory efficient finetuning.
+
+{% if model_family_name == "llama-2"%}
+# MAGIC Requirements:
+# MAGIC - To get the access of the model on HuggingFace, please visit the [Meta website](https://ai.meta.com/resources/models-and-libraries/llama-downloads) and accept our license terms and acceptable use policy before submitting this form. Requests will be processed in 1-2 days.
+{% endif %}
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Install required packages
+# MAGIC
+# MAGIC Run the cells below to setup and install the required libraries. For our experiment we will need `accelerate`, `peft`, `transformers`, `datasets` and TRL to leverage the recent [`SFTTrainer`](https://huggingface.co/docs/trl/main/en/sft_trainer). We will use `bitsandbytes` to [quantize the base model into 4bit](https://huggingface.co/blog/4bit-transformers-bitsandbytes). We will also install `einops` as it is a requirement to load Falcon models.
+
+# COMMAND ----------
+
+# MAGIC %pip install -U {% for pip_package in pip_requirements %} {{ pip_package }} {% endfor %}
+
+# MAGIC %pip install bitsandbytes==0.41.1 einops==0.7.0 trl==0.7.10 peft==0.5.0
+# MAGIC dbutils.library.restartPython()
+
+# COMMAND ----------
+
+# Define some parameters
+model_output_location = "/local_disk0/{{ base_model_name }}-lora-fine-tune"
+local_output_dir = "/local_disk0/results"
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Dataset
+# MAGIC
+# MAGIC We will use the [mosaicml/dolly_hhrlhf](https://huggingface.co/datasets/mosaicml/dolly_hhrlhf) dataset.
+
+# COMMAND ----------
+
+from datasets import load_dataset
+
+dataset_name = "mosaicml/dolly_hhrlhf"
+dataset = load_dataset(dataset_name, split="train")
+
+# COMMAND ----------
+
+dataset["prompt"][0]
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Loading the model
+# MAGIC
+# MAGIC In this section we will load the [{{ base_model_name }}](https://huggingface.co/{{ hf_org_name }}/{{ base_model_name }}), quantize it in 4bit and attach LoRA adapters on it.
+
+# COMMAND ----------
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
+
+# it is suggested to pin the revision commit hash and not change it for reproducibility because the uploader might change the model afterwards; you can find the commmit history of `{{ fine_tuned_model_name }}`. in https://huggingface.co/{{hf_org_name}}/{{fine_tuned_model_name}}/commits/main
+model = "{{ hf_org_name }}/{{ base_model_name }}"
+revision = "{{ revision }}"
+
+tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16,
+)
+
+model = AutoModelForCausalLM.from_pretrained(
+    model,
+    quantization_config=bnb_config,
+    revision=revision,
+    trust_remote_code=True,
+)
+model.config.use_cache = False
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC Load the configuration file in order to create the LoRA model. 
+# MAGIC
+# MAGIC According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance.
+
+# COMMAND ----------
+
+# Choose all linear layers from the model
+import bitsandbytes as bnb
+
+def find_all_linear_names(model):
+    cls = bnb.nn.Linear4bit
+    lora_module_names = set()
+    for name, module in model.named_modules():
+        if isinstance(module, cls):
+            names = name.split('.')
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+
+
+    if 'lm_head' in lora_module_names: # needed for 16-bit
+        lora_module_names.remove('lm_head')
+    return list(lora_module_names)
+
+linear_layers = find_all_linear_names(model)
+print(f"Linear layers in the model: {linear_layers}")
+
+# COMMAND ----------
+
+from peft import LoraConfig
+
+lora_alpha = 16
+lora_dropout = 0.1
+lora_r = 64
+
+peft_config = LoraConfig(
+    lora_alpha=lora_alpha,
+    lora_dropout=lora_dropout,
+    r=lora_r,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules=linear_layers,
+)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Loading the trainer
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.
+
+# COMMAND ----------
+
+from transformers import TrainingArguments
+
+per_device_train_batch_size = 4
+gradient_accumulation_steps = 4
+optim = "paged_adamw_32bit"
+save_steps = 500
+logging_steps = 100
+learning_rate = 2e-4
+max_grad_norm = 0.3
+max_steps = 1000
+warmup_ratio = 0.03
+lr_scheduler_type = "constant"
+
+training_arguments = TrainingArguments(
+    output_dir=local_output_dir,
+    per_device_train_batch_size=per_device_train_batch_size,
+    gradient_accumulation_steps=gradient_accumulation_steps,
+    optim=optim,
+    save_steps=save_steps,
+    logging_steps=logging_steps,
+    learning_rate=learning_rate,
+    fp16=True,
+    max_grad_norm=max_grad_norm,
+    max_steps=max_steps,
+    warmup_ratio=warmup_ratio,
+    group_by_length=True,
+    lr_scheduler_type=lr_scheduler_type,
+    ddp_find_unused_parameters=False,
+{% if not support_gradient_checkpointing %}
+    gradient_checkpointing=False,
+{% endif %}
+)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC Then finally pass everthing to the trainer
+
+# COMMAND ----------
+
+from trl import SFTTrainer
+
+max_seq_length = 512
+
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=dataset,
+    peft_config=peft_config,
+    dataset_text_field="prompt",
+    max_seq_length=max_seq_length,
+    tokenizer=tokenizer,
+    args=training_arguments,
+)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC We will also pre-process the model by upcasting the layer norms in float 32 for more stable training
+
+# COMMAND ----------
+
+for name, module in trainer.model.named_modules():
+    if "norm" in name:
+        module = module.to(torch.float32)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Train the model
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC Now let's train the model! Simply call `trainer.train()`
+
+# COMMAND ----------
+
+trainer.train()
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Save the LORA model
+
+# COMMAND ----------
+
+trainer.save_model(model_output_location)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## Log the fine tuned model to MLFlow
+
+# COMMAND ----------
+
+import torch
+from peft import PeftModel, PeftConfig
+
+peft_model_id = model_output_location
+config = PeftConfig.from_pretrained(peft_model_id)
+
+from huggingface_hub import snapshot_download
+# Download the model snapshot from huggingface
+snapshot_location = snapshot_download(repo_id=config.base_model_name_or_path)
+
+
+# COMMAND ----------
+
+import mlflow
+class FineTunedQLORA(mlflow.pyfunc.PythonModel):
+  def load_context(self, context):
+    self.tokenizer = AutoTokenizer.from_pretrained(context.artifacts['repository'])
+    self.tokenizer.pad_token = tokenizer.eos_token
+    config = PeftConfig.from_pretrained(context.artifacts['lora'])
+    base_model = AutoModelForCausalLM.from_pretrained(
+      context.artifacts['repository'], 
+      return_dict=True, 
+      load_in_4bit=True, 
+      device_map={"":0},
+      trust_remote_code=True,
+    )
+    self.model = PeftModel.from_pretrained(base_model, context.artifacts['lora'])
+  
+  def predict(self, context, model_input):
+    prompt = model_input["prompt"][0]
+    temperature = model_input.get("temperature", [1.0])[0]
+    max_new_tokens = model_input.get("max_new_tokens", [100])[0]
+    batch = self.tokenizer(prompt, padding=True, truncation=True,return_tensors='pt').to('cuda')
+    with torch.cuda.amp.autocast():
+      output_tokens = self.model.generate(
+          input_ids = batch.input_ids, 
+          max_new_tokens=max_new_tokens,
+          temperature=temperature,
+          top_p=0.7,
+          num_return_sequences=1,
+          do_sample=True,
+          pad_token_id=tokenizer.eos_token_id,
+          eos_token_id=tokenizer.eos_token_id,
+      )
+    generated_text = self.tokenizer.decode(output_tokens[0], skip_special_tokens=True)
+
+    return generated_text
+
+# COMMAND ----------
+
+from mlflow.models.signature import ModelSignature
+from mlflow.types import DataType, Schema, ColSpec
+import pandas as pd
+import mlflow
+
+# Define input and output schema
+input_schema = Schema([
+    ColSpec(DataType.string, "prompt"), 
+    ColSpec(DataType.double, "temperature"), 
+    ColSpec(DataType.long, "max_new_tokens")])
+output_schema = Schema([ColSpec(DataType.string)])
+signature = ModelSignature(inputs=input_schema, outputs=output_schema)
+
+# Define input example
+input_example=pd.DataFrame({
+            "prompt":["what is ML?"], 
+            "temperature": [0.5],
+            "max_new_tokens": [100]})
+
+with mlflow.start_run() as run:  
+    mlflow.pyfunc.log_model(
+        "model",
+        python_model=FineTunedQLORA(),
+        artifacts={'repository' : snapshot_location, "lora": peft_model_id},
+        pip_requirements=["torch", "transformers", "accelerate", "einops", "loralib", "bitsandbytes", "peft"],
+        input_example=pd.DataFrame({"prompt":["what is ML?"], "temperature": [0.5],"max_new_tokens": [100]}),
+        signature=signature
+    )
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC Run model inference with the model logged in MLFlow.
+
+# COMMAND ----------
+
+import mlflow
+import pandas as pd
+
+
+prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
+### Instruction:
+if one get corona and you are self isolating and it is not severe, is there any meds that one can take?
+
+### Response: """
+# Load model as a PyFuncModel.
+run_id = run.info.run_id
+logged_model = f"runs:/{run_id}/model"
+
+loaded_model = mlflow.pyfunc.load_model(logged_model)
+
+text_example=pd.DataFrame({
+            "prompt":[prompt], 
+            "temperature": [0.5],
+            "max_new_tokens": [100]})
+
+# Predict on a Pandas DataFrame.
+loaded_model.predict(text_example)
\ No newline at end of file
diff --git a/llm-models/scripts/text_generation_model_generator.py b/llm-models/scripts/text_generation_model_generator.py
new file mode 100644
index 0000000..727b9ec
--- /dev/null
+++ b/llm-models/scripts/text_generation_model_generator.py
@@ -0,0 +1,269 @@
+import argparse
+from dataclasses import dataclass
+from huggingface_hub import model_info
+import os
+from typing import Any, Dict
+import yaml
+import logging
+import sys
+
+from template import TemplateManager
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+_logger = logging.getLogger(__name__)
+handler = logging.StreamHandler(sys.stdout)
+handler.setLevel(logging.DEBUG)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+handler.setFormatter(formatter)
+_logger.addHandler(handler)
+
+_SECTIONS_PACKAGE = "./scripts/templates/"
+
+FINE_TUNE_EXAMPLES = [
+    "04_fine_tune_qlora.py",
+#     "06_fine_tune_qlora_marketplace.py",
+]
+EXAMPLE_NOTEBOOK_LIST = [
+    "01_load_inference.py",
+    "02_mlflow_logging_inference.py",
+    "03_langchain_inference.py",
+#     "08_load_from_marketplace.py",
+] + FINE_TUNE_EXAMPLES
+
+@dataclass
+class ModelSpecifics:
+    model_serving_type_aws: str
+    model_serving_type_azure: str
+    model_compute_size: str
+    model_compute_type_aws: str
+    model_compute_type_azure: str
+    model_compute_type_gcp: str
+    model_fine_tune_type_aws: str
+    model_fine_tune_type_azure: str
+    model_fine_tune_type_gcp: str
+
+
+def inference_instance_type(model_size: int, work_type: str) -> str:
+    """
+    Determines the size category based on the given integer input.
+
+    :param model_size: rounded number of billions of parameters for the model.
+    :param work_type: work type ('model_serving', 'inference', 'peft', or 'full_tune').
+    :return: A string representing the size category ('small', 'medium', or 'large').
+    """
+    if model_size < 3:
+        if work_type == 'model_serving':
+            return {
+                "aws": "GPU_SMALL",
+                "azure": "GPU_SMALL",
+            }
+        elif work_type == 'inference':
+            return {
+                "aws": "`g4dn.xlarge`",
+                "azure": "`Standard_NC4as_T4_v3`",
+                "gcp": "`g2-standard-4`",
+            }
+        elif work_type == 'peft':
+            return {
+                "aws": "`g5.xlarge`",
+                "azure": "`Standard_NV36ads_A10_v5`",
+                "gcp": "`g2-standard-8` or `a2-highgpu-1g`",
+            }
+        elif work_type == 'full_tune':
+            return {
+                "aws": "`g5.xlarge`",
+                "azure": "`Standard_NV36ads_A10_v5`",
+                "gcp": "`g2-standard-8` or `a2-highgpu-1g`",
+            }
+    elif model_size < 13:
+        if work_type == 'model_serving':
+            return {
+                "aws": "GPU_MEDIUM",
+                "azure": "GPU_LARGE",
+            }
+        elif work_type == 'inference':
+            return {
+                "aws": "`g5.xlarge`",
+                "azure": "`Standard_NV36ads_A10_v5`",
+                "gcp": "`g2-standard-4`",
+            }
+        elif work_type == 'peft':
+            return {
+                "aws": "`g5.8xlarge`",
+                "azure": "`Standard_NV36ads_A10_v5`",
+                "gcp": "`g2-standard-8` or `a2-highgpu-1g`",
+            }
+        elif work_type == 'full_tune':
+            return {
+                "aws": "`g5.8xlarge`",
+                "azure": "`Standard_NV36ads_A10_v5`",
+                "gcp": "`g2-standard-8` or `a2-highgpu-1g`",
+            }
+    elif 13 <= model_size < 24:
+        if work_type == 'model_serving':
+            return {
+                "aws": "MULTIGPU_MEDIUM",
+                "azure": "GPU_LARGE",
+            }
+        elif work_type == 'inference':
+            return {
+                "aws": "`g5.12xlarge`",
+                "azure": "`Standard_NV36ads_A10_v5`",
+                "gcp": "`g2-standard-24`",
+            }
+        elif work_type == 'peft':
+            return {
+                "aws": "`g5.8xlarge`",
+                "azure": "`Standard_NV36ads_A10_v5`",
+                "gcp": "`g2-standard-8` or `a2-highgpu-1g`",
+            }
+        elif work_type == 'full_tune':
+            return {
+                "aws": "`g5.48xlarge`",
+                "azure": "`Standard_NC48ads_A100_v4`",
+                "gcp": "`g2-standard-96` or `a2-highgpu-4g`",
+            }
+    elif 24 <= model_size < 41:
+        if work_type == 'model_serving':
+            return {
+                "aws": "MULTIGPU_MEDIUM",
+                "azure": "GPU_LARGE",
+            }
+        elif work_type == 'inference':
+            return {
+                "aws": "`g5.12xlarge`",
+                "azure": "`Standard_NC24ads_A100_v4`",
+                "gcp": "`g2-standard-48`",
+            }
+        elif work_type == 'peft':
+            return {
+                "aws": "`g5.12xlarge`",
+                "azure": "`Standard_NC24ads_A100_v4`",
+                "gcp": "`a2-ultragpu-1g`",
+            }
+        elif work_type == 'full_tune':
+            return {
+                "aws": "`p4d.24xlarge`",
+                "azure": "`Standard_NC96ads_A100_v4`",
+                "gcp": "`a2-ultragpu-4g`",
+            }
+    else:
+        if work_type == 'model_serving':
+            return {
+                "aws": "GPU_LARGE_4",
+                "azure": "GPU_LARGE_4",
+            }
+        elif work_type == 'inference':
+            return {
+                "aws": "`p4d.24xlarge`",
+                "azure": "`Standard_NC24ads_A100_v4`",
+                "gcp": "`a2-ultragpu-4g`",
+            }
+        elif work_type == 'peft':
+            return {
+                "aws": "`p4d.24xlarge`",
+                "azure": "`Standard_NC24ads_A100_v4`",
+                "gcp": "`a2-ultragpu-4g`",
+            }
+        elif work_type == 'full_tune':
+            return {
+                "aws": "`p4d.24xlarge`",
+                "azure": "`Standard_NC96ads_A100_v4`",
+                "gcp": "`a2-ultragpu-4g`",
+            }
+        
+
+def get_model_info(hf_model_name: str) -> Dict[str, Any]:
+    """
+    Gets the latest revision number for a given model name.
+    :param hf_model_name: Hugging Face model name.
+    :return: The latest revision sha.
+    """
+    model_info_dict = {}
+    hf_model_info = model_info(hf_model_name)
+    model_info_dict["revision"] = hf_model_info.sha
+    model_info_dict["model_size"] = hf_model_info.safetensors[
+                'total'] if hf_model_info.safetensors else None
+
+    return model_info_dict
+
+
+def should_generate_example(file_name: str, overwrite: bool) -> bool:
+    if os.path.exists(file_name) and not overwrite:
+        print(f"Skipping {file_name} because it already exists.")
+        return False
+    return True
+
+def generate_example_notebook(
+        model_manifest: Dict[str, Any],
+        example_folder: str,
+        overwrite: bool=False,
+):
+    package_loader = TemplateManager.get_filesystem_loader(_SECTIONS_PACKAGE)
+    template_manager = TemplateManager(package_loader)
+    
+    if "prompt_template" in model_manifest:
+        prompt_template = model_manifest["prompt_template"]
+    else:
+        prompt_template = """### Instruction:
+{system_prompt}
+{instruction}
+
+### Response:\\n"""
+
+    for example in EXAMPLE_NOTEBOOK_LIST:
+        file_name = os.path.join(example_folder, example)
+        if example in FINE_TUNE_EXAMPLES:
+            model_hf_path = f"{model_manifest['hf_org_name']}/{model_manifest['base_model_name']}"
+        else:
+            model_hf_path = f"{model_manifest['hf_org_name']}/{model_manifest['fine_tuned_model_name']}"
+        model_info_dict = get_model_info(model_hf_path)
+        model_size = model_info_dict["model_size"] if model_info_dict["model_size"] else model_manifest["model_size"]
+        if should_generate_example(file_name, overwrite):
+            _logger.info(f"Generating {example}")
+            template_manager.dump_template(
+                f"text_generation/{example}.jinja",
+                file_name,
+                model_family_name=model_manifest["model_family_name"],
+                model_name=model_manifest["fine_tuned_model_name"],
+                base_model_name=model_manifest["base_model_name"],
+                fine_tuned_model_name=model_manifest["fine_tuned_model_name"],
+                hf_org_name=model_manifest["hf_org_name"],
+                compute_type=inference_instance_type(int(model_size), "inference"),
+                serving_type=inference_instance_type(int(model_size), "model_serving"),
+                peft_type=inference_instance_type(int(model_size), "peft"),
+                pip_requirements=model_manifest["pip_requirements"],
+                revision=model_info_dict["revision"],
+                prompt_template=prompt_template,
+                model_size=model_size,
+                support_optimized_serving=model_manifest.get("support_optimized_serving", False),
+                marketplace_link=model_manifest.get("marketplace_link", ""),
+                support_vllm=model_manifest.get("support_vllm", False),
+                support_gradient_checkpointing=model_manifest.get("support_gradient_checkpointing", True),
+            )
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_file", type=str, required=True)
+    parser.add_argument('--overwrite', default=False, action='store_true')
+
+    args = parser.parse_args()
+    model_file = args.model_file
+    overwrite = args.overwrite
+
+    if not os.path.exists(model_file):
+        raise Exception(f'Model file specified by --model_file {model_file} does not exist.')
+    with open(model_file, 'r') as models_file:
+        model_manifest = yaml.safe_load(models_file)
+    print(model_manifest)
+
+    # Create the folder for the model
+    example_folder = f"{model_manifest['model_family_name']}/{model_manifest['model_name']}"
+    os.makedirs(example_folder, exist_ok=True)
+
+    generate_example_notebook(model_manifest, example_folder, overwrite)
+
+    
+
+if __name__ == "__main__":
+    main()