From 1adff7479f4df702e5f08d9c928642d563d3144a Mon Sep 17 00:00:00 2001
From: Abhay Gupta <gupta-abhay@users.noreply.github.com>
Date: Wed, 4 Sep 2024 20:25:33 -0700
Subject: [PATCH 1/2] Fix cross attention for blocks (#1512)

---
 llmfoundry/models/layers/blocks.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py
index 82e8e94f74..c88cf33d1b 100644
--- a/llmfoundry/models/layers/blocks.py
+++ b/llmfoundry/models/layers/blocks.py
@@ -170,7 +170,9 @@ def forward(
         extra_kwargs = {}
         if prev_layer_key_value is not None:
             extra_kwargs['prev_layer_key_value'] = prev_layer_key_value
+        if key_value_states is not None:
             extra_kwargs['key_value_states'] = key_value_states
+
         if self.fuse_norm_attn_norm:
             x, m, attn_weights, past_key_value = self.norm_attn_norm(
                 x,
@@ -336,7 +338,9 @@ def forward(
         extra_kwargs = {}
         if prev_layer_key_value is not None:
             extra_kwargs['prev_layer_key_value'] = prev_layer_key_value
+        if key_value_states is not None:
             extra_kwargs['key_value_states'] = key_value_states
+
         b, attn_weights, past_key_value = self.attn(
             a,
             past_key_value=past_key_value,

From e8eca4fa83f3fec69ad482465f839fb7dcfbfb0d Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Thu, 5 Sep 2024 01:03:16 -0700
Subject: [PATCH 2/2] Put 2.3 image back in release examples (#1513)

---
 mcli/mcli-1b-eval.yaml                | 2 +-
 mcli/mcli-1b-max-seq-len-8k.yaml      | 2 +-
 mcli/mcli-1b.yaml                     | 2 +-
 mcli/mcli-benchmark-mpt.yaml          | 2 +-
 mcli/mcli-convert-composer-to-hf.yaml | 2 +-
 mcli/mcli-hf-eval.yaml                | 2 +-
 mcli/mcli-hf-generate.yaml            | 2 +-
 mcli/mcli-llama2-finetune.yaml        | 2 +-
 mcli/mcli-openai-eval.yaml            | 2 +-
 mcli/mcli-pretokenize-oci-upload.yaml | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/mcli/mcli-1b-eval.yaml b/mcli/mcli-1b-eval.yaml
index 2f48fa5ce1..4fcf8b3cb9 100644
--- a/mcli/mcli-1b-eval.yaml
+++ b/mcli/mcli-1b-eval.yaml
@@ -9,7 +9,7 @@ integrations:
 command: |
   cd llm-foundry/scripts/
   composer eval/eval.py /mnt/config/parameters.yaml
-image: mosaicml/llm-foundry:2.4.0_cu124-latest
+image: mosaicml/llm-foundry:2.3.1_cu121-latest
 name: mpt-1b-eval
 
 compute:
diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml
index bb83e2061d..fb96c576e0 100644
--- a/mcli/mcli-1b-max-seq-len-8k.yaml
+++ b/mcli/mcli-1b-max-seq-len-8k.yaml
@@ -17,7 +17,7 @@ command: |
     --out_root ./my-copy-c4 --splits train_small val_small \
     --concat_tokens 8192 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
   composer train/train.py /mnt/config/parameters.yaml
-image: mosaicml/llm-foundry:2.4.0_cu124-latest
+image: mosaicml/llm-foundry:2.3.1_cu121-latest
 name: mpt-1b-ctx-8k-gpus-8
 
 compute:
diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml
index f371051ca0..26255977f4 100644
--- a/mcli/mcli-1b.yaml
+++ b/mcli/mcli-1b.yaml
@@ -21,7 +21,7 @@ command: |
     eval_loader.dataset.split=val_small \
     max_duration=100ba \
     eval_interval=0
-image: mosaicml/llm-foundry:2.4.0_cu124-latest
+image: mosaicml/llm-foundry:2.3.1_cu121-latest
 name: mpt-1b-gpus-8
 
 compute:
diff --git a/mcli/mcli-benchmark-mpt.yaml b/mcli/mcli-benchmark-mpt.yaml
index b15f3b7eea..3995598fd3 100644
--- a/mcli/mcli-benchmark-mpt.yaml
+++ b/mcli/mcli-benchmark-mpt.yaml
@@ -6,7 +6,7 @@ compute:
   # cluster: TODO # Name of the cluster to use for this run
   # gpu_type: a100_80gb # Type of GPU to use. We use a100_80gb in our experiments
 
-image: mosaicml/llm-foundry:2.4.0_cu124-latest
+image: mosaicml/llm-foundry:2.3.1_cu121-latest
 
 integrations:
 - integration_type: git_repo
diff --git a/mcli/mcli-convert-composer-to-hf.yaml b/mcli/mcli-convert-composer-to-hf.yaml
index 9c5d960a95..7b715f6792 100644
--- a/mcli/mcli-convert-composer-to-hf.yaml
+++ b/mcli/mcli-convert-composer-to-hf.yaml
@@ -13,7 +13,7 @@ command: |
     --hf_output_path s3://bucket/folder/hf/ \
     --output_precision bf16 \
 
-image: mosaicml/llm-foundry:2.4.0_cu124-latest
+image: mosaicml/llm-foundry:2.3.1_cu121-latest
 name: convert-composer-hf
 
 compute:
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index 5f6b6c564f..27f5938d67 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -16,7 +16,7 @@ gpu_num: 8
 # gpu_type:
 # cluster:  # replace with your cluster here!
 
-image: mosaicml/llm-foundry:2.4.0_cu124-latest
+image: mosaicml/llm-foundry:2.3.1_cu121-latest
 
 # The below is injected as a YAML file: /mnt/config/parameters.yaml
 parameters:
diff --git a/mcli/mcli-hf-generate.yaml b/mcli/mcli-hf-generate.yaml
index dfb9763462..cb3040e4ee 100644
--- a/mcli/mcli-hf-generate.yaml
+++ b/mcli/mcli-hf-generate.yaml
@@ -35,7 +35,7 @@ command: |
       "Here's a quick recipe for baking chocolate chip cookies: Start by" \
       "The best 5 cities to visit in Europe are"
 
-image: mosaicml/llm-foundry:2.4.0_cu124-latest
+image: mosaicml/llm-foundry:2.3.1_cu121-latest
 name: hf-generate
 
 compute:
diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml
index 32e8cddbda..7134e6204c 100644
--- a/mcli/mcli-llama2-finetune.yaml
+++ b/mcli/mcli-llama2-finetune.yaml
@@ -9,7 +9,7 @@ integrations:
 command: |
   cd llm-foundry/scripts
   composer train/train.py /mnt/config/parameters.yaml
-image: mosaicml/llm-foundry:2.4.0_cu124-latest
+image: mosaicml/llm-foundry:2.3.1_cu121-latest
 name: llama2-finetune
 
 compute:
diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml
index 4b69827d69..cd04d89f4e 100644
--- a/mcli/mcli-openai-eval.yaml
+++ b/mcli/mcli-openai-eval.yaml
@@ -16,7 +16,7 @@ gpu_num:  #
 gpu_type:  #
 cluster:  # replace with your cluster here!
 
-image: mosaicml/llm-foundry:2.4.0_cu124-latest
+image: mosaicml/llm-foundry:2.3.1_cu121-latest
 
 # The below is injected as a YAML file: /mnt/config/parameters.yaml
 parameters:
diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml
index fafb251aee..5425ce9897 100644
--- a/mcli/mcli-pretokenize-oci-upload.yaml
+++ b/mcli/mcli-pretokenize-oci-upload.yaml
@@ -1,5 +1,5 @@
 name: c4-2k-pre-tokenized
-image: mosaicml/llm-foundry:2.4.0_cu124-latest
+image: mosaicml/llm-foundry:2.3.1_cu121-latest
 compute:
   gpus: 8  # Number of GPUs to use