From 3eb146f4eaf5cb68e09b31a2bd8c908d3f195c59 Mon Sep 17 00:00:00 2001
From: Crystalcareai <162942000+Crystalcareai@users.noreply.github.com>
Date: Wed, 3 Jul 2024 13:45:48 -0500
Subject: [PATCH 01/27] Add support for Internlm2 (#362)

---
 mergekit/_data/architectures/internlm2.json | 50 +++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 mergekit/_data/architectures/internlm2.json

diff --git a/mergekit/_data/architectures/internlm2.json b/mergekit/_data/architectures/internlm2.json
new file mode 100644
index 00000000..057bc649
--- /dev/null
+++ b/mergekit/_data/architectures/internlm2.json
@@ -0,0 +1,50 @@
+{
+    "model_type": "internlm2",
+    "architectures": [
+        "InternLM2ForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.tok_embeddings.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "model.norm.weight"
+        },
+        {
+            "name": "output.weight",
+            "is_embed": true,
+            "aliases": [
+                "model.tok_embeddings.weight"
+            ]
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.attention_norm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.ffn_norm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.attention.wqkv.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.attention.wo.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.feed_forward.w1.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.feed_forward.w2.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.feed_forward.w3.weight"
+            }
+        ]
+    }
+}

From 4c3532cd1f7a21bfefe032212c8cd50e5e685ac2 Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Mon, 15 Jul 2024 12:59:06 -0700
Subject: [PATCH 02/27] Tokenizer merging overhaul (#334)

Rewrite the tokenizer merging logic to support all merge methods and
allow more customization of behavior.

The previous implementation of tokenizer merging always used either
linear or slerp to combine the embedding/LM head parameters. This was to
avoid the complexity that would be required to make all merge methods
support tensors that potentially have invalid or masked out values. It
works okay for some cases but wasn't a general solution.

In this implementation, instead of overriding the merge method for
embed/lm_head a preprocessing step remaps them to the vocabulary used by
the output model. These (now appropriately sized and ordered) tensors
are then merged normally.

The selection of embedding values for tokens not normally present in a
model is where things get slightly tricky. By default a set of
heuristics that I think are sane are applied. For a given token and
model, if the token is not present in the model's original tokenizer:
* If the base model has this token present, the base model's embedding
is used
* If only one model in the merge has the token, that model's embedding
is used
* Otherwise, the average of all embeddings for the token is assumed as a
default value

This can also be overridden on a per-token level. For example:

```yaml
merge_method: dare_ties
base_model: ...
models:
  - model: some_chatml_model
  - model: some_weird_model
  - model: some_model
tokenizer:
  source: union
  tokens:
    # if model doesn't have <|im_start|>, use embedding from some_chatml_model
    <|im_start|>:
      source: some_chatml_model
    # use embedding of <|special|> from some_weird_model for *all* models
    <|special|>:
      source: some_weird_model
      force: true
    # output tokenizer will have <|renamed_token|> with embedding of <|original_token|>
    # from some_model
    <|renamed_token|>:
      source:
        kind: model_token
        model: some_model
        token: <|original_token|>
      force: true
```

A practical example would be for merging two Llama 3 models, one using
the Llama 3 Instruct prompt format and one using chatml, trying to
preserve the ability to use both formats:
```yaml
tokenizer:
  source: union
  tokens:
    <|im_start|>:
      source: chatml_model
    <|im_end|>:
      source: chatml_model
    <|start_header_id|>:
      source: llama3_model
      force: true
    <|end_header_id|>:
      source: llama3_model
      force: true
    <|eot_id|>:
      source: llama3_model
      force: true
```
---
 mergekit/config.py                            |  14 +-
 mergekit/merge_methods/base.py                |   8 +-
 .../generalized_task_arithmetic.py            |  11 +-
 mergekit/merge_methods/linear.py              |  11 +-
 mergekit/merge_methods/model_stock.py         |  11 +-
 mergekit/merge_methods/passthrough.py         |  11 +-
 mergekit/merge_methods/slerp.py               |  11 +-
 mergekit/merge_methods/tokenizer_permute.py   |  11 +-
 mergekit/plan.py                              |  32 ++-
 mergekit/tokenizer/__init__.py                |  20 ++
 mergekit/{tokenizer.py => tokenizer/build.py} |  57 +++---
 mergekit/tokenizer/config.py                  |  51 +++++
 mergekit/tokenizer/embed.py                   | 182 ++++++++++++++++++
 pyproject.toml                                |   2 +
 tests/test_tokenizer.py                       | 145 ++++++++++++--
 15 files changed, 498 insertions(+), 79 deletions(-)
 create mode 100644 mergekit/tokenizer/__init__.py
 rename mergekit/{tokenizer.py => tokenizer/build.py} (87%)
 create mode 100644 mergekit/tokenizer/config.py
 create mode 100644 mergekit/tokenizer/embed.py

diff --git a/mergekit/config.py b/mergekit/config.py
index 28999a00..9a5e8efd 100644
--- a/mergekit/config.py
+++ b/mergekit/config.py
@@ -17,9 +17,10 @@
 
 import yaml
 from pydantic import BaseModel, model_validator
-from typing_extensions import TypeAlias
+from typing_extensions import Literal, TypeAlias
 
 from mergekit.common import ModelReference
+from mergekit.tokenizer.config import TokenizerConfig
 
 ScalarOrGradient: TypeAlias = Union[float, List[float]]
 
@@ -88,7 +89,10 @@ class MergeConfiguration(BaseModel):
     parameters: Optional[Dict[str, ParameterSetting]] = None
     base_model: Optional[ModelReference] = None
     dtype: Optional[str] = None
-    tokenizer_source: Optional[str] = None
+    tokenizer_source: Union[
+        Literal["union"], Literal["base"], ModelReference, None
+    ] = None
+    tokenizer: Optional[TokenizerConfig] = None
     out_dtype: Optional[str] = None
 
     def referenced_models(self) -> List[ModelReference]:
@@ -110,6 +114,12 @@ def validate_inputs(self):
             raise RuntimeError("Must specify either output slices or models to merge")
         return self
 
+    @model_validator(mode="after")
+    def validate_tokenizer(self):
+        if self.tokenizer_source and self.tokenizer:
+            raise RuntimeError("Cannot specify both tokenizer_source and tokenizer")
+        return self
+
     def to_yaml(self) -> str:
         return yaml.dump(
             self.model_dump(exclude_defaults=True, mode="json"),
diff --git a/mergekit/merge_methods/base.py b/mergekit/merge_methods/base.py
index 853fbf31..917ed089 100644
--- a/mergekit/merge_methods/base.py
+++ b/mergekit/merge_methods/base.py
@@ -14,14 +14,18 @@
 # along with this program. If not, see http://www.gnu.org/licenses/.
 
 from abc import ABC, abstractmethod
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Union
 
 from pydantic import BaseModel
+from typing_extensions import TypeAlias
 
 from mergekit.architecture import WeightInfo
 from mergekit.common import ImmutableMap, ModelReference
 from mergekit.graph import Task
 from mergekit.io.tasks import GatherTensors
+from mergekit.tokenizer import PermutedEmbeddings
+
+MergeTensorInput: TypeAlias = Union[GatherTensors, PermutedEmbeddings]
 
 
 class ConfigParameterDef(BaseModel):
@@ -42,7 +46,7 @@ def make_task(
         self,
         *,
         output_weight: WeightInfo,
-        tensors: GatherTensors,
+        tensors: MergeTensorInput,
         parameters: ImmutableMap[str, Any],
         tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]],
         base_model: Optional[ModelReference],
diff --git a/mergekit/merge_methods/generalized_task_arithmetic.py b/mergekit/merge_methods/generalized_task_arithmetic.py
index 02c1277f..af09c8bb 100644
--- a/mergekit/merge_methods/generalized_task_arithmetic.py
+++ b/mergekit/merge_methods/generalized_task_arithmetic.py
@@ -24,8 +24,11 @@
 from mergekit.architecture import WeightInfo
 from mergekit.common import ImmutableMap, ModelReference
 from mergekit.graph import Task
-from mergekit.io.tasks import GatherTensors
-from mergekit.merge_methods.base import ConfigParameterDef, MergeMethod
+from mergekit.merge_methods.base import (
+    ConfigParameterDef,
+    MergeMethod,
+    MergeTensorInput,
+)
 from mergekit.sparsify import SparsificationMethod, sparsify
 
 
@@ -68,7 +71,7 @@ def tensor_parameters(self) -> List[ConfigParameterDef]:
     def make_task(
         self,
         output_weight: WeightInfo,
-        tensors: GatherTensors,
+        tensors: MergeTensorInput,
         base_model: Optional[ModelReference],
         parameters: ImmutableMap[str, Any],
         tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]],
@@ -87,7 +90,7 @@ def make_task(
 
 class GTATask(Task[torch.Tensor]):
     method: GeneralizedTaskArithmeticMerge
-    tensors: GatherTensors
+    tensors: MergeTensorInput
     base_model: ModelReference
     weight_info: WeightInfo
     tensor_parameters: ImmutableMap[ModelReference, Any]
diff --git a/mergekit/merge_methods/linear.py b/mergekit/merge_methods/linear.py
index 81826a97..48224bb8 100644
--- a/mergekit/merge_methods/linear.py
+++ b/mergekit/merge_methods/linear.py
@@ -20,13 +20,16 @@
 from mergekit.architecture import WeightInfo
 from mergekit.common import ImmutableMap, ModelReference
 from mergekit.graph import Task
-from mergekit.io.tasks import GatherTensors
-from mergekit.merge_methods.base import ConfigParameterDef, MergeMethod
+from mergekit.merge_methods.base import (
+    ConfigParameterDef,
+    MergeMethod,
+    MergeTensorInput,
+)
 from mergekit.merge_methods.rectify_embed import rectify_embed_sizes
 
 
 class LinearMergeTask(Task[torch.Tensor]):
-    gather_tensors: GatherTensors
+    gather_tensors: MergeTensorInput
     tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]]
     normalize: bool
     weight_info: WeightInfo
@@ -81,7 +84,7 @@ def make_task(
         self,
         *,
         output_weight: WeightInfo,
-        tensors: GatherTensors,
+        tensors: MergeTensorInput,
         parameters: Dict[str, Any],
         tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]],
         **_kwargs,
diff --git a/mergekit/merge_methods/model_stock.py b/mergekit/merge_methods/model_stock.py
index 5130f3ea..94b1e05b 100644
--- a/mergekit/merge_methods/model_stock.py
+++ b/mergekit/merge_methods/model_stock.py
@@ -21,13 +21,16 @@
 from mergekit.architecture import WeightInfo
 from mergekit.common import ImmutableMap, ModelReference
 from mergekit.graph import Task
-from mergekit.io.tasks import GatherTensors
-from mergekit.merge_methods.base import ConfigParameterDef, MergeMethod
+from mergekit.merge_methods.base import (
+    ConfigParameterDef,
+    MergeMethod,
+    MergeTensorInput,
+)
 from mergekit.merge_methods.rectify_embed import rectify_embed_sizes
 
 
 class ModelStockMergeTask(Task[torch.Tensor]):
-    gather_tensors: GatherTensors
+    gather_tensors: MergeTensorInput
     base_model: ModelReference
     weight_info: WeightInfo
     filter_wise: bool = False
@@ -120,7 +123,7 @@ def make_task(
         self,
         *,
         output_weight: WeightInfo,
-        tensors: GatherTensors,
+        tensors: MergeTensorInput,
         base_model: Optional[ModelReference],
         parameters: ImmutableMap[str, Any],
         **_kwargs,
diff --git a/mergekit/merge_methods/passthrough.py b/mergekit/merge_methods/passthrough.py
index 8e4ba14e..62b0bf12 100644
--- a/mergekit/merge_methods/passthrough.py
+++ b/mergekit/merge_methods/passthrough.py
@@ -19,12 +19,15 @@
 
 from mergekit.common import ImmutableMap, ModelReference
 from mergekit.graph import Task
-from mergekit.io.tasks import GatherTensors
-from mergekit.merge_methods.base import ConfigParameterDef, MergeMethod
+from mergekit.merge_methods.base import (
+    ConfigParameterDef,
+    MergeMethod,
+    MergeTensorInput,
+)
 
 
 class PassthroughMergeTask(Task[torch.Tensor]):
-    gather_tensors: GatherTensors
+    gather_tensors: MergeTensorInput
     tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]]
 
     def arguments(self) -> Dict[str, Task]:
@@ -52,7 +55,7 @@ def tensor_parameters(self) -> List[ConfigParameterDef]:
     def make_task(
         self,
         *,
-        tensors: GatherTensors,
+        tensors: MergeTensorInput,
         tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]],
         **kwargs,
     ) -> Task:
diff --git a/mergekit/merge_methods/slerp.py b/mergekit/merge_methods/slerp.py
index dd89d09e..d33dd5a9 100644
--- a/mergekit/merge_methods/slerp.py
+++ b/mergekit/merge_methods/slerp.py
@@ -21,13 +21,16 @@
 from mergekit.architecture import WeightInfo
 from mergekit.common import ImmutableMap, ModelReference
 from mergekit.graph import Task
-from mergekit.io.tasks import GatherTensors
-from mergekit.merge_methods.base import ConfigParameterDef, MergeMethod
+from mergekit.merge_methods.base import (
+    ConfigParameterDef,
+    MergeMethod,
+    MergeTensorInput,
+)
 from mergekit.merge_methods.rectify_embed import rectify_embed_sizes
 
 
 class SlerpTask(Task[torch.Tensor]):
-    gather_tensors: GatherTensors
+    gather_tensors: MergeTensorInput
     base_model: ModelReference
     t: float
     weight_info: WeightInfo
@@ -75,7 +78,7 @@ def make_task(
         self,
         *,
         output_weight: WeightInfo,
-        tensors: GatherTensors,
+        tensors: MergeTensorInput,
         parameters: ImmutableMap[str, Any],
         base_model: Optional[ModelReference],
         **_kwargs,
diff --git a/mergekit/merge_methods/tokenizer_permute.py b/mergekit/merge_methods/tokenizer_permute.py
index 208fb589..07c6f9c5 100644
--- a/mergekit/merge_methods/tokenizer_permute.py
+++ b/mergekit/merge_methods/tokenizer_permute.py
@@ -20,15 +20,18 @@
 
 from mergekit.common import ImmutableMap, ModelReference
 from mergekit.graph import Task
-from mergekit.io.tasks import GatherTensors
-from mergekit.merge_methods.base import ConfigParameterDef, MergeMethod
+from mergekit.merge_methods.base import (
+    ConfigParameterDef,
+    MergeMethod,
+    MergeTensorInput,
+)
 from mergekit.merge_methods.slerp import slerp
 from mergekit.tokenizer import BuildTokenizer, TokenizerInfo
 
 
 class TokenizerPermutationMergeTask(Task[torch.Tensor]):
     tokenizer_task: BuildTokenizer
-    gather_tensors: GatherTensors
+    gather_tensors: MergeTensorInput
     base_model: Optional[ModelReference]
     use_slerp: bool
     slerp_t: Optional[float]
@@ -134,7 +137,7 @@ def tensor_parameters(self) -> List[ConfigParameterDef]:
     def make_task(
         self,
         *,
-        tensors: GatherTensors,
+        tensors: MergeTensorInput,
         parameters: Dict[str, Any],
         tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]],
         base_model: Optional[ModelReference],
diff --git a/mergekit/plan.py b/mergekit/plan.py
index 7bed6032..bdcd7004 100644
--- a/mergekit/plan.py
+++ b/mergekit/plan.py
@@ -40,9 +40,8 @@
     TensorWriterTask,
 )
 from mergekit.merge_methods import MergeMethod
-from mergekit.merge_methods.tokenizer_permute import TokenizerPermutationMerge
 from mergekit.options import MergeOptions
-from mergekit.tokenizer import BuildTokenizer
+from mergekit.tokenizer import BuildTokenizer, PermutedEmbeddings
 
 
 class MergePlanner:
@@ -68,12 +67,18 @@ def __init__(
         self.out_model_config = out_model_config
         self._method = merge_methods.get(config.merge_method)
 
-        if config.tokenizer_source:
+        token_cfg = {}
+        tokenizer_source = config.tokenizer_source
+        if config.tokenizer is not None:
+            token_cfg = config.tokenizer.tokens or {}
+            tokenizer_source = config.tokenizer.source
+        if tokenizer_source is not None:
             self._tokenizer_task = BuildTokenizer(
                 base_model=config.base_model,
                 referenced_models=tuple(config.referenced_models()),
-                tokenizer_source=config.tokenizer_source,
+                tokenizer_source=tokenizer_source,
                 trust_remote_code=options.trust_remote_code,
+                add_tokens=tuple(token_cfg.keys()),
             )
 
     @lru_cache
@@ -143,11 +148,6 @@ def plan_tensor(
                 return
 
         tensor_merge_method = self._method
-        if self._tokenizer_task and weight.is_embed:
-            tensor_merge_method = TokenizerPermutationMerge(
-                tokenizer_task=self._tokenizer_task
-            )
-
         cfg_g = cfg_reader.for_tensor(weight.name)
         global_params = {}
         for p in tensor_merge_method.parameters():
@@ -176,9 +176,21 @@ def plan_tensor(
             device="cuda" if self.options.read_to_gpu else None,
         )
 
+        tensor_input_task = gather_tensors
+        if self._tokenizer_task and weight.is_embed:
+            token_cfg = {}
+            if cfg_reader.config.tokenizer:
+                token_cfg = cfg_reader.config.tokenizer.tokens
+            tensor_input_task = PermutedEmbeddings(
+                gather_tensors=gather_tensors,
+                tokenizer_task=self._tokenizer_task,
+                tokens=token_cfg,
+                base_model=base_model,
+            )
+
         tensor_task = tensor_merge_method.make_task(
             output_weight=weight,
-            tensors=gather_tensors,
+            tensors=tensor_input_task,
             parameters=ImmutableMap(data=global_params),
             tensor_parameters=ImmutableMap(
                 data={
diff --git a/mergekit/tokenizer/__init__.py b/mergekit/tokenizer/__init__.py
new file mode 100644
index 00000000..cff42a46
--- /dev/null
+++ b/mergekit/tokenizer/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from mergekit.tokenizer.build import BuildTokenizer, TokenizerInfo
+from mergekit.tokenizer.config import TokenizerConfig
+from mergekit.tokenizer.embed import PermutedEmbeddings
+
+__all__ = ["BuildTokenizer", "TokenizerInfo", "TokenizerConfig", "PermutedEmbeddings"]
diff --git a/mergekit/tokenizer.py b/mergekit/tokenizer/build.py
similarity index 87%
rename from mergekit/tokenizer.py
rename to mergekit/tokenizer/build.py
index a3a0f858..fb9f9d9c 100644
--- a/mergekit/tokenizer.py
+++ b/mergekit/tokenizer/build.py
@@ -16,14 +16,14 @@
 import json
 import logging
 import tempfile
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import tokenizers
 import tokenizers.models
-import torch
 import tqdm
 import transformers
 from pydantic import BaseModel
+from typing_extensions import Literal
 
 from mergekit.common import ModelPath, ModelReference
 from mergekit.graph import Task
@@ -169,12 +169,19 @@ def build_union_tokenizer(
     return res
 
 
+class TokenizerInfo(BaseModel, arbitrary_types_allowed=True):
+    tokenizer: transformers.PreTrainedTokenizerBase
+    permutations: Dict[ModelReference, Dict[int, int]]
+    original_vocabs: Dict[ModelReference, Dict[str, int]]
+
+
 def build_tokenizer(
     base_model: Optional[ModelReference],
     referenced_models: List[ModelReference],
-    tokenizer_source: str,
+    tokenizer_source: Union[Literal["union"], Literal["base"], ModelReference],
     trust_remote_code: bool,
-) -> Tuple[transformers.PreTrainedTokenizer, Dict[ModelReference, torch.IntTensor]]:
+    add_tokens: Optional[List[str]] = None,
+) -> TokenizerInfo:
     if base_model is None:
         base_model = referenced_models[0]
     if base_model is None:
@@ -208,21 +215,25 @@ def build_tokenizer(
 
     logging.info("Building output tokenizer")
     # build final vocabulary
-    if tokenizer_source == "base":
+    if isinstance(tokenizer_source, ModelReference):
+        tokenizer_out = transformers.AutoTokenizer.from_pretrained(
+            tokenizer_source.model.path,
+            revision=tokenizer_source.model.revision,
+            trust_remote_code=trust_remote_code,
+        )
+    elif tokenizer_source == "base":
         # it done
         tokenizer_out = tokenizer_base
     elif tokenizer_source == "union":
         tokenizer_out = build_union_tokenizer(
             tokenizer_base, tokenizers, trust_remote_code=trust_remote_code
         )
-    elif tokenizer_source.startswith("model:"):
-        tokenizer_out = transformers.AutoTokenizer.from_pretrained(
-            tokenizer_source[len("model:") :],
-            trust_remote_code=trust_remote_code,
-        )
     else:
         raise RuntimeError(f"Unimplemented tokenizer source: {tokenizer_source}")
 
+    for tok in add_tokens:
+        tokenizer_out.add_tokens(tok)
+
     vocab_out = tokenizer_out.get_vocab()
 
     logging.info("Building permutations")
@@ -259,28 +270,28 @@ def build_tokenizer(
 
     del pbar
 
-    return tokenizer_out, permutations
-
-
-class TokenizerInfo(BaseModel, arbitrary_types_allowed=True):
-    tokenizer: transformers.PreTrainedTokenizerBase
-    permutations: Optional[Dict[ModelReference, Dict[int, int]]]
+    return TokenizerInfo(
+        tokenizer=tokenizer_out,
+        permutations=permutations,
+        original_vocabs={model: tok.get_vocab() for model, tok in tokenizers.items()},
+    )
 
 
 class BuildTokenizer(Task[TokenizerInfo]):
     base_model: Optional[ModelReference]
     referenced_models: Tuple[ModelReference, ...]
-    tokenizer_source: str
+    tokenizer_source: Union[Literal["union"], Literal["base"], ModelReference]
+    add_tokens: Optional[Tuple[str, ...]]
     trust_remote_code: bool = False
 
     def arguments(self) -> Dict[str, Task]:
         return {}
 
     def execute(self, **_kwargs) -> TokenizerInfo:
-        tokenizer, permutations = build_tokenizer(
-            self.base_model,
-            self.referenced_models,
-            self.tokenizer_source,
-            self.trust_remote_code,
+        return build_tokenizer(
+            base_model=self.base_model,
+            referenced_models=self.referenced_models,
+            tokenizer_source=self.tokenizer_source,
+            trust_remote_code=self.trust_remote_code,
+            add_tokens=self.add_tokens,
         )
-        return TokenizerInfo(tokenizer=tokenizer, permutations=permutations)
diff --git a/mergekit/tokenizer/config.py b/mergekit/tokenizer/config.py
new file mode 100644
index 00000000..94208385
--- /dev/null
+++ b/mergekit/tokenizer/config.py
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from typing import Dict, Optional, Union
+
+import pydantic
+from pydantic import BaseModel
+from typing_extensions import Literal
+
+from mergekit.common import ModelReference
+
+
+class ModelTokenEmbedding(BaseModel, frozen=True):
+    kind: Literal["model_token"]
+    model: ModelReference
+    token_id: Optional[int] = None
+    token: Optional[str] = None
+
+    @pydantic.model_validator(mode="after")
+    def validate_token(self):
+        if self.token_id is None and self.token is None:
+            raise ValueError("token_id or token must be specified")
+        if self.token_id is not None and self.token is not None:
+            raise ValueError("only one of token_id or token may be specified")
+        return self
+
+
+class ZeroEmbedding(BaseModel, frozen=True):
+    kind: Literal["zero"]
+
+
+class TokenEmbeddingConfig(BaseModel, frozen=True):
+    source: Union[ModelTokenEmbedding, ZeroEmbedding, ModelReference, None] = None
+    force: bool = False
+
+
+class TokenizerConfig(BaseModel, frozen=True):
+    source: Union[ModelReference, Literal["union"], Literal["base"]] = "union"
+    tokens: Optional[Dict[str, TokenEmbeddingConfig]] = None
diff --git a/mergekit/tokenizer/embed.py b/mergekit/tokenizer/embed.py
new file mode 100644
index 00000000..3cdb1840
--- /dev/null
+++ b/mergekit/tokenizer/embed.py
@@ -0,0 +1,182 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+from typing import Dict, Optional
+
+import torch
+
+from mergekit.common import ImmutableMap, ModelReference
+from mergekit.graph import Task
+from mergekit.io.tasks import GatherTensors
+from mergekit.tokenizer.build import BuildTokenizer, TokenizerInfo
+from mergekit.tokenizer.config import (
+    ModelTokenEmbedding,
+    TokenEmbeddingConfig,
+    ZeroEmbedding,
+)
+
+
+class PermutedEmbeddings(Task[Dict[ModelReference, torch.Tensor]]):
+    gather_tensors: GatherTensors
+    tokenizer_task: BuildTokenizer
+    tokens: Optional[ImmutableMap[str, TokenEmbeddingConfig]]
+    base_model: Optional[ModelReference]
+
+    def arguments(self) -> Dict[str, Task]:
+        return {"tokenizer_info": self.tokenizer_task, "tensors": self.gather_tensors}
+
+    def execute(
+        self, tokenizer_info: TokenizerInfo, tensors: Dict[ModelReference, torch.Tensor]
+    ) -> Dict[ModelReference, torch.Tensor]:
+        tokenizer = tokenizer_info.tokenizer
+        permutations = tokenizer_info.permutations
+
+        models = set(tensors.keys())
+        if self.base_model:
+            models.add(self.base_model)
+        models = list(models)
+
+        vocab = tokenizer.get_vocab()
+        vocab_size = len(vocab)
+        embed_size = tensors[models[0]].shape[1]
+        assert all(
+            t.shape[1] == embed_size for t in tensors.values()
+        ), "Embedding sizes must match"
+
+        dtype = tensors[models[0]].dtype
+        device = tensors[models[0]].device
+
+        token_configs = dict(**self.tokens) or {}
+        tokens_to_average = self.assign_embedding_sources(
+            permutations, models, vocab, token_configs
+        )
+
+        default_embeds = {}
+        for token, token_id in vocab.items():
+            embed = torch.zeros(embed_size, dtype=dtype, device=device)
+            if token in tokens_to_average:
+                count = 0
+                for model in models:
+                    p = permutations[model]
+                    if p[token_id] < 0:
+                        continue
+                    embed += tensors[model][p[token_id]]
+                    count += 1
+                embed /= count
+            elif cfg := token_configs.get(token, None):
+                cfg: TokenEmbeddingConfig
+                embed = self.compute_default_embedding(
+                    tokenizer_info, tensors, permutations, token, token_id, cfg
+                )
+            else:
+                continue
+            default_embeds[token] = embed
+
+        result = {}
+        for model in models:
+            p = permutations[model]
+            old_embed = tensors[model]
+            new_embed = torch.zeros(
+                (vocab_size, embed_size), dtype=dtype, device=device
+            )
+            for token, token_id in vocab.items():
+                force = False
+                if token in token_configs:
+                    force = token_configs[token].force
+
+                if p[token_id] >= 0 and not force:
+                    new_embed[token_id, :] = old_embed[p[token_id]]
+                elif token in default_embeds:
+                    new_embed[token_id, :] = default_embeds[token]
+                else:
+                    logging.error(
+                        f"No embedding for token {repr(token)} in model {model}!"
+                    )
+            result[model] = new_embed
+
+        return result
+
+    def assign_embedding_sources(
+        self,
+        permutations: Dict[ModelReference, Dict[int, int]],
+        models: list[ModelReference],
+        vocab: Dict[str, int],
+        token_configs: Dict[str, TokenEmbeddingConfig],
+    ):
+        permutation_list = [permutations[model] for model in models]
+
+        tokens_to_average = set()
+        # find tokens that are only present in one model
+        for token, token_id in vocab.items():
+            if token in token_configs:
+                continue
+
+            has_token = [p[token_id] >= 0 for p in permutation_list]
+            num_present = sum(int(x) for x in has_token)
+            if num_present == 1:
+                donor_model = models[has_token.index(True)]
+                token_configs[token] = TokenEmbeddingConfig(source=donor_model)
+                continue
+
+            if num_present == 0:
+                token_configs[token] = TokenEmbeddingConfig(source=ZeroEmbedding())
+                logging.warning(f"Token {repr(token)} not found in any model")
+                continue
+
+            if num_present > 0 and self.base_model is not None:
+                if permutations[self.base_model][token_id] >= 0:
+                    token_configs[token] = TokenEmbeddingConfig(source=self.base_model)
+                    continue
+
+            tokens_to_average.add(token)
+        return tokens_to_average
+
+    def compute_default_embedding(
+        self,
+        tokenizer_info: TokenizerInfo,
+        tensors: Dict[ModelReference, torch.Tensor],
+        permutations: Dict[ModelReference, Dict[int, int]],
+        token: str,
+        token_id: int,
+        cfg: TokenEmbeddingConfig,
+    ) -> torch.Tensor:
+        if isinstance(cfg.source, ZeroEmbedding):
+            pass
+        elif isinstance(cfg.source, ModelTokenEmbedding):
+            model = cfg.source.model
+            assert (
+                model in permutations
+            ), f"Model {model} referenced but not part of merge"
+            p = permutations[model]
+            src_token_id = cfg.source.token_id
+            if src_token_id is None:
+                src_token = cfg.source.token
+                assert (
+                    src_token in tokenizer_info.original_vocabs[model]
+                ), f"Token {repr(src_token)} not found in model {model}"
+                src_token_id = tokenizer_info.original_vocabs[model][src_token]
+            assert (
+                src_token_id >= 0 and src_token_id < tensors[model].shape[0]
+            ), f"Token ID {src_token_id} out of range for model {model}"
+            embed = tensors[model][src_token_id]
+        elif isinstance(cfg.source, ModelReference):
+            model = cfg.source
+            p = permutations[model]
+            assert p[token_id] >= 0, f"Token {repr(token)} not found in model {model}"
+            embed = tensors[model][p[token_id]]
+        else:
+            raise NotImplementedError(cfg)
+        return embed
diff --git a/pyproject.toml b/pyproject.toml
index 7cf524a8..b612908e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,5 +72,7 @@ include = '\.pyi?$'
 minversion = "6.0"
 filterwarnings = [
     "ignore::pydantic.PydanticDeprecatedSince20:huggingface_hub.*:",
+    "ignore::FutureWarning:huggingface_hub.*:",
+    "ignore:(read_text|open_text|contents) is deprecated:DeprecationWarning", # yes i know, but files() doesn't exist in 3.8
 ]
 testpaths = ["tests"]
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 93b33925..17fafcc8 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -1,14 +1,17 @@
 import json
 import os
 import tempfile
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 
 import pytest
 import tokenizers
+import torch
 from common import make_picollama, run_and_check_merge
 from transformers import LlamaTokenizerFast, PreTrainedTokenizerBase
 
-from mergekit.config import InputModelDefinition, MergeConfiguration, ParameterSetting
+from mergekit.config import InputModelDefinition, MergeConfiguration
+from mergekit.io import LazyTensorLoader
+from mergekit.tokenizer import TokenizerConfig
 
 
 @pytest.fixture(scope="session")
@@ -87,6 +90,23 @@ def _cb(model_path: str):
     return _cb
 
 
+class ModelEmbeddings:
+    embed_tokens: torch.Tensor
+    vocab: Dict[str, int]
+
+    def __init__(self, model_path: str):
+        tokenizer = LlamaTokenizerFast.from_pretrained(model_path)
+        loader = LazyTensorLoader.from_disk(model_path)
+        self.embed_tokens = loader.get_tensor("model.embed_tokens.weight")
+        self.vocab = tokenizer.get_vocab()
+
+    def token_embedding(self, token: str) -> Optional[torch.Tensor]:
+        idx = self.vocab.get(token)
+        if idx is None:
+            return None
+        return self.embed_tokens[idx, :]
+
+
 class TestTokenizerMerges:
     def test_legacy_mode(self, model_base: str, model_padded: str, model_chatml: str):
         config = self.make_config(
@@ -115,23 +135,39 @@ def test_source_union(self, model_base: str, model_padded: str, model_chatml: st
             tokenizer_source="union",
         )
 
-        # output should have all tokens used by any model
-        # but not include any unused tokens
-        run_and_check_merge(
-            config,
-            validate=check_tokenizer(
+        def _check_embed(model_path: str):
+            # output should have all tokens used by any model
+            # but not include any unused tokens
+            check_tokenizer(
                 expected_size=66,
                 expected_added_ct=5,
                 must_contain=["<|im_start|>", "<|im_end|>"],
                 must_not_contain=[f"<UNUSED_{idx}>" for idx in range(4)],
-            ),
+            )(model_path)
+            emb_out = ModelEmbeddings(model_path)
+            emb_chatml = ModelEmbeddings(model_chatml)
+
+            assert torch.allclose(
+                emb_out.token_embedding("<|im_start|>"),
+                emb_chatml.token_embedding("<|im_start|>"),
+            ), "Token <|im_start|> should be from model_chatml"
+            assert torch.allclose(
+                emb_out.token_embedding("<|im_end|>"),
+                emb_chatml.token_embedding("<|im_end|>"),
+                atol=1e-3,
+                rtol=1e-4,
+            ), "Token <|im_end|> should be from model_chatml"
+
+        run_and_check_merge(
+            config,
+            validate=_check_embed,
         )
 
     def test_source_model(self, model_base: str, model_padded: str, model_chatml: str):
         config = self.make_config(
             [model_base, model_padded, model_chatml],
             base_model=model_base,
-            tokenizer_source="model:" + model_chatml,
+            tokenizer_source=model_chatml,
         )
         # tokenizer should match model_chatml
         run_and_check_merge(
@@ -147,8 +183,7 @@ def test_slerp_union(self, model_base: str, model_chatml: str):
             base_model=model_base,
             tokenizer_source="union",
             merge_method="slerp",
-            embed_slerp=True,
-            t="0.5",
+            t=0.5,
         )
 
         run_and_check_merge(
@@ -159,19 +194,92 @@ def test_slerp_union(self, model_base: str, model_chatml: str):
             ),
         )
 
+    def test_force_token(self, model_base: str, model_chatml: str):
+        config = self.make_config(
+            [model_base, model_chatml],
+            base_model=model_base,
+            merge_method="linear",
+            tokenizer_config=TokenizerConfig(
+                source="union",
+                tokens={
+                    "_tok_10": {"source": model_chatml, "force": True},
+                    "_tok_11": {"source": model_base, "force": True},
+                },
+            ),
+        )
+
+        def _check_embed(model_path: str):
+            check_tokenizer(
+                expected_size=66, must_contain=["<|im_start|>", "<|im_end|>"]
+            )(model_path)
+            emb_out = ModelEmbeddings(model_path)
+            emb_base = ModelEmbeddings(model_base)
+            emb_chatml = ModelEmbeddings(model_chatml)
+
+            assert torch.allclose(
+                emb_out.token_embedding("_tok_10"),
+                emb_chatml.token_embedding("_tok_10"),
+            ), "Token _tok_10 should be from model_chatml"
+            assert torch.allclose(
+                emb_out.token_embedding("_tok_11"),
+                emb_base.token_embedding("_tok_11"),
+            ), "Token _tok_11 should be from model_base"
+
+        run_and_check_merge(config, validate=_check_embed)
+
+    def test_model_token_id(self, model_base: str, model_chatml: str):
+        config = self.make_config(
+            [model_base, model_chatml],
+            base_model=model_base,
+            merge_method="linear",
+            tokenizer_config=TokenizerConfig(
+                source="base",
+                tokens={
+                    "_tok_20": {
+                        "source": {
+                            "kind": "model_token",
+                            "model": model_chatml,
+                            "token_id": 64,
+                        },
+                        "force": True,
+                    },
+                    "_tok_21": {
+                        "source": {
+                            "kind": "model_token",
+                            "model": model_base,
+                            "token": "<s>",
+                        },
+                        "force": True,
+                    },
+                },
+            ),
+        )
+
+        def _check_embed(model_path: str):
+            check_tokenizer(expected_size=64, must_contain=["_tok_10"])(model_path)
+            emb_out = ModelEmbeddings(model_path)
+            emb_base = ModelEmbeddings(model_base)
+            emb_chatml = ModelEmbeddings(model_chatml)
+
+            assert torch.allclose(
+                emb_out.token_embedding("_tok_20"), emb_chatml.embed_tokens[64, :]
+            ), "Token _tok_20 should be == model_chatml token 64"
+            assert torch.allclose(
+                emb_out.token_embedding("_tok_21"), emb_base.token_embedding("<s>")
+            ), "Token _tok_21 should be == model_base <s>"
+
+        run_and_check_merge(config, validate=_check_embed)
+
     def make_config(
         self,
         models: List[str],
         base_model: Optional[str] = None,
         merge_method: str = "linear",
         tokenizer_source: Optional[str] = None,
-        embed_slerp: bool = False,
-        t: Optional[ParameterSetting] = None,
+        t: Optional[float] = None,
+        tokenizer_config: Optional[TokenizerConfig] = None,
     ):
-        parameters = {"embed_slerp": embed_slerp}
-        if t is not None:
-            parameters["t"] = t
-
+        parameters = {"t": t} if t is not None else {}
         config = MergeConfiguration(
             merge_method=merge_method,
             base_model=base_model,
@@ -182,8 +290,9 @@ def make_config(
                 )
                 for m in models
             ],
-            dtype="bfloat16",
+            dtype="float32",
             tokenizer_source=tokenizer_source,
             parameters=parameters,
+            tokenizer=tokenizer_config,
         )
         return config

From aa0399fd05e44b685120575228660bf732a91a49 Mon Sep 17 00:00:00 2001
From: Charles Goddard <chargoddard@gmail.com>
Date: Mon, 15 Jul 2024 15:13:37 -0700
Subject: [PATCH 03/27] Fix pyproject.toml

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index b612908e..96599075 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,6 +54,7 @@ packages = [
     "mergekit.moe",
     "mergekit.scripts",
     "mergekit.evo",
+    "mergekit.tokenizer",
     "mergekit._data",
     "mergekit._data.architectures",
 ]

From 5fa77822d18e70b9ad4d4e08f78cd08170eba0e5 Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Mon, 15 Jul 2024 18:33:15 -0700
Subject: [PATCH 04/27] Specify chat template for output model (#367)

Adds a `chat_template` field to merge configs, which can either be a
Jinja template string or one of `chatml`, `llama3`, `alpaca`, `mistral`.
Also supports `auto` which will try to select the most common template
among the input models.
---
 mergekit/_data/chat_templates/__init__.py   |  0
 mergekit/_data/chat_templates/alpaca.jinja  | 29 +++++++
 mergekit/_data/chat_templates/chatml.jinja  |  2 +
 mergekit/_data/chat_templates/llama3.jinja  |  7 ++
 mergekit/_data/chat_templates/mistral.jinja | 24 ++++++
 mergekit/config.py                          |  1 +
 mergekit/merge.py                           | 88 +++++++++++++++++----
 pyproject.toml                              |  9 ++-
 tests/test_chat_template.py                 | 52 ++++++++++++
 9 files changed, 196 insertions(+), 16 deletions(-)
 create mode 100644 mergekit/_data/chat_templates/__init__.py
 create mode 100644 mergekit/_data/chat_templates/alpaca.jinja
 create mode 100644 mergekit/_data/chat_templates/chatml.jinja
 create mode 100644 mergekit/_data/chat_templates/llama3.jinja
 create mode 100644 mergekit/_data/chat_templates/mistral.jinja
 create mode 100644 tests/test_chat_template.py

diff --git a/mergekit/_data/chat_templates/__init__.py b/mergekit/_data/chat_templates/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mergekit/_data/chat_templates/alpaca.jinja b/mergekit/_data/chat_templates/alpaca.jinja
new file mode 100644
index 00000000..45837b0a
--- /dev/null
+++ b/mergekit/_data/chat_templates/alpaca.jinja
@@ -0,0 +1,29 @@
+{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
+
+{% for message in messages %}
+{% if message['role'] == 'user' %}
+### Instruction:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% elif message['role'] == 'assistant' %}
+### Response:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% elif message['role'] == 'user_context' %}
+### Input:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
+### Response:
+{% endif %}
diff --git a/mergekit/_data/chat_templates/chatml.jinja b/mergekit/_data/chat_templates/chatml.jinja
new file mode 100644
index 00000000..4f344455
--- /dev/null
+++ b/mergekit/_data/chat_templates/chatml.jinja
@@ -0,0 +1,2 @@
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}
+{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}
diff --git a/mergekit/_data/chat_templates/llama3.jinja b/mergekit/_data/chat_templates/llama3.jinja
new file mode 100644
index 00000000..0fcec78a
--- /dev/null
+++ b/mergekit/_data/chat_templates/llama3.jinja
@@ -0,0 +1,7 @@
+{% set loop_messages = messages %}
+{% for message in loop_messages %}
+{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}
+{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}
+{{ content }}
+{% endfor %}
+{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}
diff --git a/mergekit/_data/chat_templates/mistral.jinja b/mergekit/_data/chat_templates/mistral.jinja
new file mode 100644
index 00000000..40b37ad7
--- /dev/null
+++ b/mergekit/_data/chat_templates/mistral.jinja
@@ -0,0 +1,24 @@
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif %}
+    {%- if message['role'] == 'user' %}
+        {%- if loop.first and system_message is defined %}
+            {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}
+        {%- else %}
+            {{- ' [INST] ' + message['content'] + ' [/INST]' }}
+        {%- endif %}
+    {%- elif message['role'] == 'assistant' %}
+        {{- ' ' + message['content'] + eos_token}}
+    {%- else %}
+        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}
+    {%- endif %}
+{%- endfor %}
diff --git a/mergekit/config.py b/mergekit/config.py
index 9a5e8efd..5c79de7c 100644
--- a/mergekit/config.py
+++ b/mergekit/config.py
@@ -93,6 +93,7 @@ class MergeConfiguration(BaseModel):
         Literal["union"], Literal["base"], ModelReference, None
     ] = None
     tokenizer: Optional[TokenizerConfig] = None
+    chat_template: Optional[str] = None
     out_dtype: Optional[str] = None
 
     def referenced_models(self) -> List[ModelReference]:
diff --git a/mergekit/merge.py b/mergekit/merge.py
index abdf85a3..60189f44 100644
--- a/mergekit/merge.py
+++ b/mergekit/merge.py
@@ -13,14 +13,18 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program. If not, see http://www.gnu.org/licenses/.
 
+import importlib
+import importlib.resources
 import logging
 import os
 import shutil
+from collections import Counter
 from typing import Optional
 
 import tqdm
 import transformers
 
+from mergekit._data import chat_templates
 from mergekit.architecture import ArchitectureInfo, get_architecture_info
 from mergekit.card import generate_card
 from mergekit.config import MergeConfiguration
@@ -116,32 +120,87 @@ def run_merge(
         ) as fp:
             fp.write(config_source)
 
-    if tokenizer is None and options.copy_tokenizer:
-        try:
-            _copy_tokenizer(
-                merge_config, out_path, trust_remote_code=options.trust_remote_code
-            )
-        except Exception as e:
-            logging.error(
-                "Failed to copy tokenizer. The merge was still successful, just copy it from somewhere else.",
-                exc_info=e,
+    if tokenizer is None:
+        if options.copy_tokenizer:
+            try:
+                _copy_tokenizer(
+                    merge_config, out_path, trust_remote_code=options.trust_remote_code
+                )
+            except Exception as e:
+                logging.error(
+                    "Failed to copy tokenizer. The merge was still successful, just copy it from somewhere else.",
+                    exc_info=e,
+                )
+        elif merge_config.chat_template:
+            logging.warning(
+                "Chat template specified but no tokenizer found. Chat template will not be saved."
             )
 
     if tokenizer:
         logging.info("Saving tokenizer")
+        _set_chat_template(tokenizer, merge_config)
         tokenizer.save_pretrained(out_path, safe_serialization=True)
 
 
+def _set_chat_template(
+    tokenizer: transformers.PreTrainedTokenizerBase,
+    merge_config: MergeConfiguration,
+    trust_remote_code: bool = False,
+):
+    chat_template = merge_config.chat_template
+    if not chat_template:
+        return
+
+    if chat_template == "auto":
+        # see if there is a plurality chat template among the input models
+        model_templates = []
+        for model in merge_config.referenced_models():
+            try:
+                tok = transformers.AutoTokenizer.from_pretrained(
+                    model.model.path,
+                    revision=model.model.revision,
+                    trust_remote_code=trust_remote_code,
+                )
+                template = tok.chat_template
+                if isinstance(template, dict):
+                    template = template.get("default", None)
+                if template:
+                    model_templates.append(template.strip())
+            except Exception as e:
+                logging.warning(f"Unable to load tokenizer for {model}", exc_info=e)
+
+        if not model_templates:
+            return
+
+        chat_template = Counter(model_templates).most_common(1)[0][0]
+        logging.info(f"Auto-selected chat template: {chat_template}")
+
+    elif importlib.resources.is_resource(chat_templates, chat_template + ".jinja"):
+        with importlib.resources.open_text(
+            chat_templates, chat_template + ".jinja"
+        ) as fp:
+            chat_template = fp.read()
+
+    elif len(chat_template) < 20 or "{" not in chat_template:
+        raise RuntimeError(f"Invalid chat template: {chat_template}")
+
+    tokenizer.chat_template = chat_template
+
+
 def _copy_tokenizer(
     merge_config: MergeConfiguration, out_path: str, trust_remote_code: bool = False
 ):
     donor_model = merge_config.base_model or (merge_config.referenced_models()[0])
 
-    if os.path.exists(
-        os.path.join(donor_model.model.path, "tokenizer_config.json")
-    ) and (
-        os.path.exists(os.path.join(donor_model.model.path, "tokenizer.json"))
-        or os.path.exists(os.path.join(donor_model.model.path, "tokenizer.model"))
+    if (
+        (not merge_config.chat_template)
+        and os.path.exists(
+            os.path.join(donor_model.model.path, "tokenizer_config.json")
+        )
+        and (
+            os.path.exists(os.path.join(donor_model.model.path, "tokenizer.json"))
+            or os.path.exists(os.path.join(donor_model.model.path, "tokenizer.model"))
+        )
     ):
         logging.info(f"Copying tokenizer from {donor_model}")
 
@@ -166,6 +225,7 @@ def _copy_tokenizer(
         revision=donor_model.model.revision,
         trust_remote_code=trust_remote_code,
     )
+    _set_chat_template(tokenizer, merge_config)
     tokenizer.save_pretrained(out_path, safe_serialization=True)
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 96599075..a8a339a7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,9 +57,14 @@ packages = [
     "mergekit.tokenizer",
     "mergekit._data",
     "mergekit._data.architectures",
+    "mergekit._data.chat_templates",
 ]
 include-package-data = true
-package-data = { "mergekit._data.architectures" = ["*.json"] }
+package-data = { "mergekit._data.architectures" = [
+    "*.json",
+], "mergekit._data.chat_templates" = [
+    "*.jinja",
+] }
 
 [tool.isort]
 profile = "black"
@@ -74,6 +79,6 @@ minversion = "6.0"
 filterwarnings = [
     "ignore::pydantic.PydanticDeprecatedSince20:huggingface_hub.*:",
     "ignore::FutureWarning:huggingface_hub.*:",
-    "ignore:(read_text|open_text|contents) is deprecated:DeprecationWarning", # yes i know, but files() doesn't exist in 3.8
+    "ignore:(read_text|open_text|contents|is_resource) is deprecated:DeprecationWarning", # yes i know, but files() doesn't exist in 3.8
 ]
 testpaths = ["tests"]
diff --git a/tests/test_chat_template.py b/tests/test_chat_template.py
new file mode 100644
index 00000000..af511a2b
--- /dev/null
+++ b/tests/test_chat_template.py
@@ -0,0 +1,52 @@
+from typing import Optional
+
+from common import run_and_check_merge
+from test_basic_merges import model_b
+from test_tokenizer import model_base
+from transformers import AutoTokenizer
+
+from mergekit.config import InputModelDefinition, MergeConfiguration
+
+
+def check_chat_template(model_path: str, needle: Optional[str] = None):
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    if needle is None:
+        assert not tokenizer.chat_template, "Expected no chat template"
+        return
+    assert (
+        tokenizer.chat_template and needle in tokenizer.chat_template
+    ), f"Expected chat template to contain {needle}"
+
+
+class TestChatTemplate:
+    def test_template_chatml(self, model_base, model_b):
+        config = MergeConfiguration(
+            merge_method="linear",
+            models=[
+                InputModelDefinition(model=model_base, parameters={"weight": 0.5}),
+                InputModelDefinition(model=model_b, parameters={"weight": 0.5}),
+            ],
+            base_model=model_base,
+            dtype="bfloat16",
+            chat_template="chatml",
+        )
+        run_and_check_merge(
+            config,
+            validate=lambda p: check_chat_template(p, "<|im_start|>"),
+        )
+
+    def test_template_literal_jinja(self, model_base, model_b):
+        config = MergeConfiguration(
+            merge_method="linear",
+            models=[
+                InputModelDefinition(model=model_base, parameters={"weight": 0.5}),
+                InputModelDefinition(model=model_b, parameters={"weight": 0.5}),
+            ],
+            base_model=model_base,
+            dtype="bfloat16",
+            chat_template="{{messages[0]['content']}}",
+        )
+        run_and_check_merge(
+            config,
+            validate=lambda p: check_chat_template(p, "{{messages[0]['content']}}"),
+        )

From 6447a8524fa368e9907020dd34a977b02974b753 Mon Sep 17 00:00:00 2001
From: Luke Meyers <lukearmeyers@gmail.com>
Date: Thu, 18 Jul 2024 22:51:10 -0400
Subject: [PATCH 05/27] Activation based merging - copied over from wip-zipit
 branch (#365)

# What is this?
This PR introduces a way to merge two models via their activations and
hidden states on a tiny sample of data.
This method uses these activations and hidden states to form correlation
matrices to then generate permutation and inverse permutation matrices
for weights in each model and then combines them

This PR consists of three main scripts
1. the first one generates the activation/hidden state for each space
2. a permutation and inverse permutation pair is generated for each
space
3. based on each space and the connected weights, the permutation and/or
inverse permutation is applied to each weight and then the weights are
combined

# Assumptions
The models to be merged are of the same architecture and equal
block/layer count

# Things that couldn't make into the final PR
on-the-fly handling of models with grouped query attention. This hasn't
been tested enough for this release but will be in the near future. For
now, users will have to resort to using this script first:

## Note:
Because this was copied over from another branch (`wip-zipit`) @shamanez
's contributions to the PR is missing, so this is explicit
acknowledgement that @shamanez has worked on this PR alongside other
authors
---
 mergekit/_data/architectures/llama.json       |  59 ++-
 mergekit/architecture.py                      |   2 +
 .../scripts/ABM/activations_based_merge.py    | 171 +++++++++
 mergekit/scripts/ABM/extract_activations.py   | 347 ++++++++++++++++++
 .../ABM/extract_permutation_matrices.py       | 226 ++++++++++++
 pyproject.toml                                |   2 +
 6 files changed, 773 insertions(+), 34 deletions(-)
 create mode 100644 mergekit/scripts/ABM/activations_based_merge.py
 create mode 100644 mergekit/scripts/ABM/extract_activations.py
 create mode 100644 mergekit/scripts/ABM/extract_permutation_matrices.py

diff --git a/mergekit/_data/architectures/llama.json b/mergekit/_data/architectures/llama.json
index c418f055..3095b207 100644
--- a/mergekit/_data/architectures/llama.json
+++ b/mergekit/_data/architectures/llama.json
@@ -8,7 +8,7 @@
         {
             "name": "model.embed_tokens.weight",
             "is_embed": true,
-            "output_space": "h_0"
+            "output_space": "running_residual"
         }
     ],
     "num_layers_config_key": "num_hidden_layers",
@@ -16,76 +16,67 @@
         "weights": [
             {
                 "name": "model.layers.${layer_index}.input_layernorm.weight",
-                "input_space": "h_${layer_index}"
+                "input_space": "running_residual"
             },
             {
                 "name": "model.layers.${layer_index}.self_attn.q_proj.weight",
-                "input_space": "h_${layer_index}",
-                "output_space": "attn_qk_${layer_index}"
+                "input_space": "running_residual",
+                "output_space": "attn_qk_${layer_index}",
+                "head_split": "output",
+                "is_kq": true
             },
             {
                 "name": "model.layers.${layer_index}.self_attn.k_proj.weight",
-                "input_space": "h_${layer_index}",
-                "output_space": "attn_qk_${layer_index}"
+                "input_space": "running_residual",
+                "output_space": "attn_qk_${layer_index}",
+                "head_split": "output",
+                "is_kq": true
             },
             {
                 "name": "model.layers.${layer_index}.self_attn.v_proj.weight",
-                "input_space": "h_${layer_index}",
-                "output_space": "attn_v_${layer_index}"
+                "input_space": "running_residual",
+                "output_space": "attn_v_${layer_index}",
+                "head_split": "output"
             },
             {
                 "name": "model.layers.${layer_index}.self_attn.o_proj.weight",
                 "input_space": "attn_v_${layer_index}",
-                "output_space": "post_attn_${layer_index}"
+                "output_space": "running_residual",
+                "head_split": "input"
             },
             {
                 "name": "model.layers.${layer_index}.post_attention_layernorm.weight",
-                "input_space": "h_a_${layer_index}"
+                "input_space": "running_residual"
             },
             {
                 "name": "model.layers.${layer_index}.mlp.up_proj.weight",
-                "input_space": "h_a_${layer_index}",
+                "input_space": "running_residual",
                 "output_space": "up_${layer_index}"
             },
             {
                 "name": "model.layers.${layer_index}.mlp.gate_proj.weight",
-                "input_space": "h_a_${layer_index}",
+                "input_space": "running_residual",
                 "output_space": "up_${layer_index}"
             },
             {
                 "name": "model.layers.${layer_index}.mlp.down_proj.weight",
                 "input_space": "up_${layer_index}",
-                "output_space": "post_mlp_${layer_index}"
-            }
-        ],
-        "procedural_spaces": [
-            {
-                "name": "h_a_${layer_index}",
-                "type": "residual",
-                "inputs": [
-                    "h_${layer_index}",
-                    "post_attn_${layer_index}"
-                ]
-            },
-            {
-                "name": "h_${layer_index+1}",
-                "type": "residual",
-                "inputs": [
-                    "h_a_${layer_index}",
-                    "post_mlp_${layer_index}"
-                ]
+                "output_space": "running_residual"
             }
         ]
     },
     "post_weights": [
         {
             "name": "model.norm.weight",
-            "input_space": "h_${num_layers}"
+            "input_space": "running_residual"
         },
         {
             "name": "lm_head.weight",
-            "input_space": "h_${num_layers}",
-            "is_embed": true
+            "input_space": "running_residual",
+            "is_embed":true,
+            "aliases": [
+                "model.lm_head.weight"
+            ]
         }
     ]
 }
diff --git a/mergekit/architecture.py b/mergekit/architecture.py
index 653f1ac3..4c7b4625 100644
--- a/mergekit/architecture.py
+++ b/mergekit/architecture.py
@@ -52,6 +52,8 @@ class WeightInfo(BaseModel, frozen=True):
     optional: bool = False
     aliases: Optional[Tuple[str, ...]] = None
     force_dtype: Optional[str] = None
+    head_split: Literal[None, "input", "output"] = None
+    is_kq: Optional[bool] = False
 
 
 class ProceduralSpaceInfo(BaseModel, frozen=True):
diff --git a/mergekit/scripts/ABM/activations_based_merge.py b/mergekit/scripts/ABM/activations_based_merge.py
new file mode 100644
index 00000000..cb3c912a
--- /dev/null
+++ b/mergekit/scripts/ABM/activations_based_merge.py
@@ -0,0 +1,171 @@
+import logging
+import os
+from typing import Optional
+
+import click
+import safetensors.torch
+import torch
+import tqdm
+from transformers import AutoTokenizer
+
+from mergekit.architecture import get_architecture_info
+from mergekit.common import ModelReference, dtype_from_name
+from mergekit.io.tasks import LoaderCache
+from mergekit.io.tensor_writer import TensorWriter
+from mergekit.options import MergeOptions, add_merge_options
+
+
+@click.command("mergekit-activation-based-merge")
+@click.argument("model_path", type=str)
+@click.argument("secondary_model_path", type=str)
+@click.argument("merge_unmerge_directory", type=str)
+@click.option("--out-path", "-o", required=True, type=str, help="Output model path")
+@click.option(
+    "--dtype",
+    type=str,
+    default="float16",
+    help="Data type to convert weights to",
+)
+@click.option(
+    "--device",
+    "-d",
+    type=str,
+    default="cuda",
+    help="Device to compute on (default: cuda)",
+)
+@add_merge_options
+def main(
+    model_path: str,
+    secondary_model_path,
+    merge_unmerge_directory: str,
+    out_path: str,
+    dtype: Optional[str],
+    device: Optional[str],
+    merge_options: MergeOptions,
+):
+    model = ModelReference.model_validate(model_path)
+    secondary_model = ModelReference.model_validate(secondary_model_path)
+
+    dtype = dtype_from_name(dtype) if dtype else None
+
+    cache = LoaderCache()
+    cache.lazy_unpickle = merge_options.lazy_unpickle
+    cache.hf_cache_dir = merge_options.transformers_cache
+
+    for m in tqdm.tqdm([model, secondary_model], desc="Preparing models"):
+        cache.get(m)
+
+    writer = TensorWriter(
+        out_path=out_path,
+        max_shard_size=merge_options.out_shard_size,
+        safe_serialization=merge_options.safe_serialization,
+    )
+
+    model_config = model.config(trust_remote_code=merge_options.trust_remote_code)
+    model_arch_info = get_architecture_info(
+        model.config(trust_remote_code=merge_options.trust_remote_code)
+    )
+
+    loader_1 = cache.get(model)
+    loader_2 = cache.get(secondary_model)
+
+    os.makedirs(out_path, exist_ok=True)
+
+    merge_unmerge_dictionary = {}
+    # load files from merge_unmerge_directory
+    spaces = [
+        f.split("_unmerge")[0]
+        for f in os.listdir(merge_unmerge_directory)
+        if "_unmerge" in f
+    ]
+    for i in spaces:
+        logging.info(f"Loading merge/unmerge tensors for {i}")
+        m = safetensors.torch.load_file(
+            os.path.join(merge_unmerge_directory, f"{i}_merge.safetensor"),
+            device=device,
+        )
+        u = safetensors.torch.load_file(
+            os.path.join(merge_unmerge_directory, f"{i}_unmerge.safetensor"),
+            device=device,
+        )
+        merge_unmerge_dictionary[i] = (
+            m[i].to(device, dtype=dtype),
+            u[i].to(device, dtype=dtype),
+        )
+
+    for weight_info in model_arch_info.all_weights(config=model_config):
+        merge_matrix, unmerge_matrix = None, None
+
+        if weight_info.input_space in merge_unmerge_dictionary:
+            _, unmerge_matrix = merge_unmerge_dictionary[weight_info.input_space]
+            unmerge_matrix = unmerge_matrix.chunk(2, dim=0)
+
+        if weight_info.output_space in merge_unmerge_dictionary:
+            merge_matrix, _ = merge_unmerge_dictionary[weight_info.output_space]
+            merge_matrix = merge_matrix.chunk(2, dim=1)
+
+        original_w = loader_1.get_tensor(weight_info.name, device=device)
+        original_w2 = loader_2.get_tensor(weight_info.name, device=device)
+
+        if dtype is not None:
+            original_w = original_w.to(dtype=dtype)
+            original_w2 = original_w2.to(dtype=dtype)
+
+        w = torch.clone(original_w)
+        w2 = torch.clone(original_w2)
+
+        if not merge_matrix and not unmerge_matrix:
+            logging.warning(
+                f"❌ Weight {weight_info.name} for model 1 and model 2 has no merge or unmerge matrix"
+            )
+
+        if merge_matrix is not None:
+            if weight_info.is_embed:
+                w = (merge_matrix[0] @ w.T).T
+                w2 = (merge_matrix[1] @ w2.T).T
+            else:
+                w = merge_matrix[0] @ w
+                w2 = merge_matrix[1] @ w2
+
+        if unmerge_matrix is not None:
+            w = w @ unmerge_matrix[0]
+            w2 = w2 @ unmerge_matrix[1]
+
+        # check if weights have not mutated, if yes then  shoot warning
+        if torch.allclose(original_w, w):
+            logging.warning(
+                f"❌ Weight {weight_info.name} for model 1 has NOT mutated during merge"
+            )
+        else:
+            logging.warning(
+                f"✅ Weight {weight_info.name} for model 1 has mutated during merge"
+            )
+
+        if torch.allclose(original_w2, w2):
+            logging.warning(
+                f"❌ Weight {weight_info.name} for model 2 has NOT mutated during merge"
+            )
+        else:
+            logging.warning(
+                f"✅ Weight {weight_info.name} for model 2 has mutated during merge"
+            )
+
+        # average weights and save them
+        if merge_matrix:
+            w = w + w2
+        else:
+            w = (w + w2) / 2
+        writer.save_tensor(weight_info.name, w)
+    writer.finalize()
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer.save_pretrained(out_path, safe_serialization=True)
+
+    # write config
+    model_out_config = model.config(trust_remote_code=merge_options.trust_remote_code)
+    if dtype:
+        model_out_config.torch_dtype = dtype
+    model_out_config.save_pretrained(out_path)
+
+
+main()
diff --git a/mergekit/scripts/ABM/extract_activations.py b/mergekit/scripts/ABM/extract_activations.py
new file mode 100644
index 00000000..7cb5961b
--- /dev/null
+++ b/mergekit/scripts/ABM/extract_activations.py
@@ -0,0 +1,347 @@
+import logging
+import os
+from collections import defaultdict
+from typing import List, Optional
+
+import click
+import datasets
+import numpy as np
+import torch
+from safetensors.torch import save_file
+from torch.utils.data import DataLoader
+from transformers import AutoModel, AutoTokenizer, DefaultDataCollator
+
+from mergekit.architecture import _template_substitution, get_architecture_info
+from mergekit.common import ModelReference
+
+logging.basicConfig(level=logging.INFO)
+
+# set seed
+torch.manual_seed(42)
+np.random.seed(42)
+
+
+def clean_name(name):
+    return name.replace(".weight", "").replace("model.", "")
+
+
+def parse_items(ctx, param, value):
+    if value is not None:
+        return [item.strip() for item in value.split(",")]
+
+
+def remove_pads(attention_mask, feature_vector):
+    if (
+        len(feature_vector.shape) == 3
+    ):  # Hidden states: (batch_size, seq_length, embedding_dim)
+        # Expand mask to match the feature_vector dimensions and apply it
+        expanded_mask = attention_mask.unsqueeze(-1)
+        filtered_feature_vector = feature_vector * expanded_mask
+    else:
+        raise ValueError("Unsupported feature vector shape.")
+
+    return filtered_feature_vector
+
+
+def get_attention_output_hook(storage_dict, space_name, capture_input=True):
+    """
+    Returns a hook function that stores the output of the attention layer.
+    """
+
+    def hook(module, input, output):
+        # NOTE: shape of input is [batch, seq_len, dim] and output is Tuple[(seq_len, dim),...]
+        if capture_input:
+            o = input[0].detach()
+        else:
+            o = output.detach()
+
+        if space_name not in storage_dict:
+            storage_dict[space_name] = o
+        else:
+            storage_dict[space_name] = torch.cat((storage_dict[space_name], o), dim=0)
+
+    return hook
+
+
+"""
+
+What this script does:
+
+It tries to map input/output spaces to activation maps
+
+"""
+
+
+@click.command("mergekit-abm-extract-activations")
+@click.argument("model-path", type=str)
+@click.option(
+    "--dataset", "-d", required=True, type=str, help="Dataset to use for activations"
+)
+@click.option("--out-path", "-o", required=True, type=str, help="Output model path")
+@click.option("--batch-size", "-b", type=int, default=2, help="Batch size")
+@click.option(
+    "--dataset-size",
+    "-s",
+    type=int,
+    default=None,
+    help="Dataset size. If None, use full dataset",
+)
+@click.option(
+    "--dataset-column", "-c", type=str, default="text", help="Dataset column to use"
+)
+@click.option(
+    "--dataset-subset", "-u", type=str, default="eval", help="Dataset subset to use"
+)
+@click.option(
+    "--chat-template/--no-chat-template",
+    default=False,
+    help="use Chat template for inference",
+)
+@click.option("--max-length", "-l", type=int, default=512, help="Max length")
+@click.option("--dtype", type=str, default=None, help="Data type to convert weights to")
+@click.option(
+    "--device", type=str, default=None, help="device to compute the activations"
+)
+@click.option(
+    "--ignore-spaces",
+    "-i",
+    type=str,
+    default="",
+    callback=parse_items,
+    help="Spaces to ignore separated by comma. Example: up_${layer_index}",
+)
+def main(
+    model_path: str,
+    dataset: str,
+    dataset_column: str,
+    out_path: str,
+    batch_size: int,
+    max_length: int,
+    dataset_size: Optional[int],
+    dataset_subset: Optional[str],
+    chat_template: Optional[bool],
+    dtype: Optional[str],
+    device: Optional[str],
+    ignore_spaces: Optional[List[str]],
+):
+    # sorting out locations to hook into
+    # we do this via the predefined json architecture definitions in mergekit
+
+    model = ModelReference.model_validate(model_path)
+
+    model_config = model.config()
+    model_arch_info = get_architecture_info(model_config)
+
+    _json = model_arch_info.definition
+
+    residual_space = None
+
+    weights = []
+    for weight in _json.layer_templates.weights:
+        if weight.is_kq:
+            residual_space = weight.input_space
+        weights.append(weight)
+
+    if residual_space is None:
+        raise ValueError("No residual space found")
+
+    # ======================== Mapping spaces to weights ========================
+
+    # just a list of connected components
+    space_to_output_weight_templates = defaultdict(list)
+    space_to_input_weight_templates = defaultdict(list)
+
+    for layer_template in weights:
+        if (
+            not layer_template.input_space
+            or layer_template.input_space in ignore_spaces
+        ):
+            continue
+        space_to_output_weight_templates[layer_template.input_space].append(
+            layer_template.name
+        )
+
+    for layer_template in weights:
+        if (
+            not layer_template.output_space
+            or layer_template.output_space in ignore_spaces
+        ):
+            continue
+        space_to_input_weight_templates[layer_template.output_space].append(
+            layer_template.name
+        )
+
+    # remove the residual space from the input and output
+    space_to_input_weight_templates.pop(residual_space, None)
+    space_to_output_weight_templates.pop(residual_space, None)
+
+    # NOTE: if space has input and output weights, remove one or the other because hooking
+    # into both will result in duplicate activations
+    to_remove = []
+    for space, input_weights in space_to_input_weight_templates.items():
+        if space in space_to_output_weight_templates:
+            # if count of input weights and output weights is non zero, remove the space from space to output_weights
+            if (
+                len(input_weights) > 0
+                and len(space_to_output_weight_templates[space]) > 0
+            ):
+                to_remove.append(space)
+
+    # remove keys from output
+    space_to_output_weight_templates = {
+        k: v for k, v in space_to_output_weight_templates.items() if k not in to_remove
+    }
+
+    num_layers = model_arch_info.num_layers(model_config)
+
+    space_to_input_weights = {}
+    for k, v in space_to_input_weight_templates.items():
+        for j in range(num_layers):
+            f = lambda x: _template_substitution(x, num_layers=num_layers, layer_idx=j)
+            space_to_input_weights[f(k)] = [f(_v) for _v in v]
+
+    space_to_output_weights = {}
+    for k, v in space_to_output_weight_templates.items():
+        for j in range(num_layers):
+            f = lambda x: _template_substitution(x, num_layers=num_layers, layer_idx=j)
+            space_to_output_weights[f(k)] = [f(_v) for _v in v]
+
+    # ================== Load model, tokenizer for inference and prepare dataset ==================
+
+    model = AutoModel.from_pretrained(
+        model_path, output_attentions=True, attn_implementation="eager"
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    tokenize_function = None
+    if chat_template:
+        logging.info("Using chat template for inference")
+        tokenize_function = lambda x: tokenizer.apply_chat_template(
+            x,
+            padding="longest",
+            max_length=max_length,
+            truncation=True,
+            return_dict=True,
+        )
+    else:
+        logging.info("Using default tokenizer (no chat template) for inference")
+        tokenize_function = lambda x: tokenizer(
+            x,
+            padding="longest",
+            max_length=max_length,
+            truncation=True,
+        )
+
+    model.eval()
+    model.to(device)
+    if dtype is not None:
+        model = model.to(dtype=dtype)
+
+    dataset = datasets.load_dataset(dataset)[dataset_subset]
+
+    if dataset_size is not None:
+        logging.info("Using dataset size %s", dataset_size)
+        dataset = dataset.select(range(dataset_size))
+
+    def tokenize(element):
+        outputs = tokenize_function(element[dataset_column])
+        return {
+            "input_ids": outputs["input_ids"],
+            "attention_mask": outputs["attention_mask"],
+        }
+
+    dataset = dataset.map(tokenize).select_columns(["input_ids", "attention_mask"])
+
+    datasets_dataloader = DataLoader(
+        dataset, batch_size=batch_size, shuffle=False, collate_fn=DefaultDataCollator()
+    )
+
+    feature_storage = {}
+    storage_dict = {}
+
+    # ================== Hooking into the model ==================
+
+    # NOTE: if the capture input set to True seems confusing, a space's output is a weight recieving input from the space
+    for k, v in space_to_output_weights.items():
+        for weight in v:
+            weight = clean_name(weight)
+            model.get_submodule(weight).register_forward_hook(
+                get_attention_output_hook(feature_storage, k, capture_input=True)
+            )
+    for k, v in space_to_input_weights.items():
+        for weight in v:
+            weight = clean_name(weight)
+            model.get_submodule(weight).register_forward_hook(
+                get_attention_output_hook(feature_storage, k, capture_input=False)
+            )
+
+    # ================== Inference ==================
+
+    for batch in datasets_dataloader:
+        with torch.no_grad():
+            inputs = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(
+                **inputs, output_hidden_states=True, output_attentions=False
+            )
+
+            # NOTE: https://huggingface.co/docs/transformers/en/main_classes/output#transformers.modeling_outputs.BaseModelOutput
+
+            # Store attention masks
+            attention_mask = inputs["attention_mask"]
+            if "attention_mask" not in feature_storage:
+                feature_storage["attention_mask"] = attention_mask.cpu().detach()
+            else:
+                feature_storage["attention_mask"] = torch.cat(
+                    (feature_storage["attention_mask"], attention_mask.cpu().detach()),
+                    dim=0,
+                )
+
+            hidden_states = [
+                remove_pads(attention_mask, hidden_state)
+                for hidden_state in outputs.hidden_states
+            ]
+            hidden_states = torch.stack(outputs.hidden_states, dim=1)
+
+            if residual_space not in feature_storage:
+                feature_storage[residual_space] = hidden_states
+            else:
+                feature_storage[residual_space] = torch.cat(
+                    (feature_storage[residual_space], hidden_states), dim=0
+                )
+
+            for space_name, v in storage_dict.items():
+                if space_name not in feature_storage:
+                    feature_storage[space_name] = v
+                else:
+                    feature_storage[space_name] = torch.cat(
+                        (feature_storage[space_name], v), dim=0
+                    )
+
+            storage_dict = {}
+
+    # ================== Save activations/features ==================
+
+    logging.info("Feature storage:")
+    for k, v in feature_storage.items():
+        if v is not None:
+            logging.info(f"{k}: Shape: {v.shape}")
+
+    abs_path = os.path.abspath(model_path)
+    if os.path.exists(abs_path):
+        model_path = abs_path
+
+    model_path = model_path.replace("/", "_")
+
+    # create output directory
+    os.makedirs(out_path, exist_ok=True)
+
+    save_file(
+        feature_storage, os.path.join(out_path, f"{model_path}_features.safetensor")
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mergekit/scripts/ABM/extract_permutation_matrices.py b/mergekit/scripts/ABM/extract_permutation_matrices.py
new file mode 100644
index 00000000..75c58692
--- /dev/null
+++ b/mergekit/scripts/ABM/extract_permutation_matrices.py
@@ -0,0 +1,226 @@
+import os
+import sys
+from collections import defaultdict
+
+import click
+import numpy as np
+import safetensors.torch
+import scipy
+import torch
+
+from mergekit.architecture import _template_substitution, get_architecture_info
+from mergekit.common import ModelReference
+
+
+def calc_correlation_matrix(feats):
+    feats = feats.view(-1, feats.shape[-1])
+
+    return torch.corrcoef(feats.T)
+
+
+def match_tensors_permute(
+    absval=False,
+    correlation_matrix=None,
+):
+    """
+    This function is adapted from ZipIt! (https://github.com/gstoica27/ZipIt)
+    """
+
+    Om = correlation_matrix.shape[0] // 2
+    device = correlation_matrix.device
+
+    mats = [torch.eye(Om, device=device)]
+
+    corr_submatrix = correlation_matrix[:Om, Om:].cpu().numpy()
+    if absval:
+        corr_submatrix = np.absolute(corr_submatrix)
+    _, col_ind = scipy.optimize.linear_sum_assignment(corr_submatrix, maximize=True)
+
+    new_mat = torch.eye(Om, device=device)[torch.tensor(col_ind).long().to(device)]
+    mats.append(new_mat.T)
+
+    unmerge_mats = mats
+
+    unmerge = torch.cat(unmerge_mats, dim=0)
+
+    merge = torch.cat(mats, dim=0)
+    merge = merge / (merge.sum(dim=0, keepdim=True) + 1e-5)
+
+    return merge.T, unmerge
+
+
+def match_tensors_permute_MHA(
+    n_heads=32,
+    absval=False,
+    correlation_matrix=None,
+):
+    """
+    Handles different head permutations in attention.
+    Modified version of the function here: https://github.com/nverma1/merging-text-transformers/blob/main/matching_functions.py#L76
+    """
+
+    Om = correlation_matrix.shape[0] // 2
+    device = correlation_matrix.device
+    query_size = Om // n_heads
+
+    mats = [torch.eye(Om, device=device)]
+    head_perms = []
+
+    costs = np.ones((n_heads, n_heads)) * -sys.maxsize
+
+    col_inds_storage = defaultdict(lambda: defaultdict(int))
+
+    for j in range(n_heads):
+        for k in range(n_heads):
+            head1_idx = [query_size * j, query_size * (j + 1)]
+            head2_idx = [query_size * k, query_size * (k + 1)]
+
+            corr_submatrix = (
+                correlation_matrix[
+                    head1_idx[0] : head1_idx[1],
+                    (Om + head2_idx[0]) : (Om + head2_idx[1]),
+                ]
+                .cpu()
+                .numpy()
+            )
+            if absval:
+                corr_submatrix = np.absolute(corr_submatrix)
+
+            # compute perm for head j & head k
+            row_ind, col_ind = scipy.optimize.linear_sum_assignment(
+                corr_submatrix, maximize=True
+            )
+
+            costs[j, k] = corr_submatrix[row_ind, col_ind].sum()
+
+            col_inds_storage[j][k] = col_ind
+
+    outer_row_ind, outer_col_ind = scipy.optimize.linear_sum_assignment(
+        costs, maximize=True
+    )
+
+    for j in range(n_heads):
+        head_1 = outer_row_ind[j]
+        head_2 = outer_col_ind[j]
+
+        head_perm = col_inds_storage[head_1][head_2]
+        head_perms.append(torch.tensor(head_perm + query_size * head_2))
+
+    new_mat = torch.eye(Om, device=device)[
+        torch.cat(head_perms).clone().detach().long().to(device)
+    ]
+    mats.append(new_mat.T)
+
+    unmerge_mats = mats
+
+    unmerge = torch.cat(unmerge_mats, dim=0)
+    merge = torch.cat(mats, dim=0)
+    merge = merge / (merge.sum(dim=0, keepdim=True) + 1e-5)
+
+    return merge.T, unmerge
+
+
+@click.command("mergekit-abm-extract-permutations")
+@click.argument("model1-ft", type=str, required=True)
+@click.argument("model2-ft", type=str, required=True)
+@click.option("--model_path", type=str, required=True, help="Model information")
+@click.option(
+    "--out_path", required=True, type=str, help="Output path for metric tensors"
+)
+@click.option(
+    "--absval/--no-absval",
+    required=False,
+    default=False,
+    help="Use absolute value on correlation matrices/submatrices while calculating merge/unmerge matrices",
+)
+@click.option(
+    "--device",
+    "-d",
+    type=str,
+    default="cpu",
+    help="Device to compute on (default: cpu)",
+)
+def main(model1_ft, model2_ft, model_path, out_path, absval, device):
+    os.makedirs(out_path, exist_ok=True)
+
+    model = ModelReference.model_validate(model_path)
+
+    model_config = model.config()
+
+    model_arch_info = get_architecture_info(model_config)
+
+    _json = model_arch_info.definition
+
+    residual_space = None
+    kq_space = None
+    v_space = None
+
+    # extract the residual, attention related spaces
+    for weight in _json.layer_templates.weights:
+        if weight.is_kq:
+            kq_space = weight.output_space
+            residual_space = weight.input_space
+            continue
+
+        # assuming order is observed
+        if (
+            not weight.is_kq
+            and weight.head_split
+            and (weight.input_space == residual_space)
+        ):
+            v_space = weight.output_space
+            continue
+
+    num_layers = model_arch_info.num_layers(model_config)
+
+    kq_spaces = []
+    v_spaces = []
+    for j in range(num_layers):
+        kq_spaces.append(
+            _template_substitution(kq_space, num_layers=num_layers, layer_idx=j)
+        )
+        v_spaces.append(
+            _template_substitution(v_space, num_layers=num_layers, layer_idx=j)
+        )
+
+    model1_features = safetensors.torch.load_file(model1_ft, device=device)
+    model2_features = safetensors.torch.load_file(model2_ft, device=device)
+
+    model1_features.pop("attention_mask")
+    model2_features.pop("attention_mask")
+
+    for feature_space in model1_features.keys():
+        concatenated_feature = torch.cat(
+            (model1_features[feature_space], model2_features[feature_space]), dim=-1
+        )
+
+        correlation_matrix = calc_correlation_matrix(concatenated_feature)
+
+        if feature_space in (kq_spaces + v_spaces):
+            merge, unmerge = match_tensors_permute_MHA(
+                correlation_matrix=correlation_matrix,
+                n_heads=model_config.num_attention_heads,
+                absval=absval,
+            )
+
+        else:
+            merge, unmerge = match_tensors_permute(
+                correlation_matrix=correlation_matrix,
+                absval=absval,
+            )
+
+        safetensors.torch.save_file(
+            {feature_space: merge.contiguous()},
+            f"{out_path}/{feature_space}_merge.safetensor",
+        )
+
+        safetensors.torch.save_file(
+            {feature_space: unmerge.contiguous()},
+            f"{out_path}/{feature_space}_unmerge.safetensor",
+        )
+
+        del merge, unmerge, correlation_matrix, concatenated_feature
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index a8a339a7..01915feb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,8 @@ dependencies = [
     "typing-extensions",
     "sentencepiece",
     "protobuf",
+    "scipy",
+    "datasets"
 ]
 
 [project.optional-dependencies]

From 619f4e42543eab0cde35ef650925ae1109e93123 Mon Sep 17 00:00:00 2001
From: Pala Tej Deep <77090356+Tej-Deep@users.noreply.github.com>
Date: Sat, 20 Jul 2024 08:06:39 +0800
Subject: [PATCH 06/27] Add Della merge method (#366)

Adds a new merging method della. Della first ranks parameters in each
row of delta parameters and assigns drop probabilities adaptively,
inversely proportional to their magnitudes. Delta parameters with higher
magnitudes are assigned lower drop probabilities. After assigning drop
probabilities, the delta parameters are dropped and rescaled in a manner
similar to the DARE method. The Della-merging paper can be found
[here](https://arxiv.org/abs/2406.11617)
---
 README.md                                     | 12 ++++-
 mergekit/merge_methods/__init__.py            | 16 ++++++
 .../generalized_task_arithmetic.py            | 23 ++++++++
 mergekit/sparsify.py                          | 54 +++++++++++++++++++
 4 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f761197a..0a191a1d 100644
--- a/README.md
+++ b/README.md
@@ -128,7 +128,8 @@ A quick overview of the currently supported merge methods:
 | [Model Breadcrumbs](https://arxiv.org/abs/2312.06795)                                            | `breadcrumbs`        | ✅          | ✅              |
 | [Model Breadcrumbs](https://arxiv.org/abs/2312.06795) + [TIES](https://arxiv.org/abs/2306.01708) | `breadcrumbs_ties`   | ✅          | ✅              |
 | [Model Stock](https://arxiv.org/abs/2403.19522)                                                  | `model_stock`        | ✅          | ✅              |
-
+| [DELLA](https://arxiv.org/abs/2406.11617)                                                  | `della`        | ✅          | ✅              |
+| [DELLA](https://arxiv.org/abs/2406.11617) [Task Arithmetic](https://arxiv.org/abs/2212.04089)                                                  | `della_linear`        | ✅          | ✅              |
 ### Linear
 
 The classic merge method - a simple weighted average.
@@ -189,6 +190,15 @@ Parameters:
 
 - `filter_wise`: if true, weight calculation will be per-row rather than per-tensor. Not recommended.
 
+### [DELLA](https://arxiv.org/abs/2406.11617)
+
+Building upon DARE, DELLA uses adaptive pruning based on parameter magnitudes. DELLA first ranks parameters in each row of delta parameters and assigns drop probabilities inversely proportional to their magnitudes. This allows it to retain more important changes while reducing interference. After pruning, it rescales the remaining parameters similar to [DARE](#dare). DELLA can be used with (`della`) or without (`della_linear`) the sign elect step of TIES
+
+Parameters: same as [Linear](#linear), plus:
+- `density` - fraction of weights in differences from the base model to retain
+- `epsilon` - maximum change in drop probability based on magnitude. Drop probabilities assigned will range from `density - epsilon` to `density + epsilon`. (When selecting values for `density` and `epsilon`, ensure that the range of probabilities falls within 0 to 1)
+- `lambda` - scaling factor for the final merged delta parameters before merging with the base parameters.
+
 ## LoRA extraction
 
 Mergekit allows extracting PEFT-compatible low-rank approximations of finetuned models.
diff --git a/mergekit/merge_methods/__init__.py b/mergekit/merge_methods/__init__.py
index c939f132..007e163e 100644
--- a/mergekit/merge_methods/__init__.py
+++ b/mergekit/merge_methods/__init__.py
@@ -77,6 +77,22 @@ def get(method: str) -> MergeMethod:
         )
     elif method == "model_stock":
         return ModelStockMerge()
+
+    elif method == "della":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=ConsensusMethod.sum,
+            sparsification_method=SparsificationMethod.rank_magnitude_sampling,
+            default_normalize=True,
+            default_rescale=True,
+        )
+
+    elif method == "della_linear":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=None,
+            sparsification_method=SparsificationMethod.rank_magnitude_sampling,
+            default_normalize=False,
+            default_rescale=True,
+        )
     raise RuntimeError(f"Unimplemented merge method {method}")
 
 
diff --git a/mergekit/merge_methods/generalized_task_arithmetic.py b/mergekit/merge_methods/generalized_task_arithmetic.py
index af09c8bb..214726b7 100644
--- a/mergekit/merge_methods/generalized_task_arithmetic.py
+++ b/mergekit/merge_methods/generalized_task_arithmetic.py
@@ -66,6 +66,19 @@ def tensor_parameters(self) -> List[ConfigParameterDef]:
                     default_value=0.01,
                 )
             )
+        if self.sparsification_method == SparsificationMethod.rank_magnitude_sampling:
+            res.append(
+                ConfigParameterDef(
+                    name="epsilon",
+                    default_value=0.15,
+                )
+            )
+            res.append(
+                ConfigParameterDef(
+                    name="lambda",
+                    default_value=1.0,
+                )
+            )
         return res
 
     def make_task(
@@ -126,6 +139,9 @@ def execute(
                 if "gamma" in tv_info:
                     kwargs["gamma"] = tv_info["gamma"]
 
+                if "epsilon" in tv_info:
+                    kwargs["epsilon"] = tv_info["epsilon"]
+
                 tv_info["delta"] = sparsify(
                     tv_info["delta"],
                     density=tv_info["density"],
@@ -162,6 +178,13 @@ def execute(
         if self.normalize:
             mixed_delta /= divisor
 
+        if (
+            self.method.sparsification_method
+            == SparsificationMethod.rank_magnitude_sampling
+        ):
+            lambda_factor = tvs[0]["lambda"]
+            mixed_delta *= lambda_factor
+
         return (base + mixed_delta).to(base.dtype)
 
     def group_label(self) -> Optional[str]:
diff --git a/mergekit/sparsify.py b/mergekit/sparsify.py
index 71a5229a..ee6477c3 100644
--- a/mergekit/sparsify.py
+++ b/mergekit/sparsify.py
@@ -22,6 +22,7 @@ class SparsificationMethod(str, Enum):
     magnitude = "magnitude"
     random = "random"
     magnitude_outliers = "magnitude_outliers"
+    rank_magnitude_sampling = "rank_magnitude_sampling"
 
 
 def rescale_sum(tensor: torch.Tensor, mask: torch.Tensor):
@@ -115,15 +116,66 @@ def bernoulli(tensor: torch.Tensor, density: float, rescale: bool) -> torch.Tens
     res = tensor.to(work_dtype) * mask
     if rescale:
         res /= density
+
     return res.to(tensor.dtype)
 
 
+def rank_magnitude(
+    tensor: torch.Tensor, density: float, rescale: bool = True, epsilon: float = 0.05
+) -> torch.Tensor:
+    if density >= 1:
+        return tensor
+
+    if density <= epsilon or density >= (1 - epsilon):
+        raise ValueError(
+            f"Error: density +- epsilon must be in the range (0, 1). density + epsilon = {density+epsilon}, density - epsilon = {density-epsilon}"
+        )
+
+    if (tensor.device.type != "cpu") or tensor.dtype == torch.bfloat16:
+        work_dtype = tensor.dtype
+    else:
+        work_dtype = torch.float32
+
+    if len(tensor.shape) < 2:
+        tensor = tensor.unsqueeze(0)
+
+    # Get Rank matrix for the delta values
+    tensor_abs = torch.abs(tensor)
+
+    sorted_indices = torch.argsort(tensor_abs, dim=1, descending=False)
+
+    ranking_tensor = torch.zeros_like(tensor_abs, dtype=work_dtype)
+    for i in range(tensor_abs.size(0)):
+        ranking_tensor[i][sorted_indices[i]] = torch.arange(
+            1, tensor.size(1) + 1, dtype=work_dtype
+        ).to(tensor.device)
+
+    # Normalise rank matrix to the probability range to density +- epsilon
+    range_vals = (
+        ranking_tensor.max(dim=1, keepdim=True).values
+        - ranking_tensor.min(dim=1, keepdim=True).values
+    )
+    norm_metrics = (ranking_tensor - ranking_tensor.min(dim=1, keepdim=True).values) / (
+        range_vals
+    )
+    final_probabilities = (density - epsilon) + norm_metrics * (2 * epsilon)
+
+    mask = torch.bernoulli(final_probabilities).to(work_dtype)
+    res = tensor.to(work_dtype) * mask
+
+    if rescale:
+        res = res / (final_probabilities.to(work_dtype))
+
+    return res.squeeze(0)
+
+
 def sparsify(
     tensor: torch.Tensor,
     density: float,
     method: SparsificationMethod,
     gamma: float = 0,
     rescale: bool = False,
+    epsilon: float = 0.15,
 ) -> torch.Tensor:
     if method == SparsificationMethod.magnitude:
         return magnitude(tensor, density=density, rescale=rescale)
@@ -131,5 +183,7 @@ def sparsify(
         return bernoulli(tensor, density=density, rescale=rescale)
     elif method == SparsificationMethod.magnitude_outliers:
         return magnitude_outliers(tensor, density=density, rescale=rescale, gamma=gamma)
+    elif method == SparsificationMethod.rank_magnitude_sampling:
+        return rank_magnitude(tensor, density=density, rescale=rescale, epsilon=epsilon)
     else:
         raise NotImplementedError(method)

From f086664c983ad8b5f126d40ce2e4385f9e65f32c Mon Sep 17 00:00:00 2001
From: Juk Armstrong <69222624+jukofyork@users.noreply.github.com>
Date: Mon, 5 Aug 2024 19:56:32 +0100
Subject: [PATCH 07/27] `extract_lora.py` improvements and fixes (#333)

- LoRA extraction code tidying up (from @jukofyork)
- Added `--extend-vocab` flag to mergekit-extract-lora to allow extracting LoRAs from finetunes with mismatched vocab size
---
 mergekit/card.py                 |  10 +-
 mergekit/scripts/extract_lora.py | 573 ++++++++++++++++++++++++-------
 2 files changed, 452 insertions(+), 131 deletions(-)

diff --git a/mergekit/card.py b/mergekit/card.py
index d884c7c0..bf0a2d0a 100644
--- a/mergekit/card.py
+++ b/mergekit/card.py
@@ -202,6 +202,8 @@ def generate_card_lora(
     base_model_ref: ModelReference,
     finetuned_model_ref: ModelReference,
     invocation: str,
+    extended: bool,
+    vocab_size: int,
     name: str,
 ) -> str:
     """
@@ -218,7 +220,13 @@ def generate_card_lora(
     hf_bases = list(extract_hf_paths([base_model_ref, finetuned_model_ref]))
     tags = ["mergekit", "peft"]
 
-    details = f"This LoRA adapter was extracted from {modelref_md(finetuned_model_ref)} and uses {modelref_md(base_model_ref)} as a base."
+    finetuned_ref_md = modelref_md(finetuned_model_ref)
+    basemodel_ref_md = modelref_md(base_model_ref)
+
+    details = f"This LoRA adapter was extracted from {finetuned_ref_md} and uses {basemodel_ref_md} as a base."
+
+    if extended:
+        details += f"\n\n> [!WARNING]\n> This LoRA adapter has an extended vocabulary. Make sure to call `model.resize_token_embeddings({vocab_size})` before applying the adapter to {basemodel_ref_md}"
 
     if os.path.isdir(base_model_ref.model.path) or os.path.isdir(
         finetuned_model_ref.model.path
diff --git a/mergekit/scripts/extract_lora.py b/mergekit/scripts/extract_lora.py
index 8232043f..ff063232 100644
--- a/mergekit/scripts/extract_lora.py
+++ b/mergekit/scripts/extract_lora.py
@@ -8,104 +8,377 @@
 import torch
 from peft.tuners.lora import QuantLinear
 from safetensors.torch import save_file
+from torch.nn.functional import pad
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM
 from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import Conv1D
 
 from mergekit.card import generate_card_lora
 from mergekit.common import ModelReference
 from mergekit.io import LazyTensorLoader
 
 
-def _low_rank_decomposition(
-    weight: torch.Tensor, reduced_rank: int = 16
+def low_rank_decomposition(
+    weight: torch.Tensor, max_rank: int
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
-    Decompose a 2D matrix into low-rank matrices A and B using SVD.a
+    Decompose a 2D matrix into low-rank matrices L and R using SVD.
 
     :param weight: The matrix to decompose, of shape (H, W)
-    :param reduced_rank: The final rank of the decomposition
-    :return: A tuple of tensors (A, B)
+    :param max_rank: The maximum rank of the decomposition
+    :return: A tuple of tensors (L, R)
     """
-    if weight.dim() != 2:
-        raise ValueError(
-            f"Only support 2D matrix, but your input has {weight.dim()} dimensions."
-        )
+    assert (
+        weight.dim() == 2
+    ), f"Only support 2D matrix, but input has {weight.dim()} dimensions."
+    assert (
+        max_rank >= 1
+    ), f"Maximum rank must be a positive integer, but input max_rank={max_rank}."
 
     dtype = weight.dtype
 
-    # SVD Decomposition
     U, S, Vh = torch.linalg.svd(weight.float(), full_matrices=False)
 
-    # Truncated matrices
-    A = Vh[:reduced_rank, :]
-    B = U[:, :reduced_rank] @ torch.diag(S[:reduced_rank])
+    final_rank = min(min(weight.shape), max_rank)
 
-    return A.to(dtype), B.to(dtype)
+    # Distribute S to both to improve numerical precision.
+    sqrt_S = torch.sqrt(torch.diag(S[:final_rank]))
+    L = sqrt_S @ Vh[:final_rank, :]
+    R = U[:, :final_rank] @ sqrt_S
+
+    return L.to(dtype), R.to(dtype)
 
 
 def decompose_delta_weight(
-    new_weight: torch.Tensor,
     base_weight: torch.Tensor,
-    reduced_rank: int,
+    finetuned_weight: torch.Tensor,
+    max_rank: int,
     device: Optional[str] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Decompose the delta weight into low-rank matrices L and R.
+
+    :param new_weight: The updated weight matrix after applying LoRA
+    :param base_weight: The original weight matrix before LoRA
+    :param max_rank: The maximum rank for the low-rank decomposition
+    :param device: The device to perform computation on
+    :return: A tuple of tensors (L, R)
+    """
+    assert (
+        base_weight.size() == finetuned_weight.size()
+    ), f"Mismatched dimensions: {base_weight.size()} != {finetuned_weight.size()}"
+
     if device is None:
         device = "cuda" if torch.cuda.is_available() else "cpu"
 
-    new_weight = new_weight.to(device)
     base_weight = base_weight.to(device)
+    finetuned_weight = finetuned_weight.to(device)
 
-    """
-    Decompose the delta weight into low-rank matrices A and B.
+    delta_weight = finetuned_weight - base_weight
 
-    :param new_weight: The updated weight matrix after applying LoRA.
-    :param base_weight: The original weight matrix before LoRA.
-    :param reduced_rank: The rank for the low-rank decomposition.
-    :param device: The device to perform computation on.
-    :return: A tuple of tensors (A, B)
+    L, R = low_rank_decomposition(delta_weight, max_rank)
+
+    return L, R
+
+
+def get_model_details(
+    model_id: str, skip_undecomposable: bool
+) -> List[Tuple[str, str, torch.Size]]:
+    """
+    Retrieve architectural details of a given pre-trained model.
+
+    :param model_id: The identifier of the pre-trained model to load
+    :param skip_undecomposable: Skip saving undecomposable modules
+    :return: A list of tuples where each tuple contains:
+             - type: The type of the module ('embedding', 'linear', or 'to_save')
+             - name: The full name of the module
+             - size: The dimensions of the module's weight tensor
     """
-    delta_weight = new_weight - base_weight
 
-    max_rank = min(delta_weight.shape)
-    assert (
-        reduced_rank <= max_rank
-    ), f"The specified rank ({reduced_rank}) must be smaller than or equal to the rank of the weight matrices ({max_rank})"
+    # Avoid loading weights as we won't need them
+    pretrained_model = AutoModelForCausalLM.from_pretrained(
+        model_id, state_dict={}, device_map="meta"
+    )
 
-    A, B = _low_rank_decomposition(delta_weight, reduced_rank=reduced_rank)
+    module_details = []
+
+    for name, module in pretrained_model.named_modules():
+        if module == pretrained_model.get_input_embeddings():
+            # if isinstance(module, torch.nn.Embedding):
+            module_details.append(("embedding", name, module.weight.size()))
+        elif module == pretrained_model.get_output_embeddings():
+            # if isinstance(module, torch.nn.Embedding):
+            module_details.append(("output", name, module.weight.size()))
+        elif hasattr(module, "weight") and isinstance(module.weight, torch.Tensor):
+            if (
+                # SEE: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/model.py
+                isinstance(
+                    module,
+                    (
+                        torch.nn.Linear,
+                        torch.nn.Conv2d,
+                        bnb.nn.Linear4bit,
+                        bnb.nn.Linear8bitLt,
+                        QuantLinear,
+                        Conv1D,
+                    ),
+                )
+                or (
+                    "Linear" in module.__class__.__name__
+                    and module.__class__.__name__
+                    not in ("LlamaLinearScalingRotaryEmbedding",)
+                )
+            ):
+                module_details.append(("linear", name, module.weight.size()))
+            elif not skip_undecomposable:
+                module_details.append(("to_save", name, module.weight.size()))
+            else:
+                logging.info(f"Skipping undecomposable module '{name}'.")
+
+    return module_details
+
+
+def validate_and_combine_details(
+    base_model_id: str,
+    finetuned_model_id: str,
+    skip_undecomposable: bool,
+    extend_vocab: bool,
+) -> List[Tuple[str, str]]:
+    """
+    Validate and combine details from a base model and a fine-tuned model.
 
-    return A, B
+    :param base_model_id: The identifier for the base model
+    :param finetuned_model_id: The identifier for the fine-tuned model
+    :param skip_undecomposable: Skip saving undecomposable modules
+    :return: A list of tuples with the type and name of the validated/combined model layers
+    """
 
+    base_model_details = get_model_details(base_model_id, skip_undecomposable)
+    finetuned_model_details = get_model_details(finetuned_model_id, skip_undecomposable)
+
+    module_details = []
+
+    base_model_embedding_size = None
+    finetuned_model_embedding_size = None
+
+    for i, (base_layer, finetuned_layer) in enumerate(
+        zip(base_model_details, finetuned_model_details)
+    ):
+        base_type, base_name, base_size = base_layer
+        finetuned_type, finetuned_name, finetuned_size = finetuned_layer
+
+        assert (
+            base_type == finetuned_type
+        ), f"Layer type mismatch: {base_type} != {finetuned_type}"
+        assert (
+            base_name == finetuned_name
+        ), f"Layer name mismatch: {base_name} != {finetuned_name}"
+
+        if base_type == "embedding":
+            base_model_embedding_size = base_size[0]
+
+        if finetuned_type == "embedding":
+            finetuned_model_embedding_size = finetuned_size[0]
+
+        # Fine-tuned models with added vocab will have have their extra rows truncated unless `extend_vocab` is specified
+        if base_type != "to_save" and finetuned_size[0] > base_size[0]:
+            assert (
+                base_size[1] == finetuned_size[1]
+            ), f"Column dimension mismatch in layer '{base_name}': {base_size} != {finetuned_size}"
+
+            if base_type == "embedding" or base_type == "output":
+                if not extend_vocab:
+                    logging.warning(
+                        f"Finetuned module '{base_name}' will have {finetuned_size[0] - base_size[0]} rows truncated for weight decomposition! To preserve all embeddings, invoke script with --extend-vocab"
+                    )
+                else:
+                    logging.warning(
+                        f"Base module '{base_name}' will have {finetuned_size[0] - base_size[0]} rows added for weight decomposition. Make sure to call `model.resize_token_embeddings({finetuned_size[0]})` before applying LoRA for inference!"
+                    )
+            else:
+                logging.warning(
+                    f"Finetuned module '{base_name}' will have {finetuned_size[0] - base_size[0]} rows truncated for weight decomposition!"
+                )
+
+        else:
+            assert (
+                base_size == finetuned_size
+            ), f"Dimension mismatch in layer '{base_name}': {base_size} != {finetuned_size}"
+
+        module_details.append((base_type, base_name))
+
+    return module_details, base_model_embedding_size, finetuned_model_embedding_size
+
+
+def extract_lora(
+    module_details: List[Tuple[str, str]],
+    base_model_ref: ModelReference,
+    finetuned_model_ref: ModelReference,
+    max_rank: int,
+    extend_vocab: bool,
+    no_lazy_unpickle: bool,
+    device: Optional[str],
+) -> Tuple[Dict[str, torch.Tensor], Dict[str, int]]:
+    """
+    Process module details to decompose weights and generate LoRA weights and ranks.
 
-def find_all_linear_names(model: PreTrainedModel) -> List[str]:
-    cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear, QuantLinear)
+    :param module_details: List of module details.
+    :param base_model_ref: Reference to the base model.
+    :param finetuned_model_ref: Reference to the fine-tuned model.
+    :param max_rank: The maximum rank for the low-rank decomposition.
+    :param no_lazy_unpickle: Flag to disable lazy unpickle.
+    :param device: The device to perform computation on.
+    :return: A tuple containing LoRA weights dictionary and ranks dictionary.
+    """
 
-    names = []
-    for name, module in model.named_modules():
-        if (
-            isinstance(module, cls)
-            or "Linear" in module.__class__.__name__
-            and module.__class__.__name__ not in ("LlamaLinearScalingRotaryEmbedding",)
-        ):
-            names.append(name)
+    base_loader = LazyTensorLoader(
+        base_model_ref.tensor_index(), lazy_unpickle=(not no_lazy_unpickle)
+    )
+    finetuned_loader = LazyTensorLoader(
+        finetuned_model_ref.tensor_index(), lazy_unpickle=(not no_lazy_unpickle)
+    )
 
-    return names
+    lora_weights = {}
+    ranks = {}
+
+    for module_type, module_name in tqdm(module_details):
+        base_weight = base_loader.get_tensor(f"{module_name}.weight")
+        finetuned_weight = finetuned_loader.get_tensor(f"{module_name}.weight")
+
+        if module_type == "to_save":
+            lora_weights[
+                f"base_model.model.{module_name}.weight"
+            ] = finetuned_weight.to("cpu").contiguous()
+
+            logging.info(
+                f"[{module_type}] {module_name}: output_dims=({finetuned_weight.shape})"
+            )
+
+        else:
+            if finetuned_weight.shape[0] > base_weight.shape[0]:
+                if extend_vocab:
+                    print(f"Extra tokens found!, module name : {module_name}")
+
+                    new_base_weight = torch.empty(
+                        finetuned_weight.shape, device=base_weight.device
+                    )
+                    new_base_weight.normal_(mean=0.0, std=0.02)
+
+                    # Copy original base_weight values into the new tensor
+                    new_base_weight[: base_weight.shape[0]] = base_weight
+
+                    if module_type == "embedding" or module_type == "output":
+                        lora_weights[
+                            f"base_model.model.{module_name}.base_layer.weight"
+                        ] = new_base_weight.to("cpu").contiguous()
+
+                    base_weight = new_base_weight
+                else:
+                    logging.warning(
+                        f"Finetuned module '{module_name}' will have {finetuned_weight.shape[0] - base_weight.shape[0]} rows truncated for weight decomposition!"
+                    )
+                    finetuned_weight = finetuned_weight[: base_weight.shape[0]]
+
+            if module_type == "embedding":
+                # These need to be transposed for some reason...
+                lora_embedding_A, lora_embedding_B = decompose_delta_weight(
+                    base_weight.T, finetuned_weight.T, max_rank, device=device
+                )
+
+                lora_weights[
+                    f"base_model.model.{module_name}.lora_embedding_A"
+                ] = lora_embedding_A.to("cpu").contiguous()
+                lora_weights[
+                    f"base_model.model.{module_name}.lora_embedding_B"
+                ] = lora_embedding_B.to("cpu").contiguous()
+
+                ranks[module_name] = lora_embedding_A.shape[0]
+
+                logging.info(
+                    f"[{module_type}] {module_name}: final_rank={ranks[module_name]}, "
+                    f"input_dims=({base_weight.shape}), "
+                    f"output_dims=({lora_embedding_A.shape}, {lora_embedding_B.shape})"
+                )
+
+            else:
+                lora_A, lora_B = decompose_delta_weight(
+                    base_weight, finetuned_weight, max_rank, device=device
+                )
+
+                lora_weights[
+                    f"base_model.model.{module_name}.lora_A.weight"
+                ] = lora_A.to("cpu").contiguous()
+                lora_weights[
+                    f"base_model.model.{module_name}.lora_B.weight"
+                ] = lora_B.to("cpu").contiguous()
+
+                ranks[module_name] = lora_A.shape[0]
+
+                logging.info(
+                    f"[{module_type}] {module_name}: final_rank={ranks[module_name]}, "
+                    f"input_dims=({base_weight.shape}), "
+                    f"output_dims=({lora_A.shape}, {lora_B.shape})"
+                )
+
+    return lora_weights, ranks
+
+
+def reconstruct_invocation(args: Dict[str, Any]) -> str:
+    """
+    Reconstruct the command-line invocation string based on the given arguments.
 
+    :param args: A dictionary containing the command arguments with keys matching the parameter names.
+                 Expected keys are 'base_model', 'finetuned_model', 'out_path', 'no_lazy_unpickle',
+                 'skip_undecomposable, 'max_rank', 'model_name', 'device' and 'verbose'.
+    :return: The reconstructed command-line invocation string.
+    """
 
-def get_linear_module_names(model_id: str) -> List[str]:
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id, state_dict={}, device_map="meta"
-    )  # avoid loading weights as we won't need them
-    linear_module_names = find_all_linear_names(model)
+    # Provide a default value for out_path if it's not in the dictionary
+    out_path = args.get("out_path", "OUTPUT_PATH")
+
+    invocation = f"mergekit-extract-lora {args['finetuned_model']} {args['base_model']} {out_path}"
+    if args.get("no_lazy_unpickle"):
+        invocation += " --no-lazy-unpickle"
+    if args.get("skip_undecomposable"):
+        invocation += " --skip-undecomposable"
+    if args.get("max_rank"):
+        invocation += f" --rank={args['max_rank']}"
+    if args.get("extend_vocab"):
+        invocation += " --extend-vocab"
+    if args.get("model_name"):
+        invocation += f" --model_name={args['model_name']}"
+    if args.get("device"):
+        invocation += f" --device={args['device']}"
+    if args.get("verbose"):
+        invocation += " --verbose"
 
-    return linear_module_names
+    return invocation
 
 
 def create_peft_config(
-    base_model_name_or_path: str, rank: int, alpha: int, target_modules: List[str]
+    base_model_name_or_path: str,
+    rank: int,
+    alpha: int,
+    rank_pattern: Dict[str, int],
+    alpha_pattern: Dict[str, int],
+    target_modules: List[str],
+    modules_to_save: List[str],
 ) -> Dict[str, Any]:
+    """
+    Create a PEFT (Parameter-Efficient Fine-Tuning) configuration dictionary.
+
+    :param base_model_name_or_path: The path or name of the base model.
+    :param rank: The rank for the low-rank adaptation.
+    :param alpha: The scaling factor for low-rank adaptation.
+    :param rank_pattern: A dictionary specifying rank patterns for different modules.
+    :param alpha_pattern: A dictionary specifying alpha patterns for different modules.
+    :param target_modules: A list of module names to apply the adaptation to.
+    :param modules_to_save: A list of module names to save during the adaptation.
+    :return: A dictionary containing the PEFT configuration.
+    """
     return {
-        "alpha_pattern": {},
+        "alpha_pattern": alpha_pattern,
         "auto_mapping": None,
         "base_model_name_or_path": base_model_name_or_path,
         "bias": "none",
@@ -119,10 +392,10 @@ def create_peft_config(
         "lora_dropout": 0,
         "megatron_config": None,
         "megatron_core": "megatron.core",
-        "modules_to_save": None,
+        "modules_to_save": modules_to_save,
         "peft_type": "LORA",
         "r": rank,
-        "rank_pattern": {},
+        "rank_pattern": rank_pattern,
         "revision": None,
         "target_modules": target_modules,
         "task_type": "CAUSAL_LM",
@@ -130,30 +403,75 @@ def create_peft_config(
     }
 
 
-def reconstruct_invocation(args):
+def save_model_and_config(
+    lora_weights: Dict[str, torch.Tensor],
+    ranks: Dict[str, int],
+    extended: bool,
+    embedding_size: int,
+    module_details: List[Tuple[str, str]],
+    invocation_args: Dict[str, Any],
+) -> None:
     """
-    Reconstructs the command-line invocation string based on the given arguments stored in a dictionary.
+    Save the PEFT model and configuration to the specified output path.
 
-    Parameters:
-    - args: A dictionary containing the command arguments with keys matching the parameter names.
-      Expected keys are 'base_model', 'finetuned_model', 'out_path', 'no_lazy_unpickle', 'desired_rank', 'model_name' and 'device'.
-
-    Returns:
-    - The reconstructed command-line invocation string.
+    :param lora_weights: The LoRA weights.
+    :param ranks: The ranks of the LoRA weights.
+    :param module_details: Details of the model modules.
+    :param invocation_args: The command-line invocation arguments.
     """
-    # Provide a default value for out_path if it's not in the dictionary
-    out_path = args.get("out_path", "OUTPUT_PATH")
 
-    invocation = f"mergekit-extract-lora {args['base_model']} {args['finetuned_model']} {out_path}"
-    if args.get("no_lazy_unpickle"):
-        invocation += " --no-lazy-unpickle"
-    invocation += f" --rank={args['desired_rank']}"
-    if args.get("model_name"):
-        invocation += f" --model_name={args['model_name']}"
-    if args.get("device"):
-        invocation += f" --device={args['device']}"
+    base_model_ref = ModelReference.parse(invocation_args["base_model"])
+    finetuned_model_ref = ModelReference.parse(invocation_args["finetuned_model"])
+    out_path = invocation_args["out_path"]
+    model_name = invocation_args["model_name"]
 
-    return invocation
+    # Work out the actual final rank and only retain those that were lower.
+    final_max_rank = max(ranks.values())
+    ranks = {k: v for k, v in ranks.items() if v != final_max_rank}
+
+    lora_config = create_peft_config(
+        base_model_name_or_path=base_model_ref.model.path,
+        rank=final_max_rank,
+        alpha=final_max_rank,  # Setting the alpha to the rank value as `peft` will scale the LoRA weights by alpha/r when applying the adapter
+        rank_pattern=ranks,
+        alpha_pattern=ranks,
+        target_modules=list(
+            set(
+                module_name.split(".")[-1]
+                for module_type, module_name in module_details
+                if module_type != "to_save"
+            )
+        ),
+        modules_to_save=list(
+            set(
+                module_name.split(".")[-1]
+                for module_type, module_name in module_details
+                if module_type == "to_save"
+            )
+        ),
+    )
+
+    with open(os.path.join(out_path, "adapter_config.json"), "w") as f:
+        json.dump(lora_config, f, indent=2)
+
+    save_file(lora_weights, os.path.join(out_path, "adapter_model.safetensors"))
+
+    invocation_args.pop("out_path")  # don't include out_path for privacy
+    invocation = reconstruct_invocation(invocation_args)
+
+    card_md = generate_card_lora(
+        base_model_ref=base_model_ref,
+        finetuned_model_ref=finetuned_model_ref,
+        invocation=invocation,
+        extended=extended,
+        vocab_size=embedding_size,
+        name=model_name,
+    )
+
+    with open(os.path.join(out_path, "README.md"), "w", encoding="utf-8") as fp:
+        fp.write(card_md)
+
+    logging.info(f"PEFT LoRA adapters saved to {out_path}")
 
 
 @click.command("mergekit-extract-lora")
@@ -162,15 +480,30 @@ def reconstruct_invocation(args):
 @click.argument("out_path", type=click.Path())
 @click.option(
     "--no-lazy-unpickle",
+    type=bool,
     is_flag=True,
+    default=False,
     help="Disable lazy unpickler (more stable, higher memory usage)",
 )
+@click.option(
+    "--skip-undecomposable",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help="Skip saving undecomposable modules in the LoRA",
+)
 @click.option(
     "--rank",
-    "desired_rank",
+    "max_rank",
     type=int,
     default=32,
-    help="Rank for the low-rank decomposition",
+    help="The maximum rank for the low-rank decomposition",
+)
+@click.option(
+    "--extend-vocab",
+    is_flag=True,
+    default=False,
+    help="Extend vocabulary for models with additional tokens instead of truncating",
 )
 @click.option(
     "--model_name",
@@ -184,14 +517,20 @@ def reconstruct_invocation(args):
     default=None,
     help="PyTorch device to perform SVD computation on",
 )
+@click.option(
+    "--verbose", "-v", type=bool, is_flag=True, default=False, help="Verbose logging"
+)
 def main(
     finetuned_model: str,
     base_model: str,
     out_path: str,
     no_lazy_unpickle: bool,
-    desired_rank: int,
+    skip_undecomposable: bool,
+    max_rank: int,
+    extend_vocab: bool,
     model_name: str,
     device: str,
+    verbose: bool,
 ) -> None:
     """
     Decomposes delta weights between a base model and a finetuned model, saving a PEFT model to the specified output path.
@@ -206,79 +545,53 @@ def main(
     invocation_args = {
         "base_model": base_model,
         "finetuned_model": finetuned_model,
-        "desired_rank": desired_rank,
+        "max_rank": max_rank,
+        "extend_vocab": extend_vocab,
         "device": device,
         "out_path": out_path,
         "model_name": model_name,
         "no_lazy_unpickle": no_lazy_unpickle,
+        "skip_undecomposable": skip_undecomposable,
+        "verbose": verbose,
     }
 
+    logging.basicConfig(level=logging.INFO if verbose else logging.WARNING)
+
     os.makedirs(out_path, exist_ok=True)
 
     base_model_ref = ModelReference.parse(base_model)
     finetuned_model_ref = ModelReference.parse(finetuned_model)
 
-    linear_module_names = get_linear_module_names(base_model_ref.model.path)
-    finetuned_model_linear_module_names = get_linear_module_names(
-        finetuned_model_ref.model.path
+    (
+        module_details,
+        base_model_embedding_size,
+        finetuned_model_embedding_size,
+    ) = validate_and_combine_details(
+        ModelReference.parse(base_model).model.path,
+        ModelReference.parse(finetuned_model).model.path,
+        skip_undecomposable,
+        extend_vocab,
     )
 
-    assert set(linear_module_names) == set(
-        finetuned_model_linear_module_names
-    ), "Model architecture mismatch"
-
-    base_loader = LazyTensorLoader(
-        base_model_ref.tensor_index(), lazy_unpickle=(not no_lazy_unpickle)
+    lora_weights, ranks = extract_lora(
+        module_details,
+        base_model_ref,
+        finetuned_model_ref,
+        max_rank,
+        extend_vocab,
+        no_lazy_unpickle,
+        device,
     )
-    finetuned_loader = LazyTensorLoader(
-        finetuned_model_ref.tensor_index(), lazy_unpickle=(not no_lazy_unpickle)
-    )
-
-    lora_weights = {}
-    for layer_name in tqdm(linear_module_names):
-        base_weight = base_loader.get_tensor(f"{layer_name}.weight")
-        finetuned_weight = finetuned_loader.get_tensor(f"{layer_name}.weight")
-
-        lora_A, lora_B = decompose_delta_weight(
-            finetuned_weight, base_weight, desired_rank, device=device
-        )
 
-        lora_weights[f"base_model.model.{layer_name}.lora_A.weight"] = lora_A.to(
-            "cpu"
-        ).contiguous()
-        lora_weights[f"base_model.model.{layer_name}.lora_B.weight"] = lora_B.to(
-            "cpu"
-        ).contiguous()
-
-    lora_config = create_peft_config(
-        base_model_name_or_path=base_model_ref.model.path,
-        alpha=desired_rank,  # Setting the alpha to the reduced rank value as `peft` will scale the LoRA weights by alpha/r when applying the adapter
-        rank=desired_rank,
-        target_modules=list(
-            set([module_name.split(".")[-1] for module_name in linear_module_names])
-        ),
+    save_model_and_config(
+        lora_weights,
+        ranks,
+        finetuned_model_embedding_size > base_model_embedding_size and extend_vocab,
+        finetuned_model_embedding_size if extend_vocab else base_model_embedding_size,
+        module_details,
+        invocation_args,
     )
 
-    with open(os.path.join(out_path, "adapter_config.json"), "w") as f:
-        json.dump(lora_config, f, indent=2)
-
-    save_file(lora_weights, os.path.join(out_path, "adapter_model.safetensors"))
-
-    invocation_args.pop("out_path")  # don't include out_path for privacy
-    invocation = reconstruct_invocation(invocation_args)
-
-    card_md = generate_card_lora(
-        base_model_ref=base_model_ref,
-        finetuned_model_ref=finetuned_model_ref,
-        invocation=invocation,
-        name=model_name,
-    )
-
-    with open(os.path.join(out_path, "README.md"), "w", encoding="utf-8") as fp:
-        fp.write(card_md)
-
-    logging.info(f"PEFT LoRA adapters saved to {out_path}")
-
 
 if __name__ == "__main__":
     main()

From f17616f6d428539badbd35af61e73ce5bbafff8a Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Thu, 22 Aug 2024 19:20:53 -0700
Subject: [PATCH 08/27] Add Phi3SmallForCausalLM (#405)

---
 mergekit/_data/architectures/phi3-small.json | 68 ++++++++++++++++++++
 mergekit/_data/architectures/phi3.json       | 20 ++----
 2 files changed, 75 insertions(+), 13 deletions(-)
 create mode 100644 mergekit/_data/architectures/phi3-small.json

diff --git a/mergekit/_data/architectures/phi3-small.json b/mergekit/_data/architectures/phi3-small.json
new file mode 100644
index 00000000..7b3a1e80
--- /dev/null
+++ b/mergekit/_data/architectures/phi3-small.json
@@ -0,0 +1,68 @@
+{
+    "model_type": "phi3small",
+    "architectures": [
+        "Phi3SmallForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.embed_tokens.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "lm_head.weight",
+            "is_embed":true,
+            "aliases": [
+                "model.embed_tokens.weight"
+            ]
+        },
+        {
+            "name": "model.final_layernorm.weight"
+        },
+        {
+            "name": "model.final_layernorm.bias"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.dense.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.dense.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.query_key_value.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.query_key_value.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.up_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.up_proj.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.down_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.down_proj.bias"
+            }
+        ]
+    }
+}
diff --git a/mergekit/_data/architectures/phi3.json b/mergekit/_data/architectures/phi3.json
index 0e741af3..6c606b84 100644
--- a/mergekit/_data/architectures/phi3.json
+++ b/mergekit/_data/architectures/phi3.json
@@ -1,5 +1,5 @@
 {
-    "model_type": "phi",
+    "model_type": "phi3",
     "architectures": [
         "Phi3ForCausalLM"
     ],
@@ -22,28 +22,22 @@
     "layer_templates": {
         "weights": [
             {
-                "name": "model.layers.${layer_index}.input_layernorm.weight",
-                "is_embed": false
+                "name": "model.layers.${layer_index}.input_layernorm.weight"
             },
             {
-                "name": "model.layers.${layer_index}.post_attention_layernorm.weight",
-                "is_embed": false
+                "name": "model.layers.${layer_index}.post_attention_layernorm.weight"
             },
             {
-                "name": "model.layers.${layer_index}.self_attn.o_proj.weight",
-                "is_embed": false
+                "name": "model.layers.${layer_index}.self_attn.o_proj.weight"
             },
             {
-                "name": "model.layers.${layer_index}.self_attn.qkv_proj.weight",
-                "is_embed": false
+                "name": "model.layers.${layer_index}.self_attn.qkv_proj.weight"
             },
             {
-                "name": "model.layers.${layer_index}.mlp.gate_up_proj.weight",
-                "is_embed": false
+                "name": "model.layers.${layer_index}.mlp.gate_up_proj.weight"
             },
             {
-                "name": "model.layers.${layer_index}.mlp.down_proj.weight",
-                "is_embed": false
+                "name": "model.layers.${layer_index}.mlp.down_proj.weight"
             }
         ]
     }

From 36738ff1e537270d10f25a0a0b24d5e7d4cf99dd Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Thu, 22 Aug 2024 19:41:18 -0700
Subject: [PATCH 09/27] Set Gemma2 lm_head optional instead of aliasing to
 embed_tokens (#406)

Resolves #385.
---
 mergekit/_data/architectures/gemma2.json | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mergekit/_data/architectures/gemma2.json b/mergekit/_data/architectures/gemma2.json
index aeca0cc8..0c6372f0 100644
--- a/mergekit/_data/architectures/gemma2.json
+++ b/mergekit/_data/architectures/gemma2.json
@@ -54,9 +54,7 @@
         {
             "name": "lm_head.weight",
             "is_embed": true,
-            "aliases": [
-                "model.embed_tokens.weight"
-            ]
+            "optional": true
         }
     ]
 }

From 25377d87ddea7f43ee9175209173979693e8b55c Mon Sep 17 00:00:00 2001
From: Jacobsolawetz <jacobsolawetz@gmail.com>
Date: Thu, 29 Aug 2024 19:38:30 -0500
Subject: [PATCH 10/27] Cloud Merging (#410)

Introduces a light cloud merging option from Arcee
---
 README.md              | 35 ++++++++++++++++++++++++++++++++---
 examples/bio-merge.yml | 15 +++++++++++++++
 2 files changed, 47 insertions(+), 3 deletions(-)
 create mode 100644 examples/bio-merge.yml

diff --git a/README.md b/README.md
index 0a191a1d..6042d928 100644
--- a/README.md
+++ b/README.md
@@ -11,10 +11,10 @@ Features:
 - Interpolated gradients for parameter values (inspired by Gryphe's [BlockMerge_Gradient](https://github.com/Gryphe/BlockMerge_Gradient) script)
 - Piecewise assembly of language models from layers ("Frankenmerging")
 - [Mixture of Experts merging](#mixture-of-experts-merging)
+- [LORA extraction](#lora-extraction)
+- [Evolutionary merge methods](#evolutionary-merge-methods)
 
-🔊 Call to Evolve - to solve evolutionary merge methods as a community - please see <https://github.com/arcee-ai/mergekit/issues/207>.
-
-🌐 GUI Launch Alert 🤗 - We are excited to announce the launch of a graphical user interface for mergekit in Hugging Face Spaces! This GUI simplifies the merging process, making it more accessible to a broader audience. Check it out and contribute at [Hugging Face Spaces - mergekit-community](https://huggingface.co/mergekit-community).
+🌐 GUI Launch Alert 🤗 - We are excited to announce the launch of a mega-GPU Backed graphical user interface for mergekit in Arcee! This GUI simplifies the merging process, making it more accessible to a broader audience. Check it out and contribute at the [Arcee App](app.arcee.ai). There is also a [Hugging Face Space](https://huggingface.co/mergekit-community) with limited amounts of GPUs.
 
 ## Installation
 
@@ -213,6 +213,35 @@ mergekit-extract-lora finetuned_model_id_or_path base_model_id_or_path output_pa
 
 The `mergekit-moe` script supports merging multiple dense models into a mixture of experts, either for direct use or for further training. For more details see the [`mergekit-moe` documentation](docs/moe.md).
 
+## Evolutionary merge methods
+
+See `docs/evolve.md` for details.
+
+## ✨ Merge in the Cloud ✨
+
+We host merging on Arcee's cloud GPUs - you can launch a cloud merge in the [Arcee App](app.arcee.ai). Or through python - grab an ARCEE_API_KEY:
+
+`export ARCEE_API_KEY=<your-api-key>`
+`pip install -q arcee-py`
+
+```
+import arcee
+arcee.merge_yaml("bio-merge","./examples/bio-merge.yml")
+```
+
+Check your merge status at the [Arcee App](app.arcee.ai)
+
+When complete, either deploy your merge:
+
+```
+arcee.start_deployment("bio-merge", merging="bio-merge")
+```
+
+Or download your merge:
+
+`!arcee merging download bio-merge`
+
+
 ## Citation
 
 We now have a [paper](https://arxiv.org/abs/2403.13257) you can cite for the MergeKit library:
diff --git a/examples/bio-merge.yml b/examples/bio-merge.yml
new file mode 100644
index 00000000..c47101f5
--- /dev/null
+++ b/examples/bio-merge.yml
@@ -0,0 +1,15 @@
+models:
+  - model: mistralai/Mistral-7B-Instruct-v0.2
+    parameters:
+      density: 0.5
+      weight: 0.5
+  - model: BioMistral/BioMistral-7B
+    parameters:
+      density: 0.5
+      weight: 0.5
+merge_method: ties
+base_model: mistralai/Mistral-7B-v0.1
+parameters:
+  normalize: false
+  int8_mask: true
+dtype: float16

From b5d89c2dde821aff46c9889647d23b1ab5004ea5 Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Thu, 29 Aug 2024 17:47:40 -0700
Subject: [PATCH 11/27] Fix link in README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6042d928..20599898 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ Features:
 - [LORA extraction](#lora-extraction)
 - [Evolutionary merge methods](#evolutionary-merge-methods)
 
-🌐 GUI Launch Alert 🤗 - We are excited to announce the launch of a mega-GPU Backed graphical user interface for mergekit in Arcee! This GUI simplifies the merging process, making it more accessible to a broader audience. Check it out and contribute at the [Arcee App](app.arcee.ai). There is also a [Hugging Face Space](https://huggingface.co/mergekit-community) with limited amounts of GPUs.
+🌐 GUI Launch Alert 🤗 - We are excited to announce the launch of a mega-GPU backed graphical user interface for mergekit in Arcee! This GUI simplifies the merging process, making it more accessible to a broader audience. Check it out and contribute at the [Arcee App](https://app.arcee.ai). There is also a [Hugging Face Space](https://huggingface.co/mergekit-community) with limited amounts of GPUs.
 
 ## Installation
 

From 258addefe1d6b02b318dd51bb88499dea58e78b8 Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Sat, 31 Aug 2024 14:11:04 -0700
Subject: [PATCH 12/27] Fix README links (#413)

Fixes #412.
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 20599898..cbf93b12 100644
--- a/README.md
+++ b/README.md
@@ -219,7 +219,7 @@ See `docs/evolve.md` for details.
 
 ## ✨ Merge in the Cloud ✨
 
-We host merging on Arcee's cloud GPUs - you can launch a cloud merge in the [Arcee App](app.arcee.ai). Or through python - grab an ARCEE_API_KEY:
+We host merging on Arcee's cloud GPUs - you can launch a cloud merge in the [Arcee App](https://app.arcee.ai). Or through python - grab an ARCEE_API_KEY:
 
 `export ARCEE_API_KEY=<your-api-key>`
 `pip install -q arcee-py`
@@ -229,7 +229,7 @@ import arcee
 arcee.merge_yaml("bio-merge","./examples/bio-merge.yml")
 ```
 
-Check your merge status at the [Arcee App](app.arcee.ai)
+Check your merge status at the [Arcee App](https://app.arcee.ai)
 
 When complete, either deploy your merge:
 

From a5a597f1535e40a9ffc6ef22b7c0a18aef225130 Mon Sep 17 00:00:00 2001
From: "Geun, Lim" <shing100@Naver.com>
Date: Fri, 13 Sep 2024 07:26:06 +0900
Subject: [PATCH 13/27] Add Solar And Exaone Model (#416)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add Solar model

Solar 22B model has been newly uploaded, I hope this PR helps.

HuggingFace:
[upstage/solar-pro-preview-instruct](https://huggingface.co/upstage/solar-pro-preview-instruct)

Added support for Exaone model is work well

<img width="281" alt="스크린샷 2024-09-11 오후 2 35 48"
src="https://github.com/user-attachments/assets/08d508cf-77a2-49fc-bd1c-986040cf3c8c">
---
 mergekit/_data/architectures/exaone.json   | 78 +++++++++++++++++++++
 mergekit/_data/architectures/solar.json    | 81 ++++++++++++++++++++++
 mergekit/_data/chat_templates/exaone.jinja | 14 ++++
 3 files changed, 173 insertions(+)
 create mode 100644 mergekit/_data/architectures/exaone.json
 create mode 100644 mergekit/_data/architectures/solar.json
 create mode 100644 mergekit/_data/chat_templates/exaone.jinja

diff --git a/mergekit/_data/architectures/exaone.json b/mergekit/_data/architectures/exaone.json
new file mode 100644
index 00000000..e9024473
--- /dev/null
+++ b/mergekit/_data/architectures/exaone.json
@@ -0,0 +1,78 @@
+{
+    "model_type": "exaone",
+    "architectures": [
+        "ExaoneForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "transformer.wte.weight",
+            "is_embed": true,
+            "output_space": "running_residual"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "transformer.h.${layer_index}.ln_1.weight",
+                "input_space": "running_residual"
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.attention.q_proj.weight",
+                "input_space": "running_residual",
+                "output_space": "attn_qk_${layer_index}",
+                "head_split": "output",
+                "is_kq": true
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.attention.k_proj.weight",
+                "input_space": "running_residual",
+                "output_space": "attn_qk_${layer_index}",
+                "head_split": "output",
+                "is_kq": true
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.attention.v_proj.weight",
+                "input_space": "running_residual",
+                "output_space": "attn_v_${layer_index}",
+                "head_split": "output"
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.attention.out_proj.weight",
+                "input_space": "attn_v_${layer_index}",
+                "output_space": "running_residual",
+                "head_split": "input"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_2.weight",
+                "input_space": "running_residual"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_fc_0.weight",
+                "input_space": "running_residual",
+                "output_space": "up_${layer_index}"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_fc_1.weight",
+                "input_space": "running_residual",
+                "output_space": "up_${layer_index}"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_proj.weight",
+                "input_space": "up_${layer_index}",
+                "output_space": "running_residual"
+            }
+        ]
+    },
+    "post_weights": [
+        {
+            "name": "transformer.ln_f.weight",
+            "input_space": "running_residual"
+        },
+        {
+            "name": "lm_head.weight",
+            "input_space": "running_residual",
+            "is_embed": true
+        }
+    ]
+}
diff --git a/mergekit/_data/architectures/solar.json b/mergekit/_data/architectures/solar.json
new file mode 100644
index 00000000..7bd6a751
--- /dev/null
+++ b/mergekit/_data/architectures/solar.json
@@ -0,0 +1,81 @@
+{
+  "model_type": "solar",
+  "architectures": [
+    "SolarForCausalLM"
+  ],
+  "pre_weights": [
+    {
+      "name": "model.embed_tokens.weight",
+      "is_embed": true,
+      "output_space": "running_residual"
+    }
+  ],
+  "num_layers_config_key": "num_hidden_layers",
+  "layer_templates": {
+    "weights": [
+      {
+        "name": "model.layers.${layer_index}.input_layernorm.weight",
+        "input_space": "running_residual"
+      },
+      {
+        "name": "model.layers.${layer_index}.self_attn.k_proj.weight",
+        "input_space": "running_residual",
+        "output_space": "attn_qk_${layer_index}",
+        "head_split": "output",
+        "is_kq": true
+      },
+      {
+        "name": "model.layers.${layer_index}.self_attn.q_proj.weight",
+        "input_space": "running_residual",
+        "output_space": "attn_qk_${layer_index}",
+        "head_split": "output",
+        "is_kq": true
+      },
+      {
+        "name": "model.layers.${layer_index}.self_attn.v_proj.weight",
+        "input_space": "running_residual",
+        "output_space": "attn_v_${layer_index}",
+        "head_split": "output"
+      },
+      {
+        "name": "model.layers.${layer_index}.self_attn.o_proj.weight",
+        "input_space": "attn_v_${layer_index}",
+        "output_space": "running_residual",
+        "head_split": "input"
+      },
+      {
+        "name": "model.layers.${layer_index}.post_attention_layernorm.weight",
+        "input_space": "running_residual"
+      },
+      {
+        "name": "model.layers.${layer_index}.mlp.gate_proj.weight",
+        "input_space": "running_residual",
+        "output_space": "up_${layer_index}"
+      },
+      {
+        "name": "model.layers.${layer_index}.mlp.up_proj.weight",
+        "input_space": "running_residual",
+        "output_space": "up_${layer_index}"
+      },
+      {
+        "name": "model.layers.${layer_index}.mlp.down_proj.weight",
+        "input_space": "up_${layer_index}",
+        "output_space": "running_residual"
+      }
+    ]
+  },
+  "post_weights": [
+    {
+      "name": "model.norm.weight",
+      "input_space": "running_residual"
+    },
+    {
+      "name": "lm_head.weight",
+      "input_space": "running_residual",
+      "is_embed": true,
+      "aliases": [
+        "model.lm_head.weight"
+      ]
+    }
+  ]
+}
diff --git a/mergekit/_data/chat_templates/exaone.jinja b/mergekit/_data/chat_templates/exaone.jinja
new file mode 100644
index 00000000..3a4d07ae
--- /dev/null
+++ b/mergekit/_data/chat_templates/exaone.jinja
@@ -0,0 +1,14 @@
+{% for message in messages %}
+    {% if loop.first and message['role'] != 'system' %}
+        {{ '[|system|][|endofturn|]\n' }}
+    {% endif %}
+    {{ '[|' + message['role'] + '|]' + message['content'] }}
+    {% if message['role'] == 'user' %}
+        {{ '\n' }}
+    {% else %}
+        {{ '[|endofturn|]\n' }}
+    {% endif %}
+{% endfor %}
+{% if add_generation_prompt %}
+    {{ '[|assistant|]' }}
+{% endif %}

From c891a0900969c1fb8ce678e3ace0e084a92c24fc Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Mon, 16 Sep 2024 15:11:15 -0700
Subject: [PATCH 14/27] Make Cohere lm_head optional (#417)

---
 mergekit/_data/architectures/cohere.json | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mergekit/_data/architectures/cohere.json b/mergekit/_data/architectures/cohere.json
index 36178c81..c8033103 100644
--- a/mergekit/_data/architectures/cohere.json
+++ b/mergekit/_data/architectures/cohere.json
@@ -16,9 +16,7 @@
         {
             "name": "lm_head.weight",
             "is_embed": true,
-            "aliases": [
-                "model.embed_tokens.weight"
-            ]
+            "optional": true
         }
     ],
     "num_layers_config_key": "num_hidden_layers",

From 852291726650c8dd6ac78721c2c4d0fbdafc8e3d Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Fri, 4 Oct 2024 16:43:03 -0700
Subject: [PATCH 15/27] Update Llama architecture to handle 3b/1b (#429)

---
 mergekit/_data/architectures/llama.json | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mergekit/_data/architectures/llama.json b/mergekit/_data/architectures/llama.json
index 3095b207..7106806b 100644
--- a/mergekit/_data/architectures/llama.json
+++ b/mergekit/_data/architectures/llama.json
@@ -73,10 +73,8 @@
         {
             "name": "lm_head.weight",
             "input_space": "running_residual",
-            "is_embed":true,
-            "aliases": [
-                "model.lm_head.weight"
-            ]
+            "is_embed": true,
+            "optional": true
         }
     ]
 }

From 459121e0ef6f2da61ecaac646b096cb9d1ceb5fc Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Sat, 5 Oct 2024 12:57:08 -0700
Subject: [PATCH 16/27] Handle merges stored as list instead of space-separated
 string (#430)

Should resolve #428.
---
 mergekit/tokenizer/build.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mergekit/tokenizer/build.py b/mergekit/tokenizer/build.py
index fb9f9d9c..3cefed91 100644
--- a/mergekit/tokenizer/build.py
+++ b/mergekit/tokenizer/build.py
@@ -90,7 +90,12 @@ def get_stripped_tokenizer(
             del tok_dict["model"]["vocab"][tok]
 
     def _keep_merge(m):
-        toks = m.split(" ")
+        if isinstance(m, str) and m.count(" ") == 1:
+            toks = m.split(" ")
+        elif isinstance(m, list):
+            toks = m
+        else:
+            raise RuntimeError(f"Unexpected merge format: {repr(m)} ({type(m)})")
         for tok in toks:
             if tok in unused_toks:
                 return False

From 93ace70a2df88a51eac2a3deb26862955ab4c229 Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Wed, 23 Oct 2024 17:02:03 +0200
Subject: [PATCH 17/27] Bump dependencies (#444)

---
 pyproject.toml | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 01915feb..9bb09a7d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,25 +11,26 @@ version = "0.0.4.4"
 authors = [{ name = "Charles Goddard", email = "chargoddard@gmail.com" }]
 dependencies = [
     "torch>=2.0.0",
-    "tqdm==4.66.4",
+    "tqdm==4.66.5",
     "click==8.1.7",
     "safetensors~=0.4.3",
-    "accelerate~=0.30.1",
-    "pydantic==2.7.1",
+    "accelerate~=1.0.1",
+    "pydantic~=2.9.2",
     "immutables==0.20",
-    "transformers>=4.42.3",
+    "transformers>=4.45.2",
+    "tokenizers>=0.20.1",
     "huggingface_hub",
     "peft",
     "typing-extensions",
     "sentencepiece",
     "protobuf",
     "scipy",
-    "datasets"
+    "datasets",
 ]
 
 [project.optional-dependencies]
-dev = ["black~=24.4.2", "isort~=5.13.2", "pre-commit~=3.7.1"]
-test = ["pytest~=8.2.1"]
+dev = ["black~=24.10.0", "isort~=5.13.2", "pre-commit~=4.0.1"]
+test = ["pytest~=8.3.3"]
 evolve = ["ray", "cma", "lm_eval", "wandb"]
 vllm = ["vllm==0.3.2", "lm_eval[vllm]"]
 

From 2a9d75420ba5cd43e7101e1bc20a5fc6872577fa Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Tue, 29 Oct 2024 20:10:54 +0100
Subject: [PATCH 18/27] Bump version number (#448)

---
 mergekit/io/tensor_writer.py | 2 +-
 pyproject.toml               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mergekit/io/tensor_writer.py b/mergekit/io/tensor_writer.py
index 1483a3c3..2f9e2a1e 100644
--- a/mergekit/io/tensor_writer.py
+++ b/mergekit/io/tensor_writer.py
@@ -121,7 +121,7 @@ def finalize(self):
             json.dump(
                 {
                     "metadata": {
-                        "mergekit_version": "0.0.4.4",
+                        "mergekit_version": "0.0.5",
                         "total_size": self.total_size,
                     },
                     "weight_map": self.weight_map,
diff --git a/pyproject.toml b/pyproject.toml
index 9bb09a7d..8a1292bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "mergekit"
 description = "Tools for merging pre-trained large language models"
 readme = "README.md"
 license = { text = "LGPL-3.0-or-later" }
-version = "0.0.4.4"
+version = "0.0.5"
 authors = [{ name = "Charles Goddard", email = "chargoddard@gmail.com" }]
 dependencies = [
     "torch>=2.0.0",

From 57e7d14e2a732f532970e2c9dada00e2d8f15a7a Mon Sep 17 00:00:00 2001
From: Charles Goddard <chargoddard@gmail.com>
Date: Wed, 30 Oct 2024 12:44:49 -0700
Subject: [PATCH 19/27] Bump again for PyPi

---
 mergekit/io/tensor_writer.py | 2 +-
 pyproject.toml               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mergekit/io/tensor_writer.py b/mergekit/io/tensor_writer.py
index 2f9e2a1e..199772ea 100644
--- a/mergekit/io/tensor_writer.py
+++ b/mergekit/io/tensor_writer.py
@@ -121,7 +121,7 @@ def finalize(self):
             json.dump(
                 {
                     "metadata": {
-                        "mergekit_version": "0.0.5",
+                        "mergekit_version": "0.0.5.1",
                         "total_size": self.total_size,
                     },
                     "weight_map": self.weight_map,
diff --git a/pyproject.toml b/pyproject.toml
index 8a1292bb..128a5b87 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "mergekit"
 description = "Tools for merging pre-trained large language models"
 readme = "README.md"
 license = { text = "LGPL-3.0-or-later" }
-version = "0.0.5"
+version = "0.0.5.1"
 authors = [{ name = "Charles Goddard", email = "chargoddard@gmail.com" }]
 dependencies = [
     "torch>=2.0.0",

From afe37801a76cf037e0639b12b59c30766a54c62b Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Sat, 30 Nov 2024 12:50:04 -0800
Subject: [PATCH 20/27] Handle optional weights in mergekit-moe (#463)

Now working with base models with tied weights.
---
 mergekit/evo/actors.py           |  2 +-
 mergekit/moe/common.py           | 30 +++++++++++++++++++
 mergekit/moe/deepseek.py         | 49 +++++++++++++++-----------------
 mergekit/moe/mixtral.py          | 30 +++++++++----------
 mergekit/moe/qwen.py             | 38 ++++++++++++-------------
 mergekit/scripts/extract_lora.py |  2 --
 6 files changed, 86 insertions(+), 65 deletions(-)

diff --git a/mergekit/evo/actors.py b/mergekit/evo/actors.py
index e107efe7..ff5c4986 100644
--- a/mergekit/evo/actors.py
+++ b/mergekit/evo/actors.py
@@ -207,7 +207,7 @@ def _maybe_init_model(self, config: MergeConfiguration):
                 tokenizer_donor = self.genome.definition.base_model
                 if tokenizer_donor is None:
                     logging.warning(
-                        f"Base model not set, using tokenizer from first model in genome"
+                        "Base model not set, using tokenizer from first model in genome"
                     )
                     tokenizer_donor = self.genome.definition.models[0]
                 tok = transformers.AutoTokenizer.from_pretrained(
diff --git a/mergekit/moe/common.py b/mergekit/moe/common.py
index 4a0df69c..a5970b4a 100644
--- a/mergekit/moe/common.py
+++ b/mergekit/moe/common.py
@@ -13,12 +13,14 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program. If not, see http://www.gnu.org/licenses/.
 
+import logging
 from typing import Dict, Optional, Tuple
 
 import torch
 import tqdm
 import transformers
 
+from mergekit.architecture import WeightInfo
 from mergekit.common import ModelReference, dtype_from_name
 from mergekit.io import LazyTensorLoader, TensorWriter
 from mergekit.merge import MergeOptions
@@ -73,3 +75,31 @@ def noise_and_scale(
     if is_residual and expert.residual_scale is not None:
         tensor = tensor * expert.residual_scale
     return tensor
+
+
+def copy_tensor_out(
+    weight_info: WeightInfo,
+    loader: LazyTensorLoader,
+    writer: TensorWriter,
+    expert: Optional[Expert] = None,
+    is_residual: bool = False,
+    output_name: Optional[str] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    clone: bool = False,
+):
+    out_tensor_name = output_name or weight_info.name
+    try:
+        tensor = loader.get_tensor(weight_info.name, aliases=weight_info.aliases)
+    except KeyError:
+        tensor = None
+    if tensor is None and not weight_info.optional:
+        logging.error(f"Missing weight: {weight_info.name} / {out_tensor_name}")
+        raise KeyError(out_tensor_name)
+
+    if expert:
+        tensor = noise_and_scale(tensor, expert, is_residual=is_residual)
+    writer.save_tensor(
+        out_tensor_name,
+        tensor.to(dtype=out_dtype),
+        clone=clone,
+    )
diff --git a/mergekit/moe/deepseek.py b/mergekit/moe/deepseek.py
index 1f7226fb..4ce62865 100644
--- a/mergekit/moe/deepseek.py
+++ b/mergekit/moe/deepseek.py
@@ -24,7 +24,7 @@
 
 from mergekit.architecture import get_architecture_info
 from mergekit.moe.arch import MoEOutputArchitecture
-from mergekit.moe.common import initialize_io, noise_and_scale, select_dtype
+from mergekit.moe.common import copy_tensor_out, initialize_io, select_dtype
 from mergekit.moe.config import MoEMergeConfig
 from mergekit.options import MergeOptions
 
@@ -148,39 +148,36 @@ def write_model(
                         ".mlp.", f".mlp.experts.{expert_idx}."
                     )
                     expert_loader = loaders.get(expert.source_model)
-                    tensor = expert_loader.get_tensor(
-                        weight_info.name, aliases=weight_info.aliases
-                    )
-                    tensor = noise_and_scale(
-                        tensor, expert, is_residual="down_proj" in tensor_name
-                    )
-                    writer.save_tensor(
-                        expert_name,
-                        tensor.to(dtype=out_dtype),
+                    copy_tensor_out(
+                        weight_info,
+                        expert_loader,
+                        writer,
+                        expert=expert,
+                        is_residual="down_proj" in tensor_name,
+                        output_name=expert_name,
+                        out_dtype=out_dtype,
                         clone=merge_options.clone_tensors,
                     )
 
                 if shared_def is not None:
-                    shared_tensor = shared_loader.get_tensor(
-                        weight_info.name, aliases=weight_info.aliases
-                    )
-                    shared_tensor = noise_and_scale(
-                        shared_tensor,
-                        shared_def,
+                    copy_tensor_out(
+                        weight_info,
+                        shared_loader,
+                        writer,
+                        expert=shared_def,
                         is_residual="down_proj" in tensor_name,
-                    )
-                    writer.save_tensor(
-                        tensor_name.replace(".mlp.", ".mlp.shared_experts."),
-                        shared_tensor.to(dtype=out_dtype),
+                        output_name=tensor_name.replace(
+                            ".mlp.", ".mlp.shared_experts."
+                        ),
+                        out_dtype=out_dtype,
                         clone=merge_options.clone_tensors,
                     )
             else:
-                tensor = base_loader.get_tensor(
-                    tensor_name, aliases=weight_info.aliases
-                )
-                writer.save_tensor(
-                    tensor_name,
-                    tensor.to(dtype=out_dtype),
+                copy_tensor_out(
+                    weight_info,
+                    base_loader,
+                    writer,
+                    out_dtype=out_dtype,
                     clone=merge_options.clone_tensors,
                 )
 
diff --git a/mergekit/moe/mixtral.py b/mergekit/moe/mixtral.py
index 538cb701..f3fe97df 100644
--- a/mergekit/moe/mixtral.py
+++ b/mergekit/moe/mixtral.py
@@ -22,7 +22,7 @@
 
 from mergekit.architecture import MISTRAL_INFO, WeightInfo
 from mergekit.moe.arch import MoEOutputArchitecture
-from mergekit.moe.common import initialize_io, noise_and_scale, select_dtype
+from mergekit.moe.common import copy_tensor_out, initialize_io, select_dtype
 from mergekit.moe.config import MoEMergeConfig
 from mergekit.options import MergeOptions
 
@@ -145,24 +145,22 @@ def write_model(
                 for expert_index, expert in enumerate(config.experts):
                     expert_name = tensor_name.replace("{expert_idx}", str(expert_index))
                     expert_loader = loaders.get(expert.source_model)
-                    tensor = expert_loader.get_tensor(
-                        weight_info.name, aliases=weight_info.aliases
-                    )
-                    tensor = noise_and_scale(
-                        tensor, expert, is_residual="down_proj" in tensor_name
-                    )
-                    writer.save_tensor(
-                        expert_name,
-                        tensor.to(dtype=out_dtype),
+                    copy_tensor_out(
+                        weight_info,
+                        expert_loader,
+                        writer,
+                        expert=expert,
+                        out_dtype=out_dtype,
+                        output_name=expert_name,
                         clone=merge_options.clone_tensors,
+                        is_residual="down_proj" in tensor_name,
                     )
             else:
-                tensor = base_loader.get_tensor(
-                    tensor_name, aliases=weight_info.aliases
-                )
-                writer.save_tensor(
-                    tensor_name,
-                    tensor.to(dtype=out_dtype),
+                copy_tensor_out(
+                    weight_info,
+                    base_loader,
+                    writer,
+                    out_dtype=out_dtype,
                     clone=merge_options.clone_tensors,
                 )
 
diff --git a/mergekit/moe/qwen.py b/mergekit/moe/qwen.py
index ab94f7d5..65337a0a 100644
--- a/mergekit/moe/qwen.py
+++ b/mergekit/moe/qwen.py
@@ -26,7 +26,7 @@
 
 from mergekit.architecture import QWEN2_INFO
 from mergekit.moe.arch import MoEOutputArchitecture
-from mergekit.moe.common import initialize_io, noise_and_scale, select_dtype
+from mergekit.moe.common import copy_tensor_out, initialize_io, select_dtype
 from mergekit.moe.config import MoEMergeConfig
 from mergekit.options import MergeOptions
 
@@ -137,29 +137,25 @@ def write_model(
                         ".mlp.", f".mlp.experts.{expert_idx}."
                     )
                     expert_loader = loaders.get(expert.source_model)
-                    tensor = expert_loader.get_tensor(
-                        weight_info.name, aliases=weight_info.aliases
-                    )
-                    tensor = noise_and_scale(
-                        tensor, expert, is_residual="down_proj" in tensor_name
-                    )
-                    writer.save_tensor(
-                        expert_name,
-                        tensor.to(dtype=out_dtype),
+                    copy_tensor_out(
+                        weight_info,
+                        expert_loader,
+                        writer,
+                        expert=expert,
+                        is_residual="down_proj" in tensor_name,
+                        output_name=expert_name,
+                        out_dtype=out_dtype,
                         clone=merge_options.clone_tensors,
                     )
 
-                shared_tensor = shared_loader.get_tensor(
-                    weight_info.name, aliases=weight_info.aliases
-                )
-                shared_tensor = noise_and_scale(
-                    shared_tensor,
-                    shared_def,
+                copy_tensor_out(
+                    weight_info,
+                    shared_loader,
+                    writer,
+                    expert=shared_def,
                     is_residual="down_proj" in tensor_name,
-                )
-                writer.save_tensor(
-                    tensor_name.replace(".mlp.", ".mlp.shared_expert."),
-                    shared_tensor.to(dtype=out_dtype),
+                    output_name=tensor_name.replace(".mlp.", ".mlp.shared_expert."),
+                    out_dtype=out_dtype,
                     clone=merge_options.clone_tensors,
                 )
             else:
@@ -180,6 +176,8 @@ def write_model(
                             else out_cfg.num_attention_heads
                         )
                         tensor = torch.zeros(num_heads * head_dim, dtype=out_dtype)
+                    elif weight_info.optional:
+                        continue
                     else:
                         raise
 
diff --git a/mergekit/scripts/extract_lora.py b/mergekit/scripts/extract_lora.py
index ff063232..69c010bb 100644
--- a/mergekit/scripts/extract_lora.py
+++ b/mergekit/scripts/extract_lora.py
@@ -8,10 +8,8 @@
 import torch
 from peft.tuners.lora import QuantLinear
 from safetensors.torch import save_file
-from torch.nn.functional import pad
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM
-from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import Conv1D
 
 from mergekit.card import generate_card_lora

From 68c4b65e93d9e5c16d04935e8d567cb74cf4886a Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Sat, 30 Nov 2024 13:55:02 -0800
Subject: [PATCH 21/27] Better tied weight handling (#464)

Handle cases where some input models have a tied tensor and some don't.

For example, there are some fine tunes of Llama 3.2 3B floating around
that are ~3.6B parameters because they have a separate LM head - with
these changes these can be merged with standard sized ones. There will
be a LM head in the output model if any inputs have one. Otherwise
behavior will be as it was before.
---
 .../_data/architectures/bert-masked-lm.json   |  3 +-
 .../architectures/distilbert-masked-lm.json   |  3 +-
 mergekit/_data/architectures/gemma2.json      |  5 +-
 mergekit/_data/architectures/gptbigcode.json  |  4 +-
 mergekit/_data/architectures/internlm2.json   |  3 +-
 mergekit/_data/architectures/llama.json       |  5 +-
 mergekit/_data/architectures/mamba.json       |  5 +-
 mergekit/_data/architectures/phi3-small.json  |  5 +-
 mergekit/_data/architectures/qwen2.json       |  3 +-
 .../architectures/roberta-masked-lm.json      |  7 ++-
 mergekit/_data/architectures/solar.json       |  3 +-
 mergekit/_data/architectures/starcoder2.json  |  5 +-
 mergekit/architecture.py                      |  4 ++
 mergekit/io/tasks.py                          |  6 +-
 mergekit/io/tensor_writer.py                  |  2 +-
 mergekit/plan.py                              |  5 +-
 mergekit/scripts/tokensurgeon.py              | 58 +++++++++++++------
 pyproject.toml                                |  2 +-
 18 files changed, 91 insertions(+), 37 deletions(-)

diff --git a/mergekit/_data/architectures/bert-masked-lm.json b/mergekit/_data/architectures/bert-masked-lm.json
index 3b0620fb..d6430e40 100644
--- a/mergekit/_data/architectures/bert-masked-lm.json
+++ b/mergekit/_data/architectures/bert-masked-lm.json
@@ -44,7 +44,8 @@
         },
         {
             "name": "cls.predictions.decoder.weight",
-            "aliases": [
+            "optional": true,
+            "tied_names": [
                 "bert.embeddings.word_embeddings.weight"
             ],
             "is_embed": true
diff --git a/mergekit/_data/architectures/distilbert-masked-lm.json b/mergekit/_data/architectures/distilbert-masked-lm.json
index 6828cca2..1a079811 100644
--- a/mergekit/_data/architectures/distilbert-masked-lm.json
+++ b/mergekit/_data/architectures/distilbert-masked-lm.json
@@ -40,7 +40,8 @@
         {
             "name": "vocab_projector.weight",
             "is_embed": true,
-            "aliases": [
+            "optional": true,
+            "tied_names": [
                 "distilbert.embeddings.word_embeddings.weight"
             ]
         },
diff --git a/mergekit/_data/architectures/gemma2.json b/mergekit/_data/architectures/gemma2.json
index 0c6372f0..52505245 100644
--- a/mergekit/_data/architectures/gemma2.json
+++ b/mergekit/_data/architectures/gemma2.json
@@ -54,7 +54,10 @@
         {
             "name": "lm_head.weight",
             "is_embed": true,
-            "optional": true
+            "optional": true,
+            "tied_names": [
+                "model.embed_tokens.weight"
+            ]
         }
     ]
 }
diff --git a/mergekit/_data/architectures/gptbigcode.json b/mergekit/_data/architectures/gptbigcode.json
index 4b086278..c12bac5c 100644
--- a/mergekit/_data/architectures/gptbigcode.json
+++ b/mergekit/_data/architectures/gptbigcode.json
@@ -21,7 +21,9 @@
         },
         {
             "name": "lm_head.weight",
-            "aliases": [
+            "is_embed": true,
+            "optional": true,
+            "tied_names": [
                 "transformer.wte.weight"
             ]
         }
diff --git a/mergekit/_data/architectures/internlm2.json b/mergekit/_data/architectures/internlm2.json
index 057bc649..888faa48 100644
--- a/mergekit/_data/architectures/internlm2.json
+++ b/mergekit/_data/architectures/internlm2.json
@@ -16,7 +16,8 @@
         {
             "name": "output.weight",
             "is_embed": true,
-            "aliases": [
+            "optional": true,
+            "tied_names": [
                 "model.tok_embeddings.weight"
             ]
         }
diff --git a/mergekit/_data/architectures/llama.json b/mergekit/_data/architectures/llama.json
index 7106806b..00918a2c 100644
--- a/mergekit/_data/architectures/llama.json
+++ b/mergekit/_data/architectures/llama.json
@@ -74,7 +74,10 @@
             "name": "lm_head.weight",
             "input_space": "running_residual",
             "is_embed": true,
-            "optional": true
+            "optional": true,
+            "tied_names": [
+                "model.embed_tokens.weight"
+            ]
         }
     ]
 }
diff --git a/mergekit/_data/architectures/mamba.json b/mergekit/_data/architectures/mamba.json
index b3727dba..1c473532 100644
--- a/mergekit/_data/architectures/mamba.json
+++ b/mergekit/_data/architectures/mamba.json
@@ -16,7 +16,10 @@
         {
             "name": "lm_head.weight",
             "is_embed": true,
-            "aliases": ["backbone.embeddings.weight"]
+            "optional": true,
+            "tied_names": [
+                "backbone.embeddings.weight"
+            ]
         }
     ],
     "num_layers_config_key": "num_hidden_layers",
diff --git a/mergekit/_data/architectures/phi3-small.json b/mergekit/_data/architectures/phi3-small.json
index 7b3a1e80..f27dfac4 100644
--- a/mergekit/_data/architectures/phi3-small.json
+++ b/mergekit/_data/architectures/phi3-small.json
@@ -12,8 +12,9 @@
     "post_weights": [
         {
             "name": "lm_head.weight",
-            "is_embed":true,
-            "aliases": [
+            "is_embed": true,
+            "optional": true,
+            "tied_names": [
                 "model.embed_tokens.weight"
             ]
         },
diff --git a/mergekit/_data/architectures/qwen2.json b/mergekit/_data/architectures/qwen2.json
index 638b3630..c7131523 100644
--- a/mergekit/_data/architectures/qwen2.json
+++ b/mergekit/_data/architectures/qwen2.json
@@ -16,7 +16,8 @@
         {
             "name": "lm_head.weight",
             "is_embed": true,
-            "aliases": [
+            "optional": true,
+            "tied_names": [
                 "model.embed_tokens.weight"
             ]
         }
diff --git a/mergekit/_data/architectures/roberta-masked-lm.json b/mergekit/_data/architectures/roberta-masked-lm.json
index 492127a5..1aae76a1 100644
--- a/mergekit/_data/architectures/roberta-masked-lm.json
+++ b/mergekit/_data/architectures/roberta-masked-lm.json
@@ -8,7 +8,8 @@
             "name": "roberta.embeddings.position_embeddings.weight"
         },
         {
-            "name": "roberta.embeddings.word_embeddings.weight"
+            "name": "roberta.embeddings.word_embeddings.weight",
+            "is_embed": true
         },
         {
             "name": "roberta.embeddings.token_type_embeddings.weight"
@@ -43,7 +44,9 @@
         },
         {
             "name": "lm_head.decoder.weight",
-            "aliases": [
+            "is_embed": true,
+            "optional": true,
+            "tied_names": [
                 "roberta.embeddings.word_embeddings.weight"
             ]
         }
diff --git a/mergekit/_data/architectures/solar.json b/mergekit/_data/architectures/solar.json
index 7bd6a751..78fd5998 100644
--- a/mergekit/_data/architectures/solar.json
+++ b/mergekit/_data/architectures/solar.json
@@ -73,7 +73,8 @@
       "name": "lm_head.weight",
       "input_space": "running_residual",
       "is_embed": true,
-      "aliases": [
+      "optional": true,
+      "tied_names": [
         "model.lm_head.weight"
       ]
     }
diff --git a/mergekit/_data/architectures/starcoder2.json b/mergekit/_data/architectures/starcoder2.json
index 851fdd1a..c2266899 100644
--- a/mergekit/_data/architectures/starcoder2.json
+++ b/mergekit/_data/architectures/starcoder2.json
@@ -13,7 +13,10 @@
         {
             "name": "lm_head.weight",
             "is_embed": true,
-            "aliases": ["model.embed_tokens.weight"]
+            "optional": true,
+            "tied_names": [
+                "model.embed_tokens.weight"
+            ]
         },
         {
             "name": "model.norm.bias"
diff --git a/mergekit/architecture.py b/mergekit/architecture.py
index 4c7b4625..40872160 100644
--- a/mergekit/architecture.py
+++ b/mergekit/architecture.py
@@ -41,6 +41,8 @@ class WeightInfo(BaseModel, frozen=True):
             Indicates whether the weight can be omitted from a model.
         aliases (Optional[List[str]]):
             List of alternative names for the weight, if applicable.
+        tied_names (Optional[List[str]]):
+            List of names for weights that are tied to this weight, if applicable.
         force_dtype (Optional[str]):
             Mandatory dtype for the weight, if applicable.
     """
@@ -50,7 +52,9 @@ class WeightInfo(BaseModel, frozen=True):
     input_space: Optional[str] = None
     output_space: Optional[str] = None
     optional: bool = False
+    tied: bool = False
     aliases: Optional[Tuple[str, ...]] = None
+    tied_names: Optional[Tuple[str, ...]] = None
     force_dtype: Optional[str] = None
     head_split: Literal[None, "input", "output"] = None
     is_kq: Optional[bool] = False
diff --git a/mergekit/io/tasks.py b/mergekit/io/tasks.py
index 70dffc41..499ad4c0 100644
--- a/mergekit/io/tasks.py
+++ b/mergekit/io/tasks.py
@@ -67,12 +67,15 @@ class LoadTensor(Task[Optional[torch.Tensor]]):
     device: Optional[str] = None
     optional: bool = False
     aliases: Optional[Tuple[str, ...]] = None
+    tied_names: Optional[Tuple[str, ...]] = None
 
     def arguments(self) -> Dict[str, Task]:
         return {}
 
     def _resolve_name(self, loader: LazyTensorLoader) -> Optional[str]:
-        all_names = [self.tensor] + list(self.aliases or [])
+        all_names = (
+            [self.tensor] + list(self.aliases or []) + list(self.tied_names or [])
+        )
         for name in all_names:
             if name in loader.index.tensor_paths:
                 return name
@@ -120,6 +123,7 @@ def arguments(self) -> Dict[str, Task]:
                 device=self.device,
                 optional=wi.optional,
                 aliases=wi.aliases,
+                tied_names=wi.tied_names,
             )
             for (model, wi) in self.weight_info.items()
         }
diff --git a/mergekit/io/tensor_writer.py b/mergekit/io/tensor_writer.py
index 199772ea..9ea58222 100644
--- a/mergekit/io/tensor_writer.py
+++ b/mergekit/io/tensor_writer.py
@@ -121,7 +121,7 @@ def finalize(self):
             json.dump(
                 {
                     "metadata": {
-                        "mergekit_version": "0.0.5.1",
+                        "mergekit_version": "0.0.5.2",
                         "total_size": self.total_size,
                     },
                     "weight_map": self.weight_map,
diff --git a/mergekit/plan.py b/mergekit/plan.py
index bdcd7004..5b34eddc 100644
--- a/mergekit/plan.py
+++ b/mergekit/plan.py
@@ -139,7 +139,10 @@ def plan_tensor(
             any_weight = False
             for model, w_in in zip(models, weights_in):
                 index = LoaderCache().get(model).index
-                if w_in.name in index.tensor_paths:
+                if any(
+                    name in index.tensor_paths
+                    for name in [w_in.name] + (w_in.aliases or [])
+                ):
                     any_weight = True
                     break
 
diff --git a/mergekit/scripts/tokensurgeon.py b/mergekit/scripts/tokensurgeon.py
index a6715643..ea6dd4bc 100644
--- a/mergekit/scripts/tokensurgeon.py
+++ b/mergekit/scripts/tokensurgeon.py
@@ -147,26 +147,42 @@ def main(
     )
 
     if lm_head_info:
-        old_lm_head = cache.get(model).get_tensor(
-            lm_head_info.name, aliases=lm_head_info.aliases, device=device
-        )
-        donor_lm_head = cache.get(donor).get_tensor(
-            donor_lm_head_info.name, aliases=donor_lm_head_info.aliases, device=device
-        )
+        try:
+            old_lm_head = cache.get(model).get_tensor(
+                lm_head_info.name, aliases=lm_head_info.aliases, device=device
+            )
+        except KeyError:
+            if lm_head_info.optional:
+                logging.info(f"LM head tensor {lm_head_info.name} not found, skipping")
+            else:
+                report_issue(
+                    f"Could not load LM head tensor {lm_head_info.name}",
+                    error=True,
+                )
+            old_lm_head = None
 
-        LOG.info("Computing new lm_head embeddings")
-        new_lm_head = get_embeddings(
-            old_lm_head,
-            donor_lm_head,
-            old_vocab,
-            new_vocab,
-            common_tokens,
-            accept_prefix=True,
-            k=k,
-            barycentric=barycentric,
-            cosine_similarity=cosine_similarity,
-            name=lm_head_info.name,
-        )
+        if old_lm_head is not None:
+            donor_lm_head = cache.get(donor).get_tensor(
+                donor_lm_head_info.name,
+                aliases=donor_lm_head_info.aliases,
+                device=device,
+            )
+
+            LOG.info("Computing new lm_head embeddings")
+            new_lm_head = get_embeddings(
+                old_lm_head,
+                donor_lm_head,
+                old_vocab,
+                new_vocab,
+                common_tokens,
+                accept_prefix=True,
+                k=k,
+                barycentric=barycentric,
+                cosine_similarity=cosine_similarity,
+                name=lm_head_info.name,
+            )
+        else:
+            new_lm_head = None
 
     # Save out the new model
     LOG.info(f"Saving new model to {out_path}")
@@ -184,6 +200,10 @@ def main(
             tensor = cache.get(model).get_tensor(
                 weight_info.name, aliases=weight_info.aliases
             )
+        if tensor is None:
+            if weight_info.optional:
+                continue
+            report_issue(f"Could not load weight tensor {weight_info.name}", error=True)
         writer.save_tensor(weight_info.name, tensor, clone=merge_options.clone_tensors)
     writer.finalize()
 
diff --git a/pyproject.toml b/pyproject.toml
index 128a5b87..e04fd464 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "mergekit"
 description = "Tools for merging pre-trained large language models"
 readme = "README.md"
 license = { text = "LGPL-3.0-or-later" }
-version = "0.0.5.1"
+version = "0.0.5.2"
 authors = [{ name = "Charles Goddard", email = "chargoddard@gmail.com" }]
 dependencies = [
     "torch>=2.0.0",

From 8d1a10df83bb9ac9c3854199aed20542c7d08231 Mon Sep 17 00:00:00 2001
From: zsgvivo <yangding2022@gmail.com>
Date: Sun, 1 Dec 2024 06:18:40 +0800
Subject: [PATCH 22/27] Add methods from https://arxiv.org/abs/2405.07813
 (#441)

add consensus_ties and consensus_ta method from
https://arxiv.org/abs/2405.07813
---
 mergekit/merge_methods/__init__.py            | 16 +++++++
 .../generalized_task_arithmetic.py            | 43 +++++++++++++++++--
 mergekit/sparsify.py                          | 16 ++++++-
 3 files changed, 70 insertions(+), 5 deletions(-)

diff --git a/mergekit/merge_methods/__init__.py b/mergekit/merge_methods/__init__.py
index 007e163e..6dc92023 100644
--- a/mergekit/merge_methods/__init__.py
+++ b/mergekit/merge_methods/__init__.py
@@ -93,6 +93,22 @@ def get(method: str) -> MergeMethod:
             default_normalize=False,
             default_rescale=True,
         )
+
+    elif method == "consensus_ta":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=None,
+            sparsification_method=SparsificationMethod.consensus_ta,
+            default_normalize=False,
+            default_rescale=False,
+        )
+
+    elif method == "consensus_ties":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=ConsensusMethod.sum,
+            sparsification_method=SparsificationMethod.consensus_ties,
+            default_normalize=True,
+            default_rescale=False,
+        )
     raise RuntimeError(f"Unimplemented merge method {method}")
 
 
diff --git a/mergekit/merge_methods/generalized_task_arithmetic.py b/mergekit/merge_methods/generalized_task_arithmetic.py
index 214726b7..0bb3f0c7 100644
--- a/mergekit/merge_methods/generalized_task_arithmetic.py
+++ b/mergekit/merge_methods/generalized_task_arithmetic.py
@@ -29,7 +29,7 @@
     MergeMethod,
     MergeTensorInput,
 )
-from mergekit.sparsify import SparsificationMethod, sparsify
+from mergekit.sparsify import SparsificationMethod, get_tall_mask, sparsify
 
 
 class ConsensusMethod(str, Enum):
@@ -79,6 +79,22 @@ def tensor_parameters(self) -> List[ConfigParameterDef]:
                     default_value=1.0,
                 )
             )
+        if (
+            self.sparsification_method == SparsificationMethod.consensus_ta
+            or self.sparsification_method == SparsificationMethod.consensus_ties
+        ):
+            res.append(
+                ConfigParameterDef(
+                    name="k",
+                    default_value=1,
+                )
+            )
+            res.append(
+                ConfigParameterDef(
+                    name="lambda",
+                    default_value=1.0,
+                )
+            )
         return res
 
     def make_task(
@@ -133,7 +149,10 @@ def execute(
             return base
 
         # sparsify
-        if self.method.sparsification_method:
+        if (
+            self.method.sparsification_method
+            and self.method.sparsification_method != SparsificationMethod.consensus_ta
+        ):
             for tv_info in tvs:
                 kwargs = {}
                 if "gamma" in tv_info:
@@ -142,7 +161,7 @@ def execute(
                 if "epsilon" in tv_info:
                     kwargs["epsilon"] = tv_info["epsilon"]
 
-                tv_info["delta"] = sparsify(
+                tv_info["sparsified_delta"] = sparsify(
                     tv_info["delta"],
                     density=tv_info["density"],
                     method=self.method.sparsification_method,
@@ -150,7 +169,9 @@ def execute(
                     **kwargs,
                 )
 
-        deltas = torch.stack([tv["delta"] for tv in tvs], dim=0)
+            deltas = torch.stack([tv["sparsified_delta"] for tv in tvs], dim=0)
+        else:
+            deltas = torch.stack([tv["delta"] for tv in tvs], dim=0)
         weights = torch.tensor(
             [tv["weight"] for tv in tvs], dtype=deltas.dtype, device=deltas.device
         )
@@ -185,6 +206,20 @@ def execute(
             lambda_factor = tvs[0]["lambda"]
             mixed_delta *= lambda_factor
 
+        if (
+            self.method.sparsification_method == SparsificationMethod.consensus_ta
+            or self.method.sparsification_method == SparsificationMethod.consensus_ties
+        ):
+            for tv_info in tvs:
+                tv_info["tall_mask"] = get_tall_mask(
+                    tv_info["delta"],
+                    tv_info["lambda"],
+                    mixed_delta,
+                )
+            tall_masks = torch.stack([tv["tall_mask"] for tv in tvs], dim=0)
+            consensus_mask = tall_masks.sum(dim=0) >= tvs[0]["k"]
+            mixed_delta = mixed_delta * consensus_mask
+
         return (base + mixed_delta).to(base.dtype)
 
     def group_label(self) -> Optional[str]:
diff --git a/mergekit/sparsify.py b/mergekit/sparsify.py
index ee6477c3..f782247f 100644
--- a/mergekit/sparsify.py
+++ b/mergekit/sparsify.py
@@ -23,6 +23,8 @@ class SparsificationMethod(str, Enum):
     random = "random"
     magnitude_outliers = "magnitude_outliers"
     rank_magnitude_sampling = "rank_magnitude_sampling"
+    consensus_ta = "consensus_ta"
+    consensus_ties = "consensus_ties"
 
 
 def rescale_sum(tensor: torch.Tensor, mask: torch.Tensor):
@@ -177,7 +179,10 @@ def sparsify(
     rescale: bool = False,
     epsilon: float = 0.15,
 ) -> torch.Tensor:
-    if method == SparsificationMethod.magnitude:
+    if (
+        method == SparsificationMethod.magnitude
+        or method == SparsificationMethod.consensus_ties
+    ):
         return magnitude(tensor, density=density, rescale=rescale)
     elif method == SparsificationMethod.random:
         return bernoulli(tensor, density=density, rescale=rescale)
@@ -187,3 +192,12 @@ def sparsify(
         return rank_magnitude(tensor, density=density, rescale=rescale, epsilon=epsilon)
     else:
         raise NotImplementedError(method)
+
+
+def get_tall_mask(
+    delta: torch.Tensor,  # individual task vectors
+    lambda_factor: float,  # hyper-parameter lambda for generating TALL masks
+    mixed_delta: torch.Tensor,  # multi-task vector
+):
+    mask = delta.abs() > lambda_factor * (mixed_delta - delta).abs()
+    return mask

From 01e60a2322b871752b3ee34f28e8917728865865 Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Sat, 30 Nov 2024 16:21:29 -0800
Subject: [PATCH 23/27] Pad embeds to multiple (#465)

Add the ability to pad the output embeddings to a multiple of a
user-defined factor when merging tokenizers.

Config syntax example:
```yaml
merge_method: linear
models:
  - model: model_a
  - model: model_b
parameters:
  weight: 0.5
tokenizer:
  source: union
  pad_to_multiple_of: 64
```
---
 mergekit/merge.py                | 11 +++++++++--
 mergekit/plan.py                 |  3 +++
 mergekit/scripts/tokensurgeon.py |  2 +-
 mergekit/tokenizer/config.py     |  1 +
 mergekit/tokenizer/embed.py      | 12 +++++++++++-
 tests/test_tokenizer.py          | 32 +++++++++++++++++++++++++++++++-
 6 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/mergekit/merge.py b/mergekit/merge.py
index 60189f44..2d659505 100644
--- a/mergekit/merge.py
+++ b/mergekit/merge.py
@@ -98,7 +98,10 @@ def run_merge(
             tokenizer = value.tokenizer
 
     if tokenizer:
-        _update_config_vocab(cfg_out, tokenizer)
+        pad_to_multiple_of = None
+        if merge_config.tokenizer and merge_config.tokenizer.pad_to_multiple_of:
+            pad_to_multiple_of = merge_config.tokenizer.pad_to_multiple_of
+        _update_config_vocab(cfg_out, tokenizer, pad_to_multiple_of=pad_to_multiple_of)
 
     logging.info("Saving config")
     cfg_out.save_pretrained(out_path)
@@ -263,9 +266,13 @@ def _model_out_config(
 def _update_config_vocab(
     config: transformers.PretrainedConfig,
     tokenizer: transformers.PreTrainedTokenizerBase,
+    pad_to_multiple_of: Optional[int] = None,
 ):
+    vocab_size = len(tokenizer.get_vocab())
+    if pad_to_multiple_of and vocab_size % pad_to_multiple_of:
+        vocab_size = vocab_size + pad_to_multiple_of - (vocab_size % pad_to_multiple_of)
     try:
-        config.vocab_size = len(tokenizer.get_vocab())
+        config.vocab_size = vocab_size
     except Exception as e:
         logging.warning(
             "Unable to set vocabulary size in output config - you may need to manually correct it.",
diff --git a/mergekit/plan.py b/mergekit/plan.py
index 5b34eddc..3e407be1 100644
--- a/mergekit/plan.py
+++ b/mergekit/plan.py
@@ -182,12 +182,15 @@ def plan_tensor(
         tensor_input_task = gather_tensors
         if self._tokenizer_task and weight.is_embed:
             token_cfg = {}
+            pad_to_multiple = None
             if cfg_reader.config.tokenizer:
                 token_cfg = cfg_reader.config.tokenizer.tokens
+                pad_to_multiple = cfg_reader.config.tokenizer.pad_to_multiple_of
             tensor_input_task = PermutedEmbeddings(
                 gather_tensors=gather_tensors,
                 tokenizer_task=self._tokenizer_task,
                 tokens=token_cfg,
+                pad_to_multiple_of=pad_to_multiple,
                 base_model=base_model,
             )
 
diff --git a/mergekit/scripts/tokensurgeon.py b/mergekit/scripts/tokensurgeon.py
index ea6dd4bc..31d38fdf 100644
--- a/mergekit/scripts/tokensurgeon.py
+++ b/mergekit/scripts/tokensurgeon.py
@@ -210,7 +210,7 @@ def main(
     tokenizer.save_pretrained(out_path)
     cfg_out = arch_info.config
     try:
-        cfg_out.vocab_size = tokenizer.vocab_size
+        cfg_out.vocab_size = new_embed.shape[0]
     except AttributeError:
         LOG.error(
             "Could not set vocab size in config.json - you may need to update it manually."
diff --git a/mergekit/tokenizer/config.py b/mergekit/tokenizer/config.py
index 94208385..7bdaeca2 100644
--- a/mergekit/tokenizer/config.py
+++ b/mergekit/tokenizer/config.py
@@ -49,3 +49,4 @@ class TokenEmbeddingConfig(BaseModel, frozen=True):
 class TokenizerConfig(BaseModel, frozen=True):
     source: Union[ModelReference, Literal["union"], Literal["base"]] = "union"
     tokens: Optional[Dict[str, TokenEmbeddingConfig]] = None
+    pad_to_multiple_of: Optional[int] = None
diff --git a/mergekit/tokenizer/embed.py b/mergekit/tokenizer/embed.py
index 3cdb1840..a853d1af 100644
--- a/mergekit/tokenizer/embed.py
+++ b/mergekit/tokenizer/embed.py
@@ -33,6 +33,7 @@ class PermutedEmbeddings(Task[Dict[ModelReference, torch.Tensor]]):
     gather_tensors: GatherTensors
     tokenizer_task: BuildTokenizer
     tokens: Optional[ImmutableMap[str, TokenEmbeddingConfig]]
+    pad_to_multiple_of: Optional[int]
     base_model: Optional[ModelReference]
 
     def arguments(self) -> Dict[str, Task]:
@@ -51,6 +52,10 @@ def execute(
 
         vocab = tokenizer.get_vocab()
         vocab_size = len(vocab)
+        if self.pad_to_multiple_of and vocab_size % self.pad_to_multiple_of:
+            vocab_size = (
+                vocab_size // self.pad_to_multiple_of + 1
+            ) * self.pad_to_multiple_of
         embed_size = tensors[models[0]].shape[1]
         assert all(
             t.shape[1] == embed_size for t in tensors.values()
@@ -59,7 +64,7 @@ def execute(
         dtype = tensors[models[0]].dtype
         device = tensors[models[0]].device
 
-        token_configs = dict(**self.tokens) or {}
+        token_configs = dict(**(self.tokens or {}))
         tokens_to_average = self.assign_embedding_sources(
             permutations, models, vocab, token_configs
         )
@@ -105,6 +110,11 @@ def execute(
                     logging.error(
                         f"No embedding for token {repr(token)} in model {model}!"
                     )
+
+            if vocab_size > len(vocab):
+                # as suggested by https://nlp.stanford.edu/~johnhew/vocab-expansion.html
+                avg_embed = torch.mean(new_embed[: len(vocab), :], dim=0)
+                new_embed[len(vocab) :, :] = avg_embed
             result[model] = new_embed
 
         return result
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 17fafcc8..a799e8c4 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -7,7 +7,7 @@
 import tokenizers
 import torch
 from common import make_picollama, run_and_check_merge
-from transformers import LlamaTokenizerFast, PreTrainedTokenizerBase
+from transformers import LlamaConfig, LlamaTokenizerFast, PreTrainedTokenizerBase
 
 from mergekit.config import InputModelDefinition, MergeConfiguration
 from mergekit.io import LazyTensorLoader
@@ -270,6 +270,36 @@ def _check_embed(model_path: str):
 
         run_and_check_merge(config, validate=_check_embed)
 
+    def test_pad_to_multiple_of(self, model_chatml: str):
+        config = self.make_config(
+            [model_chatml],
+            base_model=model_chatml,
+            merge_method="linear",
+            tokenizer_config=TokenizerConfig(
+                source="base",
+                pad_to_multiple_of=16,
+            ),
+        )
+        real_vocab_size = 64 + 2
+        padded_size = (real_vocab_size // 16 + 1) * 16
+
+        def _check_result(model_path: str):
+            cfg = LlamaConfig.from_pretrained(model_path)
+            assert (
+                cfg.vocab_size == padded_size
+            ), f"Expected vocab size {padded_size}, got {cfg.vocab_size}"
+            check_tokenizer(
+                expected_size=real_vocab_size,
+                must_contain=["<|im_start|>", "<|im_end|>"],
+            )(model_path)
+
+            emb_out = ModelEmbeddings(model_path)
+            assert (
+                emb_out.embed_tokens.shape[0] == padded_size
+            ), "Embedding size mismatch"
+
+        run_and_check_merge(config, validate=_check_result)
+
     def make_config(
         self,
         models: List[str],

From 47f418cac7c3d77e7cbdd81ad7bda7dc22e7514e Mon Sep 17 00:00:00 2001
From: Ikko Eltociear Ashimine <eltociear@gmail.com>
Date: Thu, 5 Dec 2024 15:08:41 +0900
Subject: [PATCH 24/27] docs: update README.md (#466)

minor fix
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cbf93b12..a248bdb6 100644
--- a/README.md
+++ b/README.md
@@ -173,7 +173,7 @@ Parameters: same as [TIES](#ties) for `dare_ties`, or [Linear](#linear) for `dar
 
 ### [Model Breadcrumbs](https://arxiv.org/abs/2312.06795)
 
-An extension of task arithmetic that discards both small and and extremely large differences from the base model. As with DARE, the Model Breadcrumbs algorithm can be used with (`breadcrumbs_ties`) or without (`breadcrumbs`) the sign consensus algorithm of TIES.
+An extension of task arithmetic that discards both small and extremely large differences from the base model. As with DARE, the Model Breadcrumbs algorithm can be used with (`breadcrumbs_ties`) or without (`breadcrumbs`) the sign consensus algorithm of TIES.
 
 Parameters: same as [Linear](#linear), plus:
 

From fd9e332c5d88f8ea28948583d0399776fd8c15f6 Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Wed, 4 Dec 2024 23:02:47 -0800
Subject: [PATCH 25/27] Update README (#467)

---
 README.md | 181 +++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 158 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index a248bdb6..91b0fb6d 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,34 @@
 
 `mergekit` is a toolkit for merging pre-trained language models. `mergekit` uses an out-of-core approach to perform unreasonably elaborate merges in resource-constrained situations. Merges can be run entirely on CPU or accelerated with as little as 8 GB of VRAM. Many merging algorithms are supported, with more coming as they catch my attention.
 
-Features:
+## Contents
+
+- [Why Merge Models?](#why-merge-models)
+- [Features](#features)
+- [Installation](#installation)
+- [Usage](#usage)
+- [Merge Configuration](#merge-configuration)
+  - [Parameter Specification](#parameter-specification)
+  - [Tokenizer Configuration](#tokenizer-configuration)
+  - [Chat Template Configuration](#chat-template-configuration)
+  - [Examples](#examples)
+- [Merge Methods](#merge-methods)
+
+## Why Merge Models?
+
+Model merging is a powerful technique that allows combining the strengths of different models without the computational overhead of ensembling or the need for additional training. By operating directly in the weight space of models, merging can:
+
+- Combine multiple specialized models into a single versatile model
+- Transfer capabilities between models without access to training data
+- Find optimal trade-offs between different model behaviors
+- Improve performance while maintaining inference costs
+- Create new capabilities through creative model combinations
+
+Unlike traditional ensembling which requires running multiple models, merged models maintain the same inference cost as a single model while often achieving comparable or superior performance.
+
+## Features
+
+Key features of `mergekit` include:
 
 - Supports Llama, Mistral, GPT-NeoX, StableLM, and more
 - Many [merge methods](#merge-methods)
@@ -52,7 +79,7 @@ When you have a merged model you're happy with, you may want to share it on the
 
 Once you're happy with your model card and merged model, you can upload it to the Hugging Face Hub using the [huggingface_hub](https://huggingface.co/docs/huggingface_hub/index) Python library.
 
-```
+```sh
 # log in to huggingface with an access token (must have write permission)
 huggingface-cli login
 # upload your model
@@ -72,7 +99,8 @@ Below are the primary elements of a configuration file:
 - `base_model`: Specifies the base model used in some merging methods.
 - `parameters`: Holds various parameters such as weights and densities, which can also be specified at different levels of the configuration.
 - `dtype`: Specifies the data type used for the merging operation.
-- `tokenizer_source`: Determines how to construct a tokenizer for the merged model.
+- `tokenizer` or `tokenizer_source`: Determines how to construct a tokenizer for the merged model.
+- `chat_template`: Specifies a chat template for the merged model.
 
 ### Parameter Specification
 
@@ -90,23 +118,112 @@ The parameters can be set at different levels, with decreasing precedence as fol
 3. `models.*.parameters` or `input_model_parameters` - applying to any tensors coming from specific input models
 4. `parameters` - catchall
 
-### Tokenizer Source
+### Tokenizer Configuration
+
+The tokenizer behavior can be configured in two ways: using the new `tokenizer` field (recommended) or the legacy `tokenizer_source` field (maintained for backward compatibility). These fields are mutually exclusive - you should use one or the other, not both.
+
+#### Modern Configuration (tokenizer)
+
+The `tokenizer` field provides fine-grained control over vocabulary and embeddings:
+
+```yaml
+tokenizer:
+  source: "union"  # or "base" or a specific model path
+  tokens:          # Optional: configure specific tokens
+    <token_name>:
+      source: ...  # Specify embedding source
+      force: false # Optional: force this embedding for all models
+  pad_to_multiple_of: null  # Optional: pad vocabulary size
+```
+
+##### Tokenizer Source
+
+The `source` field determines the vocabulary of the output model:
+
+- `union`: Combine vocabularies from all input models (default)
+- `base`: Use vocabulary from the base model
+- `"path/to/model"`: Use vocabulary from a specific model
+
+##### Token Embedding Handling
 
-The `tokenizer_source` field of a configuration file determines what tokenizer is used by the merged model. This also effects how embeddings and language model heads are merged.
+When merging models with different vocabularies, mergekit uses smart defaults to handle token embeddings:
 
-This functionality is still experimental and may break. Please file an issue if you encounter any issues with it.
+- If a token exists in the base model, its embedding is used as the default
+- If only one model has the token, that model's embedding is used
+- Otherwise, an average of all available embeddings is used
 
-Valid values:
+You can override these defaults for specific tokens:
 
-- `base`: use the tokenizer from the base model
-- `union`: construct a tokenizer with all tokens from all models
-- `model:<model_path>`: use the tokenizer from a specific model
+```yaml
+tokenizer:
+  source: union
+  tokens:
+    # Use embedding from a specific model
+    <|im_start|>:
+      source: "path/to/chatml/model"
 
-If set, mergekit will find a mapping between each model's vocabulary and the output tokenizer. This allows models with different vocabularies or added tokens to be meaningfully merged.
+    # Force a specific embedding for all models
+    <|special|>:
+      source: "path/to/model"
+      force: true
 
-`tokenizer_source` is compatible with all merge methods, but when used `lm_head`/`embed_tokens` will be merged linearly. For two-model merges, the `embed_slerp` parameter can be set to `true` to use SLERP instead.
+    # Map a token to another model's token embedding
+    <|renamed_token|>:
+      source:
+        kind: "model_token"
+        model: "path/to/model"
+        token: "<|original_token|>"  # or use token_id: 1234
+```
+
+##### Practical Example
+
+Here's how you might preserve both Llama 3 Instruct and ChatML prompt formats when merging models:
+
+```yaml
+tokenizer:
+  source: union
+  tokens:
+    # ChatML tokens
+    <|im_start|>:
+      source: "chatml_model"
+    <|im_end|>:
+      source: "chatml_model"
+
+    # Llama 3 tokens - force original embeddings
+    <|start_header_id|>:
+      source: "llama3_model"
+      force: true
+    <|end_header_id|>:
+      source: "llama3_model"
+      force: true
+    <|eot_id|>:
+      source: "llama3_model"
+      force: true
+```
+
+#### Legacy Configuration (tokenizer_source)
+
+For backward compatibility, the `tokenizer_source` field is still supported:
+
+```yaml
+tokenizer_source: "union"  # or "base" or a model path
+```
+
+This provides basic tokenizer selection but lacks the fine-grained control of the modern `tokenizer` field.
+
+### Chat Template Configuration
 
-If the `tokenizer_source` field is not set, mergekit will fall back to its legacy default behavior. The tokenizer for the base model (or first model in the merge, if no base model is specified) will be copied to the output directory. The parameter matrices for `lm_head`/`embed_tokens` will be truncated to the smallest size present in the merge. In _most_ cases this corresponds to using the tokenizer for the base model.
+The optional `chat_template` field allows overriding the chat template used for the merged model.
+
+```yaml
+chat_template: "auto"  # or a template name or Jinja2 template
+```
+
+Options include:
+
+- `"auto"`: Automatically select the most common template among input models
+- Built-in templates: `"alpaca"`, `"chatml"`, `"llama3"`, `"mistral"`, `"exaone"`
+- A Jinja2 template string for custom formatting
 
 ### Examples
 
@@ -130,6 +247,7 @@ A quick overview of the currently supported merge methods:
 | [Model Stock](https://arxiv.org/abs/2403.19522)                                                  | `model_stock`        | ✅          | ✅              |
 | [DELLA](https://arxiv.org/abs/2406.11617)                                                  | `della`        | ✅          | ✅              |
 | [DELLA](https://arxiv.org/abs/2406.11617) [Task Arithmetic](https://arxiv.org/abs/2212.04089)                                                  | `della_linear`        | ✅          | ✅              |
+
 ### Linear
 
 The classic merge method - a simple weighted average.
@@ -195,6 +313,7 @@ Parameters:
 Building upon DARE, DELLA uses adaptive pruning based on parameter magnitudes. DELLA first ranks parameters in each row of delta parameters and assigns drop probabilities inversely proportional to their magnitudes. This allows it to retain more important changes while reducing interference. After pruning, it rescales the remaining parameters similar to [DARE](#dare). DELLA can be used with (`della`) or without (`della_linear`) the sign elect step of TIES
 
 Parameters: same as [Linear](#linear), plus:
+
 - `density` - fraction of weights in differences from the base model to retain
 - `epsilon` - maximum change in drop probability based on magnitude. Drop probabilities assigned will range from `density - epsilon` to `density + epsilon`. (When selecting values for `density` and `epsilon`, ensure that the range of probabilities falls within 0 to 1)
 - `lambda` - scaling factor for the final merged delta parameters before merging with the base parameters.
@@ -215,7 +334,7 @@ The `mergekit-moe` script supports merging multiple dense models into a mixture
 
 ## Evolutionary merge methods
 
-See `docs/evolve.md` for details.
+See [`docs/evolve.md`](docs/evolve.md) for details.
 
 ## ✨ Merge in the Cloud ✨
 
@@ -224,7 +343,7 @@ We host merging on Arcee's cloud GPUs - you can launch a cloud merge in the [Arc
 `export ARCEE_API_KEY=<your-api-key>`
 `pip install -q arcee-py`
 
-```
+```python
 import arcee
 arcee.merge_yaml("bio-merge","./examples/bio-merge.yml")
 ```
@@ -233,7 +352,7 @@ Check your merge status at the [Arcee App](https://app.arcee.ai)
 
 When complete, either deploy your merge:
 
-```
+```python
 arcee.start_deployment("bio-merge", merging="bio-merge")
 ```
 
@@ -241,16 +360,32 @@ Or download your merge:
 
 `!arcee merging download bio-merge`
 
-
 ## Citation
 
-We now have a [paper](https://arxiv.org/abs/2403.13257) you can cite for the MergeKit library:
+If you find `mergekit` useful in your research, please consider citing the [paper](https://aclanthology.org/2024.emnlp-industry.36/):
 
 ```bibtex
-@article{goddard2024arcee,
-  title={Arcee's MergeKit: A Toolkit for Merging Large Language Models},
-  author={Goddard, Charles and Siriwardhana, Shamane and Ehghaghi, Malikeh and Meyers, Luke and Karpukhin, Vlad and Benedict, Brian and McQuade, Mark and Solawetz, Jacob},
-  journal={arXiv preprint arXiv:2403.13257},
-  year={2024}
+@inproceedings{goddard-etal-2024-arcees,
+    title = "Arcee{'}s {M}erge{K}it: A Toolkit for Merging Large Language Models",
+    author = "Goddard, Charles  and
+      Siriwardhana, Shamane  and
+      Ehghaghi, Malikeh  and
+      Meyers, Luke  and
+      Karpukhin, Vladimir  and
+      Benedict, Brian  and
+      McQuade, Mark  and
+      Solawetz, Jacob",
+    editor = "Dernoncourt, Franck  and
+      Preo{\c{t}}iuc-Pietro, Daniel  and
+      Shimorina, Anastasia",
+    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
+    month = nov,
+    year = "2024",
+    address = "Miami, Florida, US",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.emnlp-industry.36",
+    doi = "10.18653/v1/2024.emnlp-industry.36",
+    pages = "477--485",
+    abstract = "The rapid growth of open-source language models provides the opportunity to merge model checkpoints, combining their parameters to improve performance and versatility. Advances in transfer learning have led to numerous task-specific models, which model merging can integrate into powerful multitask models without additional training. MergeKit is an open-source library designed to support this process with an efficient and extensible framework suitable for any hardware. It has facilitated the merging of thousands of models, contributing to some of the world{'}s most powerful open-source model checkpoints. The library is accessible at: https://github.com/arcee-ai/mergekit.",
 }
 ```

From 2980fba8839390f00d4dac48000bf8e2f1b36edd Mon Sep 17 00:00:00 2001
From: Charles Goddard <chargoddard@gmail.com>
Date: Wed, 4 Dec 2024 23:05:17 -0800
Subject: [PATCH 26/27] Fix

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 91b0fb6d..225bd838 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,11 @@
   - [Chat Template Configuration](#chat-template-configuration)
   - [Examples](#examples)
 - [Merge Methods](#merge-methods)
+- [LoRA extraction](#lora-extraction)
+- [Mixture of Experts merging](#mixture-of-experts-merging)
+- [Evolutionary merge methods](#evolutionary-merge-methods)
+- [Merge in the Cloud](#-merge-in-the-cloud-)
+- [Citation](#citation)
 
 ## Why Merge Models?
 

From 00f8bf4375336226aa42a1e2d1d2a12a9a883c79 Mon Sep 17 00:00:00 2001
From: T145 <T145@protonmail.com>
Date: Sat, 7 Dec 2024 18:14:42 -0500
Subject: [PATCH 27/27] Spellcheck on the evolve doc (#468)

---
 docs/evolve.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/evolve.md b/docs/evolve.md
index 2ac164a9..930fc279 100644
--- a/docs/evolve.md
+++ b/docs/evolve.md
@@ -121,7 +121,7 @@ Assigns an actor to each GPU in your cluster and guarantees merges and evaluatio
 
 #### `buffered`
 
-Maintains a buffer of tasks scheduled to ensure that there is always a model mergign or ready to evaluate for each gpu. Allows for concurrent merging and evaluation of models on the same GPU if enough VRAM is available. Only suitable for a single-node setup or when `--storage-path` points to a fast shared filesystem.
+Maintains a buffer of tasks scheduled to ensure that there is always a model merging or ready to evaluate for each GPU. Allows for concurrent merging and evaluation of models on the same GPU if enough VRAM is available. Only suitable for a single-node setup or when `--storage-path` points to a fast shared filesystem.
 
 #### `serial`