Merge branch 'main' into graph-rework

arcee-ai · Jan 3, 2024 · 1a861aa · 1a861aa
2 parents 9e83c33 + e8b982b
commit 1a861aa
Show file tree

Hide file tree

Showing 35 changed files with 707 additions and 388 deletions.
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -8,9 +8,31 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
-    - uses: actions/setup-python@v4
-      with:
-          python-version: "3.10"
-          cache: 'pip'
-    - uses: pre-commit/[email protected]
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+          cache: "pip"
+      - uses: pre-commit/[email protected]
+
+  pytest:
+    name: PyTest
+    needs: [pre-commit]
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.9", "3.10", "3.11"]
+    timeout-minutes: 5
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python_version }}
+          cache: "pip"
+      - name: Install dependencies
+        run: pip3 install -U -e .[test]
+      - name: Run tests
+        run: pytest .
diff --git a/README.md b/README.md
@@ -1,27 +1,55 @@
 # mergekit
 
-`mergekit` is a toolkit for merging pre-trained language models, using a variety of merge methods including TIES, linear, and slerp merging. The toolkit also enables piecewise assembly of a language model from layers.
+`mergekit` is a toolkit for merging pre-trained language models. `mergekit` uses an out-of-core approach to perform unreasonably elaborate merges in resource-constrained situations. Merges can be run entirely on CPU or accelerated with as little as 8 GB of VRAM. Many merging algorithms are supported, with more coming as they catch my attention.
+
+Features:
+
+- Supports Llama, Mistral, GPT-NeoX, StableLM, and more
+- Many [merge methods](#merge-methods)
+- GPU or CPU execution
+- Lazy loading of tensors for low memory use
+- Interpolated gradients for parameter values (inspired by Gryphe's [BlockMerge_Gradient](https://github.com/Gryphe/BlockMerge_Gradient) script)
+- Piecewise assembly of language models from layers ("Frankenmerging")
+
+## Installation
+
+```sh
+git clone https://github.com/cg123/mergekit.git
+cd mergekit
+
+pip install -e .  # install the package and make scripts available
+```
 
-Run `pip install -e .` to install the package and make the scripts available.
 If the above fails with the error of:
+
 ```
 ERROR: File "setup.py" or "setup.cfg" not found. Directory cannot be installed in editable mode:
 (A "pyproject.toml" file was found, but editable mode currently requires a setuptools-based build.)
 ```
+
 You may need to upgrade pip to > 21.3 with the command `python3 -m pip install --upgrade pip`
 
-The script `mergekit-yaml` takes a YAML configuration file defining the operations to perform.
+## Usage
+
+The script `mergekit-yaml` is the main entry point for `mergekit`. It takes a YAML configuration file and an output path, like so:
+
+```sh
+mergekit-yaml path/to/your/config.yml ./output-model-directory [--cuda] [--lazy-unpickle] [--allow-crimes] [... other options]
+```
 
-## Configuration
+For more information on the arguments accepted by `mergekit-yaml` run the command `mergekit-yaml --help`.
 
+## Merge Configuration
+
+Merge configurations are YAML documents specifying the operations to perform in order to produce your merged model.
 Below are the primary elements of a configuration file:
 
-- `merge_method`: Specifies the method to use for merging models. Can be one of 'ties', 'linear', 'slerp', or 'passthrough'.
+- `merge_method`: Specifies the method to use for merging models. See [Merge Methods](#merge-methods) for a list.
 - `slices`: Defines slices of layers from different models to be used. This field is mutually exclusive with `models`.
 - `models`: Defines entire models to be used for merging. This field is mutually exclusive with `slices`.
 - `base_model`: Specifies the base model used in some merging methods.
 - `parameters`: Holds various parameters such as weights and densities, which can also be specified at different levels of the configuration.
-- `dtype`: Specifies the data type for the merging operation.
+- `dtype`: Specifies the data type used for the merging operation.
 - `tokenizer_source`: Determines how to construct a tokenizer for the merged model.
 
 ### Parameter Specification
@@ -37,114 +65,82 @@ The parameters can be set at different levels, with decreasing precedence as fol
 
 1. `slices.*.sources.parameters` - applying to a specific input slice
 2. `slices.*.parameters` - applying to a specific output slice
-3. `input_model_parameters` - applying to any tensors coming from specific input models
+3. `models.*.parameters` or `input_model_parameters` - applying to any tensors coming from specific input models
 4. `parameters` - catchall
 
+### Tokenizer Source
 
-### Merge Methods
+The `tokenizer_source` field of a configuration file determines what tokenizer is used by the merged model. This also effects how embeddings and language model heads are merged.
 
-#### **[Resolving Interference When Merging Models](https://arxiv.org/abs/2306.01708)** (`"ties"`)
-Requires a base model.
-Parameters:
-- `density` - fraction of weights in differences from the base model to retain
-- `weight` - relative (or absolute if `normalize=False`) weighting of a given tensor
-- `normalize` - if true, the weights of all models contributing to a tensor will be normalized. Default behavior.
+This functionality is still experimental and may break. Please file an issue if you encounter any issues with it.
 
+Valid values:
 
-#### Linear
-Does not require a base model. Takes parameters `weight` and `normalize`, with same definition as above.
+- `base`: use the tokenizer from the base model
+- `union`: construct a tokenizer with all tokens from all models
+- `model:<model_path>`: use the tokenizer from a specific model
 
+If set, mergekit will find a mapping between each model's vocabulary and the output tokenizer. This allows models with different vocabularies or added tokens to be meaningfully merged.
 
-#### SLERP
-Requires exactly two models, one of which must be the base model. Takes one parameter - `t` - the interpolation factor from the base model to the secondary model.
+`tokenizer_source` is compatible with all merge methods, but when used `lm_head`/`embed_tokens` will be merged linearly. For two-model merges, the `embed_slerp` parameter can be set to `true` to use SLERP instead.
 
+If the `tokenizer_source` field is not set, mergekit will fall back to its legacy default behavior. The tokenizer for the base model (or first model in the merge, if no base model is specified) will be copied to the output directory. The parameter matrices for `lm_head`/`embed_tokens` will be truncated to the smallest size present in the merge. In _most_ cases this corresponds to using the tokenizer for the base model.
 
-### Tokenizer Source
+### Examples
 
-The `tokenizer_source` field of a configuration file determines what tokenizer is used by the merged model. This also effects how embeddings and language model heads are merged.
+Several examples of merge configurations are available in [`examples/`](examples/).
 
-Valid values:
+## Merge Methods
 
-* `base`: use the tokenizer from the base model
-* `union`: construct a tokenizer with all tokens from all models
-* `model:<model_path>`: use the tokenizer from a specific model
+A quick overview of the currently supported merge methods:
 
-If set, mergekit will find a mapping between each model's vocabulary and the output tokenizer. This allows models with different vocabularies or added tokens to be meaningfully merged.
+| Method                                                                                       | `merge_method` value | Multi-Model | Uses base model |
+| -------------------------------------------------------------------------------------------- | -------------------- | ----------- | --------------- |
+| Linear ([Model Soups](https://arxiv.org/abs/2203.05482))                                     | `linear`             | ✅          | ❌              |
+| SLERP                                                                                        | `slerp`              | ❌          | ✅              |
+| [Task Arithmetic](https://arxiv.org/abs/2212.04089)                                          | `task_arithmetic`    | ✅          | ✅              |
+| [TIES](https://arxiv.org/abs/2306.01708)                                                     | `ties`               | ✅          | ✅              |
+| [DARE](https://arxiv.org/abs/2311.03099) [TIES](https://arxiv.org/abs/2306.01708)            | `dare_ties`          | ✅          | ✅              |
+| [DARE](https://arxiv.org/abs/2311.03099) [Task Arithmetic](https://arxiv.org/abs/2212.04089) | `dare_linear`        | ✅          | ✅              |
+| Passthrough                                                                                  | `passthrough`        | ❌          | ❌              |
 
-`tokenizer_source` is compatible with all merge methods, but when used `lm_head`/`embed_tokens` will be merged linearly. For two-model merges, the `embed_slerp` parameter can be set to `true` to use SLERP instead.
+### Linear
 
-If the `tokenizer_source` field is not set, mergekit will fall back to its legacy default behavior. The tokenizer for the base model (or first model in the merge, if no base model is specified) will be copied to the output directory. The parameter matrices for `lm_head`/`embed_tokens` will be truncated to the smallest size present in the merge. In *most* cases this corresponds to using the tokenizer for the base model.
+The classic merge method - a simple weighted average.
 
-### Examples
+Parameters:
 
-- Simple linear merge of multiple models:
-
-  ```yml
-  models:
-    - model: psmathur/orca_mini_v3_13b
-      parameters:
-        weight: 1.0
-    - model: WizardLM/WizardLM-13B-V1.2
-      parameters:
-        weight: 0.3
-    - model: garage-bAInd/Platypus2-13B
-      parameters:
-        weight: 0.5
-  merge_method: linear
-  dtype: float16
-  ```
-
-- `bakllama.py` style layer recombination:
-
-  ```yml
-  slices:
-    - sources:
-      - model: psmathur/orca_mini_v3_13b
-        layer_range: [0, 24]
-    - sources:
-      - model: garage-bAInd/Platypus2-13B
-        layer_range: [20, 40]
-  merge_method: passthrough
-  dtype: float16
-  ```
-
-- Gradient SLERP with different weights for mlp/self attention:
-
-  ```yml
-  slices:
-    - sources:
-        - model: psmathur/orca_mini_v3_13b
-          layer_range: [0, 40]
-        - model: garage-bAInd/Platypus2-13B
-          layer_range: [0, 40]
-  merge_method: slerp
-  base_model: psmathur/orca_mini_v3_13b
-  parameters:
-    t:
-      - filter: self_attn
-        value: [0, 0.5, 0.3, 0.7, 1]
-      - filter: mlp
-        value: [1, 0.5, 0.7, 0.3, 0]
-      - value: 0.5 # fallback for rest of tensors
-  dtype: float16
-  ```
-
-#### Usage
-
-Once you have created the YAML configuration file, run `mergekit-yaml` with the config file and output path as arguments:
+- `weight` - relative (or absolute if `normalize=False`) weighting of a given tensor
+- `normalize` - if true, the weights of all models contributing to a tensor will be normalized. Default behavior.
 
-```sh
-mergekit-yaml path/to/your/config.yml ./output-model-directory [--cuda]
-```
+### SLERP
 
-## Legacy Wrappers
+Spherically interpolate the parameters of two models. One must be set as `base_model`.
 
-Mergekit originally featured two separate scripts with different inputs. The functionality of these is maintained in the `mergekit-legacy` and `bakllama` wrappers. Example usage:
+Parameters:
 
-```sh
-mergekit-legacy ./output-model --base-model TheBloke/Llama-2-13B-fp16 --cuda \
-    --merge WizardLM/WizardLM-13B-V1.2 --weight 0.3 --density 0.5 \
-    --merge garage-bAInd/Platypus2-13B --weight 0.5 --density 0.5
-```
+- `t` - interpolation factor. At `t=0` will return `base_model`, at `t=1` will return the other one.
+
+### [Task Arithmetic](https://arxiv.org/abs/2212.04089)
+
+Computes "task vectors" for each model by subtracting a base model. Merges the task vectors linearly and adds back the base. Works great for models that were fine tuned from a common ancestor. Also a super useful mental framework for several of the more involved merge methods.
+
+Parameters: same as [Linear](#linear)
+
+### [TIES](https://arxiv.org/abs/2306.01708)
+
+Builds on the task arithmetic framework. Resolves interference between models by sparsifying the task vectors and applying a sign consensus algorithm. Allows you to merge a larger number of models and retain more of their strengths.
+
+Parameters: same as [Linear](#linear), plus:
+
+- `density` - fraction of weights in differences from the base model to retain
+
+### [DARE](https://arxiv.org/abs/2311.03099)
+
+In the same vein as TIES, sparsifies task vectors to reduce interference. Differs in that DARE uses random pruning with a novel rescaling to better match performance of the original models. DARE can be used either with the sign consensus algorithm of TIES (`dare_ties`) or without (`dare_linear`).
+
+Parameters: same as [TIES](#ties) for `dare_ties`, or [Linear](#linear) for `dare_linear`
+
+### Passthrough
 
-`mergekit-legacy` can output a YAML configuration for easy migration with the `--print-yaml` option.
+`passthrough` is a no-op that simply passes input tensors through unmodified. It is meant to be used for layer-stacking type merges where you have only one input model. Useful for frankenmerging.
diff --git a/examples/gpt2-small.yml b/examples/gpt2-small.yml
diff --git a/examples/gradient-slerp.yml b/examples/gradient-slerp.yml
@@ -4,6 +4,10 @@ slices:
         layer_range: [0, 40]
       - model: garage-bAInd/Platypus2-13B
         layer_range: [0, 40]
+# or, the equivalent models: syntax:
+# models:
+#   - model: psmathur/orca_mini_v3_13b
+#   - model: garage-bAInd/Platypus2-13B
 merge_method: slerp
 base_model: psmathur/orca_mini_v3_13b
 parameters:

diff --git a/examples/linear.yml b/examples/linear.yml
@@ -0,0 +1,12 @@
+models:
+  - model: psmathur/orca_mini_v3_13b
+    parameters:
+      weight: 1.0
+  - model: WizardLM/WizardLM-13B-V1.2
+    parameters:
+      weight: 0.3
+  - model: garage-bAInd/Platypus2-13B
+    parameters:
+      weight: 0.5
+merge_method: linear
+dtype: float16
diff --git a/examples/ties.yml b/examples/ties.yml
@@ -1,6 +1,4 @@
 models:
-  - model: TheBloke/Llama-2-13B-fp16
-    # no parameters necessary for base model
   - model: psmathur/orca_mini_v3_13b
     parameters:
       density: [1, 0.7, 0.1] # density gradient

diff --git a/mergekit/architecture.py b/mergekit/architecture.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Charles O. Goddard
+# Copyright (C) 2024 Charles O. Goddard
 #
 # This software is free software: you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public License as
@@ -239,6 +239,11 @@ class PhiTensorNames(ArchitectureInfo):
     def __init__(self, config: PretrainedConfig):
         self.config = config
 
+    def __eq__(self, rhs: "PhiTensorNames"):
+        if not isinstance(rhs, PhiTensorNames):
+            return False
+        return self.num_layers() == rhs.num_layers()
+
     def pre_weights(self) -> List[str]:
         return ["layers.0.wte.weight"]
 
@@ -282,6 +287,32 @@ def num_layers_config_key(self) -> str:
         return "n_layer"
 
 
+PHI2_INFO = StaticTensorNames(
+    name="PhiForCausalLM",
+    pre_weight_names=["transformer.embd.wte.weight"],
+    post_weight_names=[
+        "lm_head.linear.bias",
+        "lm_head.linear.weight",
+        "lm_head.ln.bias",
+        "lm_head.ln.weight",
+    ],
+    embed_weight_names=["lm_head.linear.weight", "transformer.embd.wte.weight"],
+    layer_prefix_format="transformer.h.{idx}",
+    layer_weight_suffixes=[
+        "ln.bias",
+        "ln.weight",
+        "mixer.out_proj.bias",
+        "mixer.out_proj.weight",
+        "mixer.Wqkv.bias",
+        "mixer.Wqkv.weight",
+        "mlp.fc1.bias",
+        "mlp.fc1.weight",
+        "mlp.fc2.bias",
+        "mlp.fc2.weight",
+    ],
+)
+
+
 def get_architecture_info(config: PretrainedConfig) -> StaticTensorNames:
     if len(config.architectures) != 1:
         raise RuntimeError("More than one architecture in config?")
@@ -299,6 +330,7 @@ def get_architecture_info(config: PretrainedConfig) -> StaticTensorNames:
         GPT2_SEQCLASS_INFO,
         CHATGLM_INFO,
         STABLELM_INFO,
+        PHI2_INFO,
     ]
     for arch in supported:
         if arch.name == arch_name: