arcee-ai · ElliotStein · May 30, 2024 · May 30, 2024 · May 30, 2024 · May 30, 2024
diff --git a/examples/linear_small.yml b/examples/linear_small.yml
@@ -0,0 +1,6 @@
+models:
+  - model: BEE-spoke-data/smol_llama-220M-GQA
+  - model: BEE-spoke-data/smol_llama-220M-openhermes
+
+metric_method: all
+dtype: float32
diff --git a/mergekit/architecture.py b/mergekit/architecture.py
@@ -43,6 +43,10 @@ class WeightInfo(BaseModel, frozen=True):
             List of alternative names for the weight, if applicable.
         force_dtype (Optional[str]):
             Mandatory dtype for the weight, if applicable.
+        GQA_groups (Optional[int]):
+            Number of groups for GQA-style weight sharing, if applicable.
+        num_heads (Optional[int]):
+            Number of heads for multihead attention, if applicable.
     """
 
     name: str
@@ -53,6 +57,9 @@ class WeightInfo(BaseModel, frozen=True):
     aliases: Optional[Tuple[str, ...]] = None
     force_dtype: Optional[str] = None
 
+    GQA_groups: Optional[int] = None # None if not GQA, 1 if MQA, >1 if GQA
+    num_heads: Optional[int] = None
+
 
 class ProceduralSpaceInfo(BaseModel, frozen=True):
     """Defines a procedural space computed from one or more other spaces.

diff --git a/mergekit/config.py b/mergekit/config.py
@@ -82,7 +82,8 @@ class OutputSliceDefinition(BaseModel):
 
 
 class MergeConfiguration(BaseModel):
-    merge_method: str
+    merge_method: Optional[str] = None
+    metric_method: Optional[str] = None
     slices: Optional[List[OutputSliceDefinition]] = None
     models: Optional[List[InputModelDefinition]] = None
     parameters: Optional[Dict[str, ParameterSetting]] = None

diff --git a/mergekit/graph.py b/mergekit/graph.py
@@ -37,6 +37,7 @@ class Task(ABC, BaseModel, Generic[ValueT], frozen=True):
     Abstract base class representing a task in a computational graph.
 
     This class should be extended to define specific tasks. Each task can have arguments (dependencies) and a defined execution strategy.
+    Note that PyDantic BaseModel requires that all attributes are defined in the class initialisation, and cannot be changed after. 
 
     Attributes:
         Generic[ValueT] (TypeVar): The type of the value that the task returns upon execution.
@@ -106,7 +107,6 @@ def uses_accelerator(self) -> bool:
         """
         return False
 
-
 class Executor:
     """
     Schedules and executes a set of tasks and their dependencies.
@@ -241,13 +241,20 @@ def _make_schedule(self, targets: List[Task]) -> List[Task]:
             # they will be included in the final schedule
             edge_tups.append((Executor.DUMMY_TASK_VALUE, task))
 
+        def _pad_numbers(s):
+            parts = s.split('.')
+            for i, part in enumerate(parts):
+                if part.isdigit():
+                    parts[i] = part.zfill(3)
+            return '.'.join(parts)
+
         def _compare_key(task: Union[Task, str]):
             if task == Executor.DUMMY_TASK_VALUE:
                 return ("", 0)
-            return (
-                task.group_label() or "",
-                -task.priority(),
-            )
+            group_label = task.group_label() or ""
+            padded_label = _pad_numbers(group_label)
+            priority = -task.priority()
+            return (padded_label, priority)
 
         graph = networkx.DiGraph(edge_tups)
         res = [

diff --git a/mergekit/measure.py b/mergekit/measure.py
@@ -0,0 +1,92 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+
+import tqdm
+import transformers
+
+from mergekit.architecture import get_architecture_info
+from mergekit.config import MergeConfiguration
+from mergekit.graph import Executor
+from mergekit.io.tasks import LoaderCache
+from mergekit.options import MergeOptions
+from mergekit.plan import MergePlanner
+from mergekit.merge import _model_out_config
+
+
+def run_measure(
+    merge_config: MergeConfiguration,
+    out_path: str,
+    options: MergeOptions,
+):
+    if options.random_seed is not None:
+        transformers.trainer_utils.set_seed(options.random_seed)
+
+    if not merge_config.models and not merge_config.slices:
+        raise RuntimeError("No output requested")
+
+    model_arch_info = [
+        get_architecture_info(m.config(trust_remote_code=options.trust_remote_code))
+        for m in merge_config.referenced_models()
+    ]
+    if not options.allow_crimes:
+        if not all(a == model_arch_info[0] for a in model_arch_info[1:]):
+            raise RuntimeError(
+                "Must specify --allow-crimes to attempt to mix different architectures"
+            )
+    arch_info = model_arch_info[0]
+
+    # initialize loader cache and set options
+    loader_cache = LoaderCache()
+    loader_cache.setup(options=options)
+
+    # create config for output model
+    cfg_out = _model_out_config(
+        merge_config, arch_info, trust_remote_code=options.trust_remote_code
+    )
+
+    # warm up loader cache
+    for model in (
+        pbar := tqdm.tqdm(
+            merge_config.referenced_models(),
+            desc="Warmup loader cache",
+            disable=options.quiet,
+        )
+    ):
+        loader_cache.get(model)
+    del pbar
+
+    logging.info("Planning operations")
+    targets = MergePlanner(
+        merge_config,
+        arch_info,
+        options=options,
+        out_model_config=cfg_out,
+    ).plan_to_disk(out_path=out_path)
+
+    exec = Executor(
+        tasks=targets,
+        math_device="cuda" if options.cuda else "cpu",
+        storage_device="cuda" if options.low_cpu_memory else "cpu",
+    )
+
+    res = []
+    for _task, value in exec.run(quiet=options.quiet):
+        res.append((_task, value))
+
+    return res
+
+__all__ = ["MergeOptions", "run_merge"]
diff --git a/mergekit/metric_methods/__init__.py b/mergekit/metric_methods/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from mergekit.metric_methods.base import MetricMethod
+from mergekit.metric_methods.all_metrics import AllMetric
+
+
+def get(method: str) -> MetricMethod:
+    if method == "all":
+        return AllMetric()
+    raise RuntimeError(f"Unimplemented metric method {method}")
+
+
+__all__ = [
+    "MetricMethod",
+    "get",
+]