diff --git a/doc/examples/example_cli_famos.ipynb b/doc/examples/example_cli_famos.ipynb
index 5956c66..a1d32a1 100644
--- a/doc/examples/example_cli_famos.ipynb
+++ b/doc/examples/example_cli_famos.ipynb
@@ -33,6 +33,7 @@
     "\n",
     "from example_cli_famos_helpers import (\n",
     "    parse_summary_to_progress_list,\n",
+    "    petab_select_problem_yaml,  # noqa: F401\n",
     ")\n",
     "\n",
     "output_path = Path().resolve() / \"output_famos\"\n",
@@ -141,8 +142,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/dilan/Documents/future_annex/model_selection/packages/petab_select/petab_select/candidate_space.py:376: RuntimeWarning: Model `model_subspace_1-0001011010010010` has been previously excluded from the candidate space so is skipped here.\n",
-      "  warnings.warn(\n"
+      "petab_select/candidate_space.py:1137: RuntimeWarning: Model `model_subspace_1-0001011010010010` has been previously excluded from the candidate space so is skipped here.\n",
+      "  return_value = self.inner_candidate_space.consider(model)\n"
      ]
     },
     {
@@ -173,7 +174,6 @@
    ],
    "source": [
     "%%bash -s \"$petab_select_problem_yaml\" \"$output_path_str\"\n",
-    "\n",
     "petab_select_problem_yaml=$1\n",
     "output_path_str=$2\n",
     "\n",
diff --git a/doc/examples/example_cli_famos_calibration_tool.py b/doc/examples/example_cli_famos_calibration_tool.py
index c78cabb..f5b58c2 100644
--- a/doc/examples/example_cli_famos_calibration_tool.py
+++ b/doc/examples/example_cli_famos_calibration_tool.py
@@ -7,14 +7,12 @@
 models_yaml = sys.argv[1]
 calibrated_models_yaml = sys.argv[2]
 
-models = petab_select.model.models_from_yaml_list(models_yaml)
+models = petab_select.Models.from_yaml(models_yaml)
 predecessor_model_hashes = set()
 for model in models:
     calibrate(model=model)
     predecessor_model_hashes |= {model.predecessor_model_hash}
-petab_select.model.models_to_yaml_list(
-    models=models, output_yaml=calibrated_models_yaml
-)
+models.to_yaml(output_yaml=calibrated_models_yaml)
 
 if len(predecessor_model_hashes) == 0:
     pass
diff --git a/doc/examples/workflow_cli.ipynb b/doc/examples/workflow_cli.ipynb
index 46c5a51..6f4cf83 100644
--- a/doc/examples/workflow_cli.ipynb
+++ b/doc/examples/workflow_cli.ipynb
@@ -177,7 +177,7 @@
     "output_path_str=$1\n",
     "\n",
     "petab_select end_iteration \\\n",
-    "--state=output/state.dill \\\n",
+    "--state=$output_path_str/state.dill \\\n",
     "--calibrated-models=model_selection/calibrated_models_1.yaml \\\n",
     "--output-models=$output_path_str/models_1.yaml \\\n",
     "--output-metadata=$output_path_str/metadata.yaml \\\n",
@@ -289,7 +289,7 @@
     "petab_select get_best \\\n",
     "--problem model_selection/petab_select_problem.yaml \\\n",
     "--models model_selection/calibrated_models_1.yaml \\\n",
-    "--output output_cli/predecessor_model.yaml\n",
+    "--output $output_path_str/predecessor_model.yaml\n",
     "# create a copy of the original PEtab select problem and update its paths\n",
     "cp model_selection/petab_select_problem.yaml $output_path_str/custom_problem.yaml\n",
     "sed -i 's|- model_space.tsv|- ../model_selection/model_space.tsv|' $output_path_str/custom_problem.yaml\n",
@@ -470,7 +470,7 @@
    "id": "889dedc1",
    "metadata": {},
    "source": [
-    "As we are performing a forward search from `M1_4`, which has two parameters, then all models in this iteration with have 3+ parameters. This model space contains only one model with 3 or more estimated parameters. We finalize the iteration with its calibration results."
+    "As we are performing a forward search from `M1_4`, which has two parameters, then all models in this iteration will have 3+ parameters. This model space contains only one model with 3 or more estimated parameters. We finalize the iteration with its calibration results."
    ]
   },
   {
@@ -531,7 +531,7 @@
    "metadata": {},
    "source": [
     "## Fourth iteration\n",
-    "As there are no models in the model space with 4+ parameters, subsequent forward searches will return no candidate models. This can be used by tools to detect when model selection terminates."
+    "As there are no models in the model space with 4+ parameters, subsequent forward searches will return no candidate models. Tools can detect when to terminate by inspecting the metadata produced by `end_iteration`, as demonstrated at the end of this iteration."
    ]
   },
   {
@@ -600,8 +600,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "null\n",
-      "...\n",
+      "[]\n",
       "\n"
      ]
     }
@@ -611,6 +610,43 @@
     "    print(f.read())"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "02df7ed9-422d-4f28-9b01-8670be873933",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash -s \"$output_path_str\"\n",
+    "output_path_str=$1\n",
+    "\n",
+    "petab_select end_iteration \\\n",
+    "--state=$output_path_str/state.dill \\\n",
+    "--output-models=$output_path_str/models_4.yaml \\\n",
+    "--output-metadata=$output_path_str/metadata.yaml \\\n",
+    "--relative-paths"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "57e483fd-5ffa-48a4-8c2a-359f6ebd1422",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "terminate: true\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open(\"output_cli/metadata.yaml\") as f:\n",
+    "    print(f.read())"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7b0b1123",
@@ -622,7 +658,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 18,
    "id": "d5b5087d",
    "metadata": {},
    "outputs": [],
@@ -643,7 +679,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 19,
    "id": "30721bfa",
    "metadata": {},
    "outputs": [
@@ -716,7 +752,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 20,
    "id": "73d54111",
    "metadata": {},
    "outputs": [],
@@ -736,7 +772,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 21,
    "id": "c36564f1",
    "metadata": {},
    "outputs": [
@@ -781,7 +817,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "id": "d5d03cd6",
    "metadata": {},
    "outputs": [
diff --git a/doc/examples/workflow_python.ipynb b/doc/examples/workflow_python.ipynb
index 2a20398..170c767 100644
--- a/doc/examples/workflow_python.ipynb
+++ b/doc/examples/workflow_python.ipynb
@@ -35,7 +35,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Information about the model selection problem.\n",
+      "Information about the model selection problem:\n",
       "YAML: model_selection/petab_select_problem.yaml\n",
       "Method: forward\n",
       "Criterion: Criterion.AIC\n",
@@ -306,7 +306,7 @@
       "Model ID: M1_2-000\n",
       "Criterion.AIC: 140\n",
       "\n",
-      "\u001B[1mBEST MODEL OF CURRENT ITERATION\u001B[0m\n",
+      "\u001b[1mBEST MODEL OF CURRENT ITERATION\u001b[0m\n",
       "Model subspace ID: M1_3\n",
       "PEtab YAML location: model_selection/petab_problem.yaml\n",
       "Custom model parameters: {'k1': 'estimate', 'k2': 0.1, 'k3': 0}\n",
@@ -356,7 +356,7 @@
       "Model ID: M1_5-000\n",
       "Criterion.AIC: -70\n",
       "\n",
-      "\u001B[1mBEST MODEL OF CURRENT ITERATION\u001B[0m\n",
+      "\u001b[1mBEST MODEL OF CURRENT ITERATION\u001b[0m\n",
       "Model subspace ID: M1_6\n",
       "PEtab YAML location: model_selection/petab_problem.yaml\n",
       "Custom model parameters: {'k1': 'estimate', 'k2': 'estimate', 'k3': 0}\n",
@@ -399,7 +399,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001B[1mBEST MODEL OF CURRENT ITERATION\u001B[0m\n",
+      "\u001b[1mBEST MODEL OF CURRENT ITERATION\u001b[0m\n",
       "Model subspace ID: M1_7\n",
       "PEtab YAML location: model_selection/petab_problem.yaml\n",
       "Custom model parameters: {'k1': 'estimate', 'k2': 'estimate', 'k3': 'estimate'}\n",
diff --git a/petab_select/__init__.py b/petab_select/__init__.py
index 233d233..665c410 100644
--- a/petab_select/__init__.py
+++ b/petab_select/__init__.py
@@ -9,6 +9,7 @@
 from .model import *
 from .model_space import *
 from .model_subspace import *
+from .models import *
 from .problem import *
 from .ui import *
 
diff --git a/petab_select/candidate_space.py b/petab_select/candidate_space.py
index 865bebf..fad615e 100644
--- a/petab_select/candidate_space.py
+++ b/petab_select/candidate_space.py
@@ -27,6 +27,7 @@
 )
 from .handlers import TYPE_LIMIT, LimitHandler
 from .model import Model, ModelHash, default_compare
+from .models import Models
 
 __all__ = [
     "BackwardCandidateSpace",
@@ -102,7 +103,7 @@ def __init__(
         limit: TYPE_LIMIT = np.inf,
         summary_tsv: TYPE_PATH = None,
         previous_predecessor_model: Model | None = None,
-        calibrated_models: dict[ModelHash, Model] = None,
+        calibrated_models: Models | None = None,
     ):
         """See class attributes for arguments."""
         self.method = method
@@ -125,15 +126,13 @@ def __init__(
         if self.previous_predecessor_model is None:
             self.previous_predecessor_model = self.predecessor_model
 
-        self.set_iteration_user_calibrated_models({})
+        self.set_iteration_user_calibrated_models(Models())
         self.criterion = criterion
-        self.calibrated_models = calibrated_models
-        if self.calibrated_models is None:
-            self.calibrated_models = {}
-        self.latest_iteration_calibrated_models = {}
+        self.calibrated_models = calibrated_models or Models()
+        self.latest_iteration_calibrated_models = Models()
 
     def set_iteration_user_calibrated_models(
-        self, user_calibrated_models: dict[str, Model] | None
+        self, user_calibrated_models: Models | None
     ) -> None:
         """Hide previously-calibrated models from the calibration tool.
 
@@ -146,18 +145,17 @@ def set_iteration_user_calibrated_models(
 
         Args:
             user_calibrated_models:
-                The previously-calibrated models. Keys are model hashes, values
-                are models.
+                The previously-calibrated models.
         """
         if not user_calibrated_models:
-            self.iteration_user_calibrated_models = {}
+            self.iteration_user_calibrated_models = Models()
             return
 
-        iteration_uncalibrated_models = []
-        iteration_user_calibrated_models = {}
+        iteration_uncalibrated_models = Models()
+        iteration_user_calibrated_models = Models()
         for model in self.models:
             if (
-                (user_model := user_calibrated_models.get(model.get_hash()))
+                (user_model := user_calibrated_models[model.get_hash()])
                 is not None
             ) and (
                 user_model.get_criterion(
@@ -209,11 +207,11 @@ def get_iteration_calibrated_models(
             The full list of calibrated models.
         """
         combined_calibrated_models = (
-            self.iteration_user_calibrated_models | calibrated_models
+            self.iteration_user_calibrated_models + calibrated_models
         )
         if reset:
             self.set_iteration_user_calibrated_models(
-                user_calibrated_models={}
+                user_calibrated_models=Models()
             )
         return combined_calibrated_models
 
@@ -418,7 +416,7 @@ def consider(self, model: Model | None) -> bool:
 
     def reset_accepted(self) -> None:
         """Reset the accepted models."""
-        self.models = []
+        self.models = Models()
         self.distances = []
 
     def set_predecessor_model(self, predecessor_model: Model | str | None):
@@ -452,6 +450,7 @@ def set_excluded_hashes(
             extend:
                 Whether to replace or extend the current excluded hashes.
         """
+        # FIXME refactor to use `Models` and rename `set_excluded_models`?
         if isinstance(hashes, Model | ModelHash):
             hashes = [hashes]
         excluded_hashes = set()
@@ -642,7 +641,7 @@ def distances_in_estimated_parameters(
     def update_after_calibration(
         self,
         *args,
-        iteration_calibrated_models: dict[ModelHash, Model],
+        iteration_calibrated_models: Models,
         **kwargs,
     ):
         """Do work in the candidate space after calibration.
@@ -654,7 +653,7 @@ def update_after_calibration(
         are here, to ensure candidate spaces can be switched easily and still
         receive sufficient arguments.
         """
-        self.calibrated_models |= iteration_calibrated_models
+        self.calibrated_models += iteration_calibrated_models
         self.latest_iteration_calibrated_models = iteration_calibrated_models
         self.set_excluded_hashes(
             self.latest_iteration_calibrated_models,
@@ -999,7 +998,7 @@ def __init__(
         else:
             self.most_distant_max_number = 1
 
-        self.best_models = []
+        self.best_models = Models()
         self.best_model_of_current_run = predecessor_model
 
         self.jumped_to_most_distant = False
@@ -1030,7 +1029,7 @@ def read_arguments_from_yaml_dict(cls, yaml_dict) -> dict:
     def update_after_calibration(
         self,
         *args,
-        iteration_calibrated_models: dict[str, Model],
+        iteration_calibrated_models: Models,
         **kwargs,
     ) -> None:
         """See `CandidateSpace.update_after_calibration`."""
@@ -1045,7 +1044,7 @@ def update_after_calibration(
         # to False and continue to candidate generation
         if self.jumped_to_most_distant:
             self.jumped_to_most_distant = False
-            jumped_to_model = one(iteration_calibrated_models.values())
+            jumped_to_model = one(iteration_calibrated_models)
             self.set_predecessor_model(jumped_to_model)
             self.previous_predecessor_model = jumped_to_model
             self.best_model_of_current_run = jumped_to_model
@@ -1057,7 +1056,7 @@ def update_after_calibration(
             logging.info("Switching method")
             self.switch_method()
             self.switch_inner_candidate_space(
-                excluded_hashes=list(self.calibrated_models),
+                excluded_hashes=self.calibrated_models,
             )
             logging.info(
                 "Method switched to ", self.inner_candidate_space.method
@@ -1067,14 +1066,14 @@ def update_after_calibration(
 
     def update_from_iteration_calibrated_models(
         self,
-        iteration_calibrated_models: dict[str, Model],
+        iteration_calibrated_models: Models,
     ) -> bool:
         """Update ``self.best_models`` with the latest ``iteration_calibrated_models``
         and determine if there was a new best model. If so, return
         ``False``. ``True`` otherwise.
         """
         go_into_switch_method = True
-        for model in iteration_calibrated_models.values():
+        for model in iteration_calibrated_models:
             if (
                 self.best_model_of_current_run == VIRTUAL_INITIAL_MODEL
                 or default_compare(
@@ -1319,6 +1318,7 @@ def get_most_distant(
         most_distance = 0
         most_distant_indices = []
 
+        # FIXME for multiple PEtab problems?
         parameter_ids = self.best_models[0].petab_parameters
 
         for model in self.best_models:
@@ -1334,7 +1334,7 @@ def get_most_distant(
             # initialize the least distance to the maximal possible value of it
             complement_least_distance = len(complement_parameters)
             # get the complement least distance
-            for calibrated_model in self.calibrated_models.values():
+            for calibrated_model in self.calibrated_models:
                 calibrated_model_estimated_parameters = np.array(
                     [
                         p == ESTIMATE
diff --git a/petab_select/cli.py b/petab_select/cli.py
index f318205..37f8355 100644
--- a/petab_select/cli.py
+++ b/petab_select/cli.py
@@ -12,8 +12,9 @@
 
 from . import ui
 from .candidate_space import CandidateSpace
-from .constants import CANDIDATE_SPACE, MODELS, PETAB_YAML, TERMINATE
-from .model import ModelHash, models_from_yaml_list, models_to_yaml_list
+from .constants import CANDIDATE_SPACE, MODELS, PETAB_YAML, PROBLEM, TERMINATE
+from .model import ModelHash
+from .models import Models, models_to_yaml_list
 from .problem import Problem
 
 
@@ -21,8 +22,8 @@ def read_state(filename: str) -> dict[str, Any]:
     with open(filename, "rb") as f:
         state = dill.load(f)
 
-    state["problem"] = dill.loads(state["problem"])
-    state["candidate_space"] = dill.loads(state["candidate_space"])
+    state[PROBLEM] = dill.loads(state[PROBLEM])
+    state[CANDIDATE_SPACE] = dill.loads(state[CANDIDATE_SPACE])
 
     return state
 
@@ -40,8 +41,8 @@ def get_state(
     candidate_space: CandidateSpace,
 ) -> dict[str, Any]:
     state = {
-        "problem": dill.dumps(problem),
-        "candidate_space": dill.dumps(candidate_space),
+        PROBLEM: dill.dumps(problem),
+        CANDIDATE_SPACE: dill.dumps(candidate_space),
     }
     return state
 
@@ -80,34 +81,6 @@ def cli():
     default=None,
     help="The method used to identify the candidate models. Defaults to the method in the problem YAML.",
 )
-# @click.option(
-#    '--previous-predecessor-model',
-#    '-P',
-#    'previous_predecessor_model_yaml',
-#    type=str,
-#    default=None,
-#    help='(Optional) The predecessor model used in the previous iteration of model selection.',
-# )
-# @click.option(
-#    '--calibrated-models',
-#    '-C',
-#    'calibrated_models_yamls',
-#    type=str,
-#    multiple=True,
-#    default=None,
-#    help='(Optional) Models that have been calibrated.',
-# )
-# @click.option(
-#    '--newly-calibrated-models',
-#    '-N',
-#    'newly_calibrated_models_yamls',
-#    type=str,
-#    multiple=True,
-#    default=None,
-#    help=(
-#        '(Optional) Models that were calibrated in the most recent iteration.'
-#    ),
-# )
 @click.option(
     "--limit",
     "-l",
@@ -157,10 +130,6 @@ def start_iteration(
     state_dill: str,
     uncalibrated_models_yaml: str,
     method: str = None,
-    # previous_predecessor_model_yaml: str = None,
-    # best: str = None,
-    # calibrated_models_yamls: List[str] = None,
-    # newly_calibrated_models_yamls: List[str] = None,
     limit: float = np.inf,
     limit_sent: float = np.inf,
     relative_paths: bool = False,
@@ -194,11 +163,11 @@ def start_iteration(
         problem = state["problem"]
         candidate_space = state["candidate_space"]
 
-    excluded_models = []
+    excluded_models = Models()
     # TODO seems like default is `()`, not `None`...
     if excluded_model_files is not None:
-        for model_yaml_list in excluded_model_files:
-            excluded_models.extend(models_from_yaml_list(model_yaml_list))
+        for models_yaml in excluded_model_files:
+            excluded_models.extend(Models.from_yaml(models_yaml))
 
     # TODO test
     excluded_model_hashes = []
@@ -214,49 +183,12 @@ def start_iteration(
         ModelHash.from_hash(hash_str) for hash_str in excluded_model_hashes
     ]
 
-    # previous_predecessor_model = candidate_space.predecessor_model
-    # if previous_predecessor_model_yaml is not None:
-    #    previous_predecessor_model = Model.from_yaml(
-    #        previous_predecessor_model_yaml
-    #    )
-
-    # # FIXME write single methods to take all models from lists of lists of
-    # #       models recursively
-    # calibrated_models = None
-    # if calibrated_models_yamls:
-    #     calibrated_models = {}
-    #     for calibrated_models_yaml in calibrated_models_yamls:
-    #         calibrated_models.update(
-    #             {
-    #                 model.get_hash(): model
-    #                 for model in models_from_yaml_list(calibrated_models_yaml)
-    #             }
-    #         )
-
-    # newly_calibrated_models = None
-    # if newly_calibrated_models_yamls:
-    #     newly_calibrated_models = {}
-    #     for newly_calibrated_models_yaml in newly_calibrated_models_yamls:
-    #         newly_calibrated_models.update(
-    #             {
-    #                 model.get_hash(): model
-    #                 for model in models_from_yaml_list(
-    #                     newly_calibrated_models_yaml
-    #                 )
-    #             }
-    #         )
-
     ui.start_iteration(
         problem=problem,
         candidate_space=candidate_space,
-        # previous_predecessor_model=previous_predecessor_model,
-        # calibrated_models=calibrated_models,
-        # newly_calibrated_models=newly_calibrated_models,
         limit=limit,
         limit_sent=limit_sent,
         excluded_hashes=excluded_hashes,
-        # excluded_models=excluded_models,
-        # excluded_model_hashes=excluded_model_hashes,
     )
 
     # Save state
@@ -332,15 +264,10 @@ def end_iteration(
     problem = state["problem"]
     candidate_space = state["candidate_space"]
 
-    calibrated_models = {}
+    calibrated_models = Models()
     if calibrated_models_yamls:
         for calibrated_models_yaml in calibrated_models_yamls:
-            calibrated_models.update(
-                {
-                    model.get_hash(): model
-                    for model in models_from_yaml_list(calibrated_models_yaml)
-                }
-            )
+            calibrated_models.extend(Models.from_yaml(calibrated_models_yaml))
 
     # Finalize iteration results
     iteration_results = ui.end_iteration(
@@ -409,9 +336,9 @@ def model_to_petab(
     Documentation for arguments can be viewed with
     `petab_select model_to_petab --help`.
     """
-    models = []
+    models = Models()
     for models_yaml in models_yamls:
-        models.extend(models_from_yaml_list(models_yaml))
+        models.extend(Models.from_yaml(models_yaml))
 
     model0 = None
     try:
@@ -468,9 +395,9 @@ def models_to_petab(
     Documentation for arguments can be viewed with
     `petab_select models_to_petab --help`.
     """
-    models = []
+    models = Models()
     for models_yaml in models_yamls:
-        models.extend(models_from_yaml_list(models_yaml))
+        models.extend(Models.from_yaml(models_yaml))
 
     model_ids = pd.Series([model.model_id for model in models])
     duplicates = "\n".join(set(model_ids[model_ids.duplicated()]))
@@ -559,9 +486,9 @@ def get_best(
 
     problem = Problem.from_yaml(problem_yaml)
 
-    models = []
+    models = Models()
     for models_yaml in models_yamls:
-        models.extend(models_from_yaml_list(models_yaml))
+        models.extend(Models.from_yaml(models_yaml))
 
     best_model = ui.get_best(
         problem=problem,
diff --git a/petab_select/constants.py b/petab_select/constants.py
index b56e8a4..9afc1cb 100644
--- a/petab_select/constants.py
+++ b/petab_select/constants.py
@@ -1,5 +1,7 @@
 """Constants for the PEtab Select package."""
 
+from __future__ import annotations
+
 import string
 import sys
 from enum import Enum
@@ -84,6 +86,7 @@
 VERSION = "version"
 MODEL_SPACE_FILES = "model_space_files"
 PROBLEM_ID = "problem_id"
+PROBLEM = "problem"
 
 CANDIDATE_SPACE = "candidate_space"
 CANDIDATE_SPACE_ARGUMENTS = "candidate_space_arguments"
diff --git a/petab_select/model.py b/petab_select/model.py
index 6c7602f..fbb040d 100644
--- a/petab_select/model.py
+++ b/petab_select/model.py
@@ -45,8 +45,6 @@
 __all__ = [
     "Model",
     "default_compare",
-    "models_from_yaml_list",
-    "models_to_yaml_list",
     "ModelHash",
 ]
 
@@ -56,7 +54,7 @@ class Model(PetabMixin):
 
     NB: some of these attribute names correspond to constants defined in the
     `constants.py` file, to facilitate loading models from/saving models to
-    disk (see the `saved_attributes` attribute).
+    disk (see the `Model.saved_attributes` class attribute).
 
     Attributes:
         converters_load:
@@ -371,9 +369,8 @@ def from_yaml(model_yaml: TYPE_PATH) -> Model:
                     raise
                 raise ValueError(
                     "The provided YAML file contains a list with greater than "
-                    "one element. Use the `models_from_yaml_list` method or "
-                    "provide a PEtab Select model YAML file with only one "
-                    "model specified."
+                    "one element. Use the `Models.from_yaml` or provide a "
+                    "YAML file with only one model specified."
                 )
 
         return Model.from_dict(model_dict, base_path=Path(model_yaml).parent)
@@ -656,10 +653,10 @@ def default_compare(
         model1:
             The new model.
         criterion:
-            The criterion by which models will be compared.
+            The criterion.
         criterion_threshold:
             The value by which the new model must improve on the original
-            model. Should be non-negative.
+            model. Should be non-negative, regardless of the criterion.
 
     Returns:
         ``True` if ``model1`` has a better criterion value than ``model0``, else
@@ -704,97 +701,6 @@ def default_compare(
         raise NotImplementedError(f"Unknown criterion: {criterion}.")
 
 
-def models_from_yaml_list(
-    model_list_yaml: TYPE_PATH,
-    petab_problem: petab.Problem = None,
-    allow_single_model: bool = True,
-) -> list[Model]:
-    """Generate a model from a PEtab Select list of model YAML file.
-
-    Args:
-        model_list_yaml:
-            The path to the PEtab Select list of model YAML file.
-        petab_problem:
-            See :meth:`Model.from_dict`.
-        allow_single_model:
-            Given a YAML file that contains a single model directly (not in
-            a 1-element list), if ``True`` then the single model will be read in,
-            else a ``ValueError`` will be raised.
-
-    Returns:
-        A list of model instances, initialized with the provided
-        attributes.
-    """
-    with open(str(model_list_yaml)) as f:
-        model_dict_list = yaml.safe_load(f)
-    if not model_dict_list:
-        return []
-
-    if not isinstance(model_dict_list, list):
-        if allow_single_model:
-            return [
-                Model.from_dict(
-                    model_dict_list,
-                    base_path=Path(model_list_yaml).parent,
-                    petab_problem=petab_problem,
-                )
-            ]
-        raise ValueError("The YAML file does not contain a list of models.")
-
-    return [
-        Model.from_dict(
-            model_dict,
-            base_path=Path(model_list_yaml).parent,
-            petab_problem=petab_problem,
-        )
-        for model_dict in model_dict_list
-    ]
-
-
-def models_to_yaml_list(
-    models: list[Model | str] | dict[ModelHash, Model | str],
-    output_yaml: TYPE_PATH,
-    relative_paths: bool = True,
-) -> None:
-    """Generate a YAML listing of models.
-
-    Args:
-        models:
-            The models.
-        output_yaml:
-            The location where the YAML will be saved.
-        relative_paths:
-            Whether to rewrite the paths in each model (e.g. the path to the
-            model's PEtab problem) relative to the `output_yaml` location.
-    """
-    if isinstance(models, dict):
-        models = list(models.values())
-
-    skipped_indices = []
-    for index, model in enumerate(models):
-        if isinstance(model, Model):
-            continue
-        if model == VIRTUAL_INITIAL_MODEL:
-            continue
-        warnings.warn(f"Unexpected model, skipping: {model}.", stacklevel=2)
-        skipped_indices.append(index)
-    models = [
-        model
-        for index, model in enumerate(models)
-        if index not in skipped_indices
-    ]
-
-    paths_relative_to = None
-    if relative_paths:
-        paths_relative_to = Path(output_yaml).parent
-    model_dicts = [
-        model.to_dict(paths_relative_to=paths_relative_to) for model in models
-    ]
-    model_dicts = None if not model_dicts else model_dicts
-    with open(output_yaml, "w") as f:
-        yaml.dump(model_dicts, f)
-
-
 class ModelHash(str):
     """A class to handle model hash functionality.
 
diff --git a/petab_select/models.py b/petab_select/models.py
new file mode 100644
index 0000000..f712add
--- /dev/null
+++ b/petab_select/models.py
@@ -0,0 +1,393 @@
+from __future__ import annotations
+
+import warnings
+from collections import Counter
+from collections.abc import Iterable, MutableSequence
+from pathlib import Path
+from typing import TYPE_CHECKING, TypeAlias
+
+import yaml
+
+from .constants import TYPE_PATH
+from .model import (
+    Model,
+    ModelHash,
+)
+
+if TYPE_CHECKING:
+    import petab
+
+    from .problem import Problem
+
+    # `Models` can be constructed from actual `Model`s,
+    # or `ModelHash`s, or the `str` of a model hash.
+    ModelLike: TypeAlias = Model | ModelHash | str
+    ModelsLike: TypeAlias = "Models" | Iterable[Model | ModelHash | str]
+    # Access a model by list index, model hash, slice of indices, model hash
+    # string, or an iterable of these things.
+    ModelIndex: TypeAlias = int | ModelHash | slice | str | Iterable
+
+__all__ = [
+    "Models",
+    "models_from_yaml_list",
+    "models_to_yaml_list",
+]
+
+
+class Models(MutableSequence):
+    """A collection of models.
+
+    Behaves like a list of models, but also supports operations
+    involving objects that can be mapped to model(s). For example, model hashes
+    can be used to add or access models.
+
+    Some list methods are not yet implemented -- feel free to request anything
+    that feels intuitive.
+
+    Provide a PEtab Select ``problem`` to the constructor or via
+    ``set_problem``, to use add models by hashes. This means that all models
+    must belong to the same PEtab Select problem.
+    """
+
+    def set_problem(self, problem: Problem) -> None:
+        """Set the PEtab Select problem for this set of models."""
+        self._problem = problem
+
+    def lint(self):
+        """Lint the models, e.g. check all hashes are unique.
+
+        Currently raises an exception when invalid.
+        """
+        duplicates = [
+            model_hash
+            for model_hash, count in Counter(self._hashes).items()
+            if count > 1
+        ]
+        if duplicates:
+            raise ValueError(
+                "Multiple models exist with the same hash. "
+                f"Model hashes: `{duplicates}`."
+            )
+
+    @staticmethod
+    def from_yaml(
+        models_yaml: TYPE_PATH,
+        petab_problem: petab.Problem = None,
+        problem: Problem = None,
+    ) -> Models:
+        """Generate models from a PEtab Select list of model YAML file.
+
+        Args:
+            models_yaml:
+                The path to the PEtab Select list of model YAML file.
+            petab_problem:
+                See :meth:`Model.from_dict`.
+            problem:
+                The PEtab Select problem.
+
+        Returns:
+            The models.
+        """
+        with open(str(models_yaml)) as f:
+            model_dict_list = yaml.safe_load(f)
+        if not model_dict_list:
+            # Empty file
+            models = []
+        elif not isinstance(model_dict_list, list):
+            # File contains a single model
+            models = [
+                Model.from_dict(
+                    model_dict_list,
+                    base_path=Path(models_yaml).parent,
+                    petab_problem=petab_problem,
+                )
+            ]
+        else:
+            # File contains a list of models
+            models = [
+                Model.from_dict(
+                    model_dict,
+                    base_path=Path(models_yaml).parent,
+                    petab_problem=petab_problem,
+                )
+                for model_dict in model_dict_list
+            ]
+
+        return Models(models=models, problem=problem)
+
+    def to_yaml(
+        self,
+        output_yaml: TYPE_PATH,
+        relative_paths: bool = True,
+    ) -> None:
+        """Generate a YAML listing of models.
+
+        Args:
+            output_yaml:
+                The location where the YAML will be saved.
+            relative_paths:
+                Whether to rewrite the paths in each model (e.g. the path to the
+                model's PEtab problem) relative to the `output_yaml` location.
+        """
+        paths_relative_to = None
+        if relative_paths:
+            paths_relative_to = Path(output_yaml).parent
+        model_dicts = [
+            model.to_dict(paths_relative_to=paths_relative_to)
+            for model in self
+        ]
+        with open(output_yaml, "w") as f:
+            yaml.safe_dump(model_dicts, f)
+
+    # `list` methods. Compared to `UserList`, some methods are skipped.
+    # https://github.com/python/cpython/blob/main/Lib/collections/__init__.py
+
+    def __init__(
+        self, models: Iterable[ModelLike] = None, problem: Problem = None
+    ) -> Models:
+        self._models = []
+        self._hashes = []
+        self._problem = problem
+
+        if models is None:
+            models = []
+        self.extend(models)
+
+    def __repr__(self) -> str:
+        """Get the model hashes that can regenerate these models.
+
+        N.B.: some information, e.g. criteria, will be lost if the hashes are
+        used to reproduce the set of models.
+        """
+        return repr(self._hashes)
+
+    # skipped __lt__, __le__
+
+    def __eq__(self, other) -> bool:
+        other_hashes = Models(other)._hashes
+        same_length = len(self._hashes) == len(other_hashes)
+        same_hashes = set(self._hashes) == set(other_hashes)
+        return same_length and same_hashes
+
+    # skipped __gt__, __ge__, __cast
+
+    def __contains__(self, item: ModelLike) -> bool:
+        match item:
+            case Model():
+                return item in self._models
+            case ModelHash() | str():
+                return item in self._hashes
+            case _:
+                raise TypeError(f"Unexpected type: `{type(item)}`.")
+
+    def __len__(self) -> int:
+        return len(self._models)
+
+    def __getitem__(
+        self, item: ModelIndex | Iterable[ModelIndex]
+    ) -> Model | Models:
+        match item:
+            case int():
+                return self._models[item]
+            case ModelHash() | str():
+                return self._models[self._hashes.index(item)]
+            case slice():
+                return self.__class__(self._models[item])
+            case Iterable():
+                # TODO sensible to yield here?
+                return [self[item_] for item_ in item]
+            case _:
+                raise TypeError(f"Unexpected type: `{type(item)}`.")
+
+    def __setitem__(self, key: ModelIndex, item: ModelLike) -> None:
+        match key:
+            case int():
+                pass
+            case ModelHash() | str():
+                key = self._hashes.index(key)
+            case slice():
+                for key_, item_ in zip(
+                    range(*key.indices(len(self))), item, strict=True
+                ):
+                    self[key_] = item_
+            case Iterable():
+                for key_, item_ in zip(key, item, strict=True):
+                    self[key_] = item_
+            case _:
+                raise TypeError(f"Unexpected type: `{type(key)}`.")
+
+        match item:
+            case Model():
+                pass
+            case ModelHash() | str():
+                item = self._problem.model_hash_to_model(item)
+            case _:
+                raise TypeError(f"Unexpected type: `{type(item)}`.")
+
+        if key < len(self._models):
+            self._models[key] = item
+            self._hashes[key] = item.get_hash()
+        else:
+            # Key doesn't exist, e.g., instead of
+            # models[1] = model1
+            # the user did something like
+            # models[model1_hash] = model1
+            # to add a new model.
+            self.append(item)
+
+    def __delitem__(self, key: ModelIndex) -> None:
+        match key:
+            case ModelHash() | str():
+                key = self._hashes.index(key)
+            case slice():
+                for key_ in range(*key.indices(len(self))):
+                    del self[key_]
+            case Iterable():
+                for key_ in key:
+                    del self[key_]
+            case _:
+                raise TypeError(f"Unexpected type: `{type(key)}`.")
+
+        del self._models[key]
+        del self._hashes[key]
+
+    def __add__(
+        self, other: ModelLike | ModelsLike, left: bool = True
+    ) -> Models:
+        match other:
+            case Models():
+                new_models = other._models
+            case Model():
+                new_models = [other]
+            case ModelHash() | str():
+                # Assumes the models belong to the same PEtab Select problem.
+                new_models = [self._problem.model_hash_to_model(other)]
+            case Iterable():
+                # Assumes the models belong to the same PEtab Select problem.
+                new_models = Models(other, problem=self._problem)._models
+            case _:
+                raise TypeError(f"Unexpected type: `{type(other)}`.")
+
+        models = self._models + new_models
+        if not left:
+            models = new_models + self._models
+        return Models(models=models, problem=self._problem)
+
+    def __radd__(self, other: ModelLike | ModelsLike) -> Models:
+        return self.__add__(other=other, left=False)
+
+    def __iadd__(self, other: ModelLike | ModelsLike) -> Models:
+        return self.__add__(other=other)
+
+    # skipped __mul__, __rmul__, __imul__
+
+    def __copy__(self) -> Models:
+        return Models(models=self._models, problem=self._problem)
+
+    def append(self, item: ModelLike) -> None:
+        # Re-use __setitem__ logic
+        self._models.append(None)
+        self._hashes.append(None)
+        self[-1] = item
+
+    def insert(self, index: int, item: ModelLike):
+        # Re-use __setitem__ logic
+        self._models.insert(index, None)
+        self._hashes.insert(index, None)
+        self[index] = item
+
+    # def pop(self, index: int = -1):
+    #     model = self._models[index]
+
+    #     # Re-use __delitem__ logic
+    #     del self[index]
+
+    #     return model
+
+    # def remove(self, item: ModelLike):
+    #     # Re-use __delitem__ logic
+    #     if isinstance(item, Model):
+    #         item = item.get_hash()
+    #     del self[item]
+
+    # skipped clear, copy, count
+
+    def index(self, item: ModelLike, *args) -> int:
+        if isinstance(item, Model):
+            item = item.get_hash()
+        return self._hashes.index(item, *args)
+
+    # skipped reverse, sort
+
+    def extend(self, other: Iterable[ModelLike]) -> None:
+        # Re-use append and therein __setitem__ logic
+        for model_like in other:
+            self.append(model_like)
+
+
+def models_from_yaml_list(
+    model_list_yaml: TYPE_PATH,
+    petab_problem: petab.Problem = None,
+    allow_single_model: bool = True,
+    problem: Problem = None,
+) -> Models:
+    """Generate a model from a PEtab Select list of model YAML file.
+
+    Deprecated. Use `petab_select.Models.from_yaml` instead.
+
+    Args:
+        model_list_yaml:
+            The path to the PEtab Select list of model YAML file.
+        petab_problem:
+            See :meth:`Model.from_dict`.
+        allow_single_model:
+            Given a YAML file that contains a single model directly (not in
+            a 1-element list), if ``True`` then the single model will be read in,
+            else a ``ValueError`` will be raised.
+        problem:
+            The PEtab Select problem.
+
+    Returns:
+        The models.
+    """
+    warnings.warn(
+        (
+            "Use `petab_select.Models.from_yaml` instead. "
+            "The `allow_single_model` argument is fixed to `True` now."
+        ),
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    return Models.from_yaml(
+        models_yaml=model_list_yaml,
+        petab_problem=petab_problem,
+        problem=problem,
+    )
+
+
+def models_to_yaml_list(
+    models: Models,
+    output_yaml: TYPE_PATH,
+    relative_paths: bool = True,
+) -> None:
+    """Generate a YAML listing of models.
+
+    Deprecated. Use `petab_select.Models.to_yaml` instead.
+
+    Args:
+        models:
+            The models.
+        output_yaml:
+            The location where the YAML will be saved.
+        relative_paths:
+            Whether to rewrite the paths in each model (e.g. the path to the
+            model's PEtab problem) relative to the `output_yaml` location.
+    """
+    warnings.warn(
+        "Use `petab_select.Models.to_yaml` instead.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    Models(models=models).to_yaml(
+        output_yaml=output_yaml, relative_paths=relative_paths
+    )
diff --git a/petab_select/problem.py b/petab_select/problem.py
index c7e2014..b0a763b 100644
--- a/petab_select/problem.py
+++ b/petab_select/problem.py
@@ -1,5 +1,6 @@
 """The model selection problem class."""
 
+import warnings
 from collections.abc import Callable, Iterable
 from functools import partial
 from pathlib import Path
@@ -21,6 +22,7 @@
 )
 from .model import Model, ModelHash, default_compare
 from .model_space import ModelSpace
+from .models import Models
 
 __all__ = [
     "Problem",
@@ -122,7 +124,7 @@ def get_path(self, relative_path: str | Path) -> Path:
 
     def exclude_models(
         self,
-        models: Iterable[Model],
+        models: Models,
     ) -> None:
         """Exclude models from the model space.
 
@@ -142,7 +144,13 @@ def exclude_model_hashes(
             model_hashes:
                 The model hashes.
         """
-        self.model_space.exclude_model_hashes(model_hashes)
+        # FIXME think about design here -- should we have exclude_models here?
+        warnings.warn(
+            "Use `exclude_models` instead. It also accepts hashes.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        self.exclude_models(models=Models(models=model_hashes, problem=self))
 
     @staticmethod
     def from_yaml(
@@ -212,7 +220,8 @@ def from_yaml(
 
     def get_best(
         self,
-        models: list[Model] | dict[ModelHash, Model] | None,
+        models: Models,
+        # models: list[Model] | dict[ModelHash, Model] | None,
         criterion: str | None | None = None,
         compute_criterion: bool = False,
     ) -> Model:
@@ -222,11 +231,9 @@ def get_best(
 
         Args:
             models:
-                The best model will be taken from these models.
+                The models.
             criterion:
-                The criterion by which models will be compared. Defaults to
-                ``self.criterion`` (e.g. as defined in the PEtab Select problem YAML
-                file).
+                The criterion. Defaults to the problem criterion.
             compute_criterion:
                 Whether to try computing criterion values, if sufficient
                 information is available (e.g., likelihood and number of
@@ -235,8 +242,6 @@ def get_best(
         Returns:
             The best model.
         """
-        if isinstance(models, dict):
-            models = list(models.values())
         if criterion is None:
             criterion = self.criterion
 
diff --git a/petab_select/ui.py b/petab_select/ui.py
index f5ed1f1..d2dd3f1 100644
--- a/petab_select/ui.py
+++ b/petab_select/ui.py
@@ -18,6 +18,7 @@
     Method,
 )
 from .model import Model, ModelHash, default_compare
+from .models import Models
 from .problem import Problem
 
 __all__ = [
@@ -45,7 +46,7 @@ def start_iteration(
     limit_sent: float | int = np.inf,
     excluded_hashes: list[ModelHash] | None = None,
     criterion: Criterion | None = None,
-    user_calibrated_models: list[Model] | dict[ModelHash, Model] | None = None,
+    user_calibrated_models: Models | None = None,
 ) -> CandidateSpace:
     """Search the model space for candidate models.
 
@@ -71,8 +72,7 @@ def start_iteration(
             The criterion by which models will be compared. Defaults to the criterion
             defined in the PEtab Select problem.
         user_calibrated_models:
-            Models that were already calibrated by the user. When supplied as a
-            `dict`, the keys are model hashes. If a model in the
+            Models that were already calibrated by the user. If a model in the
             candidates has the same hash as a model in
             `user_calibrated_models`, then the candidate will be replaced with
             the calibrated version. Calibration tools will only receive uncalibrated
@@ -124,7 +124,7 @@ def start_iteration(
             )
             is None
         ):
-            candidate_space.models = [copy.deepcopy(predecessor_model)]
+            candidate_space.models = Models([copy.deepcopy(predecessor_model)])
             # Dummy zero likelihood, which the predecessor model will
             # improve on after it's actually calibrated.
             predecessor_model.set_criterion(Criterion.LH, 0.0)
@@ -145,7 +145,7 @@ def start_iteration(
     # this is not the first step of the search.
     if candidate_space.latest_iteration_calibrated_models:
         predecessor_model = problem.get_best(
-            candidate_space.latest_iteration_calibrated_models.values(),
+            candidate_space.latest_iteration_calibrated_models,
             criterion=criterion,
         )
         # If the new predecessor model isn't better than the previous one,
@@ -194,7 +194,7 @@ def start_iteration(
         if isinstance(candidate_space, FamosCandidateSpace):
             try:
                 candidate_space.update_after_calibration(
-                    iteration_calibrated_models={},
+                    iteration_calibrated_models=Models(),
                 )
                 continue
             except StopIteration:
@@ -214,8 +214,8 @@ def start_iteration(
 
 def end_iteration(
     candidate_space: CandidateSpace,
-    calibrated_models: list[Model] | dict[str, Model],
-) -> dict[str, dict[ModelHash, Model] | bool | CandidateSpace]:
+    calibrated_models: Models,
+) -> dict[str, Models | bool | CandidateSpace]:
     """Finalize model selection iteration.
 
     All models from the current iteration are provided to the calibration tool.
@@ -234,17 +234,11 @@ def end_iteration(
     Returns:
         A dictionary, with the following items:
             :const:`petab_select.constants.MODELS`:
-                All calibrated models for the current iteration as a
-                dictionary, where keys are model hashes, and values are models.
+                All calibrated models for the current iteration.
             :const:`petab_select.constants.TERMINATE`:
                 Whether PEtab Select has decided to end the model selection,
                 as a boolean.
     """
-    if isinstance(calibrated_models, list):
-        calibrated_models = {
-            model.get_hash(): model for model in calibrated_models
-        }
-
     iteration_results = {
         MODELS: candidate_space.get_iteration_calibrated_models(
             calibrated_models=calibrated_models,
@@ -288,7 +282,7 @@ def model_to_petab(
 
 
 def models_to_petab(
-    models: list[Model],
+    models: Models,
     output_path_prefix: list[TYPE_PATH] | None = None,
 ) -> list[dict[str, petab.Problem | TYPE_PATH]]:
     """Generate the PEtab problems for a list of models.