diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index fc250cf3f97..e63b1d846bf 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -493,6 +493,13 @@ async def eval_example(example: dict) -> dict:
             n_complete += 1
             if verbose:
                 print(f"Evaluated {n_complete} of {len(trial_rows)} examples")
+            else:
+                # Print progress at 25%, 50%, 75% and 100%
+                total_rows = len(trial_rows)
+                progress_milestones = [total_rows // 4, total_rows // 2, 3 * total_rows // 4, total_rows]
+                if n_complete in progress_milestones:
+                    percent_complete = int((n_complete / total_rows) * 100)
+                    print(f"Evaluated {percent_complete}% of examples")
             # status.update(
             #     f"Evaluating... {duration:.2f}s [{n_complete} / {len(self.dataset.rows)} complete]"  # type:ignore
             # )
diff --git a/weave/scorers/context_relevance_scorer.py b/weave/scorers/context_relevance_scorer.py
index 8a6ff5166ba..2fae089acf6 100644
--- a/weave/scorers/context_relevance_scorer.py
+++ b/weave/scorers/context_relevance_scorer.py
@@ -310,11 +310,13 @@ def _score_document(
             combined_mask = ~((model_inputs["input_ids"] == 2).bool() | special_tokens_mask.bool()).cpu().numpy().flatten()
             # we should mask the query up to the sep token, 
             # on the combined mask we have to search for the first False
-            # TODO: Check that this is now wrong
+            # TODO: Check that this is not wrong
             false_indices = np.where(~combined_mask)[0]
             start = false_indices[0]
             end = false_indices[1]
             combined_mask[start:end] = False
+
+
             results = self._model(**model_inputs)
 
             logits = results.logits[0].detach()