diff --git a/flair/nn/model.py b/flair/nn/model.py
index 0af32dca5..49716b070 100644
--- a/flair/nn/model.py
+++ b/flair/nn/model.py
@@ -434,71 +434,65 @@ def evaluate(
                 labels=labels,
             )
 
+            # compute accuracy separately as it is not always in classification_report (e.. when micro avg exists)
             accuracy_score = round(sklearn.metrics.accuracy_score(y_true, y_pred), 4)
-            macro_f_score = round(classification_report_dict["macro avg"]["f1-score"], 4)
 
             # if there is only one label, then "micro avg" = "macro avg"
             if len(target_names) == 1:
                 classification_report_dict["micro avg"] = classification_report_dict["macro avg"]
 
-            if "micro avg" in classification_report_dict:
-                # micro average is only computed if zero-label exists (for instance "O")
-                micro_f_score = round(classification_report_dict["micro avg"]["f1-score"], 4)
-            else:
-                # if no zero-label exists (such as in POS tagging) micro average is equal to accuracy
-                micro_f_score = round(classification_report_dict["accuracy"], 4)
-
-            # same for the main score
-            if "micro avg" not in classification_report_dict and main_evaluation_metric[0] == "micro avg":
-                main_score = classification_report_dict["accuracy"]
-            else:
-                main_score = classification_report_dict[main_evaluation_metric[0]][main_evaluation_metric[1]]
-
-        else:
-            # issue error and default all evaluation numbers to 0.
-            log.error(
-                "ACHTUNG! No gold labels and no all_predicted_values found! "
-                "Could be an error in your corpus or how you "
-                "initialize the trainer!"
+            # The "micro avg" appears only in the classification report if no prediction is possible.
+            # Otherwise, it is identical to the "macro avg". In this case, we add it to the report.
+            if "micro avg" not in classification_report_dict:
+                classification_report_dict["micro avg"] = {}
+                for precision_recall_f1 in classification_report_dict["macro avg"]:
+                    classification_report_dict["micro avg"][precision_recall_f1] = classification_report_dict[
+                        "accuracy"
+                    ]
+
+            detailed_result = (
+                "\nResults:"
+                f"\n- F-score (micro) {round(classification_report_dict['micro avg']['f1-score'], 4)}"
+                f"\n- F-score (macro) {round(classification_report_dict['macro avg']['f1-score'], 4)}"
+                f"\n- Accuracy {accuracy_score}"
+                "\n\nBy class:\n" + classification_report
             )
-            accuracy_score = micro_f_score = macro_f_score = main_score = 0.0
-            classification_report = ""
-            classification_report_dict = {}
-
-        detailed_result = (
-            "\nResults:"
-            f"\n- F-score (micro) {micro_f_score}"
-            f"\n- F-score (macro) {macro_f_score}"
-            f"\n- Accuracy {accuracy_score}"
-            "\n\nBy class:\n" + classification_report
-        )
 
-        scores: Dict[Union[Tuple[str, ...], str], Any] = {}
+            # Create and populate score object for logging with all evaluation values, plus the loss
+            scores: Dict[Union[Tuple[str, ...], str], Any] = {}
 
-        for avg_type in ("micro avg", "macro avg"):
-            for metric_type in ("f1-score", "precision", "recall"):
-                if avg_type == "micro avg" and avg_type not in classification_report_dict:
-                    value = classification_report_dict["accuracy"]
+            for avg_type in ("micro avg", "macro avg"):
+                for metric_type in ("f1-score", "precision", "recall"):
+                    scores[(avg_type, metric_type)] = classification_report_dict[avg_type][metric_type]
 
-                else:
-                    value = classification_report_dict[avg_type][metric_type]
+            scores["accuracy"] = accuracy_score
 
-                scores[(avg_type, metric_type)] = value
+            if average_over > 0:
+                eval_loss /= average_over
+            scores["loss"] = eval_loss.item()
 
-        scores["accuracy"] = accuracy_score
-
-        if average_over > 0:
-            eval_loss /= average_over
-        scores["loss"] = eval_loss.item()
+            return Result(
+                main_score=classification_report_dict[main_evaluation_metric[0]][main_evaluation_metric[1]],
+                detailed_results=detailed_result,
+                classification_report=classification_report_dict,
+                scores=scores,
+            )
 
-        result = Result(
-            main_score=main_score,
-            detailed_results=detailed_result,
-            classification_report=classification_report_dict,
-            scores=scores,
-        )
+        else:
+            # issue error and default all evaluation numbers to 0.
+            error_text = (
+                f"It was not possible to compute evaluation values because: \n"
+                f"- The evaluation data has no gold labels for label_type='{gold_label_type}'!\n"
+                f"- And no predictions were made!\n"
+                "Double check your corpus (if the test split has labels), and how you initialize the ModelTrainer!"
+            )
 
-        return result
+            return Result(
+                main_score=0.0,
+                detailed_results=error_text,
+                classification_report={},
+                scores={"loss": 0.0},
+            )
 
     @abstractmethod
     def predict(
diff --git a/flair/nn/multitask.py b/flair/nn/multitask.py
index f200fd20e..6fa2f20c0 100644
--- a/flair/nn/multitask.py
+++ b/flair/nn/multitask.py
@@ -17,7 +17,7 @@ def make_multitask_model_and_corpus(
         models.append(map[0])
         corpora.append(map[1])
         if len(map) == 3:
-            loss_factors.append(map[2])  # type:ignore[misc] # mypy does not understand if above checks for length
+            loss_factors.append(map[2])
         else:
             loss_factors.append(1.0)