diff --git a/flair/nn/model.py b/flair/nn/model.py index 0af32dca5..49716b070 100644 --- a/flair/nn/model.py +++ b/flair/nn/model.py @@ -434,71 +434,65 @@ def evaluate( labels=labels, ) + # compute accuracy separately as it is not always in classification_report (e.. when micro avg exists) accuracy_score = round(sklearn.metrics.accuracy_score(y_true, y_pred), 4) - macro_f_score = round(classification_report_dict["macro avg"]["f1-score"], 4) # if there is only one label, then "micro avg" = "macro avg" if len(target_names) == 1: classification_report_dict["micro avg"] = classification_report_dict["macro avg"] - if "micro avg" in classification_report_dict: - # micro average is only computed if zero-label exists (for instance "O") - micro_f_score = round(classification_report_dict["micro avg"]["f1-score"], 4) - else: - # if no zero-label exists (such as in POS tagging) micro average is equal to accuracy - micro_f_score = round(classification_report_dict["accuracy"], 4) - - # same for the main score - if "micro avg" not in classification_report_dict and main_evaluation_metric[0] == "micro avg": - main_score = classification_report_dict["accuracy"] - else: - main_score = classification_report_dict[main_evaluation_metric[0]][main_evaluation_metric[1]] - - else: - # issue error and default all evaluation numbers to 0. - log.error( - "ACHTUNG! No gold labels and no all_predicted_values found! " - "Could be an error in your corpus or how you " - "initialize the trainer!" + # The "micro avg" appears only in the classification report if no prediction is possible. + # Otherwise, it is identical to the "macro avg". In this case, we add it to the report. + if "micro avg" not in classification_report_dict: + classification_report_dict["micro avg"] = {} + for precision_recall_f1 in classification_report_dict["macro avg"]: + classification_report_dict["micro avg"][precision_recall_f1] = classification_report_dict[ + "accuracy" + ] + + detailed_result = ( + "\nResults:" + f"\n- F-score (micro) {round(classification_report_dict['micro avg']['f1-score'], 4)}" + f"\n- F-score (macro) {round(classification_report_dict['macro avg']['f1-score'], 4)}" + f"\n- Accuracy {accuracy_score}" + "\n\nBy class:\n" + classification_report ) - accuracy_score = micro_f_score = macro_f_score = main_score = 0.0 - classification_report = "" - classification_report_dict = {} - - detailed_result = ( - "\nResults:" - f"\n- F-score (micro) {micro_f_score}" - f"\n- F-score (macro) {macro_f_score}" - f"\n- Accuracy {accuracy_score}" - "\n\nBy class:\n" + classification_report - ) - scores: Dict[Union[Tuple[str, ...], str], Any] = {} + # Create and populate score object for logging with all evaluation values, plus the loss + scores: Dict[Union[Tuple[str, ...], str], Any] = {} - for avg_type in ("micro avg", "macro avg"): - for metric_type in ("f1-score", "precision", "recall"): - if avg_type == "micro avg" and avg_type not in classification_report_dict: - value = classification_report_dict["accuracy"] + for avg_type in ("micro avg", "macro avg"): + for metric_type in ("f1-score", "precision", "recall"): + scores[(avg_type, metric_type)] = classification_report_dict[avg_type][metric_type] - else: - value = classification_report_dict[avg_type][metric_type] + scores["accuracy"] = accuracy_score - scores[(avg_type, metric_type)] = value + if average_over > 0: + eval_loss /= average_over + scores["loss"] = eval_loss.item() - scores["accuracy"] = accuracy_score - - if average_over > 0: - eval_loss /= average_over - scores["loss"] = eval_loss.item() + return Result( + main_score=classification_report_dict[main_evaluation_metric[0]][main_evaluation_metric[1]], + detailed_results=detailed_result, + classification_report=classification_report_dict, + scores=scores, + ) - result = Result( - main_score=main_score, - detailed_results=detailed_result, - classification_report=classification_report_dict, - scores=scores, - ) + else: + # issue error and default all evaluation numbers to 0. + error_text = ( + f"It was not possible to compute evaluation values because: \n" + f"- The evaluation data has no gold labels for label_type='{gold_label_type}'!\n" + f"- And no predictions were made!\n" + "Double check your corpus (if the test split has labels), and how you initialize the ModelTrainer!" + ) - return result + return Result( + main_score=0.0, + detailed_results=error_text, + classification_report={}, + scores={"loss": 0.0}, + ) @abstractmethod def predict( diff --git a/flair/nn/multitask.py b/flair/nn/multitask.py index f200fd20e..6fa2f20c0 100644 --- a/flair/nn/multitask.py +++ b/flair/nn/multitask.py @@ -17,7 +17,7 @@ def make_multitask_model_and_corpus( models.append(map[0]) corpora.append(map[1]) if len(map) == 3: - loss_factors.append(map[2]) # type:ignore[misc] # mypy does not understand if above checks for length + loss_factors.append(map[2]) else: loss_factors.append(1.0)