Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix classification report if dataset has no labels #3375

Merged
merged 6 commits into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 46 additions & 51 deletions flair/nn/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,71 +434,66 @@ def evaluate(
labels=labels,
)

# compute accuracy separately as it is not always in classification_report (e.. when micro avg exists)
accuracy_score = round(sklearn.metrics.accuracy_score(y_true, y_pred), 4)
macro_f_score = round(classification_report_dict["macro avg"]["f1-score"], 4)

# if there is only one label, then "micro avg" = "macro avg"
if len(target_names) == 1:
classification_report_dict["micro avg"] = classification_report_dict["macro avg"]

if "micro avg" in classification_report_dict:
# micro average is only computed if zero-label exists (for instance "O")
micro_f_score = round(classification_report_dict["micro avg"]["f1-score"], 4)
else:
# if no zero-label exists (such as in POS tagging) micro average is equal to accuracy
micro_f_score = round(classification_report_dict["accuracy"], 4)

# same for the main score
if "micro avg" not in classification_report_dict and main_evaluation_metric[0] == "micro avg":
main_score = classification_report_dict["accuracy"]
else:
main_score = classification_report_dict[main_evaluation_metric[0]][main_evaluation_metric[1]]

else:
# issue error and default all evaluation numbers to 0.
log.error(
"ACHTUNG! No gold labels and no all_predicted_values found! "
"Could be an error in your corpus or how you "
"initialize the trainer!"
# The "micro avg" appears only in the classification report if no prediction is possible.
# Otherwise, it is identical to the "macro avg". In this case, we add it to the report.
if "micro avg" not in classification_report_dict:
classification_report_dict["micro avg"] = {}
for precision_recall_f1 in classification_report_dict["macro avg"]:
classification_report_dict["micro avg"][precision_recall_f1] = classification_report_dict[
"accuracy"
]

detailed_result = (
"\nResults:"
f"\n- F-score (micro) {round(classification_report_dict['micro avg']['f1-score'], 4)}"
f"\n- F-score (macro) {round(classification_report_dict['macro avg']['f1-score'], 4)}"
f"\n- Accuracy {accuracy_score}"
"\n\nBy class:\n" + classification_report
)
accuracy_score = micro_f_score = macro_f_score = main_score = 0.0
classification_report = ""
classification_report_dict = {}

detailed_result = (
"\nResults:"
f"\n- F-score (micro) {micro_f_score}"
f"\n- F-score (macro) {macro_f_score}"
f"\n- Accuracy {accuracy_score}"
"\n\nBy class:\n" + classification_report
)

scores: Dict[Union[Tuple[str, ...], str], Any] = {}
# Create and populate score object for logging with all evaluation values, plus the loss
scores: Dict[Union[Tuple[str, ...], str], Any] = {}

for avg_type in ("micro avg", "macro avg"):
for metric_type in ("f1-score", "precision", "recall"):
if avg_type == "micro avg" and avg_type not in classification_report_dict:
value = classification_report_dict["accuracy"]
for avg_type in ("micro avg", "macro avg"):
for metric_type in ("f1-score", "precision", "recall"):
scores[(avg_type, metric_type)] = classification_report_dict[avg_type][metric_type]

else:
value = classification_report_dict[avg_type][metric_type]
scores["accuracy"] = accuracy_score

scores[(avg_type, metric_type)] = value
if average_over > 0:
eval_loss /= average_over
scores["loss"] = eval_loss.item()

scores["accuracy"] = accuracy_score

if average_over > 0:
eval_loss /= average_over
scores["loss"] = eval_loss.item()
return Result(
main_score=classification_report_dict[main_evaluation_metric[0]][main_evaluation_metric[1]],
detailed_results=detailed_result,
classification_report=classification_report_dict,
scores=scores,
)

result = Result(
main_score=main_score,
detailed_results=detailed_result,
classification_report=classification_report_dict,
scores=scores,
)
else:
# issue error and default all evaluation numbers to 0.
error_text = (
f"ACHTUNG! It was not possible to compute evaluation values because: "
alanakbik marked this conversation as resolved.
Show resolved Hide resolved
f"- The evaluation data has no gold labels for label_type='{gold_label_type}'!"
f"- And no predictions were made!"
"Double check your corpus (if the test split has labels), and how you initialize the ModelTrainer!"
)
log.error(error_text)

return result
return Result(
main_score=0.0,
detailed_results=error_text,
classification_report={},
scores={"loss": 0.0},
)

@abstractmethod
def predict(
Expand Down
2 changes: 1 addition & 1 deletion flair/nn/multitask.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def make_multitask_model_and_corpus(
models.append(map[0])
corpora.append(map[1])
if len(map) == 3:
loss_factors.append(map[2]) # type:ignore[misc] # mypy does not understand if above checks for length
loss_factors.append(map[2])
else:
loss_factors.append(1.0)

Expand Down