Skip to content

Commit

Permalink
Add more semantic metrics (#12)
Browse files Browse the repository at this point in the history
Following are added:
- `evalem.metrics.BleuMetric`
- `evalem.metrics.SacreBleuMetric`
- `evalem.metrics.MeteorMetric`
- `evalem.metrics.RougeMetric`
  • Loading branch information
NISH1001 authored Mar 27, 2023
1 parent 9365268 commit a9a5a74
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 8 deletions.
10 changes: 9 additions & 1 deletion evalem/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,12 @@
PrecisionMetric,
RecallMetric,
)
from .semantics import BartScore, BertScore, SemanticMetric
from .semantics import (
BartScore,
BertScore,
BleuMetric,
MeteorMetric,
RougeMetric,
SacreBleuMetric,
SemanticMetric,
)
83 changes: 79 additions & 4 deletions evalem/metrics/semantics.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,18 @@
EvaluationReferenceInstance,
MetricOutput,
)
from ._base import JuryBasedMetric
from ._base import JuryBasedMetric, Metric


class SemanticMetric(JuryBasedMetric):
class SemanticMetric(Metric):
"""
Metric respresenting semantics score between predictions and references.
"""

pass


class BertScore(SemanticMetric):
class BertScore(JuryBasedMetric, SemanticMetric):
"""
Uses a BERT model to compute the semantic similarity using the contextual
embeddings from the model.
Expand Down Expand Up @@ -104,7 +104,7 @@ def compute(
return result


class BartScore(SemanticMetric):
class BartScore(JuryBasedMetric, SemanticMetric):
"""
This uses BART model (an encoder-decoder model) to compute the
semantic similarity.
Expand Down Expand Up @@ -167,6 +167,81 @@ def compute(
}


class BleuMetric(JuryBasedMetric, SemanticMetric):
"""
Bilingual Evaluation Understudy (BLEU) is generally used for
text-translation.
References:
- https://en.wikipedia.org/wiki/BLEU
- https://aclanthology.org/P02-1040.pdf
Usage:
.. code-block: python
from evalem.metrics import BleuMetric
metric = BleuMetric()
results = metric(predictions=predictions, references=references)
"""

def __init__(self) -> None:
super().__init__(metrics="bleu")


class SacreBleuMetric(JuryBasedMetric, SemanticMetric):
def __init__(self) -> None:
super().__init__(metrics="sacrebleu")


class MeteorMetric(JuryBasedMetric, SemanticMetric):
"""
Metric for Evaluation of Translation with Explicit ORdering.
References:
- https://en.wikipedia.org/wiki/METEOR
- https://www.cs.cmu.edu/~alavie/METEOR/pdf/Banerjee-Lavie-2005-METEOR.pdf
- https://arxiv.org/abs/2109.14250
Usage:
.. code-block: python
from evalem.metrics import MeteorMetric
metric = MeteorMetric()
results = metric(predictions=predictions, references=references)
"""

def __init__(self) -> None:
super().__init__(metrics="meteor")


class RougeMetric(JuryBasedMetric, SemanticMetric):
"""
Recall-Oriented Understudy for Gisting Evaluation.
References:
- https://en.wikipedia.org/wiki/ROUGE_(metric)
- https://aclanthology.org/W04-1013.pdf
- https://arxiv.org/abs/2109.14250
Usage:
.. code-block: python
from evalem.metrics import RougeMetric
metric = RougeMetric()
results = metric(predictions=predictions, references=references)
"""

def __init__(self) -> None:
super().__init__(metrics="rouge")


def main():
pass

Expand Down
8 changes: 6 additions & 2 deletions evalem/pipelines/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,14 @@ def run(
References/ground-truths to be used for evaluation.
See `evalem.metrics` for more information.
"""
predictions = self.model(inputs, **kwargs)
predictions = self.model(inputs, **kwargs.get("model_params", {}))
return list(
map(
lambda e: e(predictions=predictions, references=references),
lambda e: e(
predictions=predictions,
references=references,
**kwargs.get("eval_params", {}),
),
self.evaluators,
),
)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ loguru==0.6.0
numpy==1.24.2
pandas==1.5.3
pytest==7.2.1
sacrebleu==2.3.1
scikit-learn==1.2.1
sentencepiece==0.1.97
seqeval==1.2.2
Expand Down
57 changes: 56 additions & 1 deletion tests/metrics/test_semantics.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
# flake8: noqa
#!/usr/bin/env python3

from evalem.metrics import BartScore, BertScore
from pprint import pprint

from evalem.metrics import (
BartScore,
BertScore,
BleuMetric,
MeteorMetric,
RougeMetric,
SacreBleuMetric,
)

from ._base import BaseMetricTest, predictions, references

Expand All @@ -20,3 +29,49 @@ class TestBartScore(BaseMetricTest):

def test_metric_score(self, metric_result):
assert -10 <= metric_result[self._key]["score"] <= 10


class TestBleuMetric(BaseMetricTest):
_metric_cls = BleuMetric
_key = "bleu"

def test_metric_score(self, metric_result):
pprint(metric_result)
assert 0 <= metric_result[self._key]["score"] <= 1


class TestSacreBleuMetric(BaseMetricTest):
_metric_cls = SacreBleuMetric
_key = "sacrebleu"

def test_metric_score(self, metric_result):
pprint(metric_result)
assert 0 <= metric_result[self._key]["score"] <= 1


class TestMeteorMetric(BaseMetricTest):
_metric_cls = MeteorMetric
_key = "meteor"

def test_metric_score(self, metric_result):
pprint(metric_result)
assert 0 <= metric_result[self._key]["score"] <= 1


class TestRougeMetric(BaseMetricTest):
_metric_cls = RougeMetric
_key = "rouge"

def test_metric_return_keys(self, metric_result):
assert self._key in metric_result
assert "rouge1" in metric_result[self._key]
assert "rouge2" in metric_result[self._key]
assert "rougeL" in metric_result[self._key]
assert "rougeLsum" in metric_result[self._key]

def test_metric_score(self, metric_result):
pprint(metric_result)
assert 0 <= metric_result[self._key]["rouge1"] <= 1
assert 0 <= metric_result[self._key]["rouge2"] <= 1
assert 0 <= metric_result[self._key]["rougeL"] <= 1
assert 0 <= metric_result[self._key]["rougeLsum"] <= 1

0 comments on commit a9a5a74

Please sign in to comment.