[!162][SUBTITLING] Add sentence-level reporting for compliance metrics

# Why is the change needed? For the IWSLT 2024 subtitling track, participants can work only on the subtitling compliance. To check their progress on each sample, we need to provide sentence-level scores. # What changes does the patch introduce? Add the `--sentence-level` option to print sentence-level scores for the selected metric. # How was this patch tested? Added UT
hlt-mt · Feb 8, 2024 · a0ed786 · a0ed786
1 parent de31ff7
commit a0ed786
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 3 deletions.
diff --git a/examples/speech_to_text/scripts/subtitle_compliance.py b/examples/speech_to_text/scripts/subtitle_compliance.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 _SUPPORTED_METRICS = ['cps', 'cpl', 'lpb']
-_VERSION = "1.1"
+_VERSION = "1.2"
 _CITATION = r"""@article{papi-etal-2023-direct,
       title={{Direct Speech Translation for Automatic Subtitling}}, 
       author={Sara Papi and Marco Gaido and Alina Karakanta and Mauro Cettolo and Matteo Negri and Marco Turchi},
@@ -199,6 +199,10 @@ def report(self, metric: str, upperbound: float, precision: int, quiet: bool, ci
         else:
             return compliance_metric.json_string(precision)
 
+    def report_stats(self, metric: str, precision: int):
+        for stat in getattr(self, metric):
+            yield ComplianceMetric._format_number(stat, precision)
+
 
 def main(args):
     """
@@ -226,6 +230,10 @@ def main(args):
 
         subtitle_stats = SubtitleComplianceStats.from_subtitles(
             subtitles, args.remove_parenthesis_content)
+        if args.sentence_level:
+            for m in args.metrics:
+                for stat in subtitle_stats.report_stats(m, args.width):
+                    print(stat)
         all_stats.append(subtitle_stats)
         if not args.quiet and len(args.srt_file) > 1:
             print(f"Compliance metrics for {srt_file}")
@@ -285,6 +293,9 @@ def main(args):
         help='confidence intervals with 95% confidence level using bootstrap resampling '
              f'({_BOOTSTRAP_NUM_SAMPLES} samples). The number of samples can be customized by '
              'setting the environment variable BOOTSTRAP_NUM_SAMPLES.')
+    parser.add_argument(
+        '--sentence-level', '-sl', action='store_true', default=False,
+        help='Print metrics for each sentence. Added in v1.2.')
 
     # Text preprocessing
     parser.add_argument(
@@ -302,4 +313,7 @@ def main(args):
         print("--srt-file is required")
         parser.print_usage()
         exit(1)
+    if parsed_args.sentence_level and len(parsed_args.metrics) > 1:
+        print("Only one metric can be used in sentence-level mode.")
+        exit(1)
     main(parsed_args)
diff --git a/fbk_uts/subtitles/test_subtitle_compliance.py b/fbk_uts/subtitles/test_subtitle_compliance.py
@@ -71,7 +71,7 @@ def test_basic(self):
  "stdev": 7.11,
  "total": 8.00,
  "compliant": 1.00,
- "version": "1.1"
+ "version": "1.2"
 }""")
         self.assertEqual(cpl_metric.score_string(2), "CPL: 12.50%")
 
@@ -115,9 +115,21 @@ def test_confidence_interval(self):
  "stdev": 7.11,
  "total": 8.00,
  "compliant": 1.00,
- "version": "1.1"
+ "version": "1.2"
 }""")
 
+    def test_report_scores(self):
+        subtitles = self.get_example_content("sample_de_01.srt")
+        stats = SubtitleComplianceStats.from_subtitles(subtitles)
+        reported_cpls = list(stats.report_stats("cpl", 2))
+        self.assertEqual(len(reported_cpls), 8)
+        self.assertEqual(reported_cpls[0], "40.00")
+        self.assertEqual(reported_cpls[7], "39.00")
+        reported_cpss = list(stats.report_stats("cps", 2))
+        self.assertEqual(len(reported_cpss), 4)
+        self.assertEqual(reported_cpss[0], "13.79")
+        self.assertEqual(reported_cpss[3], "16.59")
+
 
 if __name__ == '__main__':
     unittest.main()