rwth-i6 · curufinwe · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/lib/corpus.py b/lib/corpus.py
@@ -116,15 +116,15 @@ def startElement(self, name: str, attrs: Dict[str, str]):
     def endElement(self, name: str):
         e = self.elements[-1]
 
-        if name == "orth":
+        if name == "orth" or name == "left-context-orth" or name == "right-context-orth":
             assert isinstance(e, Segment)
             # we do some processing of the text that goes into the orth tag to get a nicer formating, some corpora may have
             # multiline content in the orth tag, but to keep it that way might not be consistent with the indentation during
             # writing, thus we remove multiple spaces and newlines
             text = self.chars.strip()
             text = re.sub(" +", " ", text)
             text = re.sub("\n", "", text)
-            e.orth = text
+            setattr(e, name.replace("-", "_"), text)
         elif isinstance(e, Speaker) and name != "speaker-description":
             # we allow all sorts of elements within a speaker description
             e.attribs[name] = self.chars.strip()
@@ -356,15 +356,38 @@ def get_segment_mapping(self) -> Dict[str, Segment]:
 
 
 class Segment(NamedEntity):
-    def __init__(self):
+    def __init__(
+        self,
+        start: float = 0.0,
+        end: float = 0.0,
+        track: Optional[int] = None,
+        orth: Optional[str] = None,
+        left_context_orth: Optional[str] = None,
+        right_context_orth: Optional[str] = None,
+        speaker_name: Optional[str] = None,
+        recording: Optional[Recording] = None,
+    ):
+        """
+        :param start: Segment start.
+        :param end: Segment end.
+        :param track: Segment track/channel.
+        :param orth: Segment text.
+        :param left_context_orth: Optional left context when aligning (specific for RASR alignment).
+        :param right_context_orth: Optional right context when aligning (specific for RASR alignment).
+        :param speaker_name: Speaker name.
+        :param recording: Recording in which the segment is embedded.
+        """
         super().__init__()
-        self.start = 0.0
-        self.end = 0.0
-        self.track: Optional[int] = None
-        self.orth: Optional[str] = None
-        self.speaker_name: Optional[str] = None
 
-        self.recording: Optional[Recording] = None
+        self.start = start
+        self.end = end
+        self.track = track
+        self.orth = orth
+        self.left_context_orth = left_context_orth
+        self.right_context_orth = right_context_orth
+        self.speaker_name = speaker_name
+
+        self.recording = recording
 
     def fullname(self) -> str:
         return self.recording.fullname() + "/" + self.name
@@ -384,6 +407,16 @@ def dump(self, out: TextIO, indentation: str = ""):
             out.write('%s  <speaker name="%s"/>\n' % (indentation, self.speaker_name))
         if self.orth is not None:
             out.write("%s  <orth> %s </orth>\n" % (indentation, saxutils.escape(self.orth)))
+        if self.left_context_orth is not None:
+            out.write(
+                "%s  <left-context-orth> %s </left-context-orth>\n"
+                % (indentation, saxutils.escape(self.left_context_orth))
+            )
+        if self.right_context_orth is not None:
+            out.write(
+                "%s  <right-context-orth> %s </right-context-orth>\n"
+                % (indentation, saxutils.escape(self.right_context_orth))
+            )
         if has_child_element:
             out.write("%s</segment>\n" % indentation)
         else: