Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add left/right context orth to lib.corpus #564

Merged
merged 5 commits into from
Dec 9, 2024
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 42 additions & 9 deletions lib/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,15 +116,15 @@ def startElement(self, name: str, attrs: Dict[str, str]):
def endElement(self, name: str):
e = self.elements[-1]

if name == "orth":
if name == "orth" or name == "left-context-orth" or name == "right-context-orth":
Icemole marked this conversation as resolved.
Show resolved Hide resolved
assert isinstance(e, Segment)
# we do some processing of the text that goes into the orth tag to get a nicer formating, some corpora may have
# multiline content in the orth tag, but to keep it that way might not be consistent with the indentation during
# writing, thus we remove multiple spaces and newlines
text = self.chars.strip()
text = re.sub(" +", " ", text)
text = re.sub("\n", "", text)
e.orth = text
setattr(e, name.replace("-", "_"), text)
elif isinstance(e, Speaker) and name != "speaker-description":
# we allow all sorts of elements within a speaker description
e.attribs[name] = self.chars.strip()
Expand Down Expand Up @@ -356,15 +356,38 @@ def get_segment_mapping(self) -> Dict[str, Segment]:


class Segment(NamedEntity):
def __init__(self):
def __init__(
self,
Icemole marked this conversation as resolved.
Show resolved Hide resolved
start: float = 0.0,
end: float = 0.0,
track: Optional[int] = None,
orth: Optional[str] = None,
left_context_orth: Optional[str] = None,
right_context_orth: Optional[str] = None,
speaker_name: Optional[str] = None,
recording: Optional[Recording] = None,
):
"""
:param start: Segment start.
:param end: Segment end.
:param track: Segment track/channel.
:param orth: Segment text.
:param left_context_orth: Optional left context when aligning (specific for RASR alignment).
:param right_context_orth: Optional right context when aligning (specific for RASR alignment).
:param speaker_name: Speaker name.
:param recording: Recording in which the segment is embedded.
"""
super().__init__()
self.start = 0.0
self.end = 0.0
self.track: Optional[int] = None
self.orth: Optional[str] = None
self.speaker_name: Optional[str] = None

self.recording: Optional[Recording] = None
self.start = start
self.end = end
self.track = track
self.orth = orth
self.left_context_orth = left_context_orth
self.right_context_orth = right_context_orth
self.speaker_name = speaker_name

self.recording = recording

def fullname(self) -> str:
return self.recording.fullname() + "/" + self.name
Expand All @@ -384,6 +407,16 @@ def dump(self, out: TextIO, indentation: str = ""):
out.write('%s <speaker name="%s"/>\n' % (indentation, self.speaker_name))
if self.orth is not None:
out.write("%s <orth> %s </orth>\n" % (indentation, saxutils.escape(self.orth)))
if self.left_context_orth is not None:
out.write(
"%s <left-context-orth> %s </left-context-orth>\n"
% (indentation, saxutils.escape(self.left_context_orth))
)
if self.right_context_orth is not None:
out.write(
"%s <right-context-orth> %s </right-context-orth>\n"
% (indentation, saxutils.escape(self.right_context_orth))
)
if has_child_element:
out.write("%s</segment>\n" % indentation)
else:
Expand Down
Loading