Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tedlium not distinct corpora names for the 3 partitions #490

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
114 changes: 113 additions & 1 deletion datasets/tedlium2.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__all__ = ["DownloadTEDLIUM2CorpusJob", "CreateTEDLIUM2BlissCorpusJob"]
__all__ = ["DownloadTEDLIUM2CorpusJob", "CreateTEDLIUM2BlissCorpusJob", "CreateTEDLIUM2BlissCorpusJobV2"]

import os
import re
Expand Down Expand Up @@ -214,3 +214,115 @@ def load_stm_data(self, stm_file):
text = " ".join(tokens[6:])
data.append([rec_name, channel, spk_name, start, end, gender, text])
return data


class CreateTEDLIUM2BlissCorpusJobV2(CreateTEDLIUM2BlissCorpusJob):
"""
It has the same logic as CreateTEDLIUM2BlissCorpusJob but the corpus name is distinct for each partition
and the common text normalization for eliminating space before apostroph, similarly to Kaldi and ESPNet is applied
"""

def make_stm(self):
def extend_segment_time(seg, prev_seg, next_seg, start_pad=0.15, end_pad=0.1):
start = float(seg[3])
end = float(seg[4])
# start padding (previous seg alread padded)
if prev_seg is not None and seg[0] == prev_seg[0]:
prev_end = float(prev_seg[4])
else:
prev_end = 0.0
start = max(start - start_pad, prev_end)
# end padding (next seg not yet padded and start padding is more important)
next_start = end + end_pad
if next_seg is not None and seg[0] == next_seg[0]:
next_start = max(float(next_seg[3]) - start_pad, end)
end = min(end + end_pad, next_start)
return "%.2f" % (start), "%.2f" % (end)

header = [
";;",
';; LABEL "o" "Overall" "Overall results"',
';; LABEL "f0" "f0" "Wideband channel"',
';; LABEL "f2" "f2" "Telephone channel"',
';; LABEL "male" "Male" "Male Talkers"',
';; LABEL "female" "Female" "Female Talkers"',
";;",
]
for corpus_key in ["train", "dev", "test"]:
f = open("%s.stm" % corpus_key, "w")
f.write("\n".join(header) + "\n")

stm_folder = os.path.join(self.corpus_folders[corpus_key].get_path(), "stm")
for stm_file in sorted(os.listdir(stm_folder)):
if not stm_file.endswith(".stm"):
continue
data = self.load_stm_data(os.path.join(stm_folder, stm_file))
for idx in range(len(data)):
# some text normalization
text = data[idx][6]
text = text.replace("imiss", "i miss")
text = text.replace("uptheir", "up their")
data[idx][6] = text

# train-only: segment boundary non-overlapping extension (kaldi)
if corpus_key == "train":
pre_seg = None if idx == 0 else data[idx - 1]
next_seg = None if idx == len(data) - 1 else data[idx + 1]
data[idx][3], data[idx][4] = extend_segment_time(data[idx], pre_seg, next_seg, 0.15, 0.1)

for seg in data:
f.write(" ".join(seg) + "\n")

f.close()
shutil.move("%s.stm" % corpus_key, self.out_stm_files[corpus_key].get_path())

def make_corpus(self):
"""
Marvin84 marked this conversation as resolved.
Show resolved Hide resolved
create bliss corpus from stm file (always include speakers)
"""
for corpus_key in ["train", "dev", "test"]:
audio_dir = os.path.join(self.corpus_folders[corpus_key].get_path(), "sph")
stm_file = self.out_stm_files[corpus_key].get_path()
data = self.load_stm_data(stm_file)

c = corpus.Corpus()
c.name = f"TED-LIUM-2-{corpus_key}"

speakers = []
last_rec_name = ""
recording = None
for seg in data:
rec_name, channel, spk_name, start, end, gender, text = seg

if not spk_name in speakers:
speakers.append(spk_name)
speaker = corpus.Speaker()
speaker.name = spk_name
if "female" in gender:
speaker.attribs["gender"] = "female"
elif "male" in gender:
speaker.attribs["gender"] = "male"
c.add_speaker(speaker)

if rec_name != last_rec_name:
if recording:
c.add_recording(recording)
recording = corpus.Recording()
recording.name = rec_name
recording.audio = os.path.join(audio_dir, "%s.sph" % rec_name)
last_rec_name = rec_name
seg_id = 1

segment = corpus.Segment()
segment.name = str(seg_id)
segment.start = float(start)
segment.end = float(end)
segment.speaker_name = spk_name
segment.orth = normalized_text

recording.add_segment(segment)
seg_id += 1

# add final recording
c.add_recording(recording)
c.dump(self.out_corpus_files[corpus_key].get_path())
Loading