-
Notifications
You must be signed in to change notification settings - Fork 57
/
titml_idn.py
138 lines (115 loc) · 5.8 KB
/
titml_idn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from pathlib import Path
from typing import List
import datasets
import json
import os
from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Licenses, Tasks, DEFAULT_SOURCE_VIEW_NAME, DEFAULT_SEACROWD_VIEW_NAME
_DATASETNAME = "titml_idn"
_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME
_UNIFIED_VIEW_NAME = DEFAULT_SEACROWD_VIEW_NAME
_LANGUAGES = ["ind"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
_LOCAL = False
_CITATION = """\
@inproceedings{lestari2006titmlidn,
title={A large vocabulary continuous speech recognition system for Indonesian language},
author={Lestari, Dessi Puji and Iwano, Koji and Furui, Sadaoki},
booktitle={15th Indonesian Scientific Conference in Japan Proceedings},
pages={17--22},
year={2006}
}
"""
_DESCRIPTION = """\
TITML-IDN (Tokyo Institute of Technology Multilingual - Indonesian) is collected to build a pioneering Indonesian Large Vocabulary Continuous Speech Recognition (LVCSR) System. In order to build an LVCSR system, high accurate acoustic models and large-scale language models are essential. Since Indonesian speech corpus was not available yet, we tried to collect speech data from 20 Indonesian native speakers (11 males and 9 females) to construct a speech corpus for training the acoustic model based on Hidden Markov Models (HMMs). A text corpus which was collected by ILPS, Informatics Institute, University of Amsterdam, was used to build a 40K-vocabulary dictionary and a n-gram language model.
"""
_HOMEPAGE = "http://research.nii.ac.jp/src/en/TITML-IDN.html"
_LICENSE = Licenses.OTHERS.value + " | For research purposes only. If you use this corpus, you have to cite (Lestari et al, 2006)."
_URLs = {"titml-idn": "https://huggingface.co/datasets/holylovenia/TITML-IDN/resolve/main/IndoLVCSR.zip"}
_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION]
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "2024.06.20"
class TitmlIdn(datasets.GeneratorBasedBuilder):
"""TITML-IDN is a speech recognition dataset containing Indonesian speech collected with transcriptions from newpaper and magazine articles."""
BUILDER_CONFIGS = [
SEACrowdConfig(
name="titml_idn_source",
version=datasets.Version(_SOURCE_VERSION),
description="TITML-IDN source schema",
schema="source",
subset_id="titml_idn",
),
SEACrowdConfig(
name="titml_idn_seacrowd_sptext",
version=datasets.Version(_SEACROWD_VERSION),
description="TITML-IDN Nusantara schema",
schema="seacrowd_sptext",
subset_id="titml_idn",
),
]
DEFAULT_CONFIG_NAME = "titml_idn_source"
def _info(self):
if self.config.schema == "source":
features = datasets.Features(
{
"id": datasets.Value("string"),
"speaker_id": datasets.Value("string"),
"path": datasets.Value("string"),
"audio": datasets.Audio(sampling_rate=16_000),
"text": datasets.Value("string"),
}
)
elif self.config.schema == "seacrowd_sptext":
features = schemas.speech_text_features
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
task_templates=[datasets.AutomaticSpeechRecognition(audio_column="audio", transcription_column="text")],
)
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
base_path = dl_manager.download_and_extract(_URLs["titml-idn"])
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"filepath": base_path},
),
]
def _generate_examples(self, filepath: Path, n_speakers=20):
if self.config.schema == "source" or self.config.schema == "seacrowd_sptext":
for speaker_id in range(1, n_speakers + 1):
speaker_id = str(speaker_id).zfill(2)
dir_path = os.path.join(filepath, speaker_id)
transcription_path = os.path.join(dir_path, "script~")
with open(transcription_path, "r+") as f:
for line in f:
audio_id = line[2:8]
text = line[9:].strip()
wav_path = os.path.join(dir_path, "{}.wav".format(audio_id))
if os.path.exists(wav_path):
if self.config.schema == "source":
ex = {
"id": audio_id,
"speaker_id": speaker_id,
"path": wav_path,
"audio": wav_path,
"text": text,
}
yield audio_id, ex
elif self.config.schema == "seacrowd_sptext":
ex = {
"id": audio_id,
"speaker_id": speaker_id,
"path": wav_path,
"audio": wav_path,
"text": text,
"metadata": {
"speaker_age": None,
"speaker_gender": None,
}
}
yield audio_id, ex
else:
raise ValueError(f"Invalid config: {self.config.name}")