-
Notifications
You must be signed in to change notification settings - Fork 0
/
transcript_loader.py
193 lines (157 loc) · 8.27 KB
/
transcript_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import json
import logging
from pathlib import Path
import re
import yaml
def load_transcripts(asset: Path, threeplay: Path) ->list[dict]:
"""Load all of the transcript data for a single asset and return
them, along with a permutation tuple"""
transcripts = []
if not asset.is_dir():
return transcripts
if not (asset / "metadata.yaml").exists():
logging.warning(f"Skipping {asset.name} because there's no metadata.yaml")
return transcripts
if asset.name.endswith(".ignore"):
logging.warning(f"Ignoring {asset}")
return transcripts
# load the base metadata
with open(asset / "metadata.yaml") as f:
base_meta = yaml.safe_load(f)
# there's some weirdity where we have "(alternate)" in some of the physical
# media names. Let's remove that.
base_meta['physical format'] = base_meta['physical format'].replace('(alternate)', '').strip()
# we're going to assume that every media file in this asset has been
# normalized, so we'll use that as the basis for gathering the
# transcripts
for nfile in asset.glob("*.normalization.yaml"):
base_filename = nfile.name
# remove all of the suffixes.
for s in reversed(nfile.suffixes):
base_filename = base_filename[0:-len(s)]
if '.high' in nfile.name:
base_filename += ".high"
# It's great that we're normalized, but without a 3play transcript
# this is for nothing. Let's get it if it exists, otherwise move
# to the next one.
threeplay_base = base_filename.replace('.', '_')
if not (threeplay / f"{threeplay_base}.json").exists():
logging.warning(f"Skipping {asset.name}/{threeplay_base} since there isn't a corresponding 3play transcript")
continue
# load threeplay
threeplay_transcript = load_3play_json(threeplay / f"{threeplay_base}.json")
# load the normalization
with open(nfile) as f:
normalization_meta = yaml.safe_load(f)
# get the transcript data for each whisper transcript variation
for tfile in asset.glob(f"{base_filename}.whisper.*.json"):
whisper_transcript = load_whisper_json(tfile)
# now that we've collected everything, add it to the list of transcripts
data = {
'title': base_meta['title'],
'physical_format': base_meta['physical format'],
'content_type': base_meta['content type'],
'base_filename': base_filename,
'whisper_model': whisper_transcript['model'],
'previous_text': whisper_transcript['previous_text'],
'audio_filter': whisper_transcript['audio_filter'],
'original_duration': normalization_meta['original_duration'],
'truncated_duration': normalization_meta['truncated_duration'],
'whisper_processing_duration': whisper_transcript['runtime'],
'whisper_transcript': whisper_transcript['text'],
'3play_transcript': threeplay_transcript,
# things we'll need later that we might as well compute here.
'processing_ratio': whisper_transcript['runtime'] / normalization_meta['truncated_duration'],
'variant': (whisper_transcript['model'], whisper_transcript['audio_filter'], whisper_transcript['previous_text'])
}
transcripts.append(data)
return transcripts
def load_3play_json(file: Path):
"""Load the 3play json file and convert it to plain text without
audio descriptions or diarization"""
# 'words' is a list of tuples where I'm assuming the first value is a
# timestamp of some sort and the 2nd is the word at that timestamp.
# 'paragraphs' is a list of timestamps where each paragraph starts.
# 'speakers' is a dict of timestamp -> speaker
# audio descriptions are single words which match "^\[[A-Z ]+\]$"
with open(file) as f:
data = yaml.safe_load(f)
# filter out words we don't care about
words = [x for x in data['words'] if x[1] != ''] # empty words
words = [x for x in words if x[0] not in data['speakers']] # speaker tokens
words = [[x[0], re.sub(r'\[\?', ' ', x[1])] for x in words] # remove leading ambiguity marker
words = [[x[0], re.sub(r'\?\]', ' ', x[1])] for x in words] # remove trailing ambituity marker
words = [[x[0], re.sub(r'\[.*?\]', ' ', x[1])] for x in words] # remove sound annotation
words = [[x[0], re.sub(r'</?i>', ' ', x[1])] for x in words] # remove the italic markers
words = [[x[0], re.sub(r'\([A-Z\s]*\)', ' ', x[1])] for x in words]
# convert the timestamp to an integer
words = [[int(x[0]), x[1]] for x in words]
# split into paragraphs
paragraphs = [[]]
data['paragraphs'].append(99999999999)
for p in range(0, len(data['paragraphs']) - 1):
p_start = data['paragraphs'][p]
p_end = data['paragraphs'][p + 1]
for w in words:
if p_start <= w[0] < p_end:
paragraphs[-1].append(w[1])
paragraphs.append([])
return "\n".join([' '.join(x) for x in paragraphs if len(x)])
def load_whisper_json(file: Path, use_text=False,
ignore_zero_words=True,
ignore_annotations=True) -> dict:
"""Load a whisper transcript file and convert it to the data structure
we need for processing."""
with open(file) as f:
raw = json.load(f)
xscript = raw['_job']
if use_text:
xscript['text'] = raw['text']
else:
# normally I just use the text that's generated by whisper, but let's
# create the text manually and skip any words with 0 duration...
text = ""
words = 0
duration = 0
empty_words = 0
discarded_words = 0
# per the internet, people speak 110 - 170 words per minute in english.
# so, let's assume that someone is speaking really slowly (say, 90
# words per minute)...we could use that as a cutoff for words that
# may be really long hallucinations (I've seen 29 second words in
# whisper and that's clearly wrong). BUT, whisper will sometimes
# mis-time the words, so it's not really clear which things are
# halllucinations and which ones aren't.
word_duration_cutoff = 60 / 30
# with 2s word cutoff and a 0.5 confidence cutoff, it was too aggressive
confidence_cutoff = 0.5
for s in raw['segments']:
# whisper sound annotations start with '[' for the whole segment, so
# we can drop the segment if we match that.
if ignore_annotations and (s['text'].startswith(' [') or
'(*' in s['text']):
logging.debug(f"Removing sound annotation: {s['text']}")
continue
for position, w in enumerate(s['words']):
words += 1
word_duration = w['end'] - w['start']
duration += word_duration
if ignore_zero_words and word_duration == 0:
empty_words += 1
continue
# get rid of music symbol.
w['word'] = w['word'].replace('♪', ' ')
# fix OKAY -> OK
w['word'] = re.sub(r"/bokay/b", 'OK', w['word'], flags=re.IGNORECASE)
if False and word_duration > word_duration_cutoff:
# compute confidence score
confidence = w['probability']# * (word_duration_cutoff/ word_duration)
logging.info(f"Discarding word '{w['word']}'@{position} {w['probability']*100:0.3f}% {word_duration:0.3f}s/{word_duration_cutoff:0.3f}s, confidence {confidence * 100:0.3f}%")
#if confidence > confidence_cutoff:
text += w['word']
discarded_words += 1
continue
text += w['word']
logging.debug(f"Whisper text stats for {file}: {words} words, average {duration/words:0.3f} words per second, {empty_words} were empty, {discarded_words} were discarded for being longer than {word_duration_cutoff:0.3f}")
xscript['text'] = text
return xscript