-
Notifications
You must be signed in to change notification settings - Fork 2
/
textmetrics.py
76 lines (61 loc) · 1.94 KB
/
textmetrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import subtitleprocessing
import numpy as np
def get_avg_wpm(srt_file):
subs = subtitleprocessing.srt_to_sub(srt_file)
wpms = []
start_time = subs[0].start
n_words = 0
for sub in subs:
n_words += len(sub.content.split())
time = (sub.end - start_time).total_seconds()
if time > 5:
wpm = n_words/time*60
wpms.append(wpm)
n_words = 0
start_time = sub.end
mean = np.mean(wpms)
if np.isnan(mean):
return 0
else:
return mean
def get_wpm_variation(srt_file):
subs = subtitleprocessing.srt_to_sub(srt_file)
wpms = []
start_time = subs[0].start
n_words = 0
for sub in subs:
n_words += len(sub.content.split())
time = (sub.end - start_time).total_seconds()
if time > 5:
wpm = n_words/time*60
wpms.append(wpm)
n_words = 0
start_time = sub.end
rms = np.std(wpms)
mean = np.mean(wpms)
if np.isnan(rms) or np.isnan(mean) or mean == 0:
return 0
else:
return rms/mean*100
def get_filler_word_rate(srt_file, txt_file):
words_to_check = ['so','like','um','okay','just']
subs = subtitleprocessing.srt_to_sub(srt_file)
total_time = (subs[-1].end - subs[0].start).total_seconds()
with open(txt_file, 'r') as myfile:
txt=myfile.read()
txt_words = txt.split()
num_fillers = 0
for word in words_to_check:
num_fillers += txt_words.count(word)
return num_fillers/(total_time/60.)
def get_repeated_word_rate(srt_file, txt_file):
subs = subtitleprocessing.srt_to_sub(srt_file)
total_time = (subs[-1].end - subs[0].start).total_seconds()
with open(txt_file, 'r') as myfile:
txt=myfile.read()
txt_words = txt.split()
num_repeats = 0
for j,word in enumerate(txt_words[:-1]):
if word == txt_words[j+1]:
num_repeats += 1
return num_repeats/(total_time/60.)