forked from resemble-ai/Resemblyzer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo03_projection.py
31 lines (23 loc) · 1.23 KB
/
demo03_projection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from resemblyzer import preprocess_wav, VoiceEncoder
from demo_utils import *
from itertools import groupby
from pathlib import Path
from tqdm import tqdm
import numpy as np
# DEMO 03: we'll show one way to visualize these utterance embeddings. Since they are
# 256-dimensional, it is much simpler for us to get an overview of their manifold if we reduce
# their dimensionality first. By doing so, we can observe clusters that form for utterances of
# identical characteristics. What we'll see is that clusters form for distinct speakers,
# and they are very tight and even linearly separable.
## Gather the wavs
wav_fpaths = list(Path("audio_data", "librispeech_test-other").glob("**/*.flac"))
speakers = list(map(lambda wav_fpath: wav_fpath.parent.stem, wav_fpaths))
wavs = np.array(list(map(preprocess_wav, tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths)))))
speaker_wavs = {speaker: wavs[list(indices)] for speaker, indices in
groupby(range(len(wavs)), lambda i: speakers[i])}
## Compute the embeddings
encoder = VoiceEncoder()
utterance_embeds = np.array(list(map(encoder.embed_utterance, wavs)))
## Project the embeddings in 2D space
plot_projections(utterance_embeds, speakers, title="Embedding projections")
plt.show()