Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated script to better work with slurm #39

Open
wants to merge 47 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
da2b7d9
added new dataset
Jun 24, 2022
b7f5044
added CpVoST
Jun 27, 2022
014f519
removed duplidate dataset
Jun 27, 2022
50305e4
added to do list
Jun 29, 2022
0d911c5
First commit
Jun 29, 2022
ef4dfcb
_
Jun 29, 2022
59c0f49
Updated yaml to work with slurm
Jul 7, 2022
4147b41
merged laion-main and fixed conflicts
Jul 7, 2022
fb95086
move yml file back to root as it is being used by other scripts outsi…
Jul 7, 2022
0ac7933
_
Jul 7, 2022
6d6f929
renamed file
Jul 8, 2022
eacade6
removed unwanted text
Jul 8, 2022
43af9a7
_
Jul 11, 2022
a1576fe
First commit
Jul 11, 2022
4e34bd3
added clause to avoid duplicate calculation
Jul 11, 2022
68ca8ff
Screipt to process CREMA-D
Jul 12, 2022
caf739a
fixed issue where file were saved in wrong area
Jul 12, 2022
890d63f
script to download appropreate tsv
Jul 13, 2022
1e8f87b
Added caching of dfs to avoid recomputing the same sf multiple time i…
Jul 13, 2022
7bf8ede
fixed error where .json did not conform with audio-dataset
Aug 2, 2022
a5941ba
processing setup for en-X and x_en
Aug 2, 2022
2f9951e
processing setup for en-X and x_en
Aug 2, 2022
2c3036d
setup for eng to X
Aug 2, 2022
fb5d9b7
Fixed json
Aug 11, 2022
68e37c0
modified to process given language
Aug 15, 2022
8ca6509
Fixed bug when a trailing / is needed
Aug 16, 2022
a664c0d
processing code for EmoV-DB
Aug 16, 2022
5323d86
processing and cleaning old files
Aug 16, 2022
bc1fbad
Improved multiprocessing workload, processing improve from 3 it/s to …
Aug 22, 2022
678106c
Merge branch 'LAION-AI:main' into main
knoriy Oct 7, 2022
6db9c08
Merge branch 'LAION-AI:main' into main
knoriy Oct 30, 2022
c0b7bec
add: processor for cv11.0
Dec 5, 2022
4d545dd
Merge branch 'main' of github.com:knoriy/audio-dataset
Dec 5, 2022
812e399
add: script to process common voice
Dec 5, 2022
00a7537
Merge branch 'LAION-AI:main' into main
knoriy Dec 5, 2022
1c07775
updated cmd
Jan 13, 2023
ded62af
Merge branch 'main' of github.com:knoriy/audio-dataset
Jan 13, 2023
850b120
automated extrators and updated text
Jan 13, 2023
a7265cf
fix: fixed but laguage was not set correctly, being set as train,test…
Jan 16, 2023
0e3ca94
Merge pull request #2 from LAION-AI/main
knoriy May 4, 2023
3eb72e7
updated to include audioset descriptions
May 4, 2023
bb85a09
backup
Jun 19, 2023
0544617
backup
Jun 19, 2023
e1e3643
updated json file text to include metadata
Jul 5, 2023
6045d5d
Added meta
Jul 5, 2023
a919011
upload to s3
Jul 5, 2023
3fc2359
added emns dataset
knoriy Jul 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,4 +133,5 @@ dmypy.json
utils/__pycache__/make_tar_utils.cpython-37.pyc
/data_preprocess/process_audioset/

*.out
*.out
test.*py*
6 changes: 6 additions & 0 deletions current_dataset/ToDO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# ToDo

- [X] LJSpeech
- [X] MSWC
- [ ] GigaSpeech
- [ ] CoVoST
122 changes: 122 additions & 0 deletions current_dataset/preprocess_CREMA-D.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""
Code for preprocess GigaSpeech Corpus:
https://github.com/SpeechColab/GigaSpeech
"""

import glob
import tqdm
import os
import glob
import pandas as pd
import sys
import tarfile
import json
import shutil
import fsspec

from sklearn.model_selection import train_test_split
from concurrent.futures import ThreadPoolExecutor, as_completed

sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
from utils.audio_utils import audio_to_flac
from utils.make_tar_utils import tardir

def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=False):
if os.path.isfile(dest) and overwrite==False:
if verbose==True:
print(f'{dest} already exists, skiping')
return
audio_to_flac(file, dest)
with open(dest.replace('.flac', '.json'), 'w') as f:
json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':df['text']}, f)


def split_all_audio_files(df, dest_root_path, max_workers=96):
if not os.path.exists(dest_root_path):
raise FileNotFoundError(f'Please Check {dest_root_path} exists')

l = len(df)
with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())]
for _ in as_completed(threads):
pbar.update(1)

def create_df(root_path:str, dataset_name:str=None):
wavs = glob.glob(os.path.join(root_path, '**/*.wav'), recursive=True)
codes = { 'Statement':{ 'IEO':"It's eleven o'clock",
'TIE':"That is exactly what happened",
'IOM':"I'm on my way to the meeting",
'IWW':"I wonder what this is about",
'TAI':"The airplane is almost full",
'MTI':"Maybe tomorrow it will be cold",
'IWL':"I would like a new alarm clock",
'ITH':"I think I have a doctor's appointment",
'DFA':"Don't forget a jacket",
'ITS':"I think I've seen this before",
'TSI':"The surface is slick",
'WSI':"We'll stop in a couple of minutes",
},
'Emotion':{ 'ANG':'angery',
'DIS':'disgusted',
'FEA':'fearful',
'HAP':'happy',
'NEU':'neutral',
'SAD':'sad',
},
'Emotional intensity':{ 'LO':'Low',
'MD':'Medium',
'HI':'High',
'XX':'Unspecified',
},
}
demographics = pd.read_csv('/home/knoriy/fsx/raw_datasets/CREMA-D/VideoDemographics.csv', names=["ActorID","Age","Sex","Race","Ethnicity"])
df_data = []
for wav in tqdm.tqdm(wavs):
file_name = os.path.basename(wav).split('.')[0]
wav_codes = file_name.split('_')
text_meta = [codes['Statement'][wav_codes[1]], codes['Emotion'][wav_codes[2]], codes['Emotional intensity'][wav_codes[3]]]
demograpthics_meta = demographics.loc[demographics['ActorID'] == wav_codes[0]]

male_or_female = 'woman' if demograpthics_meta["Sex"].values[0] == 'Female' else 'man'
intensity = '' if text_meta[2] == 'Unspecified' else f'and {text_meta[2]} '
text = f'A {male_or_female} saying "{text_meta[0]}" in a {text_meta[1]} {intensity}voice.'
df_data.append({ 'path':wav, 'text':text, 'tag':{'transcript':text_meta[0], 'emotion':text_meta[1], 'gender':demograpthics_meta["Sex"].values[0], 'age':demograpthics_meta["Age"].values[0] }})

return pd.DataFrame(df_data)


if __name__ == '__main__':
import multiprocessing

max_workers = multiprocessing.cpu_count()
print("Num workers: ", max_workers)
chunk = 512

root_path = '/home/knoriy/fsx/raw_datasets/CREMA-D/AudioWAV/'
dataset_name = 'CREMA-D'

s3 = fsspec.filesystem('s3')
s3_dest = f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/'

# load metadata and configure audio paths
df = create_df(root_path)

# create train, test, valid splits
train, test = train_test_split(df, test_size=0.2)
valid, test = train_test_split(test, test_size=0.2)
train_test_val = {'valid/':valid, 'train/':train, 'test/':test}


for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
df = train_test_val[key]

dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets').replace('AudioWAV/', ''), key)
os.makedirs(dest_path, exist_ok=True)

split_all_audio_files(df, dest_path)
tardir(dest_path, dest_path, chunk, delete_file=True)

# upload to s3 and delete local
s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
shutil.rmtree(dest_path)
143 changes: 143 additions & 0 deletions current_dataset/preprocess_CoVoST.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
"""
Code for preprocess LJSpeech Corpus:
https://keithito.com/LJ-Speech-Dataset/
"""

import glob
from tokenize import Name
import tqdm
import os
import glob
import pandas as pd
import sys
import tarfile
import json
import shutil
import fsspec

from sklearn.model_selection import train_test_split
from concurrent.futures import ThreadPoolExecutor, as_completed

sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
from utils.audio_utils import audio_to_flac
from utils.make_tar_utils import tardir

def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=False):
if os.path.isfile(dest) and overwrite==False:
if verbose==True:
print(f'{dest} already exists, skiping')
return
audio_to_flac(file, dest)
with open(dest.replace('.flac', '.json'), 'w') as f:
json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['norm_text'], 'tag':{'raw_text':df['raw_text']}}, f)


def split_all_audio_files(df, dest_root_path, max_workers=96):
if not os.path.exists(dest_root_path):
raise FileNotFoundError(f'Please Check {dest_root_path} exists')

l = len(df)
with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())]
for _ in as_completed(threads):
pbar.update(1)

def download_tsvs(urls:list, output_dir:str, extract:bool=False):
os.makedirs(output_dir, exist_ok=True)
for url in urls:
dest_path = os.path.join(output_dir, url.split("/")[-1])
if os.path.isfile(dest_path):
continue
os.system(f'curl {url} --output {dest_path}')

if extract:
os.system(f'tar -xf {dest_path}')

if __name__ == '__main__':
x_2_eng = [
"https://dl.fbaipublicfiles.com/covost/covost_v2.fr_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.de_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.es_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.ca_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.it_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.ru_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.zh-CN_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.pt_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.fa_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.et_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.mn_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.nl_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.tr_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.ar_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.sv-SE_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.lv_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.sl_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.ta_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.ja_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.id_en.tsv.tar.gz",
"https://dl.fbaipublicfiles.com/covost/covost_v2.cy_en.tsv.tar.gz",
]
eng_2_x = [
'https://dl.fbaipublicfiles.com/covost/covost_v2.en_de.tsv.tar.gz',
'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ca.tsv.tar.gz',
'https://dl.fbaipublicfiles.com/covost/covost_v2.en_zh-CN.tsv.tar.gz',
'https://dl.fbaipublicfiles.com/covost/covost_v2.en_fa.tsv.tar.gz',
'https://dl.fbaipublicfiles.com/covost/covost_v2.en_et.tsv.tar.gz',
'https://dl.fbaipublicfiles.com/covost/covost_v2.en_mn.tsv.tar.gz',
'https://dl.fbaipublicfiles.com/covost/covost_v2.en_tr.tsv.tar.gz',
'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ar.tsv.tar.gz',
'https://dl.fbaipublicfiles.com/covost/covost_v2.en_sv-SE.tsv.tar.gz',
'https://dl.fbaipublicfiles.com/covost/covost_v2.en_lv.tsv.tar.gz',
'https://dl.fbaipublicfiles.com/covost/covost_v2.en_sl.tsv.tar.gz',
'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ta.tsv.tar.gz',
'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ja.tsv.tar.gz',
'https://dl.fbaipublicfiles.com/covost/covost_v2.en_id.tsv.tar.gz',
'https://dl.fbaipublicfiles.com/covost/covost_v2.en_cy.tsv.tar.gz',
]
download_tsvs(eng_2_x, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs/")
download_tsvs(x_2_eng, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs")
import multiprocessing

max_workers = multiprocessing.cpu_count()
chunk = 512
generate_subset_tsv = True

root_path = '/home/knoriy/datasets/raw_datasets/CoVoST_2/'
metadata_dir = "/home/knoriy/datasets/raw_datasets/CoVoST_2/"

dataset_name = 'CoVoST_2'

s3 = fsspec.filesystem('s3')
s3_dest = f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/'

# load metadata and configure audio paths
df = pd.read_csv('/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs/covost_v2.en_de.dev.tsv', sep='\t')
print(df.head())

# create train, test, valid splits
# train, test = train_test_split(df, test_size=0.2)
# valid, test = train_test_split(test, test_size=0.2)
# train_test_val = {'train/':train, 'test/':test, 'valid/':valid}


# for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
# df = train_test_val[key]

# dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'),key )
# os.makedirs(dest_path, exist_ok=True)

# split_all_audio_files(df, dest_path)
# tardir(dest_path, dest_path, chunk, delete_file=True)

# # upload to s3 and delete local
# s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
# shutil.rmtree(dest_path)


'''
python get_covost_splits.py \
--version 2 --src-lang en_de --tgt-lang <tgt_lang_code> \
--root <root path to the translation TSV and output TSVs> \
--cv-tsv <path to validated.tsv>
'''
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@
from utils.audio_utils import audio_to_flac
from utils.make_tar_utils import tardir

def convert_and_json_dump(file:str, dest:str, df):
def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
if os.path.isfile(dest) and overwrite==False:
print(f'{dest} already exists, skiping')
return

audio_to_flac(file, dest, segment_start=df['begin_time'], segment_end=df['end_time'])
with open(dest.replace('.flac', '.json'), 'w') as f:
json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':df['text'], 'tag':df['tag']}, f)
Expand All @@ -42,42 +46,48 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
import multiprocessing

max_workers = multiprocessing.cpu_count()
max_workers = 2
print("Num workers: ", max_workers)
chunk = 512

root_path = '/mnt/knoriy/raw_datasets/gigaspeech/'
metadata_dir = "/mnt/knoriy/raw_datasets/gigaspeech/GigaSpeech.json"
root_path = '/home/knoriy/fsx/raw_datasets/GigaSpeech/gigaspeech/'
metadata_dir = "/home/knoriy/fsx/raw_datasets/GigaSpeech/gigaspeech/GigaSpeech.json"

dataset_name = 'gigaspeech'

s3 = fsspec.filesystem('s3')
s3_dest = f's-laion/knoriy/GigaSpeech/{dataset_name}_tars/'

# load metadata and configure audio paths
raw_df = pd.read_json(metadata_dir)[:2]

new_df_data = []
for row in tqdm.tqdm(raw_df.iloc(), total=len(raw_df), desc='Generating dataframe: '):
for seg in row['audios']['segments']:
try:
catagory = row['audios']['category']
except:
catagory = 'N/A'

if seg['text_tn'] == '<SIL>':
continue

new_df_data.append(
{'path':f'{os.path.join(root_path, row["audios"]["path"])}',
'begin_time': seg['begin_time'],
'end_time': seg['end_time'],
'text': seg['text_tn'],
'tag':{ 'language':row['language'],
'url':row['audios']['url'],
'category':catagory,
'speaker':row['audios']['speaker']}
})
df = pd.DataFrame(new_df_data)
cache_df_path = os.path.join(root_path, 'temp_df.csv')
if os.path.isfile(cache_df_path):
df = pd.read_csv(cache_df_path, sep='\t')
else:
raw_df = pd.read_json(metadata_dir)

new_df_data = []
for row in tqdm.tqdm(raw_df.iloc(), total=len(raw_df), desc='Generating dataframe: '):
for seg in row['audios']['segments']:
try:
catagory = row['audios']['category']
except:
catagory = 'N/A'

if seg['text_tn'] == '<SIL>' or seg['text_tn'] == '<NOISE>':
continue

new_df_data.append(
{'path':f'{os.path.join(root_path, row["audios"]["path"])}',
'begin_time': seg['begin_time'],
'end_time': seg['end_time'],
'text': seg['text_tn'],
'tag':{ 'language':row['language'],
'url':row['audios']['url'],
'category':catagory,
'speaker':row['audios']['speaker']}
})
df = pd.DataFrame(new_df_data)
df.to_csv(cache_df_path, sep='\t', index=False)

print(df.head())

# create train, test, valid splits
Expand Down
7 changes: 6 additions & 1 deletion current_dataset/preprocess_LJSpeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@
from utils.audio_utils import audio_to_flac
from utils.make_tar_utils import tardir

def convert_and_json_dump(file:str, dest:str, df):
def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
if os.path.isfile(dest) and overwrite==False:
print(f'{dest} already exists, skiping')
return

audio_to_flac(file, dest)
with open(dest.replace('.flac', '.json'), 'w') as f:
json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['norm_text'], 'tag':{'raw_text':df['raw_text']}}, f)
Expand All @@ -42,6 +46,7 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
import multiprocessing

max_workers = multiprocessing.cpu_count()
print("Num workers: ", max_workers)
chunk = 512
generate_subset_tsv = True

Expand Down
Loading