LAION-AI · knoriy · Jun 24, 2022 · Jun 27, 2022 · Jun 27, 2022 · Jun 29, 2022
diff --git a/.gitignore b/.gitignore
@@ -133,4 +133,5 @@ dmypy.json
 utils/__pycache__/make_tar_utils.cpython-37.pyc
 /data_preprocess/process_audioset/
 
-*.out
+*.out
+test.*py*
diff --git a/current_dataset/ToDO.md b/current_dataset/ToDO.md
@@ -0,0 +1,6 @@
+# ToDo
+
+- [X] LJSpeech
+- [X] MSWC
+- [ ] GigaSpeech
+- [ ] CoVoST
diff --git a/current_dataset/preprocess_CREMA-D.py b/current_dataset/preprocess_CREMA-D.py
@@ -0,0 +1,122 @@
+"""
+Code for preprocess GigaSpeech Corpus:
+https://github.com/SpeechColab/GigaSpeech
+"""
+
+import glob
+import tqdm
+import os
+import glob
+import pandas as pd
+import sys
+import tarfile
+import json
+import shutil
+import fsspec
+
+from sklearn.model_selection import train_test_split
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=False):
+    if os.path.isfile(dest) and overwrite==False:
+        if verbose==True:
+            print(f'{dest} already exists, skiping')
+        return
+    audio_to_flac(file, dest)
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':df['text']}, f)
+
+
+def split_all_audio_files(df, dest_root_path, max_workers=96):
+    if not os.path.exists(dest_root_path):
+        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
+
+    l = len(df)
+    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())]
+            for _ in as_completed(threads):
+                pbar.update(1)
+
+def create_df(root_path:str, dataset_name:str=None):
+    wavs = glob.glob(os.path.join(root_path, '**/*.wav'), recursive=True)
+    codes = {   'Statement':{   'IEO':"It's eleven o'clock", 
+                                'TIE':"That is exactly what happened",
+                                'IOM':"I'm on my way to the meeting",
+                                'IWW':"I wonder what this is about",
+                                'TAI':"The airplane is almost full",
+                                'MTI':"Maybe tomorrow it will be cold",
+                                'IWL':"I would like a new alarm clock",
+                                'ITH':"I think I have a doctor's appointment",
+                                'DFA':"Don't forget a jacket",
+                                'ITS':"I think I've seen this before",
+                                'TSI':"The surface is slick",
+                                'WSI':"We'll stop in a couple of minutes",
+                                },
+                'Emotion':{     'ANG':'angery',
+                                'DIS':'disgusted',
+                                'FEA':'fearful',
+                                'HAP':'happy',
+                                'NEU':'neutral',
+                                'SAD':'sad',
+                        },
+                'Emotional intensity':{ 'LO':'Low', 
+                                        'MD':'Medium',
+                                        'HI':'High',
+                                        'XX':'Unspecified',
+                                        },
+                }
+    demographics = pd.read_csv('/home/knoriy/fsx/raw_datasets/CREMA-D/VideoDemographics.csv', names=["ActorID","Age","Sex","Race","Ethnicity"])
+    df_data = []
+    for wav in tqdm.tqdm(wavs):
+        file_name = os.path.basename(wav).split('.')[0]
+        wav_codes = file_name.split('_')
+        text_meta = [codes['Statement'][wav_codes[1]], codes['Emotion'][wav_codes[2]], codes['Emotional intensity'][wav_codes[3]]]
+        demograpthics_meta = demographics.loc[demographics['ActorID'] == wav_codes[0]]
+
+        male_or_female = 'woman' if demograpthics_meta["Sex"].values[0] == 'Female' else 'man'
+        intensity = '' if text_meta[2] == 'Unspecified' else f'and {text_meta[2]} '
+        text = f'A {male_or_female} saying "{text_meta[0]}" in a {text_meta[1]} {intensity}voice.'
+        df_data.append({ 'path':wav, 'text':text, 'tag':{'transcript':text_meta[0], 'emotion':text_meta[1], 'gender':demograpthics_meta["Sex"].values[0], 'age':demograpthics_meta["Age"].values[0] }})
+
+    return pd.DataFrame(df_data)
+
+
+if __name__ == '__main__':
+    import multiprocessing
+
+    max_workers = multiprocessing.cpu_count()
+    print("Num workers: ", max_workers)
+    chunk = 512
+
+    root_path = '/home/knoriy/fsx/raw_datasets/CREMA-D/AudioWAV/'
+    dataset_name = 'CREMA-D'
+
+    s3 = fsspec.filesystem('s3')
+    s3_dest = f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/'
+
+    # load metadata and configure audio paths
+    df = create_df(root_path)
+
+    # create train, test, valid splits
+    train, test = train_test_split(df, test_size=0.2)
+    valid, test = train_test_split(test, test_size=0.2)
+    train_test_val = {'valid/':valid, 'train/':train, 'test/':test}
+
+
+    for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
+        df = train_test_val[key]
+
+        dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets').replace('AudioWAV/', ''), key)
+        os.makedirs(dest_path, exist_ok=True)
+
+        split_all_audio_files(df, dest_path)
+        tardir(dest_path, dest_path, chunk, delete_file=True)
+
+        # upload to s3 and delete local
+        s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+        shutil.rmtree(dest_path)
diff --git a/current_dataset/preprocess_CoVoST.py b/current_dataset/preprocess_CoVoST.py
@@ -0,0 +1,143 @@
+"""
+Code for preprocess LJSpeech Corpus:
+https://keithito.com/LJ-Speech-Dataset/
+"""
+
+import glob
+from tokenize import Name
+import tqdm
+import os
+import glob
+import pandas as pd
+import sys
+import tarfile
+import json
+import shutil
+import fsspec
+
+from sklearn.model_selection import train_test_split
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+from utils.audio_utils import audio_to_flac
+from utils.make_tar_utils import tardir
+
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False, verbose=False):
+    if os.path.isfile(dest) and overwrite==False:
+        if verbose==True:
+            print(f'{dest} already exists, skiping')
+        return
+    audio_to_flac(file, dest)
+    with open(dest.replace('.flac', '.json'), 'w') as f:
+        json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['norm_text'], 'tag':{'raw_text':df['raw_text']}}, f)
+
+
+def split_all_audio_files(df, dest_root_path, max_workers=96):
+    if not os.path.exists(dest_root_path):
+        raise FileNotFoundError(f'Please Check {dest_root_path} exists')
+
+    l = len(df)
+    with tqdm.tqdm(total=l, desc=f'Processing {dest_root_path}') as pbar:
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            threads = [executor.submit(convert_and_json_dump, row[0], os.path.join(dest_root_path, f'{i}.flac'), row) for i, row in enumerate(df.iloc())]
+            for _ in as_completed(threads):
+                pbar.update(1)
+
+def download_tsvs(urls:list, output_dir:str, extract:bool=False):
+    os.makedirs(output_dir, exist_ok=True)
+    for url in urls:
+        dest_path = os.path.join(output_dir, url.split("/")[-1])
+        if os.path.isfile(dest_path):
+            continue
+        os.system(f'curl {url} --output {dest_path}')
+
+        if extract:
+            os.system(f'tar -xf {dest_path}')
+
+if __name__ == '__main__':
+    x_2_eng = [
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.fr_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.de_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.es_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.ca_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.it_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.ru_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.zh-CN_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.pt_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.fa_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.et_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.mn_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.nl_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.tr_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.ar_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.sv-SE_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.lv_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.sl_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.ta_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.ja_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.id_en.tsv.tar.gz",
+        "https://dl.fbaipublicfiles.com/covost/covost_v2.cy_en.tsv.tar.gz",
+    ]
+    eng_2_x = [
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_de.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ca.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_zh-CN.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_fa.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_et.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_mn.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_tr.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ar.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_sv-SE.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_lv.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_sl.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ta.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_ja.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_id.tsv.tar.gz',
+        'https://dl.fbaipublicfiles.com/covost/covost_v2.en_cy.tsv.tar.gz',
+    ]
+    download_tsvs(eng_2_x, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs/")
+    download_tsvs(x_2_eng, "/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs")
+    import multiprocessing
+
+    max_workers = multiprocessing.cpu_count()
+    chunk = 512
+    generate_subset_tsv = True
+
+    root_path = '/home/knoriy/datasets/raw_datasets/CoVoST_2/'
+    metadata_dir = "/home/knoriy/datasets/raw_datasets/CoVoST_2/"
+
+    dataset_name = 'CoVoST_2'
+
+    s3 = fsspec.filesystem('s3')
+    s3_dest = f's-laion/knoriy/{dataset_name}/{dataset_name}_tars/'
+
+    # load metadata and configure audio paths
+    df = pd.read_csv('/home/knoriy/fsx/raw_datasets/CoVoST_2/tsvs/covost_v2.en_de.dev.tsv', sep='\t')
+    print(df.head())
+
+    # create train, test, valid splits
+    # train, test = train_test_split(df, test_size=0.2)
+    # valid, test = train_test_split(test, test_size=0.2)
+    # train_test_val = {'train/':train, 'test/':test, 'valid/':valid}
+
+
+    # for key in tqdm.tqdm(train_test_val, desc=f'processing:'):
+    #     df = train_test_val[key]
+
+    #     dest_path = os.path.join(root_path.replace('raw_datasets', 'processed_datasets'),key )
+    #     os.makedirs(dest_path, exist_ok=True)
+
+    #     split_all_audio_files(df, dest_path)
+    #     tardir(dest_path, dest_path, chunk, delete_file=True)
+
+    #     # upload to s3 and delete local
+    #     s3.put(dest_path, os.path.join(s3_dest, key), recursive=True)
+    #     shutil.rmtree(dest_path)
+
+
+    '''
+        python get_covost_splits.py \
+        --version 2 --src-lang en_de --tgt-lang <tgt_lang_code> \
+        --root <root path to the translation TSV and output TSVs> \
+        --cv-tsv <path to validated.tsv>
+    '''
diff --git a/current_dataset/process_GigaSpeech.py → current_dataset/preprocess_GigaSpeech.py b/current_dataset/process_GigaSpeech.py → current_dataset/preprocess_GigaSpeech.py
@@ -21,7 +21,11 @@
 from utils.audio_utils import audio_to_flac
 from utils.make_tar_utils import tardir
 
-def convert_and_json_dump(file:str, dest:str, df):
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
+    if os.path.isfile(dest) and overwrite==False:
+        print(f'{dest} already exists, skiping')
+        return
+
     audio_to_flac(file, dest, segment_start=df['begin_time'], segment_end=df['end_time'])
     with open(dest.replace('.flac', '.json'), 'w') as f:
         json.dump({'filename': os.path.join(*dest.split('/')[5:]), 'text':df['text'], 'tag':df['tag']}, f)
@@ -42,42 +46,48 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
     import multiprocessing
 
     max_workers = multiprocessing.cpu_count()
-    max_workers = 2
+    print("Num workers: ", max_workers)
     chunk = 512
 
-    root_path = '/mnt/knoriy/raw_datasets/gigaspeech/'
-    metadata_dir = "/mnt/knoriy/raw_datasets/gigaspeech/GigaSpeech.json"
+    root_path = '/home/knoriy/fsx/raw_datasets/GigaSpeech/gigaspeech/'
+    metadata_dir = "/home/knoriy/fsx/raw_datasets/GigaSpeech/gigaspeech/GigaSpeech.json"
 
     dataset_name = 'gigaspeech'
 
     s3 = fsspec.filesystem('s3')
     s3_dest = f's-laion/knoriy/GigaSpeech/{dataset_name}_tars/'
 
     # load metadata and configure audio paths
-    raw_df = pd.read_json(metadata_dir)[:2]
-
-    new_df_data = []
-    for row in tqdm.tqdm(raw_df.iloc(), total=len(raw_df), desc='Generating dataframe: '):
-        for seg in row['audios']['segments']:
-            try:
-                catagory = row['audios']['category']
-            except:
-                catagory = 'N/A'
-
-            if seg['text_tn'] == '<SIL>':
-                continue
-
-            new_df_data.append(
-                {'path':f'{os.path.join(root_path, row["audios"]["path"])}', 
-                'begin_time': seg['begin_time'], 
-                'end_time': seg['end_time'], 
-                'text': seg['text_tn'],
-                'tag':{ 'language':row['language'], 
-                        'url':row['audios']['url'], 
-                        'category':catagory,
-                        'speaker':row['audios']['speaker']}
-                })
-    df = pd.DataFrame(new_df_data)
+    cache_df_path = os.path.join(root_path, 'temp_df.csv')
+    if os.path.isfile(cache_df_path):
+        df = pd.read_csv(cache_df_path, sep='\t')
+    else:
+        raw_df = pd.read_json(metadata_dir)
+
+        new_df_data = []
+        for row in tqdm.tqdm(raw_df.iloc(), total=len(raw_df), desc='Generating dataframe: '):
+            for seg in row['audios']['segments']:
+                try:
+                    catagory = row['audios']['category']
+                except:
+                    catagory = 'N/A'
+
+                if seg['text_tn'] == '<SIL>' or seg['text_tn'] == '<NOISE>':
+                    continue
+
+                new_df_data.append(
+                    {'path':f'{os.path.join(root_path, row["audios"]["path"])}', 
+                    'begin_time': seg['begin_time'], 
+                    'end_time': seg['end_time'], 
+                    'text': seg['text_tn'],
+                    'tag':{ 'language':row['language'], 
+                            'url':row['audios']['url'], 
+                            'category':catagory,
+                            'speaker':row['audios']['speaker']}
+                    })
+        df = pd.DataFrame(new_df_data)
+        df.to_csv(cache_df_path, sep='\t', index=False)
+
     print(df.head())
 
     # create train, test, valid splits

diff --git a/current_dataset/preprocess_LJSpeech.py b/current_dataset/preprocess_LJSpeech.py
@@ -21,7 +21,11 @@
 from utils.audio_utils import audio_to_flac
 from utils.make_tar_utils import tardir
 
-def convert_and_json_dump(file:str, dest:str, df):
+def convert_and_json_dump(file:str, dest:str, df, overwrite:bool=False):
+    if os.path.isfile(dest) and overwrite==False:
+        print(f'{dest} already exists, skiping')
+        return
+
     audio_to_flac(file, dest)
     with open(dest.replace('.flac', '.json'), 'w') as f:
         json.dump({'filename': os.path.join(*dest.split('/')[4:]), 'text':df['norm_text'], 'tag':{'raw_text':df['raw_text']}}, f)
@@ -42,6 +46,7 @@ def split_all_audio_files(df, dest_root_path, max_workers=96):
     import multiprocessing
 
     max_workers = multiprocessing.cpu_count()
+    print("Num workers: ", max_workers)
     chunk = 512
     generate_subset_tsv = True