From e2d772b4d30acbe4182fd3477a6a6fdf5db81d43 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Wed, 13 Sep 2023 12:46:03 -0700 Subject: [PATCH] Update scripts/data_prep/convert_text_to_mds.py --- scripts/data_prep/convert_text_to_mds.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index 2c3db90c00..56b58847e3 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -196,17 +196,12 @@ def download_and_convert( columns = {'tokens': 'bytes'} print(f'Converting to MDS format...') - total_tokens_bytes = 0 with MDSWriter(out=output_folder, columns=columns, max_mds_writer_workers=max_mds_writer_workers, compression=compression) as out: for sample in tqdm(dataset): - total_tokens_bytes += len(sample['tokens']) out.write(sample) - total_tokens = total_tokens_bytes / 8 - print('tokens', total_tokens_bytes, total_tokens) - return total_tokens def is_remote_path(path: str) -> bool: