Skip to content

Commit

Permalink
Update scripts/data_prep/convert_text_to_mds.py
Browse files Browse the repository at this point in the history
  • Loading branch information
irenedea authored Sep 13, 2023
1 parent 5b8cee5 commit e2d772b
Showing 1 changed file with 0 additions and 5 deletions.
5 changes: 0 additions & 5 deletions scripts/data_prep/convert_text_to_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,17 +196,12 @@ def download_and_convert(
columns = {'tokens': 'bytes'}

print(f'Converting to MDS format...')
total_tokens_bytes = 0
with MDSWriter(out=output_folder,
columns=columns,
max_mds_writer_workers=max_mds_writer_workers,
compression=compression) as out:
for sample in tqdm(dataset):
total_tokens_bytes += len(sample['tokens'])
out.write(sample)
total_tokens = total_tokens_bytes / 8
print('tokens', total_tokens_bytes, total_tokens)
return total_tokens


def is_remote_path(path: str) -> bool:
Expand Down

0 comments on commit e2d772b

Please sign in to comment.