-
Notifications
You must be signed in to change notification settings - Fork 0
/
import-dataset.py
47 lines (33 loc) · 1.37 KB
/
import-dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env python
import json
import os
import s3fs
from datasets import load_dataset, Dataset
from unsloth import standardize_sharegpt
bucket_name = os.getenv("BUCKET_NAME")
assert bucket_name is not None
dataset_name = os.getenv("DATASET_NAME")
assert dataset_name is not None
storage_options = {
"key": os.getenv("AWS_ACCESS_KEY_ID"),
"secret": os.getenv("AWS_SECRET_ACCESS_KEY"),
"endpoint_url": "https://fly.storage.tigris.dev"
}
assert storage_options["key"] is not None
assert storage_options["secret"] is not None
fs = s3fs.S3FileSystem(**storage_options)
if fs.exists(f"/{bucket_name}/standardized/{dataset_name}"):
print(f"Dataset {dataset_name} already exists and is standardized")
exit(0)
dataset = load_dataset(dataset_name, split="train", streaming=True)
biggest = 0
for i, x in enumerate(dataset.iter(5_000_000)):
if isinstance(x, dict):
ds = Dataset.from_dict(x, features=dataset.features)
else:
ds = Dataset.from_generator(lambda: (yield from x), features=dataset.features)
ds.save_to_disk(f"s3://{bucket_name}/raw/{dataset_name}/{i}", storage_options=storage_options)
ds = standardize_sharegpt(ds)
ds.save_to_disk(f"s3://{bucket_name}/standardized/{dataset_name}/{i}", storage_options=storage_options)
biggest = i
fs.write_text(f"/{bucket_name}/raw/{dataset_name}/info.json", json.dumps({"count": biggest}))