From ba1168d2428edb27f40858d913d442ccd8634250 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Mon, 9 Sep 2024 14:55:11 +0200 Subject: [PATCH] implement DatasetDict.shuffle --- src/pie_datasets/core/dataset_dict.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/pie_datasets/core/dataset_dict.py b/src/pie_datasets/core/dataset_dict.py index b2a6214c..1c3689bf 100644 --- a/src/pie_datasets/core/dataset_dict.py +++ b/src/pie_datasets/core/dataset_dict.py @@ -694,6 +694,15 @@ def cast_document_type( ) return result + def shuffle(self, **kwargs): + result = DatasetDict.from_hf(super().shuffle(**kwargs), document_type=self.document_type) + + # TODO: integrate into DatasetDict.from_hf + for split_name, split in result.items(): + split.document_converters = self[split_name].document_converters + + return result + def load_dataset(*args, **kwargs) -> Union[DatasetDict, Dataset, IterableDataset]: dataset_or_dataset_dict = datasets.load_dataset(*args, **kwargs)