From ef8f2d74d523bed7b46e1d4720dd41bdd6d71d13 Mon Sep 17 00:00:00 2001 From: ArneBinder Date: Mon, 30 Sep 2024 16:46:03 +0200 Subject: [PATCH] `DatasetDict.to_json()` can append to already serialized data (#156) * `DatasetDict.to_json()` can append to already serialized data * add tests --- src/pie_datasets/core/dataset_dict.py | 25 +++++++++++----- tests/unit/core/test_dataset_dict.py | 42 +++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 7 deletions(-) diff --git a/src/pie_datasets/core/dataset_dict.py b/src/pie_datasets/core/dataset_dict.py index b2a6214c..608a3d7c 100644 --- a/src/pie_datasets/core/dataset_dict.py +++ b/src/pie_datasets/core/dataset_dict.py @@ -183,7 +183,8 @@ def from_json( # type: ignore def to_json(self, path: Union[str, Path], **kwargs) -> None: """Serializes the DatasetDict. We convert all documents with `.asdict()` and dump them with - `json.dump()` to one JSONLINE file per split. + `json.dump()` to one JSONLINE file per split. If there is already serialized data in the + output directory, we append the new data to the existing files. Args: path: path to the output directory @@ -196,11 +197,20 @@ def to_json(self, path: Union[str, Path], **kwargs) -> None: metadata = {"document_type": serialize_document_type(self.document_type)} os.makedirs(path, exist_ok=True) if os.path.exists(path / METADATA_FILE_NAME): - logger.warning( - f"metadata file '{path / METADATA_FILE_NAME}' already exists, overwriting it" - ) - with open(path / METADATA_FILE_NAME, "w") as f: - json.dump(metadata, f, indent=2) + # load previous metadata + with open(path / METADATA_FILE_NAME) as f: + previous_metadata = json.load(f) + if previous_metadata != metadata: + raise ValueError( + f"The metadata file {path / METADATA_FILE_NAME} already exists, " + "but the content does not match the current metadata. Can not append " + "the current dataset to already serialized data." + f"\nprevious metadata: {previous_metadata}" + f"\ncurrent metadata: {metadata}" + ) + else: + with open(path / METADATA_FILE_NAME, "w") as f: + json.dump(metadata, f, indent=2) # save the splits for split, dataset in self.items(): @@ -208,7 +218,8 @@ def to_json(self, path: Union[str, Path], **kwargs) -> None: logger.info(f'serialize documents to "{split_path}" ...') os.makedirs(split_path, exist_ok=True) file_name = split_path / "documents.jsonl" - with open(file_name, "w") as f: + mode = "a" if os.path.exists(file_name) else "w" + with open(file_name, mode) as f: for doc in dataset: f.write(json.dumps(doc.asdict(), **kwargs) + "\n") diff --git a/tests/unit/core/test_dataset_dict.py b/tests/unit/core/test_dataset_dict.py index 829fc90e..3c66cf42 100644 --- a/tests/unit/core/test_dataset_dict.py +++ b/tests/unit/core/test_dataset_dict.py @@ -107,6 +107,48 @@ def test_to_json_and_back_serialize_document_type(dataset_dict, tmp_path): assert doc1 == doc2 +def test_to_json_and_back_append(dataset_dict, tmp_path): + path = Path(tmp_path) / "dataset_dict" + + dataset_dict1 = DatasetDict( + {split_name: Dataset.from_documents(docs[:2]) for split_name, docs in dataset_dict.items()} + ) + dataset_dict2 = DatasetDict( + {split_name: Dataset.from_documents(docs[2:]) for split_name, docs in dataset_dict.items()} + ) + dataset_dict1.to_json(path) + dataset_dict2.to_json(path) + dataset_dict_from_json = DatasetDict.from_json( + data_dir=str(path), + ) + assert set(dataset_dict_from_json) == set(dataset_dict) + for split in dataset_dict: + assert len(dataset_dict_from_json[split]) == len(dataset_dict[split]) + for doc1, doc2 in zip(dataset_dict_from_json[split], dataset_dict[split]): + assert doc1 == doc2 + + +def test_to_json_and_back_append_metadata_mismatch(dataset_dict, tmp_path): + path = Path(tmp_path) / "dataset_dict" + + dataset_dict1 = DatasetDict( + {split_name: Dataset.from_documents(docs[:2]) for split_name, docs in dataset_dict.items()} + ) + dataset_dict2 = DatasetDict( + {split_name: Dataset.from_documents(docs[2:]) for split_name, docs in dataset_dict.items()} + ) + dataset_dict2_converted = dataset_dict2.cast_document_type(TextBasedDocument) + dataset_dict1.to_json(path) + with pytest.raises(ValueError) as excinfo: + dataset_dict2_converted.to_json(path) + assert str(excinfo.value).endswith( + "metadata.json already exists, but the content does not match the current metadata. " + "Can not append the current dataset to already serialized data." + "\nprevious metadata: {'document_type': 'tests.unit.core.test_dataset_dict.DocumentWithEntitiesAndRelations'}" + "\ncurrent metadata: {'document_type': 'pytorch_ie.documents.TextBasedDocument'}" + ) + + def test_document_type_empty_no_splits(): with pytest.raises(ValueError) as excinfo: DatasetDict().document_type