DatasetDict.to_json() can append to already serialized data (#156)

* `DatasetDict.to_json()` can append to already serialized data * add tests
ArneBinder · Sep 30, 2024 · ef8f2d7 · ef8f2d7
1 parent e1db8f3
commit ef8f2d7
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 7 deletions.
diff --git a/src/pie_datasets/core/dataset_dict.py b/src/pie_datasets/core/dataset_dict.py
@@ -183,7 +183,8 @@ def from_json(  # type: ignore
 
     def to_json(self, path: Union[str, Path], **kwargs) -> None:
         """Serializes the DatasetDict. We convert all documents with `.asdict()` and dump them with
-        `json.dump()` to one JSONLINE file per split.
+        `json.dump()` to one JSONLINE file per split. If there is already serialized data in the
+        output directory, we append the new data to the existing files.
 
         Args:
             path: path to the output directory
@@ -196,19 +197,29 @@ def to_json(self, path: Union[str, Path], **kwargs) -> None:
         metadata = {"document_type": serialize_document_type(self.document_type)}
         os.makedirs(path, exist_ok=True)
         if os.path.exists(path / METADATA_FILE_NAME):
-            logger.warning(
-                f"metadata file '{path / METADATA_FILE_NAME}' already exists, overwriting it"
-            )
-        with open(path / METADATA_FILE_NAME, "w") as f:
-            json.dump(metadata, f, indent=2)
+            # load previous metadata
+            with open(path / METADATA_FILE_NAME) as f:
+                previous_metadata = json.load(f)
+            if previous_metadata != metadata:
+                raise ValueError(
+                    f"The metadata file {path / METADATA_FILE_NAME} already exists, "
+                    "but the content does not match the current metadata. Can not append "
+                    "the current dataset to already serialized data."
+                    f"\nprevious metadata: {previous_metadata}"
+                    f"\ncurrent metadata: {metadata}"
+                )
+        else:
+            with open(path / METADATA_FILE_NAME, "w") as f:
+                json.dump(metadata, f, indent=2)
 
         # save the splits
         for split, dataset in self.items():
             split_path = path / split
             logger.info(f'serialize documents to "{split_path}" ...')
             os.makedirs(split_path, exist_ok=True)
             file_name = split_path / "documents.jsonl"
-            with open(file_name, "w") as f:
+            mode = "a" if os.path.exists(file_name) else "w"
+            with open(file_name, mode) as f:
                 for doc in dataset:
                     f.write(json.dumps(doc.asdict(), **kwargs) + "\n")
 

diff --git a/tests/unit/core/test_dataset_dict.py b/tests/unit/core/test_dataset_dict.py
@@ -107,6 +107,48 @@ def test_to_json_and_back_serialize_document_type(dataset_dict, tmp_path):
             assert doc1 == doc2
 
 
+def test_to_json_and_back_append(dataset_dict, tmp_path):
+    path = Path(tmp_path) / "dataset_dict"
+
+    dataset_dict1 = DatasetDict(
+        {split_name: Dataset.from_documents(docs[:2]) for split_name, docs in dataset_dict.items()}
+    )
+    dataset_dict2 = DatasetDict(
+        {split_name: Dataset.from_documents(docs[2:]) for split_name, docs in dataset_dict.items()}
+    )
+    dataset_dict1.to_json(path)
+    dataset_dict2.to_json(path)
+    dataset_dict_from_json = DatasetDict.from_json(
+        data_dir=str(path),
+    )
+    assert set(dataset_dict_from_json) == set(dataset_dict)
+    for split in dataset_dict:
+        assert len(dataset_dict_from_json[split]) == len(dataset_dict[split])
+        for doc1, doc2 in zip(dataset_dict_from_json[split], dataset_dict[split]):
+            assert doc1 == doc2
+
+
+def test_to_json_and_back_append_metadata_mismatch(dataset_dict, tmp_path):
+    path = Path(tmp_path) / "dataset_dict"
+
+    dataset_dict1 = DatasetDict(
+        {split_name: Dataset.from_documents(docs[:2]) for split_name, docs in dataset_dict.items()}
+    )
+    dataset_dict2 = DatasetDict(
+        {split_name: Dataset.from_documents(docs[2:]) for split_name, docs in dataset_dict.items()}
+    )
+    dataset_dict2_converted = dataset_dict2.cast_document_type(TextBasedDocument)
+    dataset_dict1.to_json(path)
+    with pytest.raises(ValueError) as excinfo:
+        dataset_dict2_converted.to_json(path)
+    assert str(excinfo.value).endswith(
+        "metadata.json already exists, but the content does not match the current metadata. "
+        "Can not append the current dataset to already serialized data."
+        "\nprevious metadata: {'document_type': 'tests.unit.core.test_dataset_dict.DocumentWithEntitiesAndRelations'}"
+        "\ncurrent metadata: {'document_type': 'pytorch_ie.documents.TextBasedDocument'}"
+    )
+
+
 def test_document_type_empty_no_splits():
     with pytest.raises(ValueError) as excinfo:
         DatasetDict().document_type