Skip to content

Commit

Permalink
DatasetDict.to_json() can append to already serialized data (#156)
Browse files Browse the repository at this point in the history
* `DatasetDict.to_json()` can append to already serialized data

* add tests
  • Loading branch information
ArneBinder authored Sep 30, 2024
1 parent e1db8f3 commit ef8f2d7
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 7 deletions.
25 changes: 18 additions & 7 deletions src/pie_datasets/core/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,8 @@ def from_json( # type: ignore

def to_json(self, path: Union[str, Path], **kwargs) -> None:
"""Serializes the DatasetDict. We convert all documents with `.asdict()` and dump them with
`json.dump()` to one JSONLINE file per split.
`json.dump()` to one JSONLINE file per split. If there is already serialized data in the
output directory, we append the new data to the existing files.
Args:
path: path to the output directory
Expand All @@ -196,19 +197,29 @@ def to_json(self, path: Union[str, Path], **kwargs) -> None:
metadata = {"document_type": serialize_document_type(self.document_type)}
os.makedirs(path, exist_ok=True)
if os.path.exists(path / METADATA_FILE_NAME):
logger.warning(
f"metadata file '{path / METADATA_FILE_NAME}' already exists, overwriting it"
)
with open(path / METADATA_FILE_NAME, "w") as f:
json.dump(metadata, f, indent=2)
# load previous metadata
with open(path / METADATA_FILE_NAME) as f:
previous_metadata = json.load(f)
if previous_metadata != metadata:
raise ValueError(
f"The metadata file {path / METADATA_FILE_NAME} already exists, "
"but the content does not match the current metadata. Can not append "
"the current dataset to already serialized data."
f"\nprevious metadata: {previous_metadata}"
f"\ncurrent metadata: {metadata}"
)
else:
with open(path / METADATA_FILE_NAME, "w") as f:
json.dump(metadata, f, indent=2)

# save the splits
for split, dataset in self.items():
split_path = path / split
logger.info(f'serialize documents to "{split_path}" ...')
os.makedirs(split_path, exist_ok=True)
file_name = split_path / "documents.jsonl"
with open(file_name, "w") as f:
mode = "a" if os.path.exists(file_name) else "w"
with open(file_name, mode) as f:
for doc in dataset:
f.write(json.dumps(doc.asdict(), **kwargs) + "\n")

Expand Down
42 changes: 42 additions & 0 deletions tests/unit/core/test_dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,48 @@ def test_to_json_and_back_serialize_document_type(dataset_dict, tmp_path):
assert doc1 == doc2


def test_to_json_and_back_append(dataset_dict, tmp_path):
path = Path(tmp_path) / "dataset_dict"

dataset_dict1 = DatasetDict(
{split_name: Dataset.from_documents(docs[:2]) for split_name, docs in dataset_dict.items()}
)
dataset_dict2 = DatasetDict(
{split_name: Dataset.from_documents(docs[2:]) for split_name, docs in dataset_dict.items()}
)
dataset_dict1.to_json(path)
dataset_dict2.to_json(path)
dataset_dict_from_json = DatasetDict.from_json(
data_dir=str(path),
)
assert set(dataset_dict_from_json) == set(dataset_dict)
for split in dataset_dict:
assert len(dataset_dict_from_json[split]) == len(dataset_dict[split])
for doc1, doc2 in zip(dataset_dict_from_json[split], dataset_dict[split]):
assert doc1 == doc2


def test_to_json_and_back_append_metadata_mismatch(dataset_dict, tmp_path):
path = Path(tmp_path) / "dataset_dict"

dataset_dict1 = DatasetDict(
{split_name: Dataset.from_documents(docs[:2]) for split_name, docs in dataset_dict.items()}
)
dataset_dict2 = DatasetDict(
{split_name: Dataset.from_documents(docs[2:]) for split_name, docs in dataset_dict.items()}
)
dataset_dict2_converted = dataset_dict2.cast_document_type(TextBasedDocument)
dataset_dict1.to_json(path)
with pytest.raises(ValueError) as excinfo:
dataset_dict2_converted.to_json(path)
assert str(excinfo.value).endswith(
"metadata.json already exists, but the content does not match the current metadata. "
"Can not append the current dataset to already serialized data."
"\nprevious metadata: {'document_type': 'tests.unit.core.test_dataset_dict.DocumentWithEntitiesAndRelations'}"
"\ncurrent metadata: {'document_type': 'pytorch_ie.documents.TextBasedDocument'}"
)


def test_document_type_empty_no_splits():
with pytest.raises(ValueError) as excinfo:
DatasetDict().document_type
Expand Down

0 comments on commit ef8f2d7

Please sign in to comment.