Skip to content

Commit

Permalink
IL-414 use write_utf8 to write in one batch
Browse files Browse the repository at this point in the history
  • Loading branch information
FelixFehseTNG committed Apr 3, 2024
1 parent abee57c commit d630a8b
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,16 @@ def _write_data(
file_path: Path,
data_to_write: Iterable[PydanticSerializable],
) -> None:
with self._file_system.open(
self.path_to_str(file_path), "w", encoding="utf-8"
) as file:
for data_chunk in data_to_write:
serialized_result = JsonSerializer(root=data_chunk)
json_string = serialized_result.model_dump_json() + "\n"
file.write(json_string)
data = "\n".join(JsonSerializer(root=chunk).model_dump_json() for chunk in data_to_write)
self.write_utf8(file_path, data, create_parents=True)

# with self._file_system.open(
# self.path_to_str(file_path), "w", encoding="utf-8"
# ) as file:
# for data_chunk in data_to_write:
# serialized_result = JsonSerializer(root=data_chunk)
# json_string = serialized_result.model_dump_json() + "\n"
# file.write(json_string)


class FileDatasetRepository(FileSystemDatasetRepository):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,24 +50,24 @@ def __init__(self, repository_id: str, token: str, private: bool) -> None:
self._repository_id = repository_id
self._file_system = file_system # for better type checks

def create_dataset(
self,
examples: Iterable[Example[Input, ExpectedOutput]],
dataset_name: str,
id: str | None = None,
) -> Dataset:
failures = 0
exception = None
while failures < 5:
try:
dataset = super().create_dataset(examples, dataset_name, id)
return dataset
except Exception as e:
exception = typing.cast(HfHubHTTPError, e)
failures += 1
print(f"Failure {failures}")
time.sleep(0.5)
raise exception # RuntimeError("Cannot create dataset on Huggingface.")
# def create_dataset(
# self,
# examples: Iterable[Example[Input, ExpectedOutput]],
# dataset_name: str,
# id: str | None = None,
# ) -> Dataset:
# failures = 0
# exception = None
# while failures < 5:
# try:
# dataset = super().create_dataset(examples, dataset_name, id)
# return dataset
# except Exception as e:
# exception = typing.cast(HfHubHTTPError, e)
# failures += 1
# print(f"Failure {failures}")
# time.sleep(0.5)
# raise exception # RuntimeError("Cannot create dataset on Huggingface.")

def delete_repository(self) -> None:
huggingface_hub.delete_repo(
Expand Down

0 comments on commit d630a8b

Please sign in to comment.