From 27c2fde1e5e645a3d5a3fbc04b8d147db495449a Mon Sep 17 00:00:00 2001 From: John Wilkie <124276291+JBWilkie@users.noreply.github.com> Date: Wed, 23 Oct 2024 14:54:16 +0100 Subject: [PATCH] Initial commit (#944) --- darwin/dataset/upload_manager.py | 15 +++++++--- tests/darwin/dataset/upload_manager_test.py | 31 +++++++++++++++++++-- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/darwin/dataset/upload_manager.py b/darwin/dataset/upload_manager.py index 50c0c83a3..9997c7630 100644 --- a/darwin/dataset/upload_manager.py +++ b/darwin/dataset/upload_manager.py @@ -452,18 +452,20 @@ def skip_existing_full_remote_filepaths(self) -> None: local_files_to_remove.extend(multi_file_item.files) multi_file_items_to_remove.append(multi_file_item) console.print( - f"The remote filepath {multi_file_item.full_path} is already occupied by a dataset item in the {self.dataset.slug} dataset. Skipping upload of item.", + f"The remote filepath {multi_file_item.full_path} is already occupied by a dataset item in the `{self.dataset.slug}` dataset. Skipping upload of item.", style="warning", ) if self.local_files: for local_file in self.local_files: - if Path(local_file.full_path) in full_remote_filepaths: + if ( + Path(local_file.full_path) in full_remote_filepaths + and local_file not in local_files_to_remove + ): local_files_to_remove.append(local_file) console.print( - f"The remote filepath {local_file.full_path} already exists in the {self.dataset.slug} dataset. Skipping upload of item.", + f"The remote filepath {local_file.full_path} already exists in the `{self.dataset.slug}` dataset. Skipping upload of item.", style="warning", ) - self.local_files = [ local_file for local_file in self.local_files @@ -476,6 +478,11 @@ def skip_existing_full_remote_filepaths(self) -> None: if multi_file_item not in multi_file_items_to_remove ] + if not self.local_files and not self.multi_file_items: + raise ValueError( + "All items to be uploaded have paths that already exist in the remote dataset. No items to upload." + ) + def prepare_upload( self, ) -> Optional[Iterator[Callable[[Optional[ByteReadCallback]], None]]]: diff --git a/tests/darwin/dataset/upload_manager_test.py b/tests/darwin/dataset/upload_manager_test.py index b2e503286..914ab7e19 100644 --- a/tests/darwin/dataset/upload_manager_test.py +++ b/tests/darwin/dataset/upload_manager_test.py @@ -63,7 +63,10 @@ def test_request_upload_is_not_called_on_init( dataset: RemoteDataset, request_upload_endpoint: str ): with patch.object(dataset, "fetch_remote_files", return_value=[]): - upload_handler = UploadHandler.build(dataset, []) + with patch.object( + UploadHandler, "skip_existing_full_remote_filepaths", return_value=[] + ): + upload_handler = UploadHandler.build(dataset, []) assert upload_handler.pending_count == 0 assert upload_handler.blocked_count == 0 @@ -446,7 +449,7 @@ def test_skip_existing_full_remote_filepaths_with_local_files(): assert local_file_2 in upload_handler.local_files mock_print.assert_any_call( - "The remote filepath /existing_file_1.jpg already exists in the test-dataset dataset. Skipping upload of item.", + "The remote filepath /existing_file_1.jpg already exists in the `test-dataset` dataset. Skipping upload of item.", style="warning", ) @@ -475,6 +478,28 @@ def test_skip_existing_full_remote_filepaths_with_multi_file_items(): # Verify that the correct warning was printed mock_print.assert_any_call( - "The remote filepath /existing_multi_file_item.jpg is already occupied by a dataset item in the test-dataset dataset. Skipping upload of item.", + "The remote filepath /existing_multi_file_item.jpg is already occupied by a dataset item in the `test-dataset` dataset. Skipping upload of item.", style="warning", ) + + +def test_skip_existing_full_remote_filepaths_raises_if_no_files_left(): + mock_dataset = MagicMock() + mock_dataset.fetch_remote_files.return_value = [ + MagicMock(full_path="/existing_multi_file_item_1.jpg"), + MagicMock(full_path="/existing_multi_file_item_2.jpg"), + ] + mock_dataset.slug = "test-dataset" + + multi_file_item_1 = MagicMock( + full_path="/existing_multi_file_item_1.jpg", files=[MagicMock()] + ) + multi_file_item_2 = MagicMock( + full_path="/existing_multi_file_item_2.jpg", files=[MagicMock()] + ) + + with pytest.raises( + ValueError, + match="All items to be uploaded have paths that already exist in the remote dataset. No items to upload.", + ): + UploadHandlerV2(mock_dataset, [], [multi_file_item_1, multi_file_item_2])