Skip to content

Commit

Permalink
Removed {item_path}/{item_name} validation on (#984)
Browse files Browse the repository at this point in the history
  • Loading branch information
JBWilkie authored Dec 18, 2024
1 parent 6752762 commit e053c42
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 132 deletions.
54 changes: 0 additions & 54 deletions darwin/dataset/upload_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,13 @@
Tuple,
Dict,
)
from rich.console import Console
import requests

from darwin.datatypes import PathLike, Slot, SourceFile
from darwin.doc_enum import DocEnum
from darwin.path_utils import construct_full_path
from darwin.utils import chunk
from darwin.utils.utils import is_image_extension_allowed_by_filename, SLOTS_GRID_MAP
from darwin.importer.importer import _console_theme

if TYPE_CHECKING:
from darwin.client import Client
Expand Down Expand Up @@ -418,7 +416,6 @@ def __init__(
self.local_files = local_files
self.dataset: RemoteDataset = dataset
self.errors: List[UploadRequestError] = []
self.skip_existing_full_remote_filepaths()
self.blocked_items, self.pending_items = self._request_upload(
handle_as_slices=handle_as_slices
)
Expand Down Expand Up @@ -466,57 +463,6 @@ def progress(self):
"""Current level of upload progress."""
return self._progress

def skip_existing_full_remote_filepaths(self) -> None:
"""
Checks if any items to be uploaded have duplicate {item_path}/{item_name} with
items already present in the remote dataset. Skip these files and display
a warning for each one.
"""
console = Console(theme=_console_theme())
full_remote_filepaths = [
Path(file.full_path) for file in self.dataset.fetch_remote_files()
]

multi_file_items_to_remove = []
local_files_to_remove = []

if self.multi_file_items:
for multi_file_item in self.multi_file_items:
if Path(multi_file_item.full_path) in full_remote_filepaths:
local_files_to_remove.extend(multi_file_item.files)
multi_file_items_to_remove.append(multi_file_item)
console.print(
f"The remote filepath {multi_file_item.full_path} is already occupied by a dataset item in the `{self.dataset.slug}` dataset. Skipping upload of item.",
style="warning",
)
if self.local_files:
for local_file in self.local_files:
if (
Path(local_file.full_path) in full_remote_filepaths
and local_file not in local_files_to_remove
):
local_files_to_remove.append(local_file)
console.print(
f"The remote filepath {local_file.full_path} already exists in the `{self.dataset.slug}` dataset. Skipping upload of item.",
style="warning",
)
self.local_files = [
local_file
for local_file in self.local_files
if local_file not in local_files_to_remove
]
if self.multi_file_items:
self.multi_file_items = [
multi_file_item
for multi_file_item in self.multi_file_items
if multi_file_item not in multi_file_items_to_remove
]

if not self.local_files and not self.multi_file_items:
raise ValueError(
"All items to be uploaded have paths that already exist in the remote dataset. No items to upload."
)

def prepare_upload(
self,
) -> Optional[Iterator[Callable[[Optional[ByteReadCallback]], None]]]:
Expand Down
79 changes: 1 addition & 78 deletions tests/darwin/dataset/upload_manager_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,7 @@ def test_request_upload_is_not_called_on_init(
dataset: RemoteDataset, request_upload_endpoint: str
):
with patch.object(dataset, "fetch_remote_files", return_value=[]):
with patch.object(
UploadHandler, "skip_existing_full_remote_filepaths", return_value=[]
):
upload_handler = UploadHandler.build(dataset, [])
upload_handler = UploadHandler.build(dataset, [])

assert upload_handler.pending_count == 0
assert upload_handler.blocked_count == 0
Expand Down Expand Up @@ -521,77 +518,3 @@ def test_default_value_when_env_var_is_not_integer(self, mock: MagicMock):
def test_value_specified_by_env_var(self, mock: MagicMock):
assert _upload_chunk_size() == 123
mock.assert_called_once_with("DARWIN_UPLOAD_CHUNK_SIZE")


def test_skip_existing_full_remote_filepaths_with_local_files():
mock_dataset = MagicMock()
mock_dataset.fetch_remote_files.return_value = [
MagicMock(full_path="/existing_file_1.jpg"),
MagicMock(full_path="/existing_file_2.jpg"),
]
mock_dataset.slug = "test-dataset"

local_file_1 = MagicMock(full_path="/existing_file_1.jpg")
local_file_2 = MagicMock(full_path="/new_file.jpg")

with patch("darwin.dataset.upload_manager.Console.print") as mock_print:
upload_handler = UploadHandlerV2(mock_dataset, [local_file_1, local_file_2], [])

assert local_file_1 not in upload_handler.local_files
assert local_file_2 in upload_handler.local_files

mock_print.assert_any_call(
"The remote filepath /existing_file_1.jpg already exists in the `test-dataset` dataset. Skipping upload of item.",
style="warning",
)


def test_skip_existing_full_remote_filepaths_with_multi_file_items():
mock_dataset = MagicMock()
mock_dataset.fetch_remote_files.return_value = [
MagicMock(full_path="/existing_multi_file_item.jpg"),
]
mock_dataset.slug = "test-dataset"

multi_file_item_1 = MagicMock(
full_path="/existing_multi_file_item.jpg", files=[MagicMock()]
)
multi_file_item_2 = MagicMock(
full_path="/new_multi_file_item.jpg", files=[MagicMock()]
)

with patch("darwin.dataset.upload_manager.Console.print") as mock_print:
upload_handler = UploadHandlerV2(
mock_dataset, [], [multi_file_item_1, multi_file_item_2]
)

assert multi_file_item_1 not in upload_handler.multi_file_items
assert multi_file_item_2 in upload_handler.multi_file_items

# Verify that the correct warning was printed
mock_print.assert_any_call(
"The remote filepath /existing_multi_file_item.jpg is already occupied by a dataset item in the `test-dataset` dataset. Skipping upload of item.",
style="warning",
)


def test_skip_existing_full_remote_filepaths_raises_if_no_files_left():
mock_dataset = MagicMock()
mock_dataset.fetch_remote_files.return_value = [
MagicMock(full_path="/existing_multi_file_item_1.jpg"),
MagicMock(full_path="/existing_multi_file_item_2.jpg"),
]
mock_dataset.slug = "test-dataset"

multi_file_item_1 = MagicMock(
full_path="/existing_multi_file_item_1.jpg", files=[MagicMock()]
)
multi_file_item_2 = MagicMock(
full_path="/existing_multi_file_item_2.jpg", files=[MagicMock()]
)

with pytest.raises(
ValueError,
match="All items to be uploaded have paths that already exist in the remote dataset. No items to upload.",
):
UploadHandlerV2(mock_dataset, [], [multi_file_item_1, multi_file_item_2])

0 comments on commit e053c42

Please sign in to comment.