From fed26939a03cce653a80dfe032b1a9fe94050f03 Mon Sep 17 00:00:00 2001 From: Patryk Gala Date: Tue, 10 Oct 2023 10:06:03 +0200 Subject: [PATCH 1/7] Revert "CHANGELOG" This reverts commit 10ed33fbcf35bf7ed316e7d36aec567c1a40237e. --- CHANGELOG.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 844903574..90fc38541 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,8 +3,6 @@ ### Changes - Upgraded performance of sending series data to Neptune ([#1483](https://github.com/neptune-ai/neptune-client/pull/1483)) - Compress (gzip) request to server, when server support it ([#1476](https://github.com/neptune-ai/neptune-client/pull/1476)) -- Support for batch processing parameters from env variables ([#1495](https://github.com/neptune-ai/neptune-client/pull/1495)) - ## neptune 1.8.1 From 6a192ee22f9b14c2d4530a9fe7a2c8c863db4d00 Mon Sep 17 00:00:00 2001 From: Patryk Gala Date: Tue, 10 Oct 2023 10:06:03 +0200 Subject: [PATCH 2/7] Revert "Support for batch processing parameters from env variables" This reverts commit 32c9ffcfba00dcb1cbf3882c4a76c4cb1ddf687c. --- src/neptune/envs.py | 10 ++---- .../async_operation_processor.py | 32 ++++++++----------- .../internal/operation_processors/factory.py | 10 ++---- 3 files changed, 18 insertions(+), 34 deletions(-) diff --git a/src/neptune/envs.py b/src/neptune/envs.py index 3c3f712b3..e73e739dc 100644 --- a/src/neptune/envs.py +++ b/src/neptune/envs.py @@ -24,9 +24,7 @@ "NEPTUNE_NOTEBOOK_PATH", "NEPTUNE_RETRIES_TIMEOUT_ENV", "NEPTUNE_SYNC_BATCH_TIMEOUT_ENV", - "NEPTUNE_ASYNC_MAX_POINTS_PER_BATCH", - "NEPTUNE_ASYNC_MAX_ATTRIBUTES_IN_BATCH", - "NEPTUNE_ASYNC_MAX_POINTS_PER_ATTRIBUTE", + "NEPTUNE_ASYNC_BATCH_SIZE", "NEPTUNE_SUBPROCESS_KILL_TIMEOUT", "NEPTUNE_FETCH_TABLE_STEP_SIZE", "NEPTUNE_SYNC_AFTER_STOP_TIMEOUT", @@ -62,11 +60,7 @@ NEPTUNE_SYNC_AFTER_STOP_TIMEOUT = "NEPTUNE_SYNC_AFTER_STOP_TIMEOUT" -NEPTUNE_ASYNC_MAX_POINTS_PER_BATCH = "NEPTUNE_ASYNC_MAX_POINTS_PER_BATCH" - -NEPTUNE_ASYNC_MAX_ATTRIBUTES_IN_BATCH = "NEPTUNE_ASYNC_MAX_ATTRIBUTES_IN_BATCH" - -NEPTUNE_ASYNC_MAX_POINTS_PER_ATTRIBUTE = "NEPTUNE_ASYNC_MAX_POINTS_PER_ATTRIBUTE" +NEPTUNE_ASYNC_BATCH_SIZE = "NEPTUNE_ASYNC_BATCH_SIZE" NEPTUNE_REQUEST_TIMEOUT = "NEPTUNE_REQUEST_TIMEOUT" diff --git a/src/neptune/internal/operation_processors/async_operation_processor.py b/src/neptune/internal/operation_processors/async_operation_processor.py index d95b1b383..be8e48388 100644 --- a/src/neptune/internal/operation_processors/async_operation_processor.py +++ b/src/neptune/internal/operation_processors/async_operation_processor.py @@ -27,6 +27,7 @@ from typing import ( TYPE_CHECKING, Callable, + ClassVar, Optional, ) @@ -59,7 +60,6 @@ class AsyncOperationProcessor(OperationProcessor): - QUEUE_RECORDS_TO_WAKEUP = 500 STOP_QUEUE_STATUS_UPDATE_FREQ_SECONDS = 30 STOP_QUEUE_MAX_TIME_NO_CONNECTION_SECONDS = int(os.getenv(NEPTUNE_SYNC_AFTER_STOP_TIMEOUT, DEFAULT_STOP_TIMEOUT)) @@ -69,10 +69,8 @@ def __init__( container_type: ContainerType, backend: NeptuneBackend, lock: threading.RLock, - max_points_per_attribute: int, - max_points_per_batch: int, - max_attributes_in_batch: int, sleep_time: float = 5, + batch_size: int = 1000, async_lag_callback: Optional[Callable[[], None]] = None, async_lag_threshold: float = ASYNC_LAG_THRESHOLD, async_no_progress_callback: Optional[Callable[[], None]] = None, @@ -90,19 +88,14 @@ def __init__( self._container_id = container_id self._container_type = container_type self._backend = backend + self._batch_size = batch_size self._async_lag_callback = async_lag_callback or (lambda: None) self._async_lag_threshold = async_lag_threshold self._async_no_progress_callback = async_no_progress_callback or (lambda: None) self._async_no_progress_threshold = async_no_progress_threshold self._last_version = 0 self._consumed_version = 0 - self._consumer = self.ConsumerThread( - self, - sleep_time=sleep_time, - max_points_per_attribute=max_points_per_attribute, - max_points_per_batch=max_points_per_batch, - max_attributes_in_batch=max_attributes_in_batch, - ) + self._consumer = self.ConsumerThread(self, sleep_time, batch_size) self._lock = lock self._last_ack = None self._lag_exceeded = False @@ -123,7 +116,7 @@ def enqueue_operation(self, op: Operation, *, wait: bool) -> None: self._check_lag() self._check_no_progress() - if self._queue.size() > AsyncOperationProcessor.QUEUE_RECORDS_TO_WAKEUP: + if self._queue.size() > self._batch_size / 2: self._consumer.wake_up() if wait: self.wait() @@ -267,24 +260,27 @@ def close(self): self._queue.close() class ConsumerThread(Daemon): + MAX_POINTS_PER_BATCH: ClassVar[int] = 100000 + MAX_ATTRIBUTES_IN_BATCH: ClassVar[int] = 1000 + MAX_POINTS_PER_ATTRIBUTE: ClassVar[int] = 10000 + def __init__( self, processor: "AsyncOperationProcessor", sleep_time: float, - max_points_per_batch: int, - max_attributes_in_batch: int, - max_points_per_attribute: int, + batch_size: int, ): super().__init__(sleep_time=sleep_time, name="NeptuneAsyncOpProcessor") self._processor = processor + self._batch_size = batch_size self._last_flush = 0 self._no_progress_exceeded = False self._batcher = Batcher( queue=self._processor._queue, backend=self._processor._backend, - max_points_per_batch=max_points_per_batch, - max_attributes_in_batch=max_attributes_in_batch, - max_points_per_attribute=max_points_per_attribute, + max_points_per_batch=self.MAX_POINTS_PER_BATCH, + max_attributes_in_batch=self.MAX_ATTRIBUTES_IN_BATCH, + max_points_per_attribute=self.MAX_POINTS_PER_ATTRIBUTE, ) def run(self): diff --git a/src/neptune/internal/operation_processors/factory.py b/src/neptune/internal/operation_processors/factory.py index 14f89b10a..7bbf4fe7d 100644 --- a/src/neptune/internal/operation_processors/factory.py +++ b/src/neptune/internal/operation_processors/factory.py @@ -23,11 +23,7 @@ Optional, ) -from neptune.envs import ( - NEPTUNE_ASYNC_MAX_ATTRIBUTES_IN_BATCH, - NEPTUNE_ASYNC_MAX_POINTS_PER_ATTRIBUTE, - NEPTUNE_ASYNC_MAX_POINTS_PER_BATCH, -) +from neptune.envs import NEPTUNE_ASYNC_BATCH_SIZE from neptune.internal.backends.neptune_backend import NeptuneBackend from neptune.internal.container_type import ContainerType from neptune.internal.id_formats import UniqueId @@ -63,9 +59,7 @@ def get_operation_processor( backend=backend, lock=lock, sleep_time=flush_period, - max_points_per_batch=int(os.environ.get(NEPTUNE_ASYNC_MAX_POINTS_PER_BATCH, "100000")), - max_points_per_attribute=int(os.environ.get(NEPTUNE_ASYNC_MAX_POINTS_PER_ATTRIBUTE, "10000")), - max_attributes_in_batch=int(os.environ.get(NEPTUNE_ASYNC_MAX_ATTRIBUTES_IN_BATCH, "1000")), + batch_size=int(os.environ.get(NEPTUNE_ASYNC_BATCH_SIZE) or "1000"), async_lag_callback=async_lag_callback, async_lag_threshold=async_lag_threshold, async_no_progress_callback=async_no_progress_callback, From be376c66762f3ed8416c6f9b26f6cbafb134d94d Mon Sep 17 00:00:00 2001 From: Patryk Gala Date: Tue, 10 Oct 2023 10:06:03 +0200 Subject: [PATCH 3/7] Revert "Update CHANGELOG.md" This reverts commit c35dccd33acf70a3dc03f243057b8117b2e39353. --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 90fc38541..e656efbb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## [UNRELEASED] neptune 1.8.2 +## [UNRELEASED] neptune 1.8.1 ### Changes - Upgraded performance of sending series data to Neptune ([#1483](https://github.com/neptune-ai/neptune-client/pull/1483)) From 519678f719096c962cdaff42be1c9c56cac625f6 Mon Sep 17 00:00:00 2001 From: Patryk Gala Date: Tue, 10 Oct 2023 10:06:03 +0200 Subject: [PATCH 4/7] Revert "Fixed, refactored, covered with tests and proper typings for batching (#1488)" This reverts commit ee00b19db135b1e3ebf86547c449dc6455f6a7d1. --- CHANGELOG.md | 2 +- pyproject.toml | 1 + .../backends/hosted_neptune_backend.py | 54 +- .../internal/backends/neptune_backend.py | 14 - .../internal/backends/neptune_backend_mock.py | 19 - .../backends/operations_preprocessor.py | 414 ++++++++++++++ src/neptune/internal/operation.py | 18 +- .../async_operation_processor.py | 104 ++-- .../internal/operation_processors/batcher.py | 119 ----- src/neptune/internal/preprocessor/__init__.py | 15 - .../preprocessor/accumulated_operations.py | 44 -- .../internal/preprocessor/exceptions.py | 20 - .../preprocessor/operations_accumulator.py | 365 ------------- .../preprocessor/operations_preprocessor.py | 116 ---- src/neptune/internal/queue/disk_queue.py | 2 +- .../backends/test_operations_preprocessor.py | 416 +++++++++++++++ .../internal/operation_processors/__init__.py | 15 - .../operation_processors/test_batcher.py | 352 ------------ .../new/internal/preprocessor/__init__.py | 15 - .../test_operations_accumulator.py | 505 ------------------ .../test_operations_preprocessor.py | 423 --------------- 21 files changed, 901 insertions(+), 2132 deletions(-) create mode 100644 src/neptune/internal/backends/operations_preprocessor.py delete mode 100644 src/neptune/internal/operation_processors/batcher.py delete mode 100644 src/neptune/internal/preprocessor/__init__.py delete mode 100644 src/neptune/internal/preprocessor/accumulated_operations.py delete mode 100644 src/neptune/internal/preprocessor/exceptions.py delete mode 100644 src/neptune/internal/preprocessor/operations_accumulator.py delete mode 100644 src/neptune/internal/preprocessor/operations_preprocessor.py create mode 100644 tests/unit/neptune/new/internal/backends/test_operations_preprocessor.py delete mode 100644 tests/unit/neptune/new/internal/operation_processors/__init__.py delete mode 100644 tests/unit/neptune/new/internal/operation_processors/test_batcher.py delete mode 100644 tests/unit/neptune/new/internal/preprocessor/__init__.py delete mode 100644 tests/unit/neptune/new/internal/preprocessor/test_operations_accumulator.py delete mode 100644 tests/unit/neptune/new/internal/preprocessor/test_operations_preprocessor.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e656efbb9..e3b779867 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## [UNRELEASED] neptune 1.8.1 +## [UNRELEASED] neptune 1.9.0 ### Changes - Upgraded performance of sending series data to Neptune ([#1483](https://github.com/neptune-ai/neptune-client/pull/1483)) diff --git a/pyproject.toml b/pyproject.toml index ec89cd736..ce02ea0f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -319,6 +319,7 @@ module = [ "neptune.internal.backends.offline_neptune_backend", "neptune.internal.backends.operation_api_name_visitor", "neptune.internal.backends.operation_api_object_converter", + "neptune.internal.backends.operations_preprocessor", "neptune.internal.backends.project_name_lookup", "neptune.internal.backends.swagger_client_wrapper", "neptune.internal.backends.utils", diff --git a/src/neptune/internal/backends/hosted_neptune_backend.py b/src/neptune/internal/backends/hosted_neptune_backend.py index fa350151f..ac4f7a4a1 100644 --- a/src/neptune/internal/backends/hosted_neptune_backend.py +++ b/src/neptune/internal/backends/hosted_neptune_backend.py @@ -111,6 +111,7 @@ from neptune.internal.backends.nql import NQLQuery from neptune.internal.backends.operation_api_name_visitor import OperationApiNameVisitor from neptune.internal.backends.operation_api_object_converter import OperationApiObjectConverter +from neptune.internal.backends.operations_preprocessor import OperationsPreprocessor from neptune.internal.backends.utils import ( ExecuteOperationsBatchingManager, MissingApiClient, @@ -132,7 +133,6 @@ UploadFileSet, ) from neptune.internal.operation_processors.operation_storage import OperationStorage -from neptune.internal.preprocessor.operations_preprocessor import OperationsPreprocessor from neptune.internal.utils import base64_decode from neptune.internal.utils.generic_attribute_mapper import map_attribute_result_to_value from neptune.internal.utils.git import GitInfo @@ -145,7 +145,6 @@ from bravado.requests_client import RequestsClient from neptune.internal.backends.api_model import ClientConfig - from neptune.internal.preprocessor.accumulated_operations import AccumulatedOperations _logger = logging.getLogger(__name__) @@ -460,7 +459,7 @@ def execute_operations( operations_preprocessor = OperationsPreprocessor() operations_preprocessor.process_batch(operations_batch.operations) - preprocessed_operations = operations_preprocessor.accumulate_operations() + preprocessed_operations = operations_preprocessor.get_operations() errors.extend(preprocessed_operations.errors) if preprocessed_operations.artifact_operations: @@ -504,53 +503,6 @@ def execute_operations( errors, ) - def execute_operations_from_accumulator( - self, - container_id: UniqueId, - container_type: ContainerType, - accumulated_operations: "AccumulatedOperations", - operation_storage: OperationStorage, - ) -> Tuple[int, List[NeptuneException]]: - errors: List[NeptuneException] = accumulated_operations.errors - - if accumulated_operations.artifact_operations: - self.verify_feature_available(OptionalFeatures.ARTIFACTS) - - # Upload operations should be done first since they are idempotent - if accumulated_operations.upload_operations: - errors.extend( - self._execute_upload_operations_with_400_retry( - container_id=container_id, - container_type=container_type, - upload_operations=accumulated_operations.upload_operations, - operation_storage=operation_storage, - ) - ) - - if accumulated_operations.artifact_operations: - artifact_operations_errors, assign_artifact_operations = self._execute_artifact_operations( - container_id=container_id, - container_type=container_type, - artifact_operations=accumulated_operations.artifact_operations, - ) - - errors.extend(artifact_operations_errors) - accumulated_operations.other_operations.extend(assign_artifact_operations) - - if accumulated_operations.other_operations: - errors.extend( - self._execute_operations( - container_id, - container_type, - operations=accumulated_operations.other_operations, - ) - ) - - for op in itertools.chain(accumulated_operations.upload_operations, accumulated_operations.other_operations): - op.clean(operation_storage=operation_storage) - - return accumulated_operations.source_operations_count, errors - def _execute_upload_operations( self, container_id: str, @@ -558,7 +510,7 @@ def _execute_upload_operations( upload_operations: List[Operation], operation_storage: OperationStorage, ) -> List[NeptuneException]: - errors: List[NeptuneException] = list() + errors = list() if self._client_config.has_feature(OptionalFeatures.MULTIPART_UPLOAD): multipart_config = self._client_config.multipart_config diff --git a/src/neptune/internal/backends/neptune_backend.py b/src/neptune/internal/backends/neptune_backend.py index 2213b9ff9..8fae59b30 100644 --- a/src/neptune/internal/backends/neptune_backend.py +++ b/src/neptune/internal/backends/neptune_backend.py @@ -17,7 +17,6 @@ import abc from typing import ( - TYPE_CHECKING, Any, List, Optional, @@ -60,9 +59,6 @@ from neptune.internal.utils.git import GitInfo from neptune.internal.websockets.websockets_factory import WebsocketsFactory -if TYPE_CHECKING: - from neptune.internal.preprocessor.accumulated_operations import AccumulatedOperations - class NeptuneBackend: def close(self) -> None: @@ -147,16 +143,6 @@ def execute_operations( ) -> Tuple[int, List[NeptuneException]]: pass - @abc.abstractmethod - def execute_operations_from_accumulator( - self, - container_id: UniqueId, - container_type: ContainerType, - accumulated_operations: "AccumulatedOperations", - operation_storage: OperationStorage, - ) -> Tuple[int, List[NeptuneException]]: - pass - @abc.abstractmethod def get_attributes(self, container_id: str, container_type: ContainerType) -> List[Attribute]: pass diff --git a/src/neptune/internal/backends/neptune_backend_mock.py b/src/neptune/internal/backends/neptune_backend_mock.py index aa6424a42..fce014ab2 100644 --- a/src/neptune/internal/backends/neptune_backend_mock.py +++ b/src/neptune/internal/backends/neptune_backend_mock.py @@ -21,7 +21,6 @@ from datetime import datetime from shutil import copyfile from typing import ( - TYPE_CHECKING, Any, Dict, Iterable, @@ -133,10 +132,6 @@ from neptune.types.value import Value from neptune.types.value_visitor import ValueVisitor -if TYPE_CHECKING: - from neptune.internal.preprocessor.accumulated_operations import AccumulatedOperations - - Val = TypeVar("Val", bound=Value) @@ -297,20 +292,6 @@ def execute_operations( result.append(e) return len(operations), result - def execute_operations_from_accumulator( - self, - container_id: UniqueId, - container_type: ContainerType, - accumulated_operations: "AccumulatedOperations", - operation_storage: OperationStorage, - ) -> Tuple[int, List[NeptuneException]]: - return self.execute_operations( - container_id=container_id, - container_type=container_type, - operations=accumulated_operations.all_operations(), - operation_storage=operation_storage, - ) - def _execute_operation( self, container_id: UniqueId, container_type: ContainerType, op: Operation, operation_storage: OperationStorage ) -> None: diff --git a/src/neptune/internal/backends/operations_preprocessor.py b/src/neptune/internal/backends/operations_preprocessor.py new file mode 100644 index 000000000..7e6816a60 --- /dev/null +++ b/src/neptune/internal/backends/operations_preprocessor.py @@ -0,0 +1,414 @@ +# +# Copyright (c) 2022, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +__all__ = ["OperationsPreprocessor"] + +import dataclasses +import typing +from enum import Enum +from typing import ( + Callable, + List, + Type, + TypeVar, +) + +from neptune.common.exceptions import InternalClientError +from neptune.exceptions import MetadataInconsistency +from neptune.internal.operation import ( + AddStrings, + AssignArtifact, + AssignBool, + AssignDatetime, + AssignFloat, + AssignInt, + AssignString, + ClearFloatLog, + ClearImageLog, + ClearStringLog, + ClearStringSet, + ConfigFloatSeries, + CopyAttribute, + DeleteAttribute, + DeleteFiles, + LogFloats, + LogImages, + LogOperation, + LogStrings, + Operation, + RemoveStrings, + TrackFilesToArtifact, + UploadFile, + UploadFileContent, + UploadFileSet, +) +from neptune.internal.operation_visitor import OperationVisitor +from neptune.internal.utils.paths import path_to_str + +T = TypeVar("T") + + +class RequiresPreviousCompleted(Exception): + """indicates that previous operations must be synchronized with server before preprocessing current one""" + + +@dataclasses.dataclass +class AccumulatedOperations: + upload_operations: List[Operation] = dataclasses.field(default_factory=list) + artifact_operations: List[TrackFilesToArtifact] = dataclasses.field(default_factory=list) + other_operations: List[Operation] = dataclasses.field(default_factory=list) + + errors: List[MetadataInconsistency] = dataclasses.field(default_factory=list) + + def all_operations(self) -> List[Operation]: + return self.upload_operations + self.artifact_operations + self.other_operations + + +class OperationsPreprocessor: + def __init__(self): + self._accumulators: typing.Dict[str, "_OperationsAccumulator"] = dict() + self.processed_ops_count = 0 + self.final_ops_count = 0 + self.final_append_count = 0 + + def process(self, operation: Operation) -> bool: + """Adds a single operation to its processed list. + Returns `False` iff the new operation can't be in queue until one of already enqueued operations gets + synchronized with server first. + """ + try: + self._process_op(operation) + self.processed_ops_count += 1 + return True + except RequiresPreviousCompleted: + return False + + def process_batch(self, operations: List[Operation]) -> None: + for op in operations: + if not self.process(op): + return + + def _process_op(self, op: Operation) -> "_OperationsAccumulator": + path_str = path_to_str(op.path) + target_acc = self._accumulators.setdefault(path_str, _OperationsAccumulator(op.path)) + old_ops_count, old_append_count = target_acc.get_op_count(), target_acc.get_append_count() + target_acc.visit(op) + self.final_ops_count += target_acc.get_op_count() - old_ops_count + self.final_append_count += target_acc.get_append_count() - old_append_count + return target_acc + + @staticmethod + def is_file_op(op: Operation): + return isinstance(op, (UploadFile, UploadFileContent, UploadFileSet)) + + @staticmethod + def is_artifact_op(op: Operation): + return isinstance(op, TrackFilesToArtifact) + + def get_operations(self) -> AccumulatedOperations: + result = AccumulatedOperations() + for _, acc in sorted(self._accumulators.items()): + acc: "_OperationsAccumulator" + for op in acc.get_operations(): + if self.is_artifact_op(op): + result.artifact_operations.append(op) + elif self.is_file_op(op): + result.upload_operations.append(op) + else: + result.other_operations.append(op) + result.errors.extend(acc.get_errors()) + + return result + + +class _DataType(Enum): + FLOAT = "Float" + INT = "Int" + BOOL = "Bool" + STRING = "String" + FILE = "File" + DATETIME = "Datetime" + FILE_SET = "File Set" + FLOAT_SERIES = "Float Series" + STRING_SERIES = "String Series" + IMAGE_SERIES = "Image Series" + STRING_SET = "String Set" + ARTIFACT = "Artifact" + + def is_file_op(self) -> bool: + return self in (self.FILE, self.FILE_SET) + + def is_artifact_op(self) -> bool: + return self in (self.ARTIFACT,) + + +class _OperationsAccumulator(OperationVisitor[None]): + def __init__(self, path: List[str]): + self._path = path + self._type: typing.Optional[_DataType] = None + self._delete_ops = [] + self._modify_ops = [] + self._config_ops = [] + self._errors = [] + self._ops_count = 0 + self._append_count = 0 + + def get_operations(self) -> List[Operation]: + return self._delete_ops + self._modify_ops + self._config_ops + + def get_errors(self) -> List[MetadataInconsistency]: + return self._errors + + def get_op_count(self) -> int: + return self._ops_count + + def get_append_count(self) -> int: + return self._append_count + + def _check_prerequisites(self, op: Operation): + if (OperationsPreprocessor.is_file_op(op) or OperationsPreprocessor.is_artifact_op(op)) and len( + self._delete_ops + ) > 0: + raise RequiresPreviousCompleted() + + def _process_modify_op( + self, + expected_type: _DataType, + op: Operation, + modifier: Callable[[List[Operation], Operation], List[Operation]], + ) -> None: + + if self._type and self._type != expected_type: + # This case should never happen since inconsistencies on data types are verified on user api. + # So such operations should not appear in the queue without delete operation between them. + # Still we want to support this case to avoid some unclear dependencies and assumptions. + self._errors.append( + MetadataInconsistency( + "Cannot perform {} operation on {}: Attribute is not a {}".format( + op.__class__.__name__, + path_to_str(self._path), + expected_type.value, + ) + ) + ) + else: + self._check_prerequisites(op) + self._type = expected_type + old_op_count = len(self._modify_ops) + self._modify_ops = modifier(self._modify_ops, op) + self._ops_count += len(self._modify_ops) - old_op_count + + def _process_config_op(self, expected_type: _DataType, op: Operation) -> None: + + if self._type and self._type != expected_type: + # This case should never happen since inconsistencies on data types are verified on user api. + # So such operations should not appear in the queue without delete operation between them. + # Still we want to support this case to avoid some unclear dependencies and assumptions. + self._errors.append( + MetadataInconsistency( + "Cannot perform {} operation on {}: Attribute is not a {}".format( + op.__class__.__name__, + path_to_str(self._path), + expected_type.value, + ) + ) + ) + else: + self._check_prerequisites(op) + self._type = expected_type + old_op_count = len(self._config_ops) + self._config_ops = [op] + self._ops_count += len(self._config_ops) - old_op_count + + def visit_assign_float(self, op: AssignFloat) -> None: + self._process_modify_op(_DataType.FLOAT, op, self._assign_modifier()) + + def visit_assign_int(self, op: AssignInt) -> None: + self._process_modify_op(_DataType.INT, op, self._assign_modifier()) + + def visit_assign_bool(self, op: AssignBool) -> None: + self._process_modify_op(_DataType.BOOL, op, self._assign_modifier()) + + def visit_assign_string(self, op: AssignString) -> None: + self._process_modify_op(_DataType.STRING, op, self._assign_modifier()) + + def visit_assign_datetime(self, op: AssignDatetime) -> None: + self._process_modify_op(_DataType.DATETIME, op, self._assign_modifier()) + + def visit_upload_file(self, op: UploadFile) -> None: + self._process_modify_op(_DataType.FILE, op, self._assign_modifier()) + + def visit_upload_file_content(self, op: UploadFileContent) -> None: + self._process_modify_op(_DataType.FILE, op, self._assign_modifier()) + + def visit_assign_artifact(self, op: AssignArtifact) -> None: + self._process_modify_op(_DataType.ARTIFACT, op, self._assign_modifier()) + + def visit_upload_file_set(self, op: UploadFileSet) -> None: + if op.reset: + self._process_modify_op(_DataType.FILE_SET, op, self._assign_modifier()) + else: + self._process_modify_op(_DataType.FILE_SET, op, self._add_modifier()) + + def visit_log_floats(self, op: LogFloats) -> None: + self._process_modify_op( + _DataType.FLOAT_SERIES, + op, + self._log_modifier( + LogFloats, + ClearFloatLog, + lambda op1, op2: LogFloats(op1.path, op1.values + op2.values), + ), + ) + + def visit_log_strings(self, op: LogStrings) -> None: + self._process_modify_op( + _DataType.STRING_SERIES, + op, + self._log_modifier( + LogStrings, + ClearStringLog, + lambda op1, op2: LogStrings(op1.path, op1.values + op2.values), + ), + ) + + def visit_log_images(self, op: LogImages) -> None: + self._process_modify_op( + _DataType.IMAGE_SERIES, + op, + self._log_modifier( + LogImages, + ClearImageLog, + lambda op1, op2: LogImages(op1.path, op1.values + op2.values), + ), + ) + + def visit_clear_float_log(self, op: ClearFloatLog) -> None: + self._process_modify_op(_DataType.FLOAT_SERIES, op, self._clear_modifier()) + + def visit_clear_string_log(self, op: ClearStringLog) -> None: + self._process_modify_op(_DataType.STRING_SERIES, op, self._clear_modifier()) + + def visit_clear_image_log(self, op: ClearImageLog) -> None: + self._process_modify_op(_DataType.IMAGE_SERIES, op, self._clear_modifier()) + + def visit_add_strings(self, op: AddStrings) -> None: + self._process_modify_op(_DataType.STRING_SET, op, self._add_modifier()) + + def visit_clear_string_set(self, op: ClearStringSet) -> None: + self._process_modify_op(_DataType.STRING_SET, op, self._clear_modifier()) + + def visit_remove_strings(self, op: RemoveStrings) -> None: + self._process_modify_op(_DataType.STRING_SET, op, self._remove_modifier()) + + def visit_config_float_series(self, op: ConfigFloatSeries) -> None: + self._process_config_op(_DataType.FLOAT_SERIES, op) + + def visit_delete_files(self, op: DeleteFiles) -> None: + self._process_modify_op(_DataType.FILE_SET, op, self._add_modifier()) + + def visit_delete_attribute(self, op: DeleteAttribute) -> None: + if self._type: + if self._delete_ops: + # Keep existing delete operation and simply clear all modification operations after it + self._modify_ops = [] + self._config_ops = [] + self._type = None + self._ops_count = len(self._delete_ops) + self._append_count = 0 + else: + # This case is tricky. There was no delete operation, but some modifications was performed. + # We do not know if this attribute exists on server side and we do not want a delete op to fail. + # So we need to send a single modification before delete to be sure a delete op is valid. + self._delete_ops = [self._modify_ops[0], op] + self._modify_ops = [] + self._config_ops = [] + self._type = None + self._ops_count = len(self._delete_ops) + self._append_count = 0 + else: + if self._delete_ops: + # Do nothing if there already is a delete operation + # and no other operations was performed after it + return + else: + # If value has not been set locally yet and no delete operation was performed, + # simply perform single delete operation. + self._delete_ops.append(op) + self._ops_count = len(self._delete_ops) + + @staticmethod + def _artifact_log_modifier( + ops: List[TrackFilesToArtifact], new_op: TrackFilesToArtifact + ) -> List[TrackFilesToArtifact]: + if len(ops) == 0: + return [new_op] + + # There should be exactly 1 operation, merge it with new_op + assert len(ops) == 1 + op_old = ops[0] + assert op_old.path == new_op.path + assert op_old.project_id == new_op.project_id + return [TrackFilesToArtifact(op_old.path, op_old.project_id, op_old.entries + new_op.entries)] + + def visit_track_files_to_artifact(self, op: TrackFilesToArtifact) -> None: + self._process_modify_op(_DataType.ARTIFACT, op, self._artifact_log_modifier) + + def visit_clear_artifact(self, op: ClearStringSet) -> None: + self._process_modify_op(_DataType.ARTIFACT, op, self._clear_modifier()) + + def visit_copy_attribute(self, op: CopyAttribute) -> None: + raise MetadataInconsistency("No CopyAttribute should reach accumulator") + + @staticmethod + def _assign_modifier(): + return lambda ops, new_op: [new_op] + + def _clear_modifier(self): + def modifier(ops: List[Operation], new_op: Operation): + for op in ops: + if isinstance(op, LogOperation): + self._append_count -= op.value_count() + return [new_op] + + return modifier + + def _log_modifier(self, log_op_class: Type[LogOperation], clear_op_class: type, log_combine: Callable[[T, T], T]): + def modifier(ops: List[Operation], new_op: Operation): + if len(ops) == 0: + res = [new_op] + elif len(ops) == 1 and isinstance(ops[0], log_op_class): + res = [log_combine(ops[0], new_op)] + elif len(ops) == 1 and isinstance(ops[0], clear_op_class): + res = [ops[0], new_op] + elif len(ops) == 2: + res = [ops[0], log_combine(ops[1], new_op)] + else: + raise InternalClientError("Preprocessing operations failed: len(ops) == {}".format(len(ops))) + if isinstance(new_op, log_op_class): # Check just so that static typing doesn't complain + self._append_count += new_op.value_count() + return res + + return modifier + + @staticmethod + def _add_modifier(): + # We do not optimize it on client side for now. It should not be often operation. + return lambda ops, op: ops + [op] + + @staticmethod + def _remove_modifier(): + # We do not optimize it on client side for now. It should not be often operation. + return lambda ops, op: ops + [op] diff --git a/src/neptune/internal/operation.py b/src/neptune/internal/operation.py index 111de886f..02124c73a 100644 --- a/src/neptune/internal/operation.py +++ b/src/neptune/internal/operation.py @@ -19,7 +19,6 @@ from datetime import datetime from typing import ( TYPE_CHECKING, - Any, Generic, List, Optional, @@ -295,13 +294,7 @@ def from_dict(data: dict) -> "UploadFileSet": class LogOperation(Operation, abc.ABC): @abc.abstractmethod def value_count(self) -> int: - ... - - # Workaround for mypy and not being able to combine dataclasses and abstract properties with subclass - # initialization in series - @abc.abstractmethod - def get_values(self) -> List[Any]: - ... + pass @dataclass @@ -344,9 +337,6 @@ def from_dict(data: dict) -> "LogFloats": def value_count(self) -> int: return len(self.values) - def get_values(self) -> List[LogSeriesValue[float]]: - return self.values - @dataclass class LogStrings(LogOperation): @@ -373,9 +363,6 @@ def from_dict(data: dict) -> "LogStrings": def value_count(self) -> int: return len(self.values) - def get_values(self) -> List[LogSeriesValue[str]]: - return self.values - @dataclass class ImageValue: @@ -424,9 +411,6 @@ def from_dict(data: dict) -> "LogImages": def value_count(self) -> int: return len(self.values) - def get_values(self) -> List[LogSeriesValue[ImageValue]]: - return self.values - @dataclass class ClearFloatLog(Operation): diff --git a/src/neptune/internal/operation_processors/async_operation_processor.py b/src/neptune/internal/operation_processors/async_operation_processor.py index be8e48388..fecd23d58 100644 --- a/src/neptune/internal/operation_processors/async_operation_processor.py +++ b/src/neptune/internal/operation_processors/async_operation_processor.py @@ -25,16 +25,18 @@ time, ) from typing import ( - TYPE_CHECKING, Callable, ClassVar, + List, Optional, + Tuple, ) from neptune.constants import ASYNC_DIRECTORY from neptune.envs import NEPTUNE_SYNC_AFTER_STOP_TIMEOUT from neptune.exceptions import NeptuneSynchronizationAlreadyStoppedException from neptune.internal.backends.neptune_backend import NeptuneBackend +from neptune.internal.backends.operations_preprocessor import OperationsPreprocessor from neptune.internal.container_type import ContainerType from neptune.internal.id_formats import UniqueId from neptune.internal.init.parameters import ( @@ -42,20 +44,22 @@ ASYNC_NO_PROGRESS_THRESHOLD, DEFAULT_STOP_TIMEOUT, ) -from neptune.internal.operation import Operation -from neptune.internal.operation_processors.batcher import Batcher +from neptune.internal.operation import ( + CopyAttribute, + Operation, +) from neptune.internal.operation_processors.operation_processor import OperationProcessor from neptune.internal.operation_processors.operation_storage import ( OperationStorage, get_container_dir, ) -from neptune.internal.queue.disk_queue import DiskQueue +from neptune.internal.queue.disk_queue import ( + DiskQueue, + QueueElement, +) from neptune.internal.threading.daemon import Daemon from neptune.internal.utils.logger import logger -if TYPE_CHECKING: - from neptune.internal.preprocessor.accumulated_operations import AccumulatedOperations - _logger = logging.getLogger(__name__) @@ -260,9 +264,9 @@ def close(self): self._queue.close() class ConsumerThread(Daemon): - MAX_POINTS_PER_BATCH: ClassVar[int] = 100000 - MAX_ATTRIBUTES_IN_BATCH: ClassVar[int] = 1000 - MAX_POINTS_PER_ATTRIBUTE: ClassVar[int] = 10000 + MAX_OPERATIONS_IN_BATCH: ClassVar[int] = 1000 + MAX_APPENDS_IN_BATCH: ClassVar[int] = 100000 + MAX_BATCH_SIZE_BYTES: ClassVar[int] = 100 * 1024 * 1024 def __init__( self, @@ -275,13 +279,7 @@ def __init__( self._batch_size = batch_size self._last_flush = 0 self._no_progress_exceeded = False - self._batcher = Batcher( - queue=self._processor._queue, - backend=self._processor._backend, - max_points_per_batch=self.MAX_POINTS_PER_BATCH, - max_attributes_in_batch=self.MAX_ATTRIBUTES_IN_BATCH, - max_points_per_attribute=self.MAX_POINTS_PER_ATTRIBUTE, - ) + self._last_disk_record: Optional[QueueElement[Operation]] = None def run(self): try: @@ -297,23 +295,43 @@ def work(self) -> None: self._last_flush = ts self._processor._queue.flush() - while self.collect_and_process_batch(): - pass - - @Daemon.ConnectionRetryWrapper( - kill_message=( - "Killing Neptune asynchronous thread. All data is safe on disk and can be later" - " synced manually using `neptune sync` command." - ) - ) - def collect_and_process_batch(self) -> bool: - batch = self._batcher.collect_batch() - - if batch: - operations, dropped_operations_count, version = batch - self.process_batch(operations, dropped_operations_count, version) - - return batch is not None + while True: + batch = self.collect_batch() + if not batch: + return + operations, version = batch + self.process_batch(operations, version) + + def collect_batch(self) -> Optional[Tuple[List[Operation], int]]: + preprocessor = OperationsPreprocessor() + version: Optional[int] = None + total_bytes = 0 + copy_ops: List[CopyAttribute] = [] + while ( + preprocessor.final_ops_count < self.MAX_OPERATIONS_IN_BATCH + and preprocessor.final_append_count < self.MAX_APPENDS_IN_BATCH + and total_bytes < self.MAX_BATCH_SIZE_BYTES + ): + record: Optional[QueueElement[Operation]] = self._last_disk_record or self._processor._queue.get() + self._last_disk_record = None + if not record: + break + if isinstance(record.obj, CopyAttribute): + # CopyAttribute can be only at the start of a batch. + if copy_ops or preprocessor.final_ops_count: + self._last_disk_record = record + break + else: + version = record.ver + copy_ops.append(record.obj) + total_bytes += record.size + elif preprocessor.process(record.obj): + version = record.ver + total_bytes += record.size + else: + self._last_disk_record = record + break + return (copy_ops + preprocessor.get_operations().all_operations(), version) if version is not None else None def _check_no_progress(self): if not self._no_progress_exceeded: @@ -321,17 +339,22 @@ def _check_no_progress(self): self._no_progress_exceeded = True self._processor._should_call_no_progress_callback = True - def process_batch(self, batch: "AccumulatedOperations", dropped_operations_count: int, version: int) -> None: - expected_count = batch.operations_count + @Daemon.ConnectionRetryWrapper( + kill_message=( + "Killing Neptune asynchronous thread. All data is safe on disk and can be later" + " synced manually using `neptune sync` command." + ) + ) + def process_batch(self, batch: List[Operation], version: int) -> None: + expected_count = len(batch) version_to_ack = version - expected_count - while True: # TODO: Handle Metadata errors try: - processed_count, errors = self._processor._backend.execute_operations_from_accumulator( + processed_count, errors = self._processor._backend.execute_operations( container_id=self._processor._container_id, container_type=self._processor._container_type, - accumulated_operations=batch, + operations=batch, operation_storage=self._processor._operation_storage, ) except Exception as e: @@ -341,7 +364,8 @@ def process_batch(self, batch: "AccumulatedOperations", dropped_operations_count self._no_progress_exceeded = False - version_to_ack += processed_count + dropped_operations_count + version_to_ack += processed_count + batch = batch[processed_count:] with self._processor._waiting_cond: self._processor._queue.ack(version_to_ack) diff --git a/src/neptune/internal/operation_processors/batcher.py b/src/neptune/internal/operation_processors/batcher.py deleted file mode 100644 index c81b834d3..000000000 --- a/src/neptune/internal/operation_processors/batcher.py +++ /dev/null @@ -1,119 +0,0 @@ -# -# Copyright (c) 2023, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from typing import ( - TYPE_CHECKING, - List, - Optional, - Tuple, -) - -from neptune.exceptions import MetadataInconsistency -from neptune.internal.operation import ( - CopyAttribute, - LogOperation, -) -from neptune.internal.preprocessor.operations_preprocessor import OperationsPreprocessor -from neptune.internal.queue.disk_queue import QueueElement - -if TYPE_CHECKING: - from neptune.exceptions import NeptuneException - from neptune.internal.backends.neptune_backend import NeptuneBackend - from neptune.internal.operation import Operation - from neptune.internal.preprocessor.accumulated_operations import AccumulatedOperations - from neptune.internal.queue.disk_queue import DiskQueue - - -class Batcher: - def __init__( - self, - queue: "DiskQueue", - backend: "NeptuneBackend", - max_points_per_batch: int = 100000, - max_attributes_in_batch: int = 1000, - max_points_per_attribute: int = 100000, - ): - self._backend: "NeptuneBackend" = backend - self._queue: "DiskQueue" = queue - self._max_points_per_batch = max_points_per_batch - self._max_attributes_in_batch = max_attributes_in_batch - self._max_points_per_attribute = max_points_per_attribute - - self._last_disk_record: Optional[QueueElement["Operation"]] = None - - def collect_batch(self) -> Optional[Tuple["AccumulatedOperations", int, int]]: - preprocessor = OperationsPreprocessor() - version: Optional[int] = None - errors: List["NeptuneException"] = [] - dropped_operations_count = 0 - - while True: - record: Optional[QueueElement[Operation]] = self._last_disk_record or self._queue.get() - self._last_disk_record = None - - if not record: - break - - operation, operation_version = record.obj, record.ver - - if isinstance(operation, CopyAttribute): - # CopyAttribute can be only at the start of a batch - if preprocessor.operations_count: - self._last_disk_record = record - break - else: - try: - operation = operation.resolve(self._backend) - except MetadataInconsistency as e: - errors.append(e) - dropped_operations_count += 1 - - version = operation_version - - # TODO: Refactor as this should be more generic - if not preprocessor.has_accumulator(operation.path): - if preprocessor.accumulators_count + 1 > self._max_attributes_in_batch: - self._last_disk_record = record - break - else: - if isinstance(operation, LogOperation): - old_accumulator_points = preprocessor.get_accumulator_append_count(operation.path) - new_points = operation.value_count() - - if preprocessor.points_count + new_points > self._max_points_per_batch: - self._last_disk_record = record - break - - if old_accumulator_points + new_points > self._max_points_per_attribute: - self._last_disk_record = record - break - - if preprocessor.process(operation): - version = operation_version - else: - self._last_disk_record = record - break - - return ( - ( - preprocessor.accumulate_operations( - initial_errors=errors, source_operations_count=preprocessor.operations_count - ), - dropped_operations_count, - version, - ) - if version is not None - else None - ) diff --git a/src/neptune/internal/preprocessor/__init__.py b/src/neptune/internal/preprocessor/__init__.py deleted file mode 100644 index 8d06af532..000000000 --- a/src/neptune/internal/preprocessor/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# -# Copyright (c) 2023, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/src/neptune/internal/preprocessor/accumulated_operations.py b/src/neptune/internal/preprocessor/accumulated_operations.py deleted file mode 100644 index 913db9421..000000000 --- a/src/neptune/internal/preprocessor/accumulated_operations.py +++ /dev/null @@ -1,44 +0,0 @@ -# -# Copyright (c) 2023, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -__all__ = ["AccumulatedOperations"] - -from dataclasses import ( - dataclass, - field, -) -from typing import List - -from neptune.exceptions import NeptuneException -from neptune.internal.operation import ( - Operation, - TrackFilesToArtifact, -) - - -@dataclass -class AccumulatedOperations: - upload_operations: List[Operation] = field(default_factory=list) - artifact_operations: List[TrackFilesToArtifact] = field(default_factory=list) - other_operations: List[Operation] = field(default_factory=list) - errors: List[NeptuneException] = field(default_factory=list) - source_operations_count: int = 0 - - def all_operations(self) -> List[Operation]: - return self.upload_operations + self.artifact_operations + self.other_operations - - @property - def operations_count(self) -> int: - return len(self.all_operations()) diff --git a/src/neptune/internal/preprocessor/exceptions.py b/src/neptune/internal/preprocessor/exceptions.py deleted file mode 100644 index 3763b759d..000000000 --- a/src/neptune/internal/preprocessor/exceptions.py +++ /dev/null @@ -1,20 +0,0 @@ -# -# Copyright (c) 2023, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -__all__ = ["RequiresPreviousCompleted"] - - -class RequiresPreviousCompleted(Exception): - """indicates that previous operations must be synchronized with server before preprocessing current one""" diff --git a/src/neptune/internal/preprocessor/operations_accumulator.py b/src/neptune/internal/preprocessor/operations_accumulator.py deleted file mode 100644 index cc70597b9..000000000 --- a/src/neptune/internal/preprocessor/operations_accumulator.py +++ /dev/null @@ -1,365 +0,0 @@ -# -# Copyright (c) 2023, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -__all__ = ["OperationsAccumulator"] - -from dataclasses import dataclass -from enum import Enum -from typing import ( - Callable, - List, - Optional, - Tuple, - Type, - TypeVar, -) - -from neptune.common.exceptions import InternalClientError -from neptune.exceptions import MetadataInconsistency -from neptune.internal.operation import ( - AddStrings, - AssignArtifact, - AssignBool, - AssignDatetime, - AssignFloat, - AssignInt, - AssignString, - ClearArtifact, - ClearFloatLog, - ClearImageLog, - ClearStringLog, - ClearStringSet, - ConfigFloatSeries, - CopyAttribute, - DeleteAttribute, - DeleteFiles, - LogFloats, - LogImages, - LogOperation, - LogStrings, - Operation, - RemoveStrings, - TrackFilesToArtifact, - UploadFile, - UploadFileContent, - UploadFileSet, -) -from neptune.internal.operation_visitor import OperationVisitor -from neptune.internal.preprocessor.exceptions import RequiresPreviousCompleted -from neptune.internal.utils.paths import path_to_str - -T = TypeVar("T") - - -class DataType(Enum): - FLOAT = "Float" - INT = "Int" - BOOL = "Bool" - STRING = "String" - FILE = "File" - DATETIME = "Datetime" - FILE_SET = "File Set" - FLOAT_SERIES = "Float Series" - STRING_SERIES = "String Series" - IMAGE_SERIES = "Image Series" - STRING_SET = "String Set" - ARTIFACT = "Artifact" - - -@dataclass -class AccumulatorStats: - append_count: int = 0 - - -class OperationsAccumulator(OperationVisitor[None]): - def __init__(self, path: List[str]): - self._path: List[str] = path - self._type: Optional[DataType] = None - self._delete_ops: List[Operation] = [] - self._modify_ops: List[Operation] = [] - self._config_ops: List[Operation] = [] - self._errors: List[MetadataInconsistency] = [] - self._stats: AccumulatorStats = AccumulatorStats() - - def get_operations(self) -> List[Operation]: - return self._delete_ops + self._modify_ops + self._config_ops - - def get_errors(self) -> List[MetadataInconsistency]: - return self._errors - - def get_op_count(self) -> int: - return len(self.get_operations()) - - def get_append_count(self) -> int: - return self._stats.append_count - - def _check_prerequisites(self, op: Operation) -> None: - if (is_file_op(op) or is_artifact_op(op)) and len(self._delete_ops) > 0: - raise RequiresPreviousCompleted() - - def _process_modify_op( - self, - expected_type: DataType, - op: Operation, - modifier: Callable[[List[Operation], Operation, AccumulatorStats], Tuple[List[Operation], AccumulatorStats]], - ) -> None: - - if self._type and self._type != expected_type: - # This case should never happen since inconsistencies on data types are verified on user api. - # So such operations should not appear in the queue without delete operation between them. - # Still we want to support this case to avoid some unclear dependencies and assumptions. - self._errors.append( - MetadataInconsistency( - "Cannot perform {} operation on {}: Attribute is not a {}".format( - op.__class__.__name__, - path_to_str(self._path), - expected_type.value, - ) - ) - ) - else: - self._check_prerequisites(op) - self._type = expected_type - self._modify_ops, self._stats = modifier(self._modify_ops, op, self._stats) - - def _process_config_op(self, expected_type: DataType, op: Operation) -> None: - - if self._type and self._type != expected_type: - # This case should never happen since inconsistencies on data types are verified on user api. - # So such operations should not appear in the queue without delete operation between them. - # Still we want to support this case to avoid some unclear dependencies and assumptions. - self._errors.append( - MetadataInconsistency( - "Cannot perform {} operation on {}: Attribute is not a {}".format( - op.__class__.__name__, - path_to_str(self._path), - expected_type.value, - ) - ) - ) - else: - self._check_prerequisites(op) - self._type = expected_type - self._config_ops = [op] - - def visit_assign_float(self, op: AssignFloat) -> None: - self._process_modify_op(DataType.FLOAT, op, assign_modifier) - - def visit_assign_int(self, op: AssignInt) -> None: - self._process_modify_op(DataType.INT, op, assign_modifier) - - def visit_assign_bool(self, op: AssignBool) -> None: - self._process_modify_op(DataType.BOOL, op, assign_modifier) - - def visit_assign_string(self, op: AssignString) -> None: - self._process_modify_op(DataType.STRING, op, assign_modifier) - - def visit_assign_datetime(self, op: AssignDatetime) -> None: - self._process_modify_op(DataType.DATETIME, op, assign_modifier) - - def visit_upload_file(self, op: UploadFile) -> None: - self._process_modify_op(DataType.FILE, op, assign_modifier) - - def visit_upload_file_content(self, op: UploadFileContent) -> None: - self._process_modify_op(DataType.FILE, op, assign_modifier) - - def visit_assign_artifact(self, op: AssignArtifact) -> None: - self._process_modify_op(DataType.ARTIFACT, op, assign_modifier) - - def visit_upload_file_set(self, op: UploadFileSet) -> None: - if op.reset: - self._process_modify_op(DataType.FILE_SET, op, assign_modifier) - else: - self._process_modify_op(DataType.FILE_SET, op, add_modifier) - - def visit_log_floats(self, op: LogFloats) -> None: - self._process_modify_op( - DataType.FLOAT_SERIES, - op, - log_modifier( - LogFloats, - ClearFloatLog, - lambda op1, op2: LogFloats(op1.path, op1.get_values() + op2.get_values()), - ), - ) - - def visit_log_strings(self, op: LogStrings) -> None: - self._process_modify_op( - DataType.STRING_SERIES, - op, - log_modifier( - LogStrings, - ClearStringLog, - lambda op1, op2: LogStrings(op1.path, op1.get_values() + op2.get_values()), - ), - ) - - def visit_log_images(self, op: LogImages) -> None: - self._process_modify_op( - DataType.IMAGE_SERIES, - op, - log_modifier( - LogImages, - ClearImageLog, - lambda op1, op2: LogImages(op1.path, op1.get_values() + op2.get_values()), - ), - ) - - def visit_clear_float_log(self, op: ClearFloatLog) -> None: - self._process_modify_op(DataType.FLOAT_SERIES, op, clear_modifier) - - def visit_clear_string_log(self, op: ClearStringLog) -> None: - self._process_modify_op(DataType.STRING_SERIES, op, clear_modifier) - - def visit_clear_image_log(self, op: ClearImageLog) -> None: - self._process_modify_op(DataType.IMAGE_SERIES, op, clear_modifier) - - def visit_add_strings(self, op: AddStrings) -> None: - self._process_modify_op(DataType.STRING_SET, op, add_modifier) - - def visit_clear_string_set(self, op: ClearStringSet) -> None: - self._process_modify_op(DataType.STRING_SET, op, clear_modifier) - - def visit_remove_strings(self, op: RemoveStrings) -> None: - self._process_modify_op(DataType.STRING_SET, op, remove_modifier) - - def visit_config_float_series(self, op: ConfigFloatSeries) -> None: - self._process_config_op(DataType.FLOAT_SERIES, op) - - def visit_delete_files(self, op: DeleteFiles) -> None: - self._process_modify_op(DataType.FILE_SET, op, add_modifier) - - def visit_delete_attribute(self, op: DeleteAttribute) -> None: - if self._type: - if self._delete_ops: - # Keep existing delete operation and simply clear all modification operations after it - self._modify_ops = [] - self._config_ops = [] - self._type = None - else: - # This case is tricky. There was no delete operation, but some modifications was performed. - # We do not know if this attribute exists on server side and we do not want a delete op to fail. - # So we need to send a single modification before delete to be sure a delete op is valid. - self._delete_ops = [self._modify_ops[0], op] - self._modify_ops = [] - self._config_ops = [] - self._type = None - else: - if self._delete_ops: - # Do nothing if there already is a delete operation - # and no other operations was performed after it - return - else: - # If value has not been set locally yet and no delete operation was performed, - # simply perform single delete operation. - self._delete_ops.append(op) - - self._stats = AccumulatorStats(append_count=0) - - def visit_track_files_to_artifact(self, op: TrackFilesToArtifact) -> None: - self._process_modify_op(DataType.ARTIFACT, op, artifact_log_modifier) - - def visit_clear_artifact(self, op: ClearArtifact) -> None: - self._process_modify_op(DataType.ARTIFACT, op, clear_modifier) - - def visit_copy_attribute(self, op: CopyAttribute) -> None: - raise MetadataInconsistency("No CopyAttribute should reach accumulator") - - -def artifact_log_modifier( - ops: List[Operation], new_op: Operation, stats: AccumulatorStats -) -> Tuple[List[Operation], AccumulatorStats]: - assert isinstance(new_op, TrackFilesToArtifact) - - if len(ops) == 0: - return [new_op], stats - - # There should be exactly 1 operation, merge it with new_op - assert len(ops) == 1 - op_old = ops[0] - assert isinstance(op_old, TrackFilesToArtifact) - assert op_old.path == new_op.path - assert op_old.project_id == new_op.project_id - - return [TrackFilesToArtifact(op_old.path, op_old.project_id, op_old.entries + new_op.entries)], stats - - -def log_modifier( - log_op_class: Type[LogOperation], - clear_op_class: Type[Operation], - log_combine: Callable[[LogOperation, LogOperation], LogOperation], -) -> Callable[[List[Operation], Operation, AccumulatorStats], Tuple[List[Operation], AccumulatorStats]]: - def modifier( - ops: List[Operation], new_op: Operation, stats: AccumulatorStats - ) -> Tuple[List[Operation], AccumulatorStats]: - assert isinstance(new_op, log_op_class) - stats = AccumulatorStats(append_count=stats.append_count + new_op.value_count()) - - if len(ops) == 0: - return [new_op], stats - - if len(ops) == 1: - first_operation = ops[0] - - if isinstance(first_operation, log_op_class): - return [log_combine(first_operation, new_op)], stats - elif isinstance(first_operation, clear_op_class): - return [first_operation, new_op], stats - - if len(ops) == 2: - first_operation = ops[0] - second_operation = ops[1] - - if isinstance(second_operation, log_op_class): - return [first_operation, log_combine(second_operation, new_op)], stats - - raise InternalClientError(f"Preprocessing operations failed: len(ops) == {len(ops)}") - - return modifier - - -def is_file_op(op: Operation) -> bool: - return isinstance(op, (UploadFile, UploadFileContent, UploadFileSet)) - - -def is_artifact_op(op: Operation) -> bool: - return isinstance(op, TrackFilesToArtifact) - - -def clear_modifier( - ops: List[Operation], new_op: Operation, stats: AccumulatorStats -) -> Tuple[List[Operation], AccumulatorStats]: - for op in ops: - if isinstance(op, LogOperation): - stats = AccumulatorStats(append_count=stats.append_count - op.value_count()) - - return [new_op], stats - - -assign_modifier: Callable[ - [List[Operation], Operation, AccumulatorStats], Tuple[List[Operation], AccumulatorStats] -] = lambda ops, new_op, stats: ([new_op], stats) - - -# We do not optimize it on client side for now. It should not be often operation. -add_modifier: Callable[ - [List[Operation], Operation, AccumulatorStats], Tuple[List[Operation], AccumulatorStats] -] = lambda ops, op, stats: (ops + [op], stats) - - -# We do not optimize it on client side for now. It should not be often operation. -remove_modifier: Callable[ - [List[Operation], Operation, AccumulatorStats], Tuple[List[Operation], AccumulatorStats] -] = lambda ops, op, stats: (ops + [op], stats) diff --git a/src/neptune/internal/preprocessor/operations_preprocessor.py b/src/neptune/internal/preprocessor/operations_preprocessor.py deleted file mode 100644 index f5bf76c73..000000000 --- a/src/neptune/internal/preprocessor/operations_preprocessor.py +++ /dev/null @@ -1,116 +0,0 @@ -# -# Copyright (c) 2022, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -__all__ = ["OperationsPreprocessor"] - -from typing import ( - TYPE_CHECKING, - Dict, - List, - Optional, - TypeVar, -) - -from neptune.internal.operation import ( - Operation, - TrackFilesToArtifact, - UploadFile, - UploadFileContent, - UploadFileSet, -) -from neptune.internal.preprocessor.accumulated_operations import AccumulatedOperations -from neptune.internal.preprocessor.exceptions import RequiresPreviousCompleted -from neptune.internal.preprocessor.operations_accumulator import OperationsAccumulator -from neptune.internal.utils.paths import path_to_str - -if TYPE_CHECKING: - from neptune.exceptions import NeptuneException - - -T = TypeVar("T") - - -class OperationsPreprocessor: - def __init__(self) -> None: - self._accumulators: Dict[str, "OperationsAccumulator"] = dict() - self.processed_ops_count: int = 0 - - @property - def accumulators_count(self) -> int: - return len(self._accumulators.keys()) - - @property - def max_points_per_accumulator(self) -> int: - return max(map(lambda acc: acc.get_append_count(), self._accumulators.values()), default=0) - - @property - def operations_count(self) -> int: - return sum(map(lambda acc: acc.get_op_count(), self._accumulators.values())) - - @property - def points_count(self) -> int: - return sum(map(lambda acc: acc.get_append_count(), self._accumulators.values())) - - def process(self, operation: Operation) -> bool: - """Adds a single operation to its processed list. - Returns `False` iff the new operation can't be in queue until one of already enqueued operations gets - synchronized with server first. - """ - try: - self._process_op(operation) - self.processed_ops_count += 1 - return True - except RequiresPreviousCompleted: - return False - - # TODO: This shouldn't be public method - def get_accumulator_append_count(self, path: List[str]) -> int: - path_str = path_to_str(path) - if path_str not in self._accumulators: - return 0 - return self._accumulators[path_str].get_append_count() - - def has_accumulator(self, path: List[str]) -> bool: - path_str = path_to_str(path) - return path_str in self._accumulators - - def process_batch(self, operations: List[Operation]) -> None: - for op in operations: - if not self.process(op): - return - - def _process_op(self, op: Operation) -> "OperationsAccumulator": - path_str = path_to_str(op.path) - target_acc = self._accumulators.setdefault(path_str, OperationsAccumulator(op.path)) - target_acc.visit(op) - return target_acc - - def accumulate_operations( - self, initial_errors: Optional[List["NeptuneException"]] = None, source_operations_count: int = 0 - ) -> AccumulatedOperations: - result = AccumulatedOperations(source_operations_count=source_operations_count) - result.errors.extend(initial_errors or []) - - for _, acc in sorted(self._accumulators.items()): - for op in acc.get_operations(): - if isinstance(op, TrackFilesToArtifact): - result.artifact_operations.append(op) - elif isinstance(op, (UploadFile, UploadFileContent, UploadFileSet)): - result.upload_operations.append(op) - else: - result.other_operations.append(op) - result.errors.extend(acc.get_errors()) - - return result diff --git a/src/neptune/internal/queue/disk_queue.py b/src/neptune/internal/queue/disk_queue.py index 68c2fbd5f..bf7146c6a 100644 --- a/src/neptune/internal/queue/disk_queue.py +++ b/src/neptune/internal/queue/disk_queue.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__all__ = ["DiskQueue", "QueueElement"] +__all__ = ["DiskQueue"] import json import logging diff --git a/tests/unit/neptune/new/internal/backends/test_operations_preprocessor.py b/tests/unit/neptune/new/internal/backends/test_operations_preprocessor.py new file mode 100644 index 000000000..4d0ab59be --- /dev/null +++ b/tests/unit/neptune/new/internal/backends/test_operations_preprocessor.py @@ -0,0 +1,416 @@ +# +# Copyright (c) 2020, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import uuid + +from neptune.exceptions import MetadataInconsistency +from neptune.internal.backends.operations_preprocessor import OperationsPreprocessor +from neptune.internal.operation import ( + AddStrings, + AssignFloat, + AssignString, + ClearFloatLog, + ClearImageLog, + ClearStringSet, + ConfigFloatSeries, + DeleteAttribute, + LogFloats, + LogImages, + LogStrings, + RemoveStrings, + TrackFilesToArtifact, + UploadFileSet, +) +from tests.unit.neptune.new.attributes.test_attribute_base import TestAttributeBase + +FLog = LogFloats.ValueType +SLog = LogStrings.ValueType +ILog = LogImages.ValueType + + +class TestOperationsPreprocessor(TestAttributeBase): + def test_delete_attribute(self): + # given + processor = OperationsPreprocessor() + + operations = [ + DeleteAttribute(["a"]), + AssignFloat(["a"], 1), + DeleteAttribute(["a"]), + AssignString(["b"], "2"), + DeleteAttribute(["b"]), + AssignString(["c"], "2"), + AssignString(["c"], "3"), + DeleteAttribute(["c"]), + DeleteAttribute(["d"]), + DeleteAttribute(["d"]), + DeleteAttribute(["e"]), + ] + + # when + processor.process_batch(operations) + + # then + result = processor.get_operations() + self.assertEqual(result.upload_operations, []) + self.assertEqual(result.artifact_operations, []) + self.assertEqual( + result.other_operations, + [ + DeleteAttribute(["a"]), + AssignString(["b"], "2"), + DeleteAttribute(["b"]), + AssignString(["c"], "3"), + DeleteAttribute(["c"]), + DeleteAttribute(["d"]), + DeleteAttribute(["e"]), + ], + ) + self.assertEqual(result.errors, []) + self.assertEqual(processor.processed_ops_count, len(operations)) + + def test_assign(self): + # given + processor = OperationsPreprocessor() + + operations = [ + AssignFloat(["a"], 1), + DeleteAttribute(["a"]), + AssignString(["a"], "111"), + DeleteAttribute(["b"]), + AssignFloat(["b"], 2), + AssignFloat(["c"], 3), + AssignString(["c"], "333"), + AssignString(["d"], "44"), + AssignFloat(["e"], 5), + AssignFloat(["e"], 10), + AssignFloat(["e"], 33), + ] + + # when + processor.process_batch(operations) + + # then + result = processor.get_operations() + self.assertEqual(result.upload_operations, []) + self.assertEqual(result.artifact_operations, []) + self.assertEqual( + result.other_operations, + [ + AssignFloat(["a"], 1), + DeleteAttribute(["a"]), + AssignString(["a"], "111"), + DeleteAttribute(["b"]), + AssignFloat(["b"], 2), + AssignFloat(["c"], 3), + AssignString(["d"], "44"), + AssignFloat(["e"], 33), + ], + ) + self.assertEqual( + result.errors, + [MetadataInconsistency("Cannot perform AssignString operation on c: Attribute is not a String")], + ) + self.assertEqual(processor.processed_ops_count, len(operations)) + + def test_series(self): + # given + processor = OperationsPreprocessor() + + operations = [ + LogFloats(["a"], [FLog(1, 2, 3)]), + ConfigFloatSeries(["a"], min=7, max=70, unit="%"), + DeleteAttribute(["a"]), + LogStrings(["a"], [SLog("111", 3, 4)]), + DeleteAttribute(["b"]), + LogStrings(["b"], [SLog("222", None, 6)]), + LogFloats(["c"], [FLog(1, 2, 3)]), + LogFloats(["c"], [FLog(10, 20, 30), FLog(100, 200, 300)]), + LogStrings(["d"], [SLog("4", 111, 222)]), + ClearFloatLog(["e"]), + LogImages(["f"], [ILog("1", 2, 3)]), + LogImages(["f"], [ILog("10", 20, 30), FLog("100", 200, 300)]), + LogImages(["f"], [ILog("1", 2, 3)]), + LogImages(["f"], [ILog("10", 20, 30), FLog("100", 200, 300)]), + ClearImageLog(["f"]), + LogImages(["f"], [ILog("3", 20, 30), FLog("4", 200, 300)]), + LogImages(["f"], [ILog("5", 2, 3)]), + LogImages(["f"], [ILog("8", 20, 30), FLog("1000", 200, 300)]), + LogImages(["g"], [ILog("10", 20, 30), FLog("100", 200, 300)]), + ClearImageLog(["g"]), + AssignString(["h"], "44"), + LogFloats(["h"], [FLog(10, 20, 30), FLog(100, 200, 300)]), + LogFloats(["i"], [FLog(1, 2, 3)]), + ConfigFloatSeries(["i"], min=7, max=70, unit="%"), + ClearFloatLog(["i"]), + LogFloats(["i"], [FLog(10, 20, 30), FLog(100, 200, 300)]), + ] + + # when + processor.process_batch(operations) + + # then + result = processor.get_operations() + self.assertEqual(result.upload_operations, []) + self.assertEqual(result.artifact_operations, []) + self.assertEqual( + result.other_operations, + [ + LogFloats(["a"], [FLog(1, 2, 3)]), + DeleteAttribute(["a"]), + LogStrings(["a"], [FLog("111", 3, 4)]), + DeleteAttribute(["b"]), + LogStrings(["b"], [SLog("222", None, 6)]), + LogFloats(["c"], [FLog(1, 2, 3), FLog(10, 20, 30), FLog(100, 200, 300)]), + LogStrings(["d"], [SLog("4", 111, 222)]), + ClearFloatLog(["e"]), + ClearImageLog(["f"]), + LogImages( + ["f"], + [ + ILog("3", 20, 30), + FLog("4", 200, 300), + ILog("5", 2, 3), + ILog("8", 20, 30), + FLog("1000", 200, 300), + ], + ), + ClearImageLog(["g"]), + AssignString(["h"], "44"), + ClearFloatLog(["i"]), + LogFloats(["i"], [FLog(10, 20, 30), FLog(100, 200, 300)]), + ConfigFloatSeries(["i"], min=7, max=70, unit="%"), + ], + ) + self.assertEqual( + result.errors, + [MetadataInconsistency("Cannot perform LogFloats operation on h: Attribute is not a Float Series")], + ) + self.assertEqual(processor.processed_ops_count, len(operations)) + + def test_sets(self): + # given + processor = OperationsPreprocessor() + + operations = [ + AddStrings(["a"], {"xx", "y", "abc"}), + DeleteAttribute(["a"]), + AddStrings(["a"], {"hhh", "gij"}), + DeleteAttribute(["b"]), + RemoveStrings(["b"], {"abc", "defgh"}), + AddStrings(["c"], {"hhh", "gij"}), + RemoveStrings(["c"], {"abc", "defgh"}), + AddStrings(["c"], {"qqq"}), + ClearStringSet(["d"]), + RemoveStrings(["e"], {"abc", "defgh"}), + AddStrings(["e"], {"qqq"}), + ClearStringSet(["e"]), + AddStrings(["f"], {"hhh", "gij"}), + RemoveStrings(["f"], {"abc", "defgh"}), + AddStrings(["f"], {"qqq"}), + ClearStringSet(["f"]), + AddStrings(["f"], {"xx", "y", "abc"}), + RemoveStrings(["f"], {"abc", "defgh"}), + AssignString(["h"], "44"), + RemoveStrings(["h"], {""}), + AssignFloat(["i"], 5), + AddStrings(["i"], {""}), + ] + + # when + processor.process_batch(operations) + + # then + result = processor.get_operations() + self.assertEqual(result.upload_operations, []) + self.assertEqual(result.artifact_operations, []) + self.assertEqual( + result.other_operations, + [ + AddStrings(["a"], {"xx", "y", "abc"}), + DeleteAttribute(["a"]), + AddStrings(["a"], {"hhh", "gij"}), + DeleteAttribute(["b"]), + RemoveStrings(["b"], {"abc", "defgh"}), + AddStrings(["c"], {"hhh", "gij"}), + RemoveStrings(["c"], {"abc", "defgh"}), + AddStrings(["c"], {"qqq"}), + ClearStringSet(["d"]), + ClearStringSet(["e"]), + ClearStringSet(["f"]), + AddStrings(["f"], {"xx", "y", "abc"}), + RemoveStrings(["f"], {"abc", "defgh"}), + AssignString(["h"], "44"), + AssignFloat(["i"], 5), + ], + ) + self.assertEqual( + result.errors, + [ + MetadataInconsistency("Cannot perform RemoveStrings operation on h: Attribute is not a String Set"), + MetadataInconsistency("Cannot perform AddStrings operation on i: Attribute is not a String Set"), + ], + ) + self.assertEqual(processor.processed_ops_count, len(operations)) + + def test_file_set(self): + # given + processor = OperationsPreprocessor() + + operations = [ + UploadFileSet(["a"], ["xx", "y", "abc"], reset=False), + UploadFileSet(["a"], ["hhh", "gij"], reset=False), + UploadFileSet(["b"], ["abc", "defgh"], reset=True), + UploadFileSet(["c"], ["hhh", "gij"], reset=False), + UploadFileSet(["c"], ["abc", "defgh"], reset=True), + UploadFileSet(["c"], ["qqq"], reset=False), + UploadFileSet(["d"], ["hhh", "gij"], reset=False), + AssignFloat(["e"], 5), + UploadFileSet(["e"], [""], reset=False), + ] + + # when + processor.process_batch(operations) + + # then + result = processor.get_operations() + self.assertEqual( + result.upload_operations, + [ + UploadFileSet(["a"], ["xx", "y", "abc"], reset=False), + UploadFileSet(["a"], ["hhh", "gij"], reset=False), + UploadFileSet(["b"], ["abc", "defgh"], reset=True), + UploadFileSet(["c"], ["abc", "defgh"], reset=True), + UploadFileSet(["c"], ["qqq"], reset=False), + UploadFileSet(["d"], ["hhh", "gij"], reset=False), + ], + ) + self.assertEqual(result.artifact_operations, []) + self.assertEqual( + result.other_operations, + [ + AssignFloat(["e"], 5), + ], + ) + self.assertEqual( + result.errors, + [MetadataInconsistency("Cannot perform UploadFileSet operation on e: Attribute is not a File Set")], + ) + self.assertEqual(processor.processed_ops_count, len(operations)) + + def test_file_ops_delete(self): + # given + processor = OperationsPreprocessor() + + operations = [ + UploadFileSet(["b"], ["abc", "defgh"], reset=True), + UploadFileSet(["c"], ["hhh", "gij"], reset=False), + UploadFileSet(["c"], ["abc", "defgh"], reset=True), + UploadFileSet(["c"], ["qqq"], reset=False), + UploadFileSet(["d"], ["hhh", "gij"], reset=False), + DeleteAttribute(["a"]), + UploadFileSet(["a"], ["xx", "y", "abc"], reset=False), + UploadFileSet(["a"], ["hhh", "gij"], reset=False), + DeleteAttribute(["b"]), + ] + + # when + processor.process_batch(operations) + + # then: there's a cutoff after DeleteAttribute(["a"]) + result = processor.get_operations() + self.assertEqual( + [ + UploadFileSet(["b"], ["abc", "defgh"], reset=True), + UploadFileSet(["c"], ["abc", "defgh"], reset=True), + UploadFileSet(["c"], ["qqq"], reset=False), + UploadFileSet(["d"], ["hhh", "gij"], reset=False), + ], + result.upload_operations, + ) + self.assertEqual(result.artifact_operations, []) + self.assertEqual( + [ + DeleteAttribute(["a"]), + ], + result.other_operations, + ) + self.assertEqual(result.errors, []) + self.assertEqual(processor.processed_ops_count, 6) + + def test_artifacts(self): + # given + processor = OperationsPreprocessor() + project_uuid = str(uuid.uuid4()) + + operations = [ + TrackFilesToArtifact(["a"], project_uuid, [("dir1/", None)]), + DeleteAttribute(["a"]), + TrackFilesToArtifact(["b"], project_uuid, [("dir1/", None)]), + TrackFilesToArtifact(["b"], project_uuid, [("dir2/dir3/", "dir2/")]), + TrackFilesToArtifact(["b"], project_uuid, [("dir4/dir5/", "dir4/")]), + AssignFloat(["c"], 5), + TrackFilesToArtifact(["c"], project_uuid, [("dir1/", None)]), + TrackFilesToArtifact(["d"], project_uuid, [("dir2/dir3/", "dir2/")]), + TrackFilesToArtifact(["d"], project_uuid, [("dir4/", None)]), + TrackFilesToArtifact(["e"], project_uuid, [("dir1/", None)]), + TrackFilesToArtifact(["e"], project_uuid, [("dir2/dir3/", "dir2/")]), + TrackFilesToArtifact(["f"], project_uuid, [("dir1/", None)]), + TrackFilesToArtifact(["f"], project_uuid, [("dir2/dir3/", "dir2/")]), + TrackFilesToArtifact(["f"], project_uuid, [("dir4/", None)]), + TrackFilesToArtifact(["a"], project_uuid, [("dir1/", None)]), + ] + + # when + processor.process_batch(operations) + + # then: there's a cutoff before second TrackFilesToArtifact(["a"]) due to DeleteAttribute(["a"]) + result = processor.get_operations() + self.assertEqual(result.upload_operations, []) + self.assertEqual( + result.artifact_operations, + [ + TrackFilesToArtifact(["a"], project_uuid, [("dir1/", None)]), + TrackFilesToArtifact( + ["b"], + project_uuid, + [("dir1/", None), ("dir2/dir3/", "dir2/"), ("dir4/dir5/", "dir4/")], + ), + TrackFilesToArtifact(["d"], project_uuid, [("dir2/dir3/", "dir2/"), ("dir4/", None)]), + TrackFilesToArtifact(["e"], project_uuid, [("dir1/", None), ("dir2/dir3/", "dir2/")]), + TrackFilesToArtifact( + ["f"], + project_uuid, + [("dir1/", None), ("dir2/dir3/", "dir2/"), ("dir4/", None)], + ), + ], + ) + self.assertEqual( + result.other_operations, + [ + DeleteAttribute(["a"]), + AssignFloat(["c"], 5), + ], + ) + self.assertEqual( + result.errors, + [ + MetadataInconsistency( + "Cannot perform TrackFilesToArtifact operation on c: Attribute is not a Artifact" + ), + ], + ) + self.assertEqual(processor.processed_ops_count, len(operations) - 1) diff --git a/tests/unit/neptune/new/internal/operation_processors/__init__.py b/tests/unit/neptune/new/internal/operation_processors/__init__.py deleted file mode 100644 index 8d06af532..000000000 --- a/tests/unit/neptune/new/internal/operation_processors/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# -# Copyright (c) 2023, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/tests/unit/neptune/new/internal/operation_processors/test_batcher.py b/tests/unit/neptune/new/internal/operation_processors/test_batcher.py deleted file mode 100644 index 0104579e8..000000000 --- a/tests/unit/neptune/new/internal/operation_processors/test_batcher.py +++ /dev/null @@ -1,352 +0,0 @@ -# -# Copyright (c) 2023, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from mock import patch - -from neptune.attributes import Integer as IntegerAttribute -from neptune.exceptions import MetadataInconsistency -from neptune.internal.backends.api_model import IntAttribute as ApiIntAttribute -from neptune.internal.container_type import ContainerType -from neptune.internal.operation import ( - AssignInt, - CopyAttribute, - DeleteAttribute, - LogFloats, - LogStrings, - TrackFilesToArtifact, - UploadFileContent, -) -from neptune.internal.operation_processors.batcher import Batcher -from neptune.internal.queue.disk_queue import QueueElement - - -@patch("neptune.internal.queue.disk_queue.DiskQueue") -@patch("neptune.internal.backends.neptune_backend.NeptuneBackend") -def test_empty_queue(backend, queue): - # given - queue.get.return_value = None - - # and - batcher = Batcher(queue=queue, backend=backend) - - # when - result = batcher.collect_batch() - - # then - assert result is None - - -@patch("neptune.internal.queue.disk_queue.DiskQueue") -@patch("neptune.internal.backends.factory.HostedNeptuneBackend") -def test_batch_preparation(backend, queue): - # given - record1 = QueueElement(AssignInt(["a"], 2501), 1, 1) - record2 = QueueElement(TrackFilesToArtifact(["b"], "123", [("a/b/c", "d/e/f")]), 2, 1) - record3 = QueueElement(UploadFileContent(["c"], "flac", "deadbeef"), 3, 1) - - # and - queue.get.side_effect = (record1, record2, record3, None) - - # and - batcher = Batcher(queue=queue, backend=backend) - - # when - result = batcher.collect_batch() - - # then - assert result is not None - - # and - accumulated_operations, _, version = result - assert accumulated_operations.other_operations == [AssignInt(["a"], 2501)] - assert accumulated_operations.upload_operations == [UploadFileContent(["c"], "flac", "deadbeef")] - assert accumulated_operations.artifact_operations == [TrackFilesToArtifact(["b"], "123", [("a/b/c", "d/e/f")])] - assert accumulated_operations.errors == [] - assert accumulated_operations.source_operations_count == 3 - assert version == 3 - - -@patch("neptune.internal.queue.disk_queue.DiskQueue") -@patch("neptune.internal.backends.factory.HostedNeptuneBackend") -def test_batch_with_limited_attributes(backend, queue): - # given - record1 = QueueElement(AssignInt(["a"], 2501), 1, 1) - record2 = QueueElement(TrackFilesToArtifact(["b"], "123", [("a/b/c", "d/e/f")]), 2, 1) - record3 = QueueElement(UploadFileContent(["c"], "flac", "deadbeef"), 3, 1) - - # and - queue.get.side_effect = (record1, record2, record3, None) - - # and - batcher = Batcher(queue=queue, backend=backend, max_attributes_in_batch=2) - - # when - result = batcher.collect_batch() - - # then - assert result is not None - - # and - accumulated_operations, _, version = result - assert accumulated_operations.other_operations == [AssignInt(["a"], 2501)] - assert accumulated_operations.upload_operations == [] - assert accumulated_operations.artifact_operations == [TrackFilesToArtifact(["b"], "123", [("a/b/c", "d/e/f")])] - assert accumulated_operations.errors == [] - assert accumulated_operations.source_operations_count == 2 - assert version == 2 - - -@patch("neptune.internal.queue.disk_queue.DiskQueue") -@patch("neptune.internal.backends.factory.HostedNeptuneBackend") -def test_batch_with_limited_points(backend, queue): - # given - record1 = QueueElement(LogFloats(["a"], [1.0, 2.0]), 1, 1) - record2 = QueueElement(LogFloats(["a"], [3.0]), 2, 1) - record3 = QueueElement(LogFloats(["a"], [4.0, 5.0, 6.0]), 3, 1) - - # and - queue.get.side_effect = (record1, record2, record3, None) - - # and - batcher = Batcher(queue=queue, backend=backend, max_points_per_batch=3) - - # when - result = batcher.collect_batch() - - # then - assert result is not None - - # and - accumulated_operations, _, version = result - assert accumulated_operations.other_operations == [LogFloats(["a"], [1.0, 2.0, 3.0])] - assert accumulated_operations.upload_operations == [] - assert accumulated_operations.artifact_operations == [] - assert accumulated_operations.errors == [] - assert accumulated_operations.source_operations_count == 1 - assert version == 2 - - -@patch("neptune.internal.queue.disk_queue.DiskQueue") -@patch("neptune.internal.backends.factory.HostedNeptuneBackend") -def test_batch_with_limited_points_per_attribute(backend, queue): - # given - record1 = QueueElement(LogFloats(["a"], [1.0, 2.0]), 1, 1) - record2 = QueueElement(LogFloats(["a"], [3.0]), 2, 1) - record3 = QueueElement(LogFloats(["b"], [4.0, 5.0, 6.0]), 3, 1) - record4 = QueueElement(LogFloats(["a"], [7.0]), 4, 1) - - # and - queue.get.side_effect = (record1, record2, record3, record4, None) - - # and - batcher = Batcher(queue=queue, backend=backend, max_points_per_attribute=3) - - # when - result = batcher.collect_batch() - - # then - assert result is not None - - # and - accumulated_operations, _, version = result - assert accumulated_operations.other_operations == [ - LogFloats(["a"], [1.0, 2.0, 3.0]), - LogFloats(["b"], [4.0, 5.0, 6.0]), - ] - assert accumulated_operations.upload_operations == [] - assert accumulated_operations.artifact_operations == [] - assert accumulated_operations.errors == [] - assert accumulated_operations.source_operations_count == 2 - assert version == 3 - - -@patch("neptune.internal.queue.disk_queue.DiskQueue") -@patch("neptune.internal.backends.factory.HostedNeptuneBackend") -def test_batch_errors(backend, queue): - # given - record1 = QueueElement(LogFloats(["a"], [1.0]), 1, 1) - record2 = QueueElement(LogStrings(["a"], ["test"]), 2, 1) - - # and - queue.get.side_effect = (record1, record2, None) - - # and - batcher = Batcher(queue=queue, backend=backend, max_points_per_attribute=3) - - # when - result = batcher.collect_batch() - - # then - assert result is not None - - # and - accumulated_operations, _, version = result - assert accumulated_operations.other_operations == [LogFloats(["a"], [1.0])] - assert accumulated_operations.upload_operations == [] - assert accumulated_operations.artifact_operations == [] - assert accumulated_operations.errors == [ - MetadataInconsistency("Cannot perform LogStrings operation on a: Attribute is not a String Series") - ] - assert accumulated_operations.source_operations_count == 1 - assert version == 2 - - -@patch("neptune.internal.queue.disk_queue.DiskQueue") -@patch("neptune.internal.backends.factory.HostedNeptuneBackend") -def test_continue(backend, queue): - # given - record1 = QueueElement(LogFloats(["a"], [1.0, 2.0]), 1, 1) - record2 = QueueElement(LogFloats(["a"], [3.0]), 2, 1) - record3 = QueueElement(LogFloats(["a"], [4.0, 5.0]), 3, 1) - - # and - queue.get.side_effect = (record1, record2, record3, None) - - # and - batcher = Batcher(queue=queue, backend=backend, max_points_per_batch=3) - - # when - result = batcher.collect_batch() - - # then - assert result is not None - - # and - accumulated_operations, _, version = result - assert accumulated_operations.other_operations == [LogFloats(["a"], [1.0, 2.0, 3.0])] - assert accumulated_operations.upload_operations == [] - assert accumulated_operations.artifact_operations == [] - assert accumulated_operations.errors == [] - assert accumulated_operations.source_operations_count == 1 - assert version == 2 - - # when - result = batcher.collect_batch() - - # then - assert result is not None - - # and - accumulated_operations, _, version = result - assert accumulated_operations.other_operations == [LogFloats(["a"], [4.0, 5.0])] - assert accumulated_operations.upload_operations == [] - assert accumulated_operations.artifact_operations == [] - assert accumulated_operations.errors == [] - assert accumulated_operations.source_operations_count == 1 - assert version == 3 - - # and - assert len(queue.get.mock_calls) == 4 - - -@patch("neptune.internal.queue.disk_queue.DiskQueue") -@patch("neptune.internal.backends.factory.HostedNeptuneBackend") -def test_copy(backend, queue): - # given - record1 = QueueElement(CopyAttribute(["a"], "123", ContainerType.RUN, ["b"], IntegerAttribute), 1, 1) - record2 = QueueElement(LogFloats(["c"], [3.0]), 2, 1) - - # and - queue.get.side_effect = (record1, record2, None) - - # and - backend.get_int_attribute.return_value = ApiIntAttribute(5) - - # and - batcher = Batcher(queue=queue, backend=backend) - - # when - result = batcher.collect_batch() - - # then - assert result is not None - - # and - accumulated_operations, _, version = result - assert accumulated_operations.other_operations == [AssignInt(["a"], 5), LogFloats(path=["c"], values=[3.0])] - assert accumulated_operations.upload_operations == [] - assert accumulated_operations.artifact_operations == [] - assert accumulated_operations.errors == [] - assert accumulated_operations.source_operations_count == 2 - assert version == 2 - - # and - backend.get_int_attribute.assert_called_once_with("123", ContainerType.RUN, ["b"]) - - -@patch("neptune.internal.queue.disk_queue.DiskQueue") -@patch("neptune.internal.backends.factory.HostedNeptuneBackend") -def test_two_copy_operations_at_begin(backend, queue): - # given - record1 = QueueElement(CopyAttribute(["a"], "123", ContainerType.RUN, ["b"], IntegerAttribute), 1, 1) - record2 = QueueElement(CopyAttribute(["b"], "123", ContainerType.RUN, ["d"], IntegerAttribute), 2, 1) - record3 = QueueElement(LogFloats(["c"], [3.0]), 3, 1) - - # and - queue.get.side_effect = (record1, record2, record3, None) - - # and - backend.get_int_attribute.side_effect = (ApiIntAttribute(5), ApiIntAttribute(3)) - - # and - batcher = Batcher(queue=queue, backend=backend) - - # when - result = batcher.collect_batch() - - # then - assert result is not None - - # and - accumulated_operations, _, version = result - assert accumulated_operations.other_operations == [AssignInt(["a"], 5)] - assert accumulated_operations.upload_operations == [] - assert accumulated_operations.artifact_operations == [] - assert accumulated_operations.errors == [] - assert accumulated_operations.source_operations_count == 1 - assert version == 1 - - # and - backend.get_int_attribute.assert_called_once_with("123", ContainerType.RUN, ["b"]) - - -@patch("neptune.internal.queue.disk_queue.DiskQueue") -@patch("neptune.internal.backends.factory.HostedNeptuneBackend") -def test_requires_previous_completed(backend, queue): - # given - record1 = QueueElement(UploadFileContent(["a"], "flac", "deadbeef"), 1, 1) - record2 = QueueElement(DeleteAttribute(["a"]), 2, 1) - record3 = QueueElement(UploadFileContent(["a"], "flac", "ffffffff"), 3, 1) - - # and - queue.get.side_effect = (record1, record2, record3, None) - - # and - batcher = Batcher(queue=queue, backend=backend) - - # when - result = batcher.collect_batch() - - # then - assert result is not None - - # and - accumulated_operations, _, version = result - assert accumulated_operations.other_operations == [DeleteAttribute(path=["a"])] - assert accumulated_operations.upload_operations == [UploadFileContent(["a"], "flac", "deadbeef")] - assert accumulated_operations.artifact_operations == [] - assert accumulated_operations.errors == [] - assert accumulated_operations.source_operations_count == 2 - assert version == 2 diff --git a/tests/unit/neptune/new/internal/preprocessor/__init__.py b/tests/unit/neptune/new/internal/preprocessor/__init__.py deleted file mode 100644 index 8d06af532..000000000 --- a/tests/unit/neptune/new/internal/preprocessor/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# -# Copyright (c) 2023, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/tests/unit/neptune/new/internal/preprocessor/test_operations_accumulator.py b/tests/unit/neptune/new/internal/preprocessor/test_operations_accumulator.py deleted file mode 100644 index cc7a3d7b1..000000000 --- a/tests/unit/neptune/new/internal/preprocessor/test_operations_accumulator.py +++ /dev/null @@ -1,505 +0,0 @@ -# -# Copyright (c) 2023, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import datetime - -import pytest - -from neptune.attributes import Integer as IntegerAttributeType -from neptune.exceptions import MetadataInconsistency -from neptune.internal.container_type import ContainerType -from neptune.internal.operation import ( - AddStrings, - AssignArtifact, - AssignBool, - AssignDatetime, - AssignFloat, - AssignInt, - AssignString, - ClearArtifact, - ClearFloatLog, - ClearImageLog, - ClearStringLog, - ClearStringSet, - ConfigFloatSeries, - CopyAttribute, - DeleteAttribute, - DeleteFiles, - LogFloats, - LogImages, - LogStrings, - RemoveStrings, - TrackFilesToArtifact, - UploadFile, - UploadFileContent, - UploadFileSet, -) -from neptune.internal.preprocessor.operations_accumulator import OperationsAccumulator - - -def test_assign_float(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - AssignFloat(path, 1.0), - AssignFloat(path, 2.0), - AssignFloat(path, 3.0), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [AssignFloat(path, 3.0)] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_assign_int(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - AssignInt(path, 1), - AssignInt(path, 2), - AssignInt(path, 3), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [AssignInt(path, 3)] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_assign_bool(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - AssignBool(path, False), - AssignBool(path, True), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [AssignBool(path, True)] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_assign_string(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - AssignString(path, "a"), - AssignString(path, "b"), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [AssignString(path, "b")] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_assign_datetime(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - AssignDatetime(path, datetime.datetime(1879, 3, 14)), - AssignDatetime(path, datetime.datetime(1846, 9, 23)), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [AssignDatetime(path, datetime.datetime(1846, 9, 23))] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_log_floats(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - LogFloats(path, [1.0]), - LogFloats(path, [2.0, 3.0]), - LogFloats(path, [4.0, 5.0]), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [LogFloats(path, [1.0, 2.0, 3.0, 4.0, 5.0])] == accumulator.get_operations() - assert 5 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_log_strings(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - LogStrings(path, ["a"]), - LogStrings(path, ["b", "c"]), - LogStrings(path, ["d", "e"]), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [LogStrings(path, ["a", "b", "c", "d", "e"])] == accumulator.get_operations() - assert 5 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_log_images(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - LogImages(path, ["a"]), - LogImages(path, ["b", "c"]), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [LogImages(path, ["a", "b", "c"])] == accumulator.get_operations() - assert 3 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_clear_floats(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - LogFloats(path, [1.0]), - LogFloats(path, [2.0, 3.0]), - LogFloats(path, [4.0, 5.0]), - ClearFloatLog(path), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [ClearFloatLog(path)] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_clear_strings(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - LogStrings(path, ["a"]), - LogStrings(path, ["b", "c"]), - LogStrings(path, ["d", "e"]), - ClearStringLog(path), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [ClearStringLog(path)] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_clear_images(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - LogImages(path, ["a"]), - LogImages(path, ["b", "c"]), - ClearImageLog(path), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [ClearImageLog(path)] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_config(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - ConfigFloatSeries(path, min=1.0, max=100.0, unit="%"), - LogFloats(path, [1.0]), - LogFloats(path, [2.0, 3.0]), - LogFloats(path, [4.0, 5.0]), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [ - LogFloats(path, [1.0, 2.0, 3.0, 4.0, 5.0]), - ConfigFloatSeries(path, min=1.0, max=100.0, unit="%"), - ] == accumulator.get_operations() - assert 5 == accumulator.get_append_count() - assert 2 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_upload(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - UploadFile(path, "a"), - UploadFile(path, "b"), - UploadFileContent(path, "flac", "c"), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [UploadFileContent(path, "flac", "c")] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_string_sets(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - AddStrings(path, {"xx", "y", "abc"}), - AddStrings(path, {"zzz"}), - RemoveStrings(path, {"y", "abc"}), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [ - AddStrings(path, {"xx", "y", "abc"}), - AddStrings(path, {"zzz"}), - RemoveStrings(path, {"y", "abc"}), - ] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 3 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_clear_string_sets(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - AddStrings(path, {"xx", "y", "abc"}), - AddStrings(path, {"zzz"}), - ClearStringSet(path), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [ClearStringSet(path)] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_assign_artifact(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - AssignArtifact(path, "a"), - AssignArtifact(path, "b"), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [AssignArtifact(path, "b")] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_clear_artifact(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - AssignArtifact(path, "a"), - AssignArtifact(path, "b"), - ClearArtifact(path), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [ClearArtifact(path)] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_artifact_track_files(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - TrackFilesToArtifact(path, "a", [("a/b/c", "d/e/f")]), - TrackFilesToArtifact(path, "a", [("g/h/i", "j/k/l"), ("m", "n")]), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [ - TrackFilesToArtifact(path, "a", [("a/b/c", "d/e/f"), ("g/h/i", "j/k/l"), ("m", "n")]) - ] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_delete_float(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [ - AssignFloat(path, 1.0), - AssignFloat(path, 2.0), - AssignFloat(path, 3.0), - DeleteAttribute(path), - AssignFloat(path, 4.0), - DeleteAttribute(path), - ] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [AssignFloat(path, 3.0), DeleteAttribute(path)] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 2 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_copy(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - - # when - with pytest.raises(MetadataInconsistency): - CopyAttribute(path, "a", ContainerType.RUN, ["d", "e"], IntegerAttributeType).accept(accumulator) - - -def test_file_set(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [UploadFileSet(path, ["a", "b"], False), UploadFileSet(path, ["c"], False), DeleteFiles(path, {"b"})] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [ - UploadFileSet(path, ["a", "b"], False), - UploadFileSet(path, ["c"], False), - DeleteFiles(path, {"b"}), - ] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 3 == accumulator.get_op_count() - assert [] == accumulator.get_errors() - - -def test_file_set_reset(): - # given - path = ["a", "b", "c"] - accumulator = OperationsAccumulator(path=path) - operations = [UploadFileSet(path, ["a", "b"], False), UploadFileSet(path, ["c"], True)] - - # when - for op in operations: - op.accept(accumulator) - - # then - assert [UploadFileSet(path, ["c"], True)] == accumulator.get_operations() - assert 0 == accumulator.get_append_count() - assert 1 == accumulator.get_op_count() - assert [] == accumulator.get_errors() diff --git a/tests/unit/neptune/new/internal/preprocessor/test_operations_preprocessor.py b/tests/unit/neptune/new/internal/preprocessor/test_operations_preprocessor.py deleted file mode 100644 index 8c68dd8a2..000000000 --- a/tests/unit/neptune/new/internal/preprocessor/test_operations_preprocessor.py +++ /dev/null @@ -1,423 +0,0 @@ -# -# Copyright (c) 2020, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import datetime -import uuid - -from neptune.exceptions import MetadataInconsistency -from neptune.internal.operation import ( - AddStrings, - AssignArtifact, - AssignBool, - AssignDatetime, - AssignFloat, - AssignInt, - AssignString, - ClearArtifact, - ClearFloatLog, - ClearImageLog, - ClearStringLog, - ClearStringSet, - ConfigFloatSeries, - DeleteAttribute, - DeleteFiles, - LogFloats, - LogImages, - LogStrings, - RemoveStrings, - TrackFilesToArtifact, - UploadFile, - UploadFileContent, - UploadFileSet, -) -from neptune.internal.preprocessor.operations_preprocessor import OperationsPreprocessor - -FLog = LogFloats.ValueType -SLog = LogStrings.ValueType -ILog = LogImages.ValueType - - -def test_delete_attribute(): - # given - processor = OperationsPreprocessor() - - operations = [ - DeleteAttribute(["a"]), - AssignFloat(["a"], 1), - DeleteAttribute(["a"]), - AssignString(["b"], "2"), - DeleteAttribute(["b"]), - AssignString(["c"], "2"), - AssignString(["c"], "3"), - AssignInt(["d"], 4), - AssignInt(["d"], 5), - AssignArtifact(["e"], "123"), - DeleteAttribute(["c"]), - DeleteAttribute(["d"]), - DeleteAttribute(["e"]), - DeleteAttribute(["f"]), - DeleteAttribute(["f"]), - DeleteAttribute(["f"]), - ] - - # when - processor.process_batch(operations) - - # then - result = processor.accumulate_operations() - assert [] == result.upload_operations - assert [] == result.artifact_operations - assert [ - DeleteAttribute(["a"]), - AssignString(["b"], "2"), - DeleteAttribute(["b"]), - AssignString(["c"], "3"), - DeleteAttribute(["c"]), - AssignInt(["d"], 5), - DeleteAttribute(["d"]), - AssignArtifact(["e"], "123"), - DeleteAttribute(["e"]), - DeleteAttribute(["f"]), - ] == result.other_operations - assert [] == result.errors - assert len(operations) == processor.processed_ops_count - - -def test_assign(): - # given - processor = OperationsPreprocessor() - - operations = [ - AssignFloat(["a"], 1), - DeleteAttribute(["a"]), - AssignString(["a"], "111"), - DeleteAttribute(["b"]), - AssignFloat(["b"], 2), - AssignFloat(["c"], 3), - AssignString(["c"], "333"), - AssignString(["d"], "44"), - AssignFloat(["e"], 5), - AssignFloat(["e"], 10), - AssignFloat(["e"], 33), - AssignBool(["f"], False), - AssignBool(["f"], True), - AssignDatetime(["g"], datetime.datetime(1879, 3, 14)), - AssignDatetime(["g"], datetime.datetime(1846, 9, 23)), - UploadFile(["h"], "123"), - UploadFileContent(["i"], "123", "123"), - ] - - # when - processor.process_batch(operations) - - # then - result = processor.accumulate_operations() - assert [ - UploadFile(["h"], "123"), - UploadFileContent(["i"], "123", "123"), - ] == result.upload_operations - assert [] == result.artifact_operations - assert [ - AssignFloat(["a"], 1), - DeleteAttribute(["a"]), - AssignString(["a"], "111"), - DeleteAttribute(["b"]), - AssignFloat(["b"], 2), - AssignFloat(["c"], 3), - AssignString(["d"], "44"), - AssignFloat(["e"], 33), - AssignBool(["f"], True), - AssignDatetime(["g"], datetime.datetime(1846, 9, 23)), - ] == result.other_operations - assert [ - MetadataInconsistency("Cannot perform AssignString operation on c: Attribute is not a String") - ] == result.errors - assert len(operations) == processor.processed_ops_count - assert result.upload_operations + result.other_operations == result.all_operations() - - -def test_series(): - # given - processor = OperationsPreprocessor() - - operations = [ - LogFloats(["a"], [FLog(1, 2, 3)]), - ConfigFloatSeries(["a"], min=7, max=70, unit="%"), - DeleteAttribute(["a"]), - LogStrings(["a"], [SLog("111", 3, 4)]), - DeleteAttribute(["b"]), - LogStrings(["b"], [SLog("222", None, 6)]), - ClearStringLog(["b"]), - LogFloats(["c"], [FLog(1, 2, 3)]), - LogFloats(["c"], [FLog(10, 20, 30), FLog(100, 200, 300)]), - LogStrings(["d"], [SLog("4", 111, 222)]), - ClearFloatLog(["e"]), - LogImages(["f"], [ILog("1", 2, 3)]), - LogImages(["f"], [ILog("10", 20, 30), FLog("100", 200, 300)]), - LogImages(["f"], [ILog("1", 2, 3)]), - LogImages(["f"], [ILog("10", 20, 30), FLog("100", 200, 300)]), - ClearImageLog(["f"]), - LogImages(["f"], [ILog("3", 20, 30), FLog("4", 200, 300)]), - LogImages(["f"], [ILog("5", 2, 3)]), - LogImages(["f"], [ILog("8", 20, 30), FLog("1000", 200, 300)]), - LogImages(["g"], [ILog("10", 20, 30), FLog("100", 200, 300)]), - ClearImageLog(["g"]), - AssignString(["h"], "44"), - LogFloats(["h"], [FLog(10, 20, 30), FLog(100, 200, 300)]), - LogFloats(["i"], [FLog(1, 2, 3)]), - ConfigFloatSeries(["i"], min=7, max=70, unit="%"), - ClearFloatLog(["i"]), - LogFloats(["i"], [FLog(10, 20, 30), FLog(100, 200, 300)]), - AssignInt(["j"], 2), - ConfigFloatSeries(["j"], min=0, max=100, unit="%"), - ] - - # when - processor.process_batch(operations) - - # then - result = processor.accumulate_operations() - assert [] == result.upload_operations - assert [] == result.artifact_operations - assert [ - LogFloats(["a"], [FLog(1, 2, 3)]), - DeleteAttribute(["a"]), - LogStrings(["a"], [FLog("111", 3, 4)]), - DeleteAttribute(["b"]), - ClearStringLog(path=["b"]), - LogFloats(["c"], [FLog(1, 2, 3), FLog(10, 20, 30), FLog(100, 200, 300)]), - LogStrings(["d"], [SLog("4", 111, 222)]), - ClearFloatLog(["e"]), - ClearImageLog(["f"]), - LogImages( - ["f"], - [ - ILog("3", 20, 30), - FLog("4", 200, 300), - ILog("5", 2, 3), - ILog("8", 20, 30), - FLog("1000", 200, 300), - ], - ), - ClearImageLog(["g"]), - AssignString(["h"], "44"), - ClearFloatLog(["i"]), - LogFloats(["i"], [FLog(10, 20, 30), FLog(100, 200, 300)]), - ConfigFloatSeries(["i"], min=7, max=70, unit="%"), - AssignInt(["j"], 2), - ] == result.other_operations - assert [ - MetadataInconsistency("Cannot perform LogFloats operation on h: Attribute is not a Float Series"), - MetadataInconsistency("Cannot perform ConfigFloatSeries operation on j: Attribute is not a Float Series"), - ] == result.errors - assert len(operations) == processor.processed_ops_count - assert result.other_operations == result.all_operations() - - -def test_sets(): - # given - processor = OperationsPreprocessor() - - operations = [ - AddStrings(["a"], {"xx", "y", "abc"}), - DeleteAttribute(["a"]), - AddStrings(["a"], {"hhh", "gij"}), - DeleteAttribute(["b"]), - RemoveStrings(["b"], {"abc", "defgh"}), - AddStrings(["c"], {"hhh", "gij"}), - RemoveStrings(["c"], {"abc", "defgh"}), - AddStrings(["c"], {"qqq"}), - ClearStringSet(["d"]), - RemoveStrings(["e"], {"abc", "defgh"}), - AddStrings(["e"], {"qqq"}), - ClearStringSet(["e"]), - AddStrings(["f"], {"hhh", "gij"}), - RemoveStrings(["f"], {"abc", "defgh"}), - AddStrings(["f"], {"qqq"}), - ClearStringSet(["f"]), - AddStrings(["f"], {"xx", "y", "abc"}), - RemoveStrings(["f"], {"abc", "defgh"}), - AssignString(["h"], "44"), - RemoveStrings(["h"], {""}), - AssignFloat(["i"], 5), - AddStrings(["i"], {""}), - ] - - # when - processor.process_batch(operations) - - # then - result = processor.accumulate_operations() - assert [] == result.upload_operations - assert [] == result.artifact_operations - assert [ - AddStrings(["a"], {"xx", "y", "abc"}), - DeleteAttribute(["a"]), - AddStrings(["a"], {"hhh", "gij"}), - DeleteAttribute(["b"]), - RemoveStrings(["b"], {"abc", "defgh"}), - AddStrings(["c"], {"hhh", "gij"}), - RemoveStrings(["c"], {"abc", "defgh"}), - AddStrings(["c"], {"qqq"}), - ClearStringSet(["d"]), - ClearStringSet(["e"]), - ClearStringSet(["f"]), - AddStrings(["f"], {"xx", "y", "abc"}), - RemoveStrings(["f"], {"abc", "defgh"}), - AssignString(["h"], "44"), - AssignFloat(["i"], 5), - ] == result.other_operations - assert [ - MetadataInconsistency("Cannot perform RemoveStrings operation on h: Attribute is not a String Set"), - MetadataInconsistency("Cannot perform AddStrings operation on i: Attribute is not a String Set"), - ] == result.errors - assert len(operations) == processor.processed_ops_count - assert result.other_operations == result.all_operations() - - -def test_file_set(): - # given - processor = OperationsPreprocessor() - - operations = [ - UploadFileSet(["a"], ["xx", "y", "abc"], reset=False), - UploadFileSet(["a"], ["hhh", "gij"], reset=False), - UploadFileSet(["b"], ["abc", "defgh"], reset=True), - UploadFileSet(["c"], ["hhh", "gij"], reset=False), - UploadFileSet(["c"], ["abc", "defgh"], reset=True), - UploadFileSet(["c"], ["qqq"], reset=False), - UploadFileSet(["d"], ["hhh", "gij"], reset=False), - AssignFloat(["e"], 5), - UploadFileSet(["e"], [""], reset=False), - DeleteFiles(["d"], {"hhh"}), - ] - - # when - processor.process_batch(operations) - - # then - result = processor.accumulate_operations() - assert result.upload_operations == [ - UploadFileSet(["a"], ["xx", "y", "abc"], reset=False), - UploadFileSet(["a"], ["hhh", "gij"], reset=False), - UploadFileSet(["b"], ["abc", "defgh"], reset=True), - UploadFileSet(["c"], ["abc", "defgh"], reset=True), - UploadFileSet(["c"], ["qqq"], reset=False), - UploadFileSet(["d"], ["hhh", "gij"], reset=False), - ] - assert [] == result.artifact_operations - assert [ - DeleteFiles(["d"], {"hhh"}), - AssignFloat(["e"], 5), - ] == result.other_operations - assert [ - MetadataInconsistency("Cannot perform UploadFileSet operation on e: Attribute is not a File Set") - ] == result.errors - assert len(operations) == processor.processed_ops_count - assert result.upload_operations + result.other_operations == result.all_operations() - - -def test_file_ops_delete(): - # given - processor = OperationsPreprocessor() - - operations = [ - UploadFileSet(["b"], ["abc", "defgh"], reset=True), - UploadFileSet(["c"], ["hhh", "gij"], reset=False), - UploadFileSet(["c"], ["abc", "defgh"], reset=True), - UploadFileSet(["c"], ["qqq"], reset=False), - UploadFileSet(["d"], ["hhh", "gij"], reset=False), - DeleteAttribute(["a"]), - UploadFileSet(["a"], ["xx", "y", "abc"], reset=False), - UploadFileSet(["a"], ["hhh", "gij"], reset=False), - DeleteAttribute(["b"]), - ] - - # when - processor.process_batch(operations) - - # then: there's a cutoff after DeleteAttribute(["a"]) - result = processor.accumulate_operations() - assert result.upload_operations == [ - UploadFileSet(["b"], ["abc", "defgh"], reset=True), - UploadFileSet(["c"], ["abc", "defgh"], reset=True), - UploadFileSet(["c"], ["qqq"], reset=False), - UploadFileSet(["d"], ["hhh", "gij"], reset=False), - ] - assert [] == result.artifact_operations - assert [ - DeleteAttribute(["a"]), - ] == result.other_operations - assert [] == result.errors - assert 6 == processor.processed_ops_count - assert result.upload_operations + result.other_operations == result.all_operations() - - -def test_artifacts(): - # given - processor = OperationsPreprocessor() - project_uuid = str(uuid.uuid4()) - - operations = [ - TrackFilesToArtifact(["a"], project_uuid, [("dir1/", None)]), - DeleteAttribute(["a"]), - TrackFilesToArtifact(["b"], project_uuid, [("dir1/", None)]), - TrackFilesToArtifact(["b"], project_uuid, [("dir2/dir3/", "dir2/")]), - TrackFilesToArtifact(["b"], project_uuid, [("dir4/dir5/", "dir4/")]), - TrackFilesToArtifact(["c"], project_uuid, [("dir4/dir5/", "dir4/")]), - AssignFloat(["d"], 5), - ClearArtifact(["c"]), - TrackFilesToArtifact(["d"], project_uuid, [("dir1/", None)]), - TrackFilesToArtifact(["e"], project_uuid, [("dir2/dir3/", "dir2/")]), - TrackFilesToArtifact(["e"], project_uuid, [("dir4/", None)]), - TrackFilesToArtifact(["f"], project_uuid, [("dir1/", None)]), - TrackFilesToArtifact(["f"], project_uuid, [("dir2/dir3/", "dir2/")]), - TrackFilesToArtifact(["g"], project_uuid, [("dir1/", None)]), - TrackFilesToArtifact(["g"], project_uuid, [("dir2/dir3/", "dir2/")]), - TrackFilesToArtifact(["g"], project_uuid, [("dir4/", None)]), - TrackFilesToArtifact(["a"], project_uuid, [("dir1/", None)]), - ] - - # when - processor.process_batch(operations) - - # then: there's a cutoff before second TrackFilesToArtifact(["a"]) due to DeleteAttribute(["a"]) - result = processor.accumulate_operations() - assert [] == result.upload_operations - assert [ - TrackFilesToArtifact(["a"], project_uuid, [("dir1/", None)]), - TrackFilesToArtifact( - ["b"], - project_uuid, - [("dir1/", None), ("dir2/dir3/", "dir2/"), ("dir4/dir5/", "dir4/")], - ), - TrackFilesToArtifact(["e"], project_uuid, [("dir2/dir3/", "dir2/"), ("dir4/", None)]), - TrackFilesToArtifact(["f"], project_uuid, [("dir1/", None), ("dir2/dir3/", "dir2/")]), - TrackFilesToArtifact( - ["g"], - project_uuid, - [("dir1/", None), ("dir2/dir3/", "dir2/"), ("dir4/", None)], - ), - ] == result.artifact_operations - assert [ - DeleteAttribute(["a"]), - ClearArtifact(["c"]), - AssignFloat(["d"], 5), - ] == result.other_operations - assert [ - MetadataInconsistency("Cannot perform TrackFilesToArtifact operation on d: Attribute is not a Artifact"), - ] == result.errors - assert len(operations) - 1 == processor.processed_ops_count - assert result.artifact_operations + result.other_operations == result.all_operations() From 6eeaf340beea67b83ea0a661fdcec537b59dabdf Mon Sep 17 00:00:00 2001 From: Patryk Gala Date: Tue, 10 Oct 2023 10:06:03 +0200 Subject: [PATCH 5/7] Revert "Gzip request body if server support it (#1476)" This reverts commit 799e9d80ca8d0250dd2608fdf3e1f52e21e07638. --- CHANGELOG.md | 2 +- src/neptune/internal/backends/api_model.py | 3 -- .../internal/backends/hosted_client.py | 29 +------------------ 3 files changed, 2 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e3b779867..d11eb391e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ### Changes - Upgraded performance of sending series data to Neptune ([#1483](https://github.com/neptune-ai/neptune-client/pull/1483)) -- Compress (gzip) request to server, when server support it ([#1476](https://github.com/neptune-ai/neptune-client/pull/1476)) + ## neptune 1.8.1 diff --git a/src/neptune/internal/backends/api_model.py b/src/neptune/internal/backends/api_model.py index 88660ee12..9dd87195d 100644 --- a/src/neptune/internal/backends/api_model.py +++ b/src/neptune/internal/backends/api_model.py @@ -132,7 +132,6 @@ class ClientConfig: _missing_features: FrozenSet[str] version_info: VersionInfo multipart_config: MultipartConfig - gzip_upload: bool def has_feature(self, feature_name: str) -> bool: return feature_name not in self._missing_features @@ -163,7 +162,6 @@ def from_api_response(config) -> "ClientConfig": multipart_upload_config = MultipartConfig( min_chunk_size, max_chunk_size, max_chunk_count, max_single_part_size ) - gzip_upload = getattr(config, "gzipUpload", False) artifacts_config_obj = getattr(config, "artifacts", None) has_artifacts = getattr(artifacts_config_obj, "enabled", False) @@ -181,7 +179,6 @@ def from_api_response(config) -> "ClientConfig": _missing_features=frozenset(missing_features), version_info=VersionInfo.build(min_recommended, min_compatible, max_compatible), multipart_config=multipart_upload_config, - gzip_upload=gzip_upload, ) diff --git a/src/neptune/internal/backends/hosted_client.py b/src/neptune/internal/backends/hosted_client.py index 239a4eaf0..0a74e77ef 100644 --- a/src/neptune/internal/backends/hosted_client.py +++ b/src/neptune/internal/backends/hosted_client.py @@ -23,7 +23,6 @@ import os import platform -import zlib from typing import ( Dict, Tuple, @@ -32,11 +31,6 @@ import requests from bravado.http_client import HttpClient from bravado.requests_client import RequestsClient -from requests import ( - PreparedRequest, - Response, -) -from requests.adapters import HTTPAdapter from neptune.common.backends.utils import with_api_exceptions_handler from neptune.common.oauth import NeptuneAuthenticator @@ -54,7 +48,6 @@ verify_host_resolution, ) from neptune.internal.credentials import Credentials -from neptune.internal.utils.logger import logger from neptune.version import version as neptune_client_version BACKEND_SWAGGER_PATH = "/api/backend/swagger.json" @@ -73,22 +66,6 @@ } -class GzipAdapter(HTTPAdapter): - def send(self, request: PreparedRequest, stream: bool = False, **kw) -> Response: - if request.body is not None and not stream and request.headers.get("Content-Type", None) == "application/json": - try: - request_body = request.body if isinstance(request.body, bytes) else bytes(request.body, "utf-8") - gzip_compress = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, zlib.MAX_WBITS | 16) - compressed = gzip_compress.compress(request_body) + gzip_compress.flush() - request.prepare_body(compressed, None) - request.headers["Content-Encoding"] = "gzip" - except zlib.error: - logger.warning("Error on compressing request") - pass - - return super(GzipAdapter, self).send(request, stream, **kw) - - def _close_connections_on_fork(session: requests.Session): try: os.register_at_fork(before=session.close, after_in_child=session.close, after_in_parent=session.close) @@ -197,11 +174,7 @@ def create_backend_client(client_config: ClientConfig, http_client: HttpClient) @cache -def create_leaderboard_client(client_config: ClientConfig, http_client: RequestsClient) -> SwaggerClientWrapper: - if client_config.gzip_upload: - http_client.session.mount("http://", GzipAdapter()) - http_client.session.mount("https://", GzipAdapter()) - +def create_leaderboard_client(client_config: ClientConfig, http_client: HttpClient) -> SwaggerClientWrapper: return SwaggerClientWrapper( create_swagger_client( build_operation_url(client_config.api_url, LEADERBOARD_SWAGGER_PATH), From 2b3c9f4ec23a67a38f2fd1a67ec62cc6640384bb Mon Sep 17 00:00:00 2001 From: Patryk Gala Date: Tue, 10 Oct 2023 10:06:03 +0200 Subject: [PATCH 6/7] Revert "Upgraded performance of sending series data to Neptune (#1483)" This reverts commit 8ed5e05326fcb64dd37389d5d80026f728dc6384. --- CHANGELOG.md | 6 - pyproject.toml | 3 + src/neptune/cli/commands.py | 2 +- src/neptune/cli/path_option.py | 2 +- src/neptune/cli/sync.py | 2 +- src/neptune/cli/utils.py | 2 +- .../internal/{queue => }/disk_queue.py | 41 ++- .../async_operation_processor.py | 8 +- .../offline_operation_processor.py | 2 +- src/neptune/internal/queue/__init__.py | 15 -- .../{queue => utils}/json_file_splitter.py | 33 +-- .../{queue => utils}/sync_offset_file.py | 18 +- tests/unit/neptune/new/cli/utils.py | 4 +- .../neptune/new/internal/queue/__init__.py | 15 -- .../new/internal/queue/test_disk_queue.py | 250 ------------------ .../internal/queue/test_json_file_splitter.py | 176 ------------ .../neptune/new/internal/test_disk_queue.py | 189 +++++++++++++ .../internal/utils/test_json_file_splitter.py | 177 +++++++++++++ 18 files changed, 416 insertions(+), 529 deletions(-) rename src/neptune/internal/{queue => }/disk_queue.py (88%) delete mode 100644 src/neptune/internal/queue/__init__.py rename src/neptune/internal/{queue => utils}/json_file_splitter.py (79%) rename src/neptune/internal/{queue => utils}/sync_offset_file.py (78%) delete mode 100644 tests/unit/neptune/new/internal/queue/__init__.py delete mode 100644 tests/unit/neptune/new/internal/queue/test_disk_queue.py delete mode 100644 tests/unit/neptune/new/internal/queue/test_json_file_splitter.py create mode 100644 tests/unit/neptune/new/internal/test_disk_queue.py create mode 100644 tests/unit/neptune/new/internal/utils/test_json_file_splitter.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d11eb391e..c87dc84de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,3 @@ -## [UNRELEASED] neptune 1.9.0 - -### Changes -- Upgraded performance of sending series data to Neptune ([#1483](https://github.com/neptune-ai/neptune-client/pull/1483)) - - ## neptune 1.8.1 ### Fixes diff --git a/pyproject.toml b/pyproject.toml index ce02ea0f1..471b8fc6e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -327,6 +327,7 @@ module = [ "neptune.internal.background_job", "neptune.internal.container_structure", "neptune.internal.credentials", + "neptune.internal.disk_queue", "neptune.internal.hardware.gpu.gpu_monitor", "neptune.internal.hardware.hardware_metric_reporting_job", "neptune.internal.id_formats", @@ -351,6 +352,7 @@ module = [ "neptune.internal.utils.git", "neptune.internal.utils.hashing", "neptune.internal.utils.images", + "neptune.internal.utils.json_file_splitter", "neptune.internal.utils.limits", "neptune.internal.utils.logger", "neptune.internal.utils.ping_background_job", @@ -359,6 +361,7 @@ module = [ "neptune.internal.utils.runningmode", "neptune.internal.utils.s3", "neptune.internal.utils.source_code", + "neptune.internal.utils.sync_offset_file", "neptune.internal.utils.traceback_job", "neptune.internal.utils.uncaught_exception_handler", "neptune.internal.websockets.websocket_signals_background_job", diff --git a/src/neptune/cli/commands.py b/src/neptune/cli/commands.py index a6c809a4a..5d7d944d4 100644 --- a/src/neptune/cli/commands.py +++ b/src/neptune/cli/commands.py @@ -41,8 +41,8 @@ from neptune.internal.backends.hosted_neptune_backend import HostedNeptuneBackend from neptune.internal.backends.neptune_backend import NeptuneBackend # noqa: F401 from neptune.internal.credentials import Credentials +from neptune.internal.disk_queue import DiskQueue # noqa: F401 from neptune.internal.operation import Operation # noqa: F401 -from neptune.internal.queue.disk_queue import DiskQueue # noqa: F401 @click.command() diff --git a/src/neptune/cli/path_option.py b/src/neptune/cli/path_option.py index 5a9728a4f..b182716d5 100644 --- a/src/neptune/cli/path_option.py +++ b/src/neptune/cli/path_option.py @@ -32,8 +32,8 @@ Project, ) from neptune.internal.backends.neptune_backend import NeptuneBackend # noqa: F401 +from neptune.internal.disk_queue import DiskQueue # noqa: F401 from neptune.internal.operation import Operation # noqa: F401 -from neptune.internal.queue.disk_queue import DiskQueue # noqa: F401 def get_neptune_path(ctx, param, path: str) -> Path: diff --git a/src/neptune/cli/sync.py b/src/neptune/cli/sync.py index 1f4ee4fb5..bff8c4d99 100644 --- a/src/neptune/cli/sync.py +++ b/src/neptune/cli/sync.py @@ -51,13 +51,13 @@ Project, ) from neptune.internal.container_type import ContainerType +from neptune.internal.disk_queue import DiskQueue from neptune.internal.id_formats import ( QualifiedName, UniqueId, ) from neptune.internal.operation import Operation from neptune.internal.operation_processors.operation_storage import OperationStorage -from neptune.internal.queue.disk_queue import DiskQueue from neptune.internal.utils.logger import logger retries_timeout = int(os.getenv(NEPTUNE_SYNC_BATCH_TIMEOUT_ENV, "3600")) diff --git a/src/neptune/cli/utils.py b/src/neptune/cli/utils.py index 36fae139f..948417c94 100644 --- a/src/neptune/cli/utils.py +++ b/src/neptune/cli/utils.py @@ -50,12 +50,12 @@ ) from neptune.internal.backends.neptune_backend import NeptuneBackend from neptune.internal.container_type import ContainerType +from neptune.internal.disk_queue import DiskQueue from neptune.internal.id_formats import ( QualifiedName, UniqueId, ) from neptune.internal.operation import Operation -from neptune.internal.queue.disk_queue import DiskQueue from neptune.internal.utils.logger import logger diff --git a/src/neptune/internal/queue/disk_queue.py b/src/neptune/internal/disk_queue.py similarity index 88% rename from src/neptune/internal/queue/disk_queue.py rename to src/neptune/internal/disk_queue.py index bf7146c6a..19ed0ee3a 100644 --- a/src/neptune/internal/queue/disk_queue.py +++ b/src/neptune/internal/disk_queue.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__all__ = ["DiskQueue"] +__all__ = ["QueueElement", "DiskQueue"] import json import logging @@ -23,21 +23,18 @@ from dataclasses import dataclass from glob import glob from pathlib import Path -from types import TracebackType from typing import ( Callable, Generic, List, Optional, Tuple, - Type, TypeVar, - Union, ) from neptune.exceptions import MalformedOperation -from neptune.internal.queue.json_file_splitter import JsonFileSplitter -from neptune.internal.queue.sync_offset_file import SyncOffsetFile +from neptune.internal.utils.json_file_splitter import JsonFileSplitter +from neptune.internal.utils.sync_offset_file import SyncOffsetFile T = TypeVar("T") @@ -62,13 +59,13 @@ def __init__( from_dict: Callable[[dict], T], lock: threading.RLock, max_file_size: int = 64 * 1024**2, - max_batch_size_bytes: Optional[int] = None, + max_batch_size_bytes: int = None, ): - self._dir_path: Path = dir_path.resolve() - self._to_dict: Callable[[T], dict] = to_dict - self._from_dict: Callable[[dict], T] = from_dict - self._max_file_size: int = max_file_size - self._max_batch_size_bytes: int = max_batch_size_bytes or int( + self._dir_path = dir_path.resolve() + self._to_dict = to_dict + self._from_dict = from_dict + self._max_file_size = max_file_size + self._max_batch_size_bytes = max_batch_size_bytes or int( os.environ.get("NEPTUNE_MAX_BATCH_SIZE_BYTES") or str(self.DEFAULT_MAX_BATCH_SIZE_BYTES) ) @@ -77,8 +74,8 @@ def __init__( except FileExistsError: pass - self._last_ack_file = SyncOffsetFile(dir_path / "last_ack_version") - self._last_put_file = SyncOffsetFile(dir_path / "last_put_version") + self._last_ack_file = SyncOffsetFile(dir_path / "last_ack_version", default=0) + self._last_put_file = SyncOffsetFile(dir_path / "last_put_version", default=0) ( self._read_file_version, @@ -165,12 +162,12 @@ def get_batch(self, size: int) -> List[QueueElement[T]]: ret.append(next_obj) return ret - def flush(self) -> None: + def flush(self): self._writer.flush() self._last_ack_file.flush() self._last_put_file.flush() - def close(self) -> None: + def close(self): self._reader.close() self._writer.close() self._last_ack_file.close() @@ -183,7 +180,7 @@ def cleanup_if_empty(self) -> None: if self.is_empty(): self._remove_data() - def _remove_data(self) -> None: + def _remove_data(self): path = self._dir_path shutil.rmtree(path, ignore_errors=True) @@ -234,13 +231,13 @@ def size(self) -> int: def _get_log_file(self, index: int) -> str: return "{}/data-{}.log".format(self._dir_path, index) - def _get_all_log_file_versions(self) -> Union[Tuple[int, int], List[int]]: + def _get_all_log_file_versions(self): log_files = glob("{}/data-*.log".format(self._dir_path)) if not log_files: return 1, 1 return sorted([int(file[len(str(self._dir_path)) + 6 : -4]) for file in log_files]) - def _get_first_and_last_log_file_version(self) -> Tuple[int, int]: + def _get_first_and_last_log_file_version(self) -> (int, int): log_versions = self._get_all_log_file_versions() return min(log_versions), max(log_versions) @@ -257,12 +254,10 @@ def _serialize(self, obj: T, version: int) -> dict: def _deserialize(self, data: dict) -> Tuple[T, int]: return self._from_dict(data["obj"]), data["version"] - def __enter__(self) -> "DiskQueue": + def __enter__(self): return self - def __exit__( - self, exc_type: Type[BaseException], exc_value: BaseException, exc_traceback: Optional[TracebackType] - ) -> None: + def __exit__(self, exc_type, exc_val, exc_tb): self.flush() self.close() self.cleanup_if_empty() diff --git a/src/neptune/internal/operation_processors/async_operation_processor.py b/src/neptune/internal/operation_processors/async_operation_processor.py index fecd23d58..3b6f9a9da 100644 --- a/src/neptune/internal/operation_processors/async_operation_processor.py +++ b/src/neptune/internal/operation_processors/async_operation_processor.py @@ -38,6 +38,10 @@ from neptune.internal.backends.neptune_backend import NeptuneBackend from neptune.internal.backends.operations_preprocessor import OperationsPreprocessor from neptune.internal.container_type import ContainerType +from neptune.internal.disk_queue import ( + DiskQueue, + QueueElement, +) from neptune.internal.id_formats import UniqueId from neptune.internal.init.parameters import ( ASYNC_LAG_THRESHOLD, @@ -53,10 +57,6 @@ OperationStorage, get_container_dir, ) -from neptune.internal.queue.disk_queue import ( - DiskQueue, - QueueElement, -) from neptune.internal.threading.daemon import Daemon from neptune.internal.utils.logger import logger diff --git a/src/neptune/internal/operation_processors/offline_operation_processor.py b/src/neptune/internal/operation_processors/offline_operation_processor.py index dd50359cf..de286242f 100644 --- a/src/neptune/internal/operation_processors/offline_operation_processor.py +++ b/src/neptune/internal/operation_processors/offline_operation_processor.py @@ -21,6 +21,7 @@ from neptune.constants import OFFLINE_DIRECTORY from neptune.internal.container_type import ContainerType +from neptune.internal.disk_queue import DiskQueue from neptune.internal.id_formats import UniqueId from neptune.internal.operation import Operation from neptune.internal.operation_processors.operation_processor import OperationProcessor @@ -28,7 +29,6 @@ OperationStorage, get_container_dir, ) -from neptune.internal.queue.disk_queue import DiskQueue class OfflineOperationProcessor(OperationProcessor): diff --git a/src/neptune/internal/queue/__init__.py b/src/neptune/internal/queue/__init__.py deleted file mode 100644 index 8d06af532..000000000 --- a/src/neptune/internal/queue/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# -# Copyright (c) 2023, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/src/neptune/internal/queue/json_file_splitter.py b/src/neptune/internal/utils/json_file_splitter.py similarity index 79% rename from src/neptune/internal/queue/json_file_splitter.py rename to src/neptune/internal/utils/json_file_splitter.py index da22df686..b0b91853a 100644 --- a/src/neptune/internal/queue/json_file_splitter.py +++ b/src/neptune/internal/utils/json_file_splitter.py @@ -15,20 +15,13 @@ # __all__ = ["JsonFileSplitter"] +import json from collections import deque from io import StringIO -from json import ( - JSONDecodeError, - JSONDecoder, -) -from types import TracebackType +from json import JSONDecodeError from typing import ( - IO, - Any, - Deque, Optional, Tuple, - Type, ) @@ -37,11 +30,11 @@ class JsonFileSplitter: MAX_PART_READ = 8 * 1024 def __init__(self, file_path: str): - self._file: IO = open(file_path, "r") - self._decoder: JSONDecoder = JSONDecoder(strict=False) - self._part_buffer: StringIO = StringIO() - self._parsed_queue: Deque[Tuple[Any, int]] = deque() - self._start_pos: int = 0 + self._file = open(file_path, "r") + self._decoder = json.JSONDecoder(strict=False) + self._part_buffer = StringIO() + self._parsed_queue = deque() + self._start_pos = 0 def close(self) -> None: self._file.close() @@ -58,7 +51,7 @@ def get_with_size(self) -> Tuple[Optional[dict], int]: return self._parsed_queue.popleft() return None, 0 - def _read_data(self) -> None: + def _read_data(self): if self._part_buffer.tell() < self.MAX_PART_READ: data = self._file.read(self.BUFFER_SIZE) if not data: @@ -75,7 +68,7 @@ def _read_data(self) -> None: data = self._reset_part_buffer() self._decode(data) - def _decode(self, data: str) -> None: + def _decode(self, data: str): start = self._json_start(data) while start is not None: try: @@ -101,11 +94,3 @@ def _reset_part_buffer(self) -> str: self._part_buffer.close() self._part_buffer = StringIO() return data - - def __enter__(self) -> "JsonFileSplitter": - return self - - def __exit__( - self, exc_type: Type[BaseException], exc_value: BaseException, exc_traceback: Optional[TracebackType] - ) -> None: - self.close() diff --git a/src/neptune/internal/queue/sync_offset_file.py b/src/neptune/internal/utils/sync_offset_file.py similarity index 78% rename from src/neptune/internal/queue/sync_offset_file.py rename to src/neptune/internal/utils/sync_offset_file.py index aa9c2238b..d92d448c7 100644 --- a/src/neptune/internal/queue/sync_offset_file.py +++ b/src/neptune/internal/utils/sync_offset_file.py @@ -16,15 +16,15 @@ __all__ = ["SyncOffsetFile"] from pathlib import Path -from typing import IO +from typing import Optional class SyncOffsetFile: - def __init__(self, path: Path, default: int = 0): + def __init__(self, path: Path, default: int = None): mode = "r+" if path.exists() else "w+" - self._file: IO = open(path, mode) - self._default: int = default - self._last: int = self.read() + self._file = open(path, mode) + self._default = default + self._last = self.read() def write(self, offset: int) -> None: self._file.seek(0) @@ -33,18 +33,18 @@ def write(self, offset: int) -> None: self._file.flush() self._last = offset - def read(self) -> int: + def read(self) -> Optional[int]: self._file.seek(0) content = self._file.read() if not content: return self._default return int(content) - def read_local(self) -> int: + def read_local(self) -> Optional[int]: return self._last - def flush(self) -> None: + def flush(self): self._file.flush() - def close(self) -> None: + def close(self): self._file.close() diff --git a/tests/unit/neptune/new/cli/utils.py b/tests/unit/neptune/new/cli/utils.py index 816579e27..ce11708fa 100644 --- a/tests/unit/neptune/new/cli/utils.py +++ b/tests/unit/neptune/new/cli/utils.py @@ -26,8 +26,8 @@ from neptune.exceptions import MetadataContainerNotFound from neptune.internal.backends.api_model import ApiExperiment from neptune.internal.container_type import ContainerType -from neptune.internal.queue.disk_queue import DiskQueue -from neptune.internal.queue.sync_offset_file import SyncOffsetFile +from neptune.internal.disk_queue import DiskQueue +from neptune.internal.utils.sync_offset_file import SyncOffsetFile from tests.unit.neptune.new.utils.api_experiments_factory import ( api_metadata_container, api_run, diff --git a/tests/unit/neptune/new/internal/queue/__init__.py b/tests/unit/neptune/new/internal/queue/__init__.py deleted file mode 100644 index 8d06af532..000000000 --- a/tests/unit/neptune/new/internal/queue/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# -# Copyright (c) 2023, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/tests/unit/neptune/new/internal/queue/test_disk_queue.py b/tests/unit/neptune/new/internal/queue/test_disk_queue.py deleted file mode 100644 index 61371e95e..000000000 --- a/tests/unit/neptune/new/internal/queue/test_disk_queue.py +++ /dev/null @@ -1,250 +0,0 @@ -# -# Copyright (c) 2020, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import json -import random -import threading -from dataclasses import dataclass -from glob import glob -from pathlib import Path -from tempfile import TemporaryDirectory - -import mock - -from neptune.internal.queue.disk_queue import ( - DiskQueue, - QueueElement, -) - - -def test_put(): - with TemporaryDirectory() as dir_path: - with DiskQueue[Obj]( - dir_path=Path(dir_path), - to_dict=serializer, - from_dict=deserializer, - lock=threading.RLock(), - ) as queue: - # given - obj = Obj(5, "test") - queue.put(obj) - - # when - queue.flush() - - # then - assert get_queue_element(obj, 1) == queue.get() - - -def test_multiple_files(): - with TemporaryDirectory() as dir_path: - with DiskQueue[Obj]( - dir_path=Path(dir_path), - to_dict=serializer, - from_dict=deserializer, - lock=threading.RLock(), - max_file_size=300, - ) as queue: - # given - for i in range(1, 101): - obj = Obj(i, str(i)) - queue.put(obj) - - # when - queue.flush() - - # then - for i in range(1, 101): - obj = Obj(i, str(i)) - assert get_queue_element(obj, i) == queue.get() - - # and - assert queue._read_file_version > 90 - assert queue._write_file_version > 90 - assert len(glob(dir_path + "/data-*.log")) > 10 - - -def test_get_batch(): - with TemporaryDirectory() as dir_path: - with DiskQueue[Obj]( - dir_path=Path(dir_path), - to_dict=serializer, - from_dict=deserializer, - lock=threading.RLock(), - max_file_size=100, - ) as queue: - # given - for i in range(1, 91): - obj = Obj(i, str(i)) - queue.put(obj) - - # when - queue.flush() - - # then - assert [get_queue_element(Obj(i, str(i)), i) for i in range(1, 26)] == queue.get_batch(25) - assert [get_queue_element(Obj(i, str(i)), i) for i in range(26, 51)] == queue.get_batch(25) - assert [get_queue_element(Obj(i, str(i)), i) for i in range(51, 76)] == queue.get_batch(25) - assert [get_queue_element(Obj(i, str(i)), i) for i in range(76, 91)] == queue.get_batch(25) - - -def test_batch_limit(): - with TemporaryDirectory() as dir_path: - with DiskQueue[Obj]( - dir_path=Path(dir_path), - to_dict=serializer, - from_dict=deserializer, - lock=threading.RLock(), - max_file_size=100, - max_batch_size_bytes=get_obj_size_bytes(Obj(1, "1"), 1) * 3, - ) as queue: - # given - for i in range(5): - obj = Obj(i, str(i)) - queue.put(obj) - - # when - queue.flush() - - # then - assert [get_queue_element(Obj(i, str(i)), i + 1) for i in range(3)] == queue.get_batch(5) - assert [get_queue_element(Obj(i, str(i)), i + 1) for i in range(3, 5)] == queue.get_batch(2) - - -def test_resuming_queue(): - with TemporaryDirectory() as dir_path: - with DiskQueue[Obj]( - dir_path=Path(dir_path), - to_dict=serializer, - from_dict=deserializer, - lock=threading.RLock(), - max_file_size=999, - ) as queue: - # given - for i in range(1, 501): - obj = Obj(i, str(i)) - queue.put(obj) - - # when - queue.flush() - - # and - version = queue.get_batch(random.randrange(300, 400))[-1].ver - version_to_ack = version - random.randrange(100, 200) - queue.ack(version_to_ack) - - # then - assert queue._read_file_version > 100 - assert queue._write_file_version > 450 - - # and - data_files = glob(dir_path + "/data-*.log") - data_files_versions = [int(file[len(dir_path + "/data-") : -len(".log")]) for file in data_files] - - assert len(data_files) > 10 - assert 1 == len([ver for ver in data_files_versions if ver <= version_to_ack]) - - # Resume queue - with DiskQueue[Obj]( - dir_path=Path(dir_path), - to_dict=serializer, - from_dict=deserializer, - lock=threading.RLock(), - max_file_size=200, - ) as queue: - # then - for i in range(version_to_ack + 1, 501): - assert get_queue_element(Obj(i, str(i)), i) == queue.get() - - -def test_ack(): - with TemporaryDirectory() as dir_path: - with DiskQueue[Obj]( - dir_path=Path(dir_path), - to_dict=serializer, - from_dict=deserializer, - lock=threading.RLock(), - max_file_size=999, - ) as queue: - # given - for i in range(5): - queue.put(Obj(i, str(i))) - - # when - queue.flush() - - # and - queue.ack(3) - - # then - assert get_queue_element(Obj(3, "3"), 4) == queue.get() - assert get_queue_element(Obj(4, "4"), 5) == queue.get() - - -@mock.patch("shutil.rmtree") -def test_cleaning_up(rmtree): - with TemporaryDirectory() as dir_path: - with DiskQueue[Obj]( - dir_path=Path(dir_path), - to_dict=serializer, - from_dict=deserializer, - lock=threading.RLock(), - max_file_size=999, - ) as queue: - # given - for i in range(5): - queue.put(Obj(i, str(i))) - - # when - queue.flush() - - # and - queue.ack(5) - - # then - assert 0 == queue.size() - assert queue.is_empty() - - # when - queue.cleanup_if_empty() - - # then - assert rmtree.assert_called_once_with(Path(dir_path).resolve(), ignore_errors=True) is None - - -@dataclass -class Obj: - num: int - txt: str - - -def get_obj_size_bytes(obj, version) -> int: - return len(json.dumps({"obj": obj.__dict__, "version": version})) - - -def get_queue_element(obj, version) -> QueueElement[Obj]: - return QueueElement(obj, version, get_obj_size_bytes(obj, version)) - - -def serializer(obj: "Obj") -> dict: - return obj.__dict__ - - -def deserializer(obj: dict) -> "Obj": - return Obj(**obj) - - -def version_getter(obj: "Obj") -> int: - return obj.num diff --git a/tests/unit/neptune/new/internal/queue/test_json_file_splitter.py b/tests/unit/neptune/new/internal/queue/test_json_file_splitter.py deleted file mode 100644 index 56ebfa3be..000000000 --- a/tests/unit/neptune/new/internal/queue/test_json_file_splitter.py +++ /dev/null @@ -1,176 +0,0 @@ -# -# Copyright (c) 2020, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from neptune.internal.queue.json_file_splitter import JsonFileSplitter -from tests.unit.neptune.new.utils.file_helpers import create_file - - -def test_simple_file(): - content = """ - { - "a": 5, - "b": "text" - } - { - "a": 13 - } - {} - """.lstrip() - - with create_file(content) as filename: - with JsonFileSplitter(filename) as splitter: - assert splitter.get() == {"a": 5, "b": "text"} - assert splitter.get() == {"a": 13} - assert splitter.get() == {} - assert splitter.get() is None - - -def test_append(): - content1 = """ - { - "a": 5, - "b": "text" - } - { - "a": 13 - }""" - - content2 = """ - { - "q": 555, - "r": "something" - } - { - "a": { - "b": [1, 2, 3] - } - } - {}""" - - with create_file(content1) as filename, open(filename, "a") as fp: - with JsonFileSplitter(filename) as splitter: - assert splitter.get() == {"a": 5, "b": "text"} - assert splitter.get() == {"a": 13} - assert splitter.get() is None - - fp.write(content2) - fp.flush() - - assert splitter.get() == {"q": 555, "r": "something"} - assert splitter.get() == {"a": {"b": [1, 2, 3]}} - assert splitter.get() == {} - assert splitter.get() is None - - -def test_append_cut_json(): - content1 = """ - { - "a": 5, - "b": "text" - } - { - "a": 1""" - - content2 = """55, - "r": "something" - } - { - "a": { - "b": [1, 2, 3] - } - }""" - - with create_file(content1) as filename, open(filename, "a") as fp: - with JsonFileSplitter(filename) as splitter: - assert splitter.get() == {"a": 5, "b": "text"} - assert splitter.get() is None - - fp.write(content2) - fp.flush() - - assert splitter.get() == {"a": 155, "r": "something"} - assert splitter.get() == {"a": {"b": [1, 2, 3]}} - assert splitter.get() is None - - -def test_big_json(): - content = """ - { - "a": 5, - "b": "text" - } - { - "a": "%s", - "b": "%s" - } - {} - """.lstrip() % ( - "x" * JsonFileSplitter.BUFFER_SIZE * 2, - "y" * JsonFileSplitter.BUFFER_SIZE * 2, - ) - - with create_file(content) as filename: - with JsonFileSplitter(filename) as splitter: - assert splitter.get() == {"a": 5, "b": "text"} - assert splitter.get() == { - "a": "x" * JsonFileSplitter.BUFFER_SIZE * 2, - "b": "y" * JsonFileSplitter.BUFFER_SIZE * 2, - } - assert splitter.get() == {} - assert splitter.get() is None - - -def test_data_size(): - object1 = """{ - "a": 5, - "b": "text" - }""" - object2 = """{ - "a": 155, - "r": "something" - }""" - object3 = """{ - "a": { - "b": [1, 2, 3] - } - }""" - content1 = """ - { - "a": 5, - "b": "text" - } - { - "a": 1""" - - content2 = """55, - "r": "something" - } - { - "a": { - "b": [1, 2, 3] - } - }""" - - with create_file(content1) as filename, open(filename, "a") as fp: - with JsonFileSplitter(filename) as splitter: - assert splitter.get_with_size() == ({"a": 5, "b": "text"}, len(object1)) - assert splitter.get_with_size()[0] is None - - fp.write(content2) - fp.flush() - - assert splitter.get_with_size() == ({"a": 155, "r": "something"}, len(object2)) - assert splitter.get_with_size() == ({"a": {"b": [1, 2, 3]}}, len(object3)) - assert splitter.get_with_size()[0] is None diff --git a/tests/unit/neptune/new/internal/test_disk_queue.py b/tests/unit/neptune/new/internal/test_disk_queue.py new file mode 100644 index 000000000..74bcb73ec --- /dev/null +++ b/tests/unit/neptune/new/internal/test_disk_queue.py @@ -0,0 +1,189 @@ +# +# Copyright (c) 2020, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json +import random +import threading +import unittest +from glob import glob +from pathlib import Path +from tempfile import TemporaryDirectory + +from neptune.internal.disk_queue import ( + DiskQueue, + QueueElement, +) + + +class TestDiskQueue(unittest.TestCase): + class Obj: + def __init__(self, num: int, txt: str): + self.num = num + self.txt = txt + + def __eq__(self, other): + return isinstance(other, TestDiskQueue.Obj) and self.num == other.num and self.txt == other.txt + + @staticmethod + def get_obj_size_bytes(obj, version) -> int: + return len(json.dumps({"obj": obj.__dict__, "version": version})) + + @staticmethod + def get_queue_element(obj, version) -> QueueElement[Obj]: + obj_size = len(json.dumps({"obj": obj.__dict__, "version": version})) + return QueueElement(obj, version, obj_size) + + def test_put(self): + with TemporaryDirectory() as dirpath: + queue = DiskQueue[TestDiskQueue.Obj]( + Path(dirpath), + self._serializer, + self._deserializer, + threading.RLock(), + ) + obj = TestDiskQueue.Obj(5, "test") + queue.put(obj) + queue.flush() + self.assertEqual(queue.get(), self.get_queue_element(obj, 1)) + queue.close() + + def test_multiple_files(self): + with TemporaryDirectory() as dirpath: + queue = DiskQueue[TestDiskQueue.Obj]( + Path(dirpath), + self._serializer, + self._deserializer, + threading.RLock(), + max_file_size=300, + ) + for i in range(1, 101): + obj = TestDiskQueue.Obj(i, str(i)) + queue.put(obj) + queue.flush() + for i in range(1, 101): + obj = TestDiskQueue.Obj(i, str(i)) + self.assertEqual(queue.get(), self.get_queue_element(obj, i)) + queue.close() + self.assertTrue(queue._read_file_version > 90) + self.assertTrue(queue._write_file_version > 90) + self.assertTrue(len(glob(dirpath + "/data-*.log")) > 10) + + def test_get_batch(self): + with TemporaryDirectory() as dirpath: + queue = DiskQueue[TestDiskQueue.Obj]( + Path(dirpath), + self._serializer, + self._deserializer, + threading.RLock(), + max_file_size=100, + ) + for i in range(1, 91): + obj = TestDiskQueue.Obj(i, str(i)) + queue.put(obj) + queue.flush() + self.assertEqual( + queue.get_batch(25), + [self.get_queue_element(TestDiskQueue.Obj(i, str(i)), i) for i in range(1, 26)], + ) + self.assertEqual( + queue.get_batch(25), + [self.get_queue_element(TestDiskQueue.Obj(i, str(i)), i) for i in range(26, 51)], + ) + self.assertEqual( + queue.get_batch(25), + [self.get_queue_element(TestDiskQueue.Obj(i, str(i)), i) for i in range(51, 76)], + ) + self.assertEqual( + queue.get_batch(25), + [self.get_queue_element(TestDiskQueue.Obj(i, str(i)), i) for i in range(76, 91)], + ) + queue.close() + + def test_batch_limit(self): + with TemporaryDirectory() as dirpath: + obj_size = self.get_obj_size_bytes(TestDiskQueue.Obj(1, "1"), 1) + queue = DiskQueue[TestDiskQueue.Obj]( + Path(dirpath), + self._serializer, + self._deserializer, + threading.RLock(), + max_file_size=100, + max_batch_size_bytes=obj_size * 3, + ) + for i in range(5): + obj = TestDiskQueue.Obj(i, str(i)) + queue.put(obj) + queue.flush() + + self.assertEqual( + queue.get_batch(5), + [self.get_queue_element(TestDiskQueue.Obj(i, str(i)), i + 1) for i in range(3)], + ) + self.assertEqual( + queue.get_batch(2), + [self.get_queue_element(TestDiskQueue.Obj(i, str(i)), i + 1) for i in range(3, 5)], + ) + + queue.close() + + def test_resuming_queue(self): + with TemporaryDirectory() as dirpath: + queue = DiskQueue[TestDiskQueue.Obj]( + Path(dirpath), + self._serializer, + self._deserializer, + threading.RLock(), + max_file_size=999, + ) + for i in range(1, 501): + obj = TestDiskQueue.Obj(i, str(i)) + queue.put(obj) + queue.flush() + version = queue.get_batch(random.randrange(300, 400))[-1].ver + version_to_ack = version - random.randrange(100, 200) + queue.ack(version_to_ack) + + self.assertTrue(queue._read_file_version > 100) + self.assertTrue(queue._write_file_version > 450) + data_files = glob(dirpath + "/data-*.log") + self.assertTrue(len(data_files) > 10) + data_files_versions = [int(file[len(dirpath + "/data-") : -len(".log")]) for file in data_files] + self.assertTrue(len([ver for ver in data_files_versions if ver <= version_to_ack]) == 1) + queue.close() + + queue = DiskQueue[TestDiskQueue.Obj]( + Path(dirpath), + self._serializer, + self._deserializer, + threading.RLock(), + max_file_size=200, + ) + for i in range(version_to_ack + 1, 501): + obj = TestDiskQueue.Obj(i, str(i)) + self.assertEqual(queue.get(), self.get_queue_element(obj, i)) + + queue.close() + + @staticmethod + def _serializer(obj: "TestDiskQueue.Obj") -> dict: + return obj.__dict__ + + @staticmethod + def _deserializer(obj: dict) -> "TestDiskQueue.Obj": + return TestDiskQueue.Obj(obj["num"], obj["txt"]) + + @staticmethod + def _version_getter(obj: "TestDiskQueue.Obj") -> int: + return obj.num diff --git a/tests/unit/neptune/new/internal/utils/test_json_file_splitter.py b/tests/unit/neptune/new/internal/utils/test_json_file_splitter.py new file mode 100644 index 000000000..3a4149ed3 --- /dev/null +++ b/tests/unit/neptune/new/internal/utils/test_json_file_splitter.py @@ -0,0 +1,177 @@ +# +# Copyright (c) 2020, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +from neptune.internal.utils.json_file_splitter import JsonFileSplitter +from tests.unit.neptune.new.utils.file_helpers import create_file + + +class TestJsonFileSplitter(unittest.TestCase): + def test_simple_file(self): + content = """ +{ + "a": 5, + "b": "text" +} +{ + "a": 13 +} +{} +""".lstrip() + + with create_file(content) as filename: + splitter = JsonFileSplitter(filename) + self.assertEqual(splitter.get(), {"a": 5, "b": "text"}) + self.assertEqual(splitter.get(), {"a": 13}) + self.assertEqual(splitter.get(), {}) + self.assertEqual(splitter.get(), None) + splitter.close() + + def test_append(self): + content1 = """ + { + "a": 5, + "b": "text" + } + { + "a": 13 + }""" + + content2 = """ + { + "q": 555, + "r": "something" + } + { + "a": { + "b": [1, 2, 3] + } + } + {}""" + + with create_file(content1) as filename, open(filename, "a") as fp: + splitter = JsonFileSplitter(filename) + self.assertEqual(splitter.get(), {"a": 5, "b": "text"}) + self.assertEqual(splitter.get(), {"a": 13}) + self.assertEqual(splitter.get(), None) + fp.write(content2) + fp.flush() + self.assertEqual(splitter.get(), {"q": 555, "r": "something"}) + self.assertEqual(splitter.get(), {"a": {"b": [1, 2, 3]}}) + self.assertEqual(splitter.get(), {}) + self.assertEqual(splitter.get(), None) + splitter.close() + + def test_append_cut_json(self): + content1 = """ + { + "a": 5, + "b": "text" + } + { + "a": 1""" + + content2 = """55, + "r": "something" + } + { + "a": { + "b": [1, 2, 3] + } + }""" + + with create_file(content1) as filename, open(filename, "a") as fp: + splitter = JsonFileSplitter(filename) + self.assertEqual(splitter.get(), {"a": 5, "b": "text"}) + self.assertEqual(splitter.get(), None) + fp.write(content2) + fp.flush() + self.assertEqual(splitter.get(), {"a": 155, "r": "something"}) + self.assertEqual(splitter.get(), {"a": {"b": [1, 2, 3]}}) + self.assertEqual(splitter.get(), None) + splitter.close() + + def test_big_json(self): + content = """ +{ + "a": 5, + "b": "text" +} +{ + "a": "%s", + "b": "%s" +} +{} +""".lstrip() % ( + "x" * JsonFileSplitter.BUFFER_SIZE * 2, + "y" * JsonFileSplitter.BUFFER_SIZE * 2, + ) + + with create_file(content) as filename: + splitter = JsonFileSplitter(filename) + self.assertEqual(splitter.get(), {"a": 5, "b": "text"}) + self.assertEqual( + splitter.get(), + { + "a": "x" * JsonFileSplitter.BUFFER_SIZE * 2, + "b": "y" * JsonFileSplitter.BUFFER_SIZE * 2, + }, + ) + self.assertEqual(splitter.get(), {}) + self.assertEqual(splitter.get(), None) + splitter.close() + + def test_data_size(self): + object1 = """{ + "a": 5, + "b": "text" + }""" + object2 = """{ + "a": 155, + "r": "something" + }""" + object3 = """{ + "a": { + "b": [1, 2, 3] + } + }""" + content1 = """ + { + "a": 5, + "b": "text" + } + { + "a": 1""" + + content2 = """55, + "r": "something" + } + { + "a": { + "b": [1, 2, 3] + } + }""" + + with create_file(content1) as filename, open(filename, "a") as fp: + splitter = JsonFileSplitter(filename) + self.assertEqual(splitter.get_with_size(), ({"a": 5, "b": "text"}, len(object1))) + self.assertIsNone(splitter.get_with_size()[0]) + fp.write(content2) + fp.flush() + self.assertEqual(splitter.get_with_size(), ({"a": 155, "r": "something"}, len(object2))) + self.assertEqual(splitter.get_with_size(), ({"a": {"b": [1, 2, 3]}}, len(object3))) + self.assertIsNone(splitter.get_with_size()[0]) + splitter.close() From b737e26cd21d1b81d2bfdcab9cce76478dbcc1de Mon Sep 17 00:00:00 2001 From: Patryk Gala Date: Tue, 10 Oct 2023 10:06:03 +0200 Subject: [PATCH 7/7] Revert "Add OperationsPreprocessor to queueing (#1485)" This reverts commit 3213b90280f79d483a1066b25b292c01f0e9ca20. --- .../backends/hosted_neptune_backend.py | 2 +- .../backends/operations_preprocessor.py | 75 ++++--------------- src/neptune/internal/operation.py | 13 +--- .../async_operation_processor.py | 54 +------------ .../backends/test_operations_preprocessor.py | 18 ++--- 5 files changed, 31 insertions(+), 131 deletions(-) diff --git a/src/neptune/internal/backends/hosted_neptune_backend.py b/src/neptune/internal/backends/hosted_neptune_backend.py index ac4f7a4a1..6a6cfd740 100644 --- a/src/neptune/internal/backends/hosted_neptune_backend.py +++ b/src/neptune/internal/backends/hosted_neptune_backend.py @@ -457,7 +457,7 @@ def execute_operations( dropped_count = operations_batch.dropped_operations_count operations_preprocessor = OperationsPreprocessor() - operations_preprocessor.process_batch(operations_batch.operations) + operations_preprocessor.process(operations_batch.operations) preprocessed_operations = operations_preprocessor.get_operations() errors.extend(preprocessed_operations.errors) diff --git a/src/neptune/internal/backends/operations_preprocessor.py b/src/neptune/internal/backends/operations_preprocessor.py index 7e6816a60..b71abc768 100644 --- a/src/neptune/internal/backends/operations_preprocessor.py +++ b/src/neptune/internal/backends/operations_preprocessor.py @@ -21,7 +21,6 @@ from typing import ( Callable, List, - Type, TypeVar, ) @@ -45,7 +44,6 @@ DeleteFiles, LogFloats, LogImages, - LogOperation, LogStrings, Operation, RemoveStrings, @@ -72,41 +70,24 @@ class AccumulatedOperations: errors: List[MetadataInconsistency] = dataclasses.field(default_factory=list) - def all_operations(self) -> List[Operation]: - return self.upload_operations + self.artifact_operations + self.other_operations - class OperationsPreprocessor: def __init__(self): self._accumulators: typing.Dict[str, "_OperationsAccumulator"] = dict() self.processed_ops_count = 0 - self.final_ops_count = 0 - self.final_append_count = 0 - - def process(self, operation: Operation) -> bool: - """Adds a single operation to its processed list. - Returns `False` iff the new operation can't be in queue until one of already enqueued operations gets - synchronized with server first. - """ - try: - self._process_op(operation) - self.processed_ops_count += 1 - return True - except RequiresPreviousCompleted: - return False - - def process_batch(self, operations: List[Operation]) -> None: + + def process(self, operations: List[Operation]): for op in operations: - if not self.process(op): + try: + self._process_op(op) + self.processed_ops_count += 1 + except RequiresPreviousCompleted: return def _process_op(self, op: Operation) -> "_OperationsAccumulator": path_str = path_to_str(op.path) target_acc = self._accumulators.setdefault(path_str, _OperationsAccumulator(op.path)) - old_ops_count, old_append_count = target_acc.get_op_count(), target_acc.get_append_count() target_acc.visit(op) - self.final_ops_count += target_acc.get_op_count() - old_ops_count - self.final_append_count += target_acc.get_append_count() - old_append_count return target_acc @staticmethod @@ -162,8 +143,6 @@ def __init__(self, path: List[str]): self._modify_ops = [] self._config_ops = [] self._errors = [] - self._ops_count = 0 - self._append_count = 0 def get_operations(self) -> List[Operation]: return self._delete_ops + self._modify_ops + self._config_ops @@ -171,12 +150,6 @@ def get_operations(self) -> List[Operation]: def get_errors(self) -> List[MetadataInconsistency]: return self._errors - def get_op_count(self) -> int: - return self._ops_count - - def get_append_count(self) -> int: - return self._append_count - def _check_prerequisites(self, op: Operation): if (OperationsPreprocessor.is_file_op(op) or OperationsPreprocessor.is_artifact_op(op)) and len( self._delete_ops @@ -206,9 +179,7 @@ def _process_modify_op( else: self._check_prerequisites(op) self._type = expected_type - old_op_count = len(self._modify_ops) self._modify_ops = modifier(self._modify_ops, op) - self._ops_count += len(self._modify_ops) - old_op_count def _process_config_op(self, expected_type: _DataType, op: Operation) -> None: @@ -228,9 +199,7 @@ def _process_config_op(self, expected_type: _DataType, op: Operation) -> None: else: self._check_prerequisites(op) self._type = expected_type - old_op_count = len(self._config_ops) self._config_ops = [op] - self._ops_count += len(self._config_ops) - old_op_count def visit_assign_float(self, op: AssignFloat) -> None: self._process_modify_op(_DataType.FLOAT, op, self._assign_modifier()) @@ -326,8 +295,6 @@ def visit_delete_attribute(self, op: DeleteAttribute) -> None: self._modify_ops = [] self._config_ops = [] self._type = None - self._ops_count = len(self._delete_ops) - self._append_count = 0 else: # This case is tricky. There was no delete operation, but some modifications was performed. # We do not know if this attribute exists on server side and we do not want a delete op to fail. @@ -336,8 +303,6 @@ def visit_delete_attribute(self, op: DeleteAttribute) -> None: self._modify_ops = [] self._config_ops = [] self._type = None - self._ops_count = len(self._delete_ops) - self._append_count = 0 else: if self._delete_ops: # Do nothing if there already is a delete operation @@ -347,7 +312,6 @@ def visit_delete_attribute(self, op: DeleteAttribute) -> None: # If value has not been set locally yet and no delete operation was performed, # simply perform single delete operation. self._delete_ops.append(op) - self._ops_count = len(self._delete_ops) @staticmethod def _artifact_log_modifier( @@ -376,30 +340,23 @@ def visit_copy_attribute(self, op: CopyAttribute) -> None: def _assign_modifier(): return lambda ops, new_op: [new_op] - def _clear_modifier(self): - def modifier(ops: List[Operation], new_op: Operation): - for op in ops: - if isinstance(op, LogOperation): - self._append_count -= op.value_count() - return [new_op] - - return modifier + @staticmethod + def _clear_modifier(): + return lambda ops, new_op: [new_op] - def _log_modifier(self, log_op_class: Type[LogOperation], clear_op_class: type, log_combine: Callable[[T, T], T]): - def modifier(ops: List[Operation], new_op: Operation): + @staticmethod + def _log_modifier(log_op_class: type, clear_op_class: type, log_combine: Callable[[T, T], T]): + def modifier(ops, new_op): if len(ops) == 0: - res = [new_op] + return [new_op] elif len(ops) == 1 and isinstance(ops[0], log_op_class): - res = [log_combine(ops[0], new_op)] + return [log_combine(ops[0], new_op)] elif len(ops) == 1 and isinstance(ops[0], clear_op_class): - res = [ops[0], new_op] + return [ops[0], new_op] elif len(ops) == 2: - res = [ops[0], log_combine(ops[1], new_op)] + return [ops[0], log_combine(ops[1], new_op)] else: raise InternalClientError("Preprocessing operations failed: len(ops) == {}".format(len(ops))) - if isinstance(new_op, log_op_class): # Check just so that static typing doesn't complain - self._append_count += new_op.value_count() - return res return modifier diff --git a/src/neptune/internal/operation.py b/src/neptune/internal/operation.py index 02124c73a..b0080617f 100644 --- a/src/neptune/internal/operation.py +++ b/src/neptune/internal/operation.py @@ -292,9 +292,7 @@ def from_dict(data: dict) -> "UploadFileSet": class LogOperation(Operation, abc.ABC): - @abc.abstractmethod - def value_count(self) -> int: - pass + pass @dataclass @@ -334,9 +332,6 @@ def from_dict(data: dict) -> "LogFloats": [LogFloats.ValueType.from_dict(value) for value in data["values"]], ) - def value_count(self) -> int: - return len(self.values) - @dataclass class LogStrings(LogOperation): @@ -360,9 +355,6 @@ def from_dict(data: dict) -> "LogStrings": [LogStrings.ValueType.from_dict(value) for value in data["values"]], ) - def value_count(self) -> int: - return len(self.values) - @dataclass class ImageValue: @@ -408,9 +400,6 @@ def from_dict(data: dict) -> "LogImages": [LogImages.ValueType.from_dict(value, ImageValue.deserializer) for value in data["values"]], ) - def value_count(self) -> int: - return len(self.values) - @dataclass class ClearFloatLog(Operation): diff --git a/src/neptune/internal/operation_processors/async_operation_processor.py b/src/neptune/internal/operation_processors/async_operation_processor.py index 3b6f9a9da..865102e76 100644 --- a/src/neptune/internal/operation_processors/async_operation_processor.py +++ b/src/neptune/internal/operation_processors/async_operation_processor.py @@ -26,32 +26,23 @@ ) from typing import ( Callable, - ClassVar, List, Optional, - Tuple, ) from neptune.constants import ASYNC_DIRECTORY from neptune.envs import NEPTUNE_SYNC_AFTER_STOP_TIMEOUT from neptune.exceptions import NeptuneSynchronizationAlreadyStoppedException from neptune.internal.backends.neptune_backend import NeptuneBackend -from neptune.internal.backends.operations_preprocessor import OperationsPreprocessor from neptune.internal.container_type import ContainerType -from neptune.internal.disk_queue import ( - DiskQueue, - QueueElement, -) +from neptune.internal.disk_queue import DiskQueue from neptune.internal.id_formats import UniqueId from neptune.internal.init.parameters import ( ASYNC_LAG_THRESHOLD, ASYNC_NO_PROGRESS_THRESHOLD, DEFAULT_STOP_TIMEOUT, ) -from neptune.internal.operation import ( - CopyAttribute, - Operation, -) +from neptune.internal.operation import Operation from neptune.internal.operation_processors.operation_processor import OperationProcessor from neptune.internal.operation_processors.operation_storage import ( OperationStorage, @@ -264,10 +255,6 @@ def close(self): self._queue.close() class ConsumerThread(Daemon): - MAX_OPERATIONS_IN_BATCH: ClassVar[int] = 1000 - MAX_APPENDS_IN_BATCH: ClassVar[int] = 100000 - MAX_BATCH_SIZE_BYTES: ClassVar[int] = 100 * 1024 * 1024 - def __init__( self, processor: "AsyncOperationProcessor", @@ -279,7 +266,6 @@ def __init__( self._batch_size = batch_size self._last_flush = 0 self._no_progress_exceeded = False - self._last_disk_record: Optional[QueueElement[Operation]] = None def run(self): try: @@ -296,42 +282,10 @@ def work(self) -> None: self._processor._queue.flush() while True: - batch = self.collect_batch() + batch = self._processor._queue.get_batch(self._batch_size) if not batch: return - operations, version = batch - self.process_batch(operations, version) - - def collect_batch(self) -> Optional[Tuple[List[Operation], int]]: - preprocessor = OperationsPreprocessor() - version: Optional[int] = None - total_bytes = 0 - copy_ops: List[CopyAttribute] = [] - while ( - preprocessor.final_ops_count < self.MAX_OPERATIONS_IN_BATCH - and preprocessor.final_append_count < self.MAX_APPENDS_IN_BATCH - and total_bytes < self.MAX_BATCH_SIZE_BYTES - ): - record: Optional[QueueElement[Operation]] = self._last_disk_record or self._processor._queue.get() - self._last_disk_record = None - if not record: - break - if isinstance(record.obj, CopyAttribute): - # CopyAttribute can be only at the start of a batch. - if copy_ops or preprocessor.final_ops_count: - self._last_disk_record = record - break - else: - version = record.ver - copy_ops.append(record.obj) - total_bytes += record.size - elif preprocessor.process(record.obj): - version = record.ver - total_bytes += record.size - else: - self._last_disk_record = record - break - return (copy_ops + preprocessor.get_operations().all_operations(), version) if version is not None else None + self.process_batch([element.obj for element in batch], batch[-1].ver) def _check_no_progress(self): if not self._no_progress_exceeded: diff --git a/tests/unit/neptune/new/internal/backends/test_operations_preprocessor.py b/tests/unit/neptune/new/internal/backends/test_operations_preprocessor.py index 4d0ab59be..042da27ae 100644 --- a/tests/unit/neptune/new/internal/backends/test_operations_preprocessor.py +++ b/tests/unit/neptune/new/internal/backends/test_operations_preprocessor.py @@ -61,7 +61,7 @@ def test_delete_attribute(self): ] # when - processor.process_batch(operations) + processor.process(operations) # then result = processor.get_operations() @@ -101,7 +101,7 @@ def test_assign(self): ] # when - processor.process_batch(operations) + processor.process(operations) # then result = processor.get_operations() @@ -160,7 +160,7 @@ def test_series(self): ] # when - processor.process_batch(operations) + processor.process(operations) # then result = processor.get_operations() @@ -231,7 +231,7 @@ def test_sets(self): ] # when - processor.process_batch(operations) + processor.process(operations) # then result = processor.get_operations() @@ -283,7 +283,7 @@ def test_file_set(self): ] # when - processor.process_batch(operations) + processor.process(operations) # then result = processor.get_operations() @@ -328,25 +328,25 @@ def test_file_ops_delete(self): ] # when - processor.process_batch(operations) + processor.process(operations) # then: there's a cutoff after DeleteAttribute(["a"]) result = processor.get_operations() self.assertEqual( + result.upload_operations, [ UploadFileSet(["b"], ["abc", "defgh"], reset=True), UploadFileSet(["c"], ["abc", "defgh"], reset=True), UploadFileSet(["c"], ["qqq"], reset=False), UploadFileSet(["d"], ["hhh", "gij"], reset=False), ], - result.upload_operations, ) self.assertEqual(result.artifact_operations, []) self.assertEqual( + result.other_operations, [ DeleteAttribute(["a"]), ], - result.other_operations, ) self.assertEqual(result.errors, []) self.assertEqual(processor.processed_ops_count, 6) @@ -375,7 +375,7 @@ def test_artifacts(self): ] # when - processor.process_batch(operations) + processor.process(operations) # then: there's a cutoff before second TrackFilesToArtifact(["a"]) due to DeleteAttribute(["a"]) result = processor.get_operations()