From 610f571e6fba645cddbf9449d5aa02d8eaa83e8c Mon Sep 17 00:00:00 2001 From: Jan Range Date: Tue, 16 Apr 2024 20:17:12 +0200 Subject: [PATCH 01/15] generalise to other `io` types --- dvuploader/checksum.py | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/dvuploader/checksum.py b/dvuploader/checksum.py index f5f5d50..83f3e67 100644 --- a/dvuploader/checksum.py +++ b/dvuploader/checksum.py @@ -1,7 +1,7 @@ import hashlib from enum import Enum import os -from typing import Callable +from typing import IO, Callable from pydantic import BaseModel, ConfigDict, Field @@ -34,7 +34,10 @@ class Checksum(BaseModel): value (str): The value of the checksum. """ - model_config = ConfigDict(populate_by_name=True) + model_config = ConfigDict( + arbitrary_types_allowed=True, + populate_by_name=True, + ) type: str = Field(..., alias="@type") value: str = Field(..., alias="@value") @@ -42,14 +45,14 @@ class Checksum(BaseModel): @classmethod def from_file( cls, - fpath: str, + handler: IO, hash_fun: Callable, hash_algo: str, ) -> "Checksum": """Takes a file path and returns a checksum object. Args: - fpath (str): The file path to generate the checksum for. + handler (IO): The file handler to generate the checksum for. hash_fun (Callable): The hash function to use for generating the checksum. hash_algo (str): The hash algorithm to use for generating the checksum. @@ -57,11 +60,15 @@ def from_file( Checksum: A Checksum object with type and value fields. """ - value = cls._chunk_checksum(fpath=fpath, hash_fun=hash_fun) + value = cls._chunk_checksum(handler=handler, hash_fun=hash_fun) return cls(type=hash_algo, value=value) # type: ignore @staticmethod - def _chunk_checksum(fpath: str, hash_fun: Callable, blocksize=2**20) -> str: + def _chunk_checksum( + handler: IO, + hash_fun: Callable, + blocksize=2**20 + ) -> str: """Chunks a file and returns a checksum. Args: @@ -73,10 +80,16 @@ def _chunk_checksum(fpath: str, hash_fun: Callable, blocksize=2**20) -> str: str: A string representing the checksum of the file. """ m = hash_fun() - with open(fpath, "rb") as f: - while True: - buf = f.read(blocksize) - if not buf: - break - m.update(buf) + while True: + buf = handler.read(blocksize) + + if not isinstance(buf, bytes): + buf = buf.encode() + + if not buf: + break + m.update(buf) + + handler.seek(0) + return m.hexdigest() From d32a5e13f742862fe543fb4e04d735ac6f593a3d Mon Sep 17 00:00:00 2001 From: Jan Range Date: Tue, 16 Apr 2024 20:17:42 +0200 Subject: [PATCH 02/15] adapt direct upload to work with handlers --- dvuploader/directupload.py | 58 ++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/dvuploader/directupload.py b/dvuploader/directupload.py index 83a6d8f..07d168a 100644 --- a/dvuploader/directupload.py +++ b/dvuploader/directupload.py @@ -75,7 +75,7 @@ async def direct_upload( if status is True: continue - print(f"❌ Failed to upload file '{file.fileName}' to the S3 storage") + print(f"❌ Failed to upload file '{file.file_name}' to the S3 storage") headers = { "X-Dataverse-key": api_token, @@ -129,15 +129,11 @@ async def _upload_to_store( await asyncio.sleep(delay) - assert file.fileName is not None, "File name is None" - assert os.path.exists(file.filepath), f"File {file.filepath} does not exist" - - file_size = os.path.getsize(file.filepath) ticket = await _request_ticket( session=session, dataverse_url=dataverse_url, api_token=api_token, - file_size=file_size, + file_size=file._size, persistent_id=persistent_id, ) @@ -145,7 +141,7 @@ async def _upload_to_store( status, storage_identifier = await _upload_singlepart( session=session, ticket=ticket, - filepath=file.filepath, + file=file, pbar=pbar, progress=progress, api_token=api_token, @@ -156,7 +152,7 @@ async def _upload_to_store( status, storage_identifier = await _upload_multipart( session=session, response=ticket, - filepath=file.filepath, + file=file, dataverse_url=dataverse_url, pbar=pbar, progress=progress, @@ -207,7 +203,7 @@ async def _request_ticket( async def _upload_singlepart( session: aiohttp.ClientSession, ticket: Dict, - filepath: str, + file: File, pbar, progress, api_token: str, @@ -241,7 +237,7 @@ async def _upload_singlepart( params = { "headers": headers, "url": ticket["url"], - "data": open(filepath, "rb"), + "data": file.handler, } async with session.put(**params) as response: @@ -249,13 +245,8 @@ async def _upload_singlepart( response.raise_for_status() if status: - progress.update( - pbar, - advance=os.path.getsize(filepath), - ) - + progress.update(pbar, advance=file._size) await asyncio.sleep(0.1) - progress.update( pbar, visible=leave_bar, @@ -267,7 +258,7 @@ async def _upload_singlepart( async def _upload_multipart( session: aiohttp.ClientSession, response: Dict, - filepath: str, + file: File, dataverse_url: str, pbar, progress, @@ -279,7 +270,7 @@ async def _upload_multipart( Args: session (aiohttp.ClientSession): The aiohttp client session. response (Dict): The response from the Dataverse API containing the upload ticket information. - filepath (str): The path to the file to be uploaded. + file (File): The file object to be uploaded. dataverse_url (str): The URL of the Dataverse instance. pbar (tqdm): A progress bar to track the upload progress. progress: The progress callback function. @@ -301,7 +292,7 @@ async def _upload_multipart( try: e_tags = await _chunked_upload( - filepath=filepath, + file=file, session=session, urls=urls, chunk_size=chunk_size, @@ -309,11 +300,12 @@ async def _upload_multipart( progress=progress, ) except Exception as e: - print(f"❌ Failed to upload file '{filepath}' to the S3 storage") + print(f"❌ Failed to upload file '{file.file_name}' to the S3 storage") await _abort_upload( session=session, url=abort, dataverse_url=dataverse_url, + api_token=api_token, ) raise e @@ -329,7 +321,7 @@ async def _upload_multipart( async def _chunked_upload( - filepath: str, + file: File, session: aiohttp.ClientSession, urls, chunk_size: int, @@ -340,7 +332,7 @@ async def _chunked_upload( Uploads a file in chunks to multiple URLs using the provided session. Args: - filepath (str): The path of the file to upload. + file (File): The file object to upload. session (aiohttp.ClientSession): The aiohttp client session to use for the upload. urls: An iterable of URLs to upload the file chunks to. chunk_size (int): The size of each chunk in bytes. @@ -351,7 +343,17 @@ async def _chunked_upload( List[str]: A list of ETags returned by the server for each uploaded chunk. """ e_tags = [] - async with aiofiles.open(filepath, "rb") as f: + + if not os.path.exists(file.filepath): + raise NotImplementedError( + """ + + Multipart chunked upload is currently only supported for local files and no in-memory objects. + Please save the handlers content to a local file and try again. + """ + ) + + async with aiofiles.open(file.filepath, "rb") as f: chunk = await f.read(chunk_size) e_tags.append( await _upload_chunk( @@ -459,6 +461,7 @@ async def _abort_upload( session: aiohttp.ClientSession, url: str, dataverse_url: str, + api_token: str, ): """ Aborts an ongoing upload by sending a DELETE request to the specified URL. @@ -467,11 +470,17 @@ async def _abort_upload( session (aiohttp.ClientSession): The aiohttp client session. url (str): The URL to send the DELETE request to. dataverse_url (str): The base URL of the Dataverse instance. + api_token (str): The API token to use for the request. Raises: aiohttp.ClientResponseError: If the DELETE request fails. """ - async with session.delete(urljoin(dataverse_url, url)) as response: + + headers = { + "X-Dataverse-key": api_token, + } + + async with session.delete(urljoin(dataverse_url, url), headers=headers) as response: response.raise_for_status() @@ -563,6 +572,7 @@ async def _multipart_json_data_request( Returns: None """ + with aiohttp.MultipartWriter("form-data") as writer: json_part = writer.append(json_data) json_part.set_content_disposition("form-data", name="jsonData") From 347f1afb4fa22a39ef702261ed32d003352a4e5f Mon Sep 17 00:00:00 2001 From: Jan Range Date: Tue, 16 Apr 2024 20:18:01 +0200 Subject: [PATCH 03/15] allow setting level of verbosity --- dvuploader/dvuploader.py | 67 +++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/dvuploader/dvuploader.py b/dvuploader/dvuploader.py index b032a9d..ffd395c 100644 --- a/dvuploader/dvuploader.py +++ b/dvuploader/dvuploader.py @@ -19,7 +19,6 @@ from dvuploader.nativeupload import native_upload from dvuploader.utils import build_url, retrieve_dataset_files, setup_pbar - class DVUploader(BaseModel): """ A class for uploading files to a Dataverse repository. @@ -34,6 +33,7 @@ class DVUploader(BaseModel): """ files: List[File] + verbose: bool = True def upload( self, @@ -56,7 +56,9 @@ def upload( None """ - print("\n") + if self.verbose: + print("\n") + info = "\n".join( [ f"Server: [bold]{dataverse_url}[/bold]", # type: ignore @@ -71,9 +73,10 @@ def upload( expand=False, ) - rich.print(panel) + if self.verbose: + rich.print(panel) - asyncio.run(self._validate_and_hash_files()) + asyncio.run(self._validate_and_hash_files(verbose=self.verbose)) # Check for duplicates self._check_duplicates( @@ -85,7 +88,7 @@ def upload( # Sort files by size files = sorted( self.files, - key=lambda x: os.path.getsize(x.filepath), + key=lambda x: x._size, reverse=False, ) @@ -100,7 +103,7 @@ def upload( persistent_id=persistent_id, ) - if not has_direct_upload and not force_native: + if not has_direct_upload and not force_native and self.verbose: rich.print( "\n[bold italic white]⚠️ Direct upload not supported. Falling back to Native API." ) @@ -136,9 +139,10 @@ def upload( ) ) - rich.print("\n[bold italic white]✅ Upload complete\n") + if self.verbose: + rich.print("\n[bold italic white]✅ Upload complete\n") - async def _validate_and_hash_files(self): + async def _validate_and_hash_files(self, verbose: bool): """ Validates and hashes the files to be uploaded. @@ -146,19 +150,34 @@ async def _validate_and_hash_files(self): None """ + if not verbose: + tasks = [ + self._validate_and_hash_file( + file=file, + verbose=self.verbose + ) + for file in self.files + ] + + await asyncio.gather(*tasks) + return + print("\n") progress = Progress() task = progress.add_task( - "[bold italic white]📦 Preparing upload[/bold italic white]", + "[bold italic white]\n📦 Preparing upload[/bold italic white]", total=len(self.files), ) + with progress: + tasks = [ self._validate_and_hash_file( file=file, progress=progress, task_id=task, + verbose=self.verbose ) for file in self.files ] @@ -170,11 +189,14 @@ async def _validate_and_hash_files(self): @staticmethod async def _validate_and_hash_file( file: File, - progress: Progress, - task_id: TaskID, + verbose: bool, + progress: Optional[Progress] = None, + task_id: Optional[TaskID] = None, ): - file.extract_filename_hash_file() - progress.update(task_id, advance=1) + file.extract_file_name_hash_file() + + if verbose: + progress.update(task_id, advance=1) # type: ignore def _check_duplicates( self, @@ -220,13 +242,13 @@ def _check_duplicates( if has_same_hash and file.checksum: n_skip_files += 1 table.add_row( - file.fileName, "[bright_black]Same hash", "[bright_black]Skip" + file.file_name, "[bright_black]Same hash", "[bright_black]Skip" ) to_remove.append(file) else: n_new_files += 1 table.add_row( - file.fileName, "[spring_green3]New", "[spring_green3]Upload" + file.file_name, "[spring_green3]New", "[spring_green3]Upload" ) # If present in dataset, replace file @@ -245,7 +267,8 @@ def _check_duplicates( table.add_column("Skipped", style="bright_black", no_wrap=True) table.add_row(str(n_new_files), str(n_skip_files)) - console.print(table) + if self.verbose: + console.print(table) @staticmethod def _get_file_id( @@ -267,10 +290,10 @@ def _get_file_id( ValueError: If the file cannot be found in the dataset. """ - # Find the file that matches label and directoryLabel + # Find the file that matches label and directory_label for ds_file in ds_files: - dspath = os.path.join(ds_file.get("directoryLabel", ""), ds_file["label"]) - fpath = os.path.join(file.directoryLabel, file.fileName) # type: ignore + dspath = os.path.join(ds_file.get("directory_label", ""), ds_file["label"]) + fpath = os.path.join(file.directory_label, file.file_name) # type: ignore if dspath == fpath: return ds_file["dataFile"]["id"] @@ -296,8 +319,8 @@ def _check_hashes(file: File, dsFile: Dict): return ( file.checksum.value == hash_value and file.checksum.type == hash_algo - and file.fileName == dsFile["label"] - and file.directoryLabel == dsFile.get("directoryLabel", "") + and file.file_name == dsFile["label"] + and file.directory_label == dsFile.get("directory_label", "") ) @staticmethod @@ -334,7 +357,7 @@ def setup_progress_bars(self, files: List[File]): progress = Progress() tasks = [ setup_pbar( - fpath=file.filepath, + file=file, progress=progress, ) for file in files From 541924ebbd9b61cb325c8a4add80de3f34b0188a Mon Sep 17 00:00:00 2001 From: Jan Range Date: Tue, 16 Apr 2024 20:18:33 +0200 Subject: [PATCH 04/15] distinguish local from handler instance --- dvuploader/file.py | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/dvuploader/file.py b/dvuploader/file.py index 27f9ac2..ffd768b 100644 --- a/dvuploader/file.py +++ b/dvuploader/file.py @@ -1,7 +1,9 @@ +from io import BytesIO, StringIO import os -from typing import List, Optional, Union +from typing import List, Optional, Union, IO from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic.fields import PrivateAttr import rich from dvuploader.checksum import Checksum, ChecksumTypes @@ -14,52 +16,66 @@ class File(BaseModel): Attributes: filepath (str): The path to the file. description (str): The description of the file. - directoryLabel (str): The label of the directory where the file is stored. + directory_label (str): The label of the directory where the file is stored. mimeType (str): The MIME type of the file. categories (List[str]): The categories associated with the file. restrict (bool): Indicates if the file is restricted. checksum_type (ChecksumTypes): The type of checksum used for the file. storageIdentifier (Optional[str]): The identifier of the storage where the file is stored. - fileName (Optional[str]): The name of the file. + file_name (Optional[str]): The name of the file. checksum (Optional[Checksum]): The checksum of the file. to_replace (bool): Indicates if the file should be replaced. file_id (Optional[str]): The ID of the file. Methods: _validate_filepath(path): Validates if the file path exists and is a file. - _extract_filename_hash_file(): Extracts the filename from the filepath and calculates the file's checksum. + _extract_file_name_hash_file(): Extracts the file_name from the filepath and calculates the file's checksum. """ - model_config: ConfigDict = ConfigDict(populate_by_alias=True) + model_config = ConfigDict( + populate_by_name=True, + arbitrary_types_allowed=True, + ) filepath: str = Field(..., exclude=True) + handler: Union[BytesIO, StringIO, IO, None] = Field(default=None, exclude=True) description: str = "" - directoryLabel: str = "" + directory_label: str = Field(default="", alias="directoryLabel") mimeType: str = "text/plain" categories: List[str] = ["DATA"] restrict: bool = False checksum_type: ChecksumTypes = Field(default=ChecksumTypes.MD5, exclude=True) storageIdentifier: Optional[str] = None - fileName: Optional[str] = None + file_name: Optional[str] = Field(default=None, alias="fileName") checksum: Optional[Checksum] = None to_replace: bool = False file_id: Optional[Union[str, int]] = Field(default=None, alias="fileToReplaceId") - def extract_filename_hash_file(self): + _size: int = PrivateAttr(default=0) + + def extract_file_name_hash_file(self): """ - Extracts the filename and calculates the hash of the file. + Extracts the file_name and calculates the hash of the file. Returns: self: The current instance of the class. """ - self._validate_filepath(self.filepath) - self.fileName = os.path.basename(self.filepath) # Hash file hash_algo, hash_fun = self.checksum_type.value + + if self.handler is None: + self._validate_filepath(self.filepath) + self.handler = open(self.filepath, "rb") + self._size = os.path.getsize(self.filepath) + else: + self._size = len(self.handler.read()) + self.handler.seek(0) + + self.file_name = os.path.basename(self.filepath) self.checksum = Checksum.from_file( - fpath=self.filepath, + handler=self.handler, hash_fun=hash_fun, hash_algo=hash_algo, ) From 94298b07ffd1e3b98baf3d43d60075af2808b909 Mon Sep 17 00:00:00 2001 From: Jan Range Date: Tue, 16 Apr 2024 20:18:48 +0200 Subject: [PATCH 05/15] adapt to handle other `io` types --- dvuploader/nativeupload.py | 65 ++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 38 deletions(-) diff --git a/dvuploader/nativeupload.py b/dvuploader/nativeupload.py index c24187f..aa13bb5 100644 --- a/dvuploader/nativeupload.py +++ b/dvuploader/nativeupload.py @@ -79,7 +79,7 @@ async def native_upload( if status == 200: continue - print(f"❌ Failed to upload file '{file.fileName}': {response['message']}") + print(f"❌ Failed to upload file '{file.file_name}': {response['message']}") def _zip_packages( @@ -112,12 +112,12 @@ def _zip_packages( ), ) - file.extract_filename_hash_file() + file.extract_file_name_hash_file() file.mimeType = "application/zip" pbar = progress.add_task( - file.fileName, # type: ignore - total=os.path.getsize(file.filepath), + file.file_name, # type: ignore + total=file._size, ) files.append((pbar, file)) @@ -177,42 +177,28 @@ async def _single_native_upload( json_data = { "description": file.description, "forceReplace": True, - "directoryLabel": file.directoryLabel, + "directory_label": file.directory_label, "categories": file.categories, "restrict": file.restrict, "forceReplace": True, } for _ in range(MAX_RETRIES): - with aiohttp.MultipartWriter("form-data") as writer: - json_part = writer.append(json.dumps(json_data)) - json_part.set_content_disposition("form-data", name="jsonData") - file_part = writer.append( - file_sender( - file_name=file.filepath, - progress=progress, - pbar=pbar, - ) - ) - file_part.set_content_disposition( - "form-data", - name="file", - filename=file.fileName, - ) - async with session.post(endpoint, data=writer) as response: - status = response.status + formdata = aiohttp.FormData() + formdata.add_field("jsonData", json.dumps(json_data), content_type="application/json") + formdata.add_field("file", file.handler, filename=file.file_name) + + async with session.post(endpoint, data=formdata) as response: + status = response.status - if status == 200: - progress.update( - pbar, - advance=os.path.getsize(file.filepath), - ) + if status == 200: + progress.update(pbar, advance=file._size, complete=file._size) - # Wait to avoid rate limiting - await asyncio.sleep(0.7) + # Wait to avoid rate limiting + await asyncio.sleep(0.7) - return status, await response.json() + return status, await response.json() # Wait to avoid rate limiting await asyncio.sleep(1.0) @@ -220,8 +206,8 @@ async def _single_native_upload( return False, {"message": "Failed to upload file"} -async def file_sender( - file_name: str, +def file_sender( + file: File, progress: Progress, pbar: TaskID, ): @@ -237,11 +223,14 @@ async def file_sender( bytes: The chunks of the file. """ + + assert file.handler is not None, "File handler is not set." + chunk_size = 64 * 1024 # 10 MB - async with aiofiles.open(file_name, "rb") as f: - chunk = await f.read(chunk_size) + chunk = file.handler.read(chunk_size) + progress.advance(pbar, advance=chunk_size) + + while chunk: + yield chunk + chunk = file.handler.read(chunk_size) progress.advance(pbar, advance=chunk_size) - while chunk: - yield chunk - chunk = await f.read(chunk_size) - progress.advance(pbar, advance=chunk_size) From a3ba39f11035192e6f4d23998ae9c149d85f4950 Mon Sep 17 00:00:00 2001 From: Jan Range Date: Tue, 16 Apr 2024 20:19:05 +0200 Subject: [PATCH 06/15] use handler and `_size` from file object --- dvuploader/packaging.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/dvuploader/packaging.py b/dvuploader/packaging.py index 2a45f6f..6acc676 100644 --- a/dvuploader/packaging.py +++ b/dvuploader/packaging.py @@ -27,23 +27,22 @@ def distribute_files(dv_files: List["File"]): # type: ignore package_index = 0 current_size = 0 for file in dv_files: - file_size = os.path.getsize(file.filepath) - if file_size > MAXIMUM_PACKAGE_SIZE: + if file._size > MAXIMUM_PACKAGE_SIZE: current_package, current_size, package_index = _append_and_reset( (package_index, [file]), packages, ) continue - if current_size + file_size > MAXIMUM_PACKAGE_SIZE: + if current_size + file._size > MAXIMUM_PACKAGE_SIZE: current_package, current_size, package_index = _append_and_reset( (package_index, current_package), packages, ) current_package.append(file) - current_size += os.path.getsize(file.filepath) + current_size += file._size else: if current_package: _append_and_reset( @@ -91,9 +90,9 @@ def zip_files( with zipfile.ZipFile(path, "w") as zip_file: for file in files: - zip_file.write( - file.filepath, - arcname=_create_arcname(file), + zip_file.writestr( + data=file.handler.read(), + zinfo_or_arcname=_create_arcname(file), ) return path @@ -109,7 +108,7 @@ def _create_arcname(file: "File"): # type: ignore Returns: str: The arcname for the given file. """ - if file.directoryLabel is not None: - return os.path.join(file.directoryLabel, file.fileName) # type: ignore + if file.directory_label is not None: + return os.path.join(file.directory_label, file.file_name) # type: ignore else: - return file.fileName + return file.file_name From 0e9827b67f08aeb28e26bdfa3813d8424da38238 Mon Sep 17 00:00:00 2001 From: Jan Range Date: Tue, 16 Apr 2024 20:19:22 +0200 Subject: [PATCH 07/15] use `File` object to init pbar --- dvuploader/utils.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/dvuploader/utils.py b/dvuploader/utils.py index 0e07335..3a6fbfb 100644 --- a/dvuploader/utils.py +++ b/dvuploader/utils.py @@ -72,7 +72,7 @@ def retrieve_dataset_files( def add_directory( directory: str, ignore: List[str] = [r"^\."], - directory_label: str = "", + rootDirectoryLabel: str = "", ): """ Recursively adds all files in the specified directory to a list of File objects. @@ -80,7 +80,7 @@ def add_directory( Args: directory (str): The directory path. ignore (List[str], optional): A list of regular expressions to ignore certain files or directories. Defaults to [r"^\."]. - directory_label (str, optional): The label to be added to the directory path of each file. Defaults to "". + rootDirectoryLabel (str, optional): The label to be added to the directory path of each file. Defaults to "". Returns: List[File]: A list of File objects representing the files in the directory. @@ -96,7 +96,7 @@ def add_directory( if any(part_is_ignored(part, ignore) for part in list(file.parts)): continue - directoryLabel = _truncate_path( + directory_label = _truncate_path( file.parent, pathlib.Path(directory), ) @@ -105,8 +105,8 @@ def add_directory( File( filepath=str(file), directoryLabel=os.path.join( + rootDirectoryLabel, directory_label, - directoryLabel, ), ) ) @@ -152,7 +152,7 @@ def part_is_ignored(part, ignore): def setup_pbar( - fpath: str, + file: File, progress: Progress, ) -> int: """ @@ -166,10 +166,11 @@ def setup_pbar( int: The task ID of the progress bar. """ - file_size = os.path.getsize(fpath) - fname = os.path.basename(fpath) + file_size = file._size + fname = file.file_name return progress.add_task( f"[pink]├── {fname}", + start=True, total=file_size, ) From 3150669bbb602b9136f350b3bb7cc370282bf219 Mon Sep 17 00:00:00 2001 From: Jan Range Date: Tue, 16 Apr 2024 20:20:18 +0200 Subject: [PATCH 08/15] update docs to changed `File` syntax --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 313d004..f099f44 100644 --- a/README.md +++ b/README.md @@ -48,8 +48,8 @@ import dvuploader as dv # Add file individually files = [ dv.File(filepath="./small.txt"), - dv.File(directoryLabel="some/dir", filepath="./medium.txt"), - dv.File(directoryLabel="some/dir", filepath="./big.txt"), + dv.File(directory_label="some/dir", filepath="./medium.txt"), + dv.File(directory_label="some/dir", filepath="./big.txt"), *dv.add_directory("./data"), # Add an entire directory ] @@ -88,7 +88,7 @@ Alternatively, you can also supply a `config` file that contains all necessary i * `api_token`: API token of the Dataverse instance. * `files`: List of files to upload. Each file is a dictionary with the following keys: * `filepath`: Path to the file to upload. - * `directoryLabel`: Optional directory label to upload the file to. + * `directory_label`: Optional directory label to upload the file to. * `description`: Optional description of the file. * `mimetype`: Mimetype of the file. * `categories`: Optional list of categories to assign to the file. @@ -104,9 +104,9 @@ api_token: XXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX files: - filepath: ./small.txt - filepath: ./medium.txt - directoryLabel: some/dir + directory_label: some/dir - filepath: ./big.txt - directoryLabel: some/dir + directory_label: some/dir ``` The `config` file can then be used as follows: From 992f1c66abc71d0611dc6d3dbd0f33ca8851089c Mon Sep 17 00:00:00 2001 From: Jan Range Date: Tue, 16 Apr 2024 20:20:54 +0200 Subject: [PATCH 09/15] update to `File` args --- tests/fixtures/cli_input.yaml | 6 ++--- tests/unit/test_cli.py | 6 ++--- tests/unit/test_file.py | 14 +++++----- tests/unit/test_utils.py | 51 ++++++++++++++++++++--------------- 4 files changed, 42 insertions(+), 35 deletions(-) diff --git a/tests/fixtures/cli_input.yaml b/tests/fixtures/cli_input.yaml index 7d1826a..b3ec508 100644 --- a/tests/fixtures/cli_input.yaml +++ b/tests/fixtures/cli_input.yaml @@ -2,6 +2,6 @@ persistent_id: doi:10.70122/XXX/XXXXX dataverse_url: https://demo.dataverse.org/ api_token: XXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX files: - - filepath: ./tests/fixtures/add_dir_files/somefile.txt - - filepath: ./tests/fixtures/add_dir_files/anotherfile.txt - directoryLabel: some/dir \ No newline at end of file + - filepath: ./tests/fixtures/add_dir_files/somefile.txt + - filepath: ./tests/fixtures/add_dir_files/anotherfile.txt + directory_label: some/dir diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 7888288..5bfe011 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -14,7 +14,7 @@ def test_full_input(self): # Act cli_input = _parse_yaml_config(fpath) - [file.extract_filename_hash_file() for file in cli_input.files] + [file.extract_file_name_hash_file() for file in cli_input.files] # Assert expected_files = [ @@ -28,7 +28,7 @@ def test_full_input(self): assert len(cli_input.files) == 2 assert sorted( - [(file.directoryLabel, file.fileName) for file in cli_input.files] + [(file.directory_label, file.file_name) for file in cli_input.files] ) == sorted(expected_files) @@ -74,7 +74,7 @@ def test_yaml_input(self, credentials): "files": [ { "filepath": "./tests/fixtures/add_dir_files/somefile.txt", - "directoryLabel": "", + "directory_label": "", } ], } diff --git a/tests/unit/test_file.py b/tests/unit/test_file.py index 695c506..314c0b1 100644 --- a/tests/unit/test_file.py +++ b/tests/unit/test_file.py @@ -10,13 +10,13 @@ def test_read_file(self): # Act file = File( filepath=fpath, - directoryLabel="", + directory_label="", ) - file.extract_filename_hash_file() + file.extract_file_name_hash_file() # Assert - assert file.fileName == "somefile.txt" + assert file.file_name == "somefile.txt" def test_read_non_existent_file(self): # Arrange @@ -26,10 +26,10 @@ def test_read_non_existent_file(self): with pytest.raises(FileNotFoundError): file = File( filepath=fpath, - directoryLabel="", + directory_label="", ) - file.extract_filename_hash_file() + file.extract_file_name_hash_file() def test_read_non_file(self): # Arrange @@ -39,7 +39,7 @@ def test_read_non_file(self): with pytest.raises(IsADirectoryError): file = File( filepath=fpath, - directoryLabel="", + directory_label="", ) - file.extract_filename_hash_file() + file.extract_file_name_hash_file() diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 81dbe32..065de37 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,7 +1,9 @@ +from io import BytesIO import requests import pytest from rich.progress import Progress +from dvuploader.file import File from dvuploader.utils import ( add_directory, build_url, @@ -17,7 +19,7 @@ def test_all_files_added_except_hidden(self): # Act files = add_directory(directory) - [file.extract_filename_hash_file() for file in files] + [file.extract_file_name_hash_file() for file in files] # Assert expected_files = [ @@ -30,15 +32,15 @@ def test_all_files_added_except_hidden(self): assert len(files) == len(expected_files), "Wrong number of files" - for directory_label, filename in expected_files: + for directory_label, file_name in expected_files: assert any( - file.fileName == filename for file in files - ), f"File {filename} not found in files" + file.file_name == file_name for file in files + ), f"File {file_name} not found in files" - file = next(filter(lambda file: file.fileName == filename, files)) + file = next(filter(lambda file: file.file_name == file_name, files)) assert ( - file.directoryLabel == directory_label - ), f"File {filename} has wrong directory label" + file.directory_label == directory_label + ), f"File {file_name} has wrong directory label" def test_all_files_added_except_hidden_and_dunder(self): # Arrange @@ -46,7 +48,7 @@ def test_all_files_added_except_hidden_and_dunder(self): # Act files = add_directory(directory, ignore=[r"^\.", "__.*__"]) - [file.extract_filename_hash_file() for file in files] + [file.extract_file_name_hash_file() for file in files] # Assert expected_files = [ @@ -58,15 +60,15 @@ def test_all_files_added_except_hidden_and_dunder(self): assert len(files) == len(expected_files), "Wrong number of files" - for directory_label, filename in expected_files: + for directory_label, file_name in expected_files: assert any( - file.fileName == filename for file in files - ), f"File {filename} not found in files" + file.file_name == file_name for file in files + ), f"File {file_name} not found in files" - file = next(filter(lambda file: file.fileName == filename, files)) + file = next(filter(lambda file: file.file_name == file_name, files)) assert ( - file.directoryLabel == directory_label - ), f"File {filename} has wrong directory label" + file.directory_label == directory_label + ), f"File {file_name} has wrong directory label" class TestBuildUrl: @@ -150,8 +152,8 @@ def test_valid_parameters_with_dotted_dict(self, mocker): "data": { "latestVersion": { "files": [ - {"filename": "file1.txt"}, - {"filename": "file2.txt"}, + {"file_name": "file1.txt"}, + {"file_name": "file2.txt"}, ] } } @@ -163,8 +165,8 @@ def test_valid_parameters_with_dotted_dict(self, mocker): # Assert the result assert result == [ - {"filename": "file1.txt"}, - {"filename": "file2.txt"}, + {"file_name": "file1.txt"}, + {"file_name": "file2.txt"}, ] # Assert that requests.get was called with the correct parameters @@ -183,7 +185,7 @@ def test_return_files_list(self, mocker): mock_response.json.return_value = { "data": { "latestVersion": { - "files": [{"filename": "file1.txt"}, {"filename": "file2.txt"}] + "files": [{"file_name": "file1.txt"}, {"file_name": "file2.txt"}] } } } @@ -193,7 +195,7 @@ def test_return_files_list(self, mocker): result = retrieve_dataset_files("http://example.com", "12345", "token") # Assert the result - assert result == [{"filename": "file1.txt"}, {"filename": "file2.txt"}] + assert result == [{"file_name": "file1.txt"}, {"file_name": "file2.txt"}] # Raise HTTPError if the request to the Dataverse repository fails. def test_raise_http_error(self, mocker): @@ -208,11 +210,16 @@ def test_raise_http_error(self, mocker): class TestSetupPbar: def test_returns_progress_bar_object(self): # Arrange - fpath = "tests/fixtures/add_dir_files/somefile.txt" + handler = BytesIO(b"Hello, world!") + file = File( + filepath="test.txt", + handler=handler, + ) + progress = Progress() # Act - result = setup_pbar(fpath=fpath, progress=progress) + result = setup_pbar(file=file, progress=progress) # Assert assert isinstance(result, int) From 0e57023ebc779e67009d369ac18037dc2ef3db87 Mon Sep 17 00:00:00 2001 From: Jan Range Date: Tue, 16 Apr 2024 20:21:05 +0200 Subject: [PATCH 10/15] add handler test case --- tests/integration/test_native_upload.py | 46 +++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/integration/test_native_upload.py b/tests/integration/test_native_upload.py index 923936e..8ec7808 100644 --- a/tests/integration/test_native_upload.py +++ b/tests/integration/test_native_upload.py @@ -1,7 +1,9 @@ +from io import BytesIO import tempfile import pytest from dvuploader.dvuploader import DVUploader +from dvuploader.file import File from dvuploader.utils import add_directory, retrieve_dataset_files from tests.conftest import create_dataset, create_mock_file @@ -100,3 +102,47 @@ def test_forced_native_upload( assert len(files) == 3 assert sorted([file["label"] for file in files]) == sorted(expected_files) + + + def test_native_upload_by_handler( + self, + credentials, + ): + BASE_URL, API_TOKEN = credentials + + # Arrange + byte_string = b"Hello, World!" + files = [ + File(filepath="subdir/file.txt", handler=BytesIO(byte_string)), + File(filepath="biggerfile.txt", handler=BytesIO(byte_string*10000)), + ] + + # Create Dataset + pid = create_dataset( + parent="Root", + server_url=BASE_URL, + api_token=API_TOKEN, + ) + + # Act + uploader = DVUploader(files=files) + uploader.upload( + persistent_id=pid, + api_token=API_TOKEN, + dataverse_url=BASE_URL, + n_parallel_uploads=1, + ) + + # Assert + expected_files = [ + "file.txt", + "biggerfile.txt", + ] + files = retrieve_dataset_files( + dataverse_url=BASE_URL, + persistent_id=pid, + api_token=API_TOKEN, + ) + + assert len(files) == 2 + assert sorted([file["label"] for file in files]) == sorted(expected_files) From 011c8224c274283ab6a5a55e831d2f99cbb17d68 Mon Sep 17 00:00:00 2001 From: Jan Range Date: Wed, 17 Apr 2024 07:59:21 +0200 Subject: [PATCH 11/15] fix `directoryLabel` mispelled --- dvuploader/dvuploader.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dvuploader/dvuploader.py b/dvuploader/dvuploader.py index ffd395c..0653e05 100644 --- a/dvuploader/dvuploader.py +++ b/dvuploader/dvuploader.py @@ -108,7 +108,8 @@ def upload( "\n[bold italic white]⚠️ Direct upload not supported. Falling back to Native API." ) - rich.print(f"\n[bold italic white]🚀 Uploading files\n") + if self.verbose: + rich.print(f"\n[bold italic white]🚀 Uploading files\n") progress, pbars = self.setup_progress_bars(files=files) @@ -292,7 +293,7 @@ def _get_file_id( # Find the file that matches label and directory_label for ds_file in ds_files: - dspath = os.path.join(ds_file.get("directory_label", ""), ds_file["label"]) + dspath = os.path.join(ds_file.get("directoryLabel", ""), ds_file["label"]) fpath = os.path.join(file.directory_label, file.file_name) # type: ignore if dspath == fpath: @@ -320,7 +321,7 @@ def _check_hashes(file: File, dsFile: Dict): file.checksum.value == hash_value and file.checksum.type == hash_algo and file.file_name == dsFile["label"] - and file.directory_label == dsFile.get("directory_label", "") + and file.directory_label == dsFile.get("directoryLabel", "") ) @staticmethod From 972477d2015bd761a10943450657041dc8b09e73 Mon Sep 17 00:00:00 2001 From: Jan Range Date: Wed, 17 Apr 2024 07:59:42 +0200 Subject: [PATCH 12/15] if handler use fpath as directory label --- dvuploader/file.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dvuploader/file.py b/dvuploader/file.py index ffd768b..85045ee 100644 --- a/dvuploader/file.py +++ b/dvuploader/file.py @@ -71,6 +71,7 @@ def extract_file_name_hash_file(self): self._size = os.path.getsize(self.filepath) else: self._size = len(self.handler.read()) + self.directory_label = os.path.dirname(self.filepath) self.handler.seek(0) self.file_name = os.path.basename(self.filepath) From 2842a75691b77555546826107c39d6f1dfb63ccc Mon Sep 17 00:00:00 2001 From: Jan Range Date: Wed, 17 Apr 2024 07:59:50 +0200 Subject: [PATCH 13/15] fix typo --- dvuploader/nativeupload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvuploader/nativeupload.py b/dvuploader/nativeupload.py index aa13bb5..16ba67b 100644 --- a/dvuploader/nativeupload.py +++ b/dvuploader/nativeupload.py @@ -177,7 +177,7 @@ async def _single_native_upload( json_data = { "description": file.description, "forceReplace": True, - "directory_label": file.directory_label, + "directoryLabel": file.directory_label, "categories": file.categories, "restrict": file.restrict, "forceReplace": True, From 61fc630dcc038db6cb020c73e1d0ec37ba1f958d Mon Sep 17 00:00:00 2001 From: Jan Range Date: Wed, 17 Apr 2024 07:59:58 +0200 Subject: [PATCH 14/15] fix typo --- tests/unit/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 5bfe011..ba2438c 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -74,7 +74,7 @@ def test_yaml_input(self, credentials): "files": [ { "filepath": "./tests/fixtures/add_dir_files/somefile.txt", - "directory_label": "", + "directoryLabel": "", } ], } From c4b40bc7e59bcbdddfe34686cbc1be37a6de437d Mon Sep 17 00:00:00 2001 From: Jan Range Date: Wed, 17 Apr 2024 08:00:17 +0200 Subject: [PATCH 15/15] add check for directory label --- tests/integration/test_native_upload.py | 38 ++++++++++++++++--------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/tests/integration/test_native_upload.py b/tests/integration/test_native_upload.py index 8ec7808..183cc54 100644 --- a/tests/integration/test_native_upload.py +++ b/tests/integration/test_native_upload.py @@ -1,4 +1,5 @@ from io import BytesIO +import json import tempfile import pytest @@ -42,17 +43,18 @@ def test_native_upload( ) # Assert - expected_files = [ - "small_file.txt", - "mid_file.txt", - "large_file.txt", - ] files = retrieve_dataset_files( dataverse_url=BASE_URL, persistent_id=pid, api_token=API_TOKEN, ) + expected_files = [ + "small_file.txt", + "mid_file.txt", + "large_file.txt", + ] + assert len(files) == 3 assert sorted([file["label"] for file in files]) == sorted(expected_files) @@ -89,17 +91,18 @@ def test_forced_native_upload( ) # Assert - expected_files = [ - "small_file.txt", - "mid_file.txt", - "large_file.txt", - ] files = retrieve_dataset_files( dataverse_url=BASE_URL, persistent_id=pid, api_token=API_TOKEN, ) + expected_files = [ + "small_file.txt", + "mid_file.txt", + "large_file.txt", + ] + assert len(files) == 3 assert sorted([file["label"] for file in files]) == sorted(expected_files) @@ -134,10 +137,11 @@ def test_native_upload_by_handler( ) # Assert - expected_files = [ - "file.txt", - "biggerfile.txt", + expected = [ + ("", "biggerfile.txt"), + ("subdir", "file.txt"), ] + files = retrieve_dataset_files( dataverse_url=BASE_URL, persistent_id=pid, @@ -145,4 +149,10 @@ def test_native_upload_by_handler( ) assert len(files) == 2 - assert sorted([file["label"] for file in files]) == sorted(expected_files) + + for ex_dir, ex_f in expected: + + file = next(file for file in files if file["label"] == ex_f) + + assert file["label"] == ex_f, f"File label does not match for file {json.dumps(file)}" + assert file.get("directoryLabel", "") == ex_dir, f"Directory label does not match for file {json.dumps(file)}"