From 346413251d8d90a5f2c68dba191529aa14ad84ac Mon Sep 17 00:00:00 2001 From: Nicola Coretti Date: Thu, 14 Mar 2024 10:24:40 +0100 Subject: [PATCH] WIP: draft bucketpath --- doc/_static/bucketpath.drawio.svg | 4 + doc/design/bucketpath.rst | 364 ++++++++++++++++++++++++++++++ doc/design/design.rst | 7 + doc/index.rst | 1 + 4 files changed, 376 insertions(+) create mode 100644 doc/_static/bucketpath.drawio.svg create mode 100644 doc/design/bucketpath.rst create mode 100644 doc/design/design.rst diff --git a/doc/_static/bucketpath.drawio.svg b/doc/_static/bucketpath.drawio.svg new file mode 100644 index 00000000..e1d4a005 --- /dev/null +++ b/doc/_static/bucketpath.drawio.svg @@ -0,0 +1,4 @@ + + + +
Bucket
Path
Bucket...
Local
Path
Local...
Chroot
Chroot
ReadOnly
ReadOnly
...
...
...
...

Pathlike

Pathlike

Modifiers

Modifiers

Inputs

Inputs

Factory

Factory
credentails
credentails
uri
uri
...
...
Pathlike
Pathlike

Concrete Paths

Concrete Paths

Factory

Factory
Text is not SVG - cannot display
\ No newline at end of file diff --git a/doc/design/bucketpath.rst b/doc/design/bucketpath.rst new file mode 100644 index 00000000..febb53fd --- /dev/null +++ b/doc/design/bucketpath.rst @@ -0,0 +1,364 @@ +============================== +Design Document Bucket Path(s) +============================== + +Problem Description +=================== + +Users of the bucketfs file system need to use it in various diffrent contexts like, from the outside of the +DB interacting with bucketfs, from within the DB when accssing bucketfs path from within udfs. +Also common actions/tasks like listing a directory are pretty tidous when just interacting with +the bucket api due to the fact that it does not know the concept of direcotries. So in +order to simplify and streamline frequently used path operations and also provide a uniform +interface accross the actual system (local path, http, ...) behind the bucketfs path we need +to have an abstraction for the user. + +The BucketFS filesystem is an integral part of sharing and accessing data for its users. +However, the current user experience with BucketFS presents several challenges, particularly in terms of versatility and ease of use across different contexts. Users interact with BucketFS both externally, from outside the database, and internally, within database operations such as accessing paths from within User-Defined Functions (UDFs). This dual mode of interaction introduces complexity and inefficiency, especially for common filesystem operations. + +Challenges with Current BucketFS Interactions ++++++++++++++++++++++++++++++++++++++++++++++ + +1. **Contextual Versatility:** + Users face difficulties when switching between different operational contexts, such as external access (http) versus internal access (local pahts, during UDF execution). The lack of a seamless transition between contexts hinders productivity and introduces additonal code complexities. + +2. **Directory Operations:** + BucketFS inherently lacks the concept of directories as understood in traditional file systems. This absence complicates tasks like listing directory contents, making what should be simple actions cumbersome and time-consuming. Users are forced to interact with a lower-level bucket API for directory-like operations, which is not only tedious but also counterintuitive. + +3. **Uniform Interface Absence:** + There is a notable absence of a uniform interface for interacting with the underlying systems (local filesystem, HTTP, etc.) behind the BucketFS paths. This inconsistency in interfaces across different backends complicates the user experience, as users must adjust their interaction patterns depending on the underlying system being accessed. + +Proposed Solution +================= + +To address the identified issues with BucketFS interactions, we propose adding an abstraction layer that simplifies and standardizes these interactions across different contexts and operations. This approach is based on the design of the `pathlib` module in the Python standard library, which abstracts filesystem access across operating systems. + +Our proposed path abstraction layer will: + +- **Mirror the `pathlib` Interface:** By adopting an interface similar to `pathlib`, the abstraction aims to utilize its structured and proven design for filesystem interaction. This decision is based on the objective to leverage `pathlib`'s intuitive model while adjusting it to fit the specific requirements of BucketFS. + +- **Focus on Essential Functionalities:** Although inspired by `pathlib`, the proposed abstraction will be streamlined to include only the functionalities necessary for effective BucketFS management. It will diverge from `pathlib` in areas specific to BucketFS's architecture and use cases, ensuring a tailored approach. + +- **Enhance Filesystem Operations:** The abstraction is designed to facilitate common filesystem tasks such as directory listings and file operations, addressing the current lack of a unified method for such interactions in BucketFS. This enhancement aims to standardize the way users interact with the filesystem, regardless of the backend system. + +The development of a path abstraction layer, while inspired by `pathlib`, is specifically designed to meet the unique interaction requirements of BucketFS. This proposed solution focuses on the practical needs of BucketFS users, aiming to streamline their workflow and improve efficiency in managing filesystem tasks. + + +Desgin +====== + +Design Goals +++++++++++++ +The primary goal of this design is to create an abstraction that simplifies working with the bucketfs file system and it's usages. +Additionaly we wan't to maintain compatibility with the intuitive and widely used `pathlib` interface(s) where possible. + +This abstraction should: + +- Simplify interacting with bucketfs paths, by providing implentations for common tasks. +- Provide a way to persist and/or share path information accross processes and systems. +- Reduce the learning curve for users familiar with the `pathlib` interface. +- Make sure the behaviour follow `pathlib` wherever possible. +- Ensure that the design is flexible enough to accommodate future enhancements and new features. + + +Architecture +++++++++++++ + +The architecture of the proposed solution is divided into four main components: + +- Interface (Pathlike) +- Backends (Concrete Paths) +- Extensions (Modifiers) +- Path Creation (Factory) + +Overview +-------- + +.. image:: /_static/bucketpath.drawio.svg + :alt: BucketFS Path Architecture + :align: center + +Interface +--------- +The central component of our path abstraction is defined by the ``Pathlike`` protocol. We opted for a protocol over a class or abstract class to eliminate inheritance and unnecessary dependencies while maintaining a clear interface. + +The ``Pathlike`` protocol specifies the essential functionalities of this abstraction, aiming for compatibility with Python's ``pathlib`` ``Path`` interfaces wherever practical. + +Backends +-------- +Backends implement the ``Pathlike`` protocol for specific underlying systems. Currently, we need at least two backends: one for local BucketFS paths and another for HTTP-based BucketFS paths. + +Extensions +---------- +Extensions modify ``Pathlike`` objects to add general-purpose capabilities. Currently, we plan to implement at least two extensions: + +Chroot +^^^^^^ + +Ensures a path is restricted to a specific root, preventing traversal above it, even if the modified path is not the system's actual root. + +Use Cases: + +- Simplify directory pinning for users +- Emulate custom roots + +ReadOnly +^^^^^^^^ + +Adjusts for the differences in behavior of local paths within UDFs, such as their read-only nature. This modifier allows the API to appropriately handle UDF paths. + +Path Creation +------------- +Path creation is managed by a factory. Not all information required for creating or sharing a path is uniformly applicable across systems and processes. For example, while the location and settings can be determined from the ``uri``, credentials should not be openly shared. + +The API's factory system compiles necessary information and provides a straightforward interface for users to create paths. + + +Detailed Design ++++++++++++++++ + +The Bucket Path API aims to align with Python's ``pathlib`` abstractions, while not mirroring its entire interface due to the extensive functionality and some aspects not being fully compatible with bucket file systems. +The goal is to utilize common functionalities and names, to improve the ease of use and reduce the learning curve. + +It's important to note that wherever feasible, we adopt function and property names along with semantics from ``pathlib``. +However, when there is a significant deviation in semantics from ``pathlib`` definitions, we choose distinct names for properties and functions. This approach ensures clarity for users regarding differences. + +Implementation guidelines are as follows: + +- Embrace and use ``pathlib`` semantics and naming conventions when applicable. +- For significant semantic deviations, opt for unique, clear names that avoid confusion with ``pathlib`` terminology. + + +Pathlike +-------- +Added functionalities or functionalities +where the implementation would majorly differ from the `pathlib` variants, should use +property and function names should be uqique + +.. code-block:: python + + from typing import Protocol + + # interface / protocol or abc + # Note: should mimic pathlib.Path so it can easily understood and adopted. + class Pathlike(Protocol): + + @property + def name: + """ + A string representing the final path component, excluding the drive and root, if any. + """ + ... + + @property + def suffix: + """ + The file extension of the final component, if any. + """ + ... + + @property + def root: + """ + A string representing the (local or global) root, if any. + """ + ... + + @property + def parent: + """ + The logical parent of the path. + """ + ... + + def as_uri(): + """ + Represent the path as a file URI. ValueError is raised if the path isn’t absolute. + """ + ... + + def exists(): + """ + Return True if the path points to an existing file or directory. + """ + ... + + def is_dir(): + """ + Return True if the path points to a directory, False if it points to another kind of file. + """ + ... + + def is_file(): + """ + Return True if the path points to a regular file, False if it points to another kind of file. + """ + ... + + def read(chunk_size: int = 8192) -> Iterable[ByteString]: + """ + """ + ... + + def write(data: ByteString | BinaryIO | Iterable[ByteString]): + """ + If exists() is flase, it will be true after writing + """ + ... + + def rm(): + """ + If exists() and is_file yields true for this path, the path will be deleted. + """ + ... + + def rmdir(recursive: bool = False): + """ + Should mimik the behaviour of ... + so likely w + """ + ... + + def joinpath(*pathsegements) + """ + Calling this method is equivalent to combining the path with each of the given pathsegments in turn: + """ + ... + + def walk(): + """ + Generate the file names in a directory tree by walking the tree either top-down or bottom-up. + + For each directory in the directory tree rooted at self (including self but excluding ‘.’ and ‘..’), the method yields a 3-tuple of (dirpath, dirnames, filenames). + """ + ... + + def iterdir(): + """ + When the path points to a directory, yield path objects of the directory contents. + + The children are yielded in arbitrary order, and the special entries '.' and '..' are not included. If a file is removed from or added to the directory after creating the iterator, whether a path object for that file be included is unspecified. + """ + ... + + # Overload / for joining, see also joinpath or `pathlib.Path`. + def __truediv__(): + ... + + +Concrete Paths (Backends) +------------------------- + +Authentcation, basic things should be implemeted by bucket, which is passed to the bucketpath + +.. code-block:: python + + + # Attention: needs to support/implment Pathlike protocol + class BucketPath: + """ + Provides access to a bucket path servered via http or https. + """ + + protocol = ['bfs', 'bfss'] + + def __init__(bucket: Bucket, path: str): + ... + + + # QUESTION: UdfPath? Do we need to distuinguish between localpaths and localpaths within udf? + # Note: needs to support Pathlike + class LocalPath: + + protocol = ['bfsl'] + + def __init__(path): + ... + + +Modiefiers (Extensions) +----------------------- + +modifiers need to extend the uri by adding their "settings" as option +a modifier should have exactly one purpose, also add this as an option, needs to consdier other modifiers +could be a combined modifier +dect if only one or first + +.. code-block:: python + + # Note: needs to support Pathlike + # restrict a path to use a specific root, instead of the "normal" root + # this shall and should work on every Pathlike + class Chroot: + + # does not have a protocol, works on all kinds. + # if ?chroot is part of the uri the path LocalPath/BucketPath needs + # to be additionaly wrapped into a Chroot + + def __init__(self, path: Pathlike, chroot='/'): + pass + + + # QUESTION: UdfPath? Do we need to distuinguish between localpaths and localpaths within udf? + # Note: needs to support Pathlike + class ReadOnly: + """ + Modifies a pathlike object so it isreadonly. + + E.g. + pathlike.write() # throws an exception, because path is readonly + """ + + def __init__(path: Pathlike): + ... + + + +Factory & Builders +------------------ + +Note: sharing accross processes factory + + +.. code-block:: python + + def PathBuilder: + + def __init__(*args, **kwargs): + """ + Pass credentails/crendentails store etc. + """ + pass + + def __call__(path, chroot='/') -> Pathlike: + # type: LocalPath, BucketPath, Chroot (sudo code) + type = _determine_type(path) + facories = { + "udf" = _create_udf_path, + "bfs" = _create_bucket_path, + "chroot" = _create_chroot_path, + } + factory = factory[type] + return factory(args) + + +Examples +^^^^^^^^ + +.. code-block:: python + + from exasol.bucketfs import PathBuilder + + Path = PathBuilder(credentials) + + # Creating differnet kind's of bucketfs paths + udf_path = Path("bfsl://some/local/path/file.tar.gz") + http_bucket_path = Path("bfs://127.0.0.1:8888/service/bucket/some/file.tar.gz") + https_bucket_path = Path("bfss://127.0.0.1:8888/service/bucket/some/file.tar.gz") + chroot_path = Path("bfss://127.0.0.1:8888/service/bucket/some/sub/subsub/file.tar.gz?chroot=/some/sub/") + readonly_path = Path("bfsl://some/local/path/file.tar.gz?mode=ro") + + + + + diff --git a/doc/design/design.rst b/doc/design/design.rst new file mode 100644 index 00000000..3e8cebab --- /dev/null +++ b/doc/design/design.rst @@ -0,0 +1,7 @@ +📑 Design Documents +=================== + +.. toctree:: + :maxdepth: 1 + + bucketpath diff --git a/doc/index.rst b/doc/index.rst index 96527d94..9f463ef8 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -7,4 +7,5 @@ user_guide/user_guide api developer_guide/developer_guide + design/design changes/changelog