From 63494c64f0c5b0fb36b71854f8a19c3a8385efe2 Mon Sep 17 00:00:00 2001 From: Vadim Yakshigulov Date: Sat, 29 Jun 2024 03:57:23 +0300 Subject: [PATCH] Update to desbordante latest version --- README.md | 120 ++++++++++++++++++++++++++++++++++++++++++++----- pyproject.toml | 4 +- src/cli.py | 51 ++++++++++++++------- 3 files changed, 147 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 7380899..c942abf 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,118 @@ -# Desbordante CLI +

+ +

+ +--- + +# Desbordante: high-performance data profiler (console interface) + +## What is it? + +[**Desbordante**](https://github.com/Desbordante/desbordante-core) is a high-performance data profiler oriented towards exploratory data analysis. This is the repository for the Desbordante console interface, which is published as a separate [package](https://pypi.org/project/desbordante-cli/). This package depends on the [desbordante package](https://pypi.org/project/desbordante/), which contains the C++ code for pattern discovery and validation. As the result, depending on the algorithm and dataset, the runtimes may be cut by 2-10 times compared to the alternative tools. + +## Table of Contents + +- [Desbordante: high-performance data profiler (console interface)](#desbordante-high-performance-data-profiler-console-interface) + - [What is it?](#what-is-it) + - [Table of Contents](#table-of-contents) +- [Main Features](#main-features) + - [Installation](#installation) + - [PyPI](#pypi) + - [Git](#git) + - [Usage examples](#usage-examples) +- [Contacts and Q\&A](#contacts-and-qa) + +# Main Features + +[**Desbordante**](https://github.com/Desbordante/desbordante-core) is a high-performance data profiler that is capable of discovering and validating many different patterns in data using various algorithms. + +The **Discovery** task is designed to identify all instances of a specified pattern *type* of a given dataset. + +The **Validation** task is different: it is designed to check whether a specified pattern *instance* is present in a given dataset. This task not only returns True or False, but it also explains why the instance does not hold (e.g. it can list table rows with conflicting values). + +The currently supported data patterns are: +* Functional dependency variants: + - Exact functional dependencies (discovery and validation) + - Approximate functional dependencies, with g1 metric (discovery and validation) + - Probabilistic functional dependencies, with PerTuple and PerValue metrics (discovery) +* Graph functional dependencies (validation) +* Conditional functional dependencies (discovery) +* Inclusion dependencies (discovery) +* Order dependencies: + - set-based axiomatization (discovery) + - list-based axiomatization (discovery) +* Metric functional dependencies (validation) +* Fuzzy algebraic constraints (discovery) +* Unique column combinations: + - Exact unique column combination (discovery and validation) + - Approximate unique column combination, with g1 metric (discovery and validation) +* Association rules (discovery) + +For more information about the supported patterns check the main [repo](https://github.com/Desbordante/desbordante-core). -Part of Desbordante platform avaliable at https://github.com/Desbordante ## Installation -```bash + +**Requrements**: +* Python 3.11+ +* pipx +* [`desbordante` package](https://pypi.org/project/desbordante/) requirements + +### PyPI +Run the following command: +```sh pipx install desbordante-cli ``` -Or -```bash -pipx install git+https://github.com/toadharvard/desbordante-cli +### Git +```sh +pipx install git+https://github.com/desbordante/desbordante-cli +``` + +## Usage examples +Example datasets can be found at main [repo](https://github.com/Desbordante/desbordante-core) + +1) Discover all exact functional dependencies in a table stored in a comma-separated file with a header row. In this example the default FD discovery algorithm (HyFD) is used. + +```sh +desbordante --task=fd --table=../examples/datasets/university_fd.csv , True +``` + +```text +[Course Classroom] -> Professor +[Classroom Semester] -> Professor +[Classroom Semester] -> Course +[Professor] -> Course +[Professor Semester] -> Classroom +[Course Semester] -> Classroom +[Course Semester] -> Professor +``` + +2) Discover all approximate functional dependencies with error less than or equal to 0.1 in a table represented by a .csv file that uses a comma as the separator and has a header row. In this example the default AFD discovery algorithm (Pyro) is used. + +```sh +desbordante --task=afd --table=../examples/datasets/inventory_afd.csv , True --error=0.1 +``` + +```text +[Id] -> ProductName +[Id] -> Price +[ProductName] -> Price +``` + +3) Check whether metric functional dependency “Title -> Duration” with radius 5 (using the Euclidean metric) holds in a table represented by a .csv file that uses a comma as the separator and has a header row. In this example the default MFD validation algorithm (BRUTE) is used. + +```sh +desbordante --task=mfd_verification --table=../examples/datasets/theatres_mfd.csv , True --lhs_indices=0 --rhs_indices=2 --metric=euclidean --parameter=5 ``` -## Usage -```bash + +```text +True +``` + +For more information check the --help option: +```sh desbordante --help ``` -## Licence -See [LICENCE](./LICENCE) file for details on licensing. +# Contacts and Q&A + +If you have any questions regarding the tool you can create an [issue](https://github.com/Desbordante/desbordante-cli/issues) at GitHub. diff --git a/pyproject.toml b/pyproject.toml index 059c15f..06b406c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ packages = [ [tool.poetry.dependencies] python = ">=3.11" -desbordante = "2.0.0" +desbordante = "2.1.0" click = "^8.1.7" @@ -21,4 +21,4 @@ build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] -desbordante = "src.cli:cli" +desbordante = "src.cli:desbordante_cli" diff --git a/src/cli.py b/src/cli.py index 089ec0e..b2ffc8a 100644 --- a/src/cli.py +++ b/src/cli.py @@ -14,6 +14,7 @@ class Task(StrEnum): fd = auto() cfd = auto() + ar = auto() afd = auto() od = auto() pfd = auto() @@ -43,6 +44,7 @@ class Algorithm(StrEnum): order = auto() spider = auto() faida = auto() + apriori = auto() naive_fd_verifier = auto() naive_afd_verifier = auto() icde09_mfd_verifier = auto() @@ -64,6 +66,7 @@ class Algorithm(StrEnum): TABLES = 'tables' TABLES_LIST = 'tables_list' TABLES_DIRECTORY = 'tables_directory' +INPUT_FORMAT = 'input_format' PRIMARY_HELP = '''The Desbordante data profiler is designed to help users discover or verify various types of patterns in data. These patterns are @@ -118,13 +121,14 @@ class Algorithm(StrEnum): 1) Discovery of exact functional dependencies 2) Discovery of approximate functional dependencies 3) Discovery of probabilistic functional dependencies -4) Discovery of exact order dependencies (set-based and list-based axiomatization) -5) Discovery of inclusion dependencies -6) Verification of exact functional dependencies -7) Verification of approximate functional dependencies -8) Verification of metric dependencies -9) Verification of exact unique column combinations -10) Verification of approximate unique column combinations +4) Discovery of association rules +5) Discovery of exact order dependencies (set-based and list-based axiomatization) +6) Discovery of inclusion dependencies +7) Verification of exact functional dependencies +8) Verification of approximate functional dependencies +9) Verification of metric dependencies +10) Verification of exact unique column combinations +11) Verification of approximate unique column combinations If you need other types, you should look into the C++ code, the Python bindings or the Web version. @@ -216,6 +220,12 @@ class Algorithm(StrEnum): Algorithms: SPIDER, FAIDA Default: SPIDER ''' +AR_HELP = '''Discover association rules. For more information, refer to +"Frequent Pattern Mining" book by Charu C. Aggarwal and Jiawei Han. + +Algorithms: Apriori +Default: Apriori +''' FD_VERIFICATION_HELP = '''Verify whether a given exact functional dependency holds on the specified dataset. For more information about the primitive and algorithms, refer to the “Functional dependency discovery: an experimental @@ -329,12 +339,10 @@ class Algorithm(StrEnum): discovery of order dependencies via set-based axiomatization” paper by J. Szlichta et al. ''' - ORDER_HELP = '''Algorithm Order efficiently discovers all n-ary lexicographical order dependencies under the operator “<”. For more information, refer to the -“Efficient order dependency detection” paper by Philipp Langer and Felix Naumann +“Efficient order dependency detection” paper by Philipp Langer and Felix Naumann. ''' - FD_FIRST_HELP = '''FD-First algorithm belongs to the family of algorithms for discovering approximate conditional functional dependencies. For more information, refer to the “Revisiting Conditional Functional Dependency @@ -362,20 +370,24 @@ class Algorithm(StrEnum): graph functional dependency holds. For more information about the primitive refer to “Functional Dependencies for Graphs” by Wenfei Fan et al. ''' - NAIVE_UCC_VERIFIER_HELP = '''A straightforward partition-based algorithm for verifying whether a given unique column combination holds. For more information on partitions refer to Section 2 of “TANE : An Efficient Algorithm for Discovering Functional and Approximate Dependencies” by Y.Huntala et al. For more information on UCC, refer to "Efficient Discovery -of Approximate Dependencies" by S. Kruse and F. Naumann ''' - +of Approximate Dependencies" by S. Kruse and F. Naumann. +''' NAIVE_AUCC_VERIFIER_HELP = '''A straightforward partition-based algorithm for verifying whether a given approximate unique column combination holds. For more information on partitions refer to Section 2 of “TANE : An Efficient Algorithm for Discovering Functional and Approximate Dependencies” by Y.Huntala et al. For more information on AUCC, refer to "Efficient Discovery -of Approximate Dependencies" by S. Kruse and F. Naumann''' +of Approximate Dependencies" by S. Kruse and F. Naumann. +''' +APRIORI_HELP = '''An algorithm for frequent item set mining and association +rule discovery. For more information, refer to the "Fast Algorithms for +Mining Association Rules" paper by Agrawal and Srikant from 1994. +''' OPTION_TYPES = { str: 'STRING', @@ -391,6 +403,7 @@ class Algorithm(StrEnum): Task.od: OD_HELP, Task.pfd: PFD_HELP, Task.ind: IND_HELP, + Task.ar: AR_HELP, Task.fd_verification: FD_VERIFICATION_HELP, Task.afd_verification: AFD_VERIFICATION_HELP, Task.mfd_verification: MFD_VERIFICATION_HELP, @@ -424,6 +437,7 @@ class Algorithm(StrEnum): Algorithm.naive_gfd_verifier: GFD_VERIFIER_HELP, Algorithm.gfd_verifier: GFD_VERIFIER_HELP, Algorithm.egfd_verifier: GFD_VERIFIER_HELP, + Algorithm.apriori: APRIORI_HELP } TaskInfo = namedtuple('TaskInfo', ['algos', 'default']) @@ -443,6 +457,8 @@ class Algorithm(StrEnum): Task.pfd: TaskInfo([Algorithm.pfdtane], Algorithm.pfdtane), Task.ind: TaskInfo([Algorithm.spider, Algorithm.faida], Algorithm.spider), + Task.ar: TaskInfo([Algorithm.apriori], + Algorithm.apriori), Task.fd_verification: TaskInfo([Algorithm.naive_fd_verifier], Algorithm.naive_fd_verifier), Task.afd_verification: TaskInfo([Algorithm.naive_afd_verifier], @@ -482,6 +498,7 @@ class Algorithm(StrEnum): Algorithm.naive_gfd_verifier: desbordante.gfd_verification.algorithms.NaiveGfdValid, Algorithm.gfd_verifier: desbordante.gfd_verification.algorithms.GfdValid, Algorithm.egfd_verifier: desbordante.gfd_verification.algorithms.EGfdValid, + Algorithm.apriori: desbordante.ar.algorithms.Apriori } @@ -621,6 +638,8 @@ def get_algo_result(algo: desbordante.Algorithm, algo_name: str) -> Any: result = algo.get_gfds() case Algorithm.fd_first: result = algo.get_cfds() + case Algorithm.apriori: + result = algo.get_ars() case _: assert False, 'No matching get_result function.' return result @@ -770,7 +789,7 @@ def decorator(func: Callable) -> Callable: @click.option(f'--{TABLES_DIRECTORY}', type=(click.Path(exists=True, file_okay=False, dir_okay=True, resolve_path=True, allow_dash=False), str, bool)) @algos_options() -def cli(**kwargs: Any) -> None: +def desbordante_cli(**kwargs: Any) -> None: """Takes in options from console as a dictionary, sets these options for the selected algo, runs algo and prints the result""" @@ -806,4 +825,4 @@ def cli(**kwargs: Any) -> None: if __name__ == '__main__': - cli() + desbordante_cli()