From 8217711665dd2189062c280d92cede64958c7c05 Mon Sep 17 00:00:00 2001 From: Roque Lopez Date: Fri, 26 Jul 2024 12:58:49 -0400 Subject: [PATCH] feat: Restructure packages to streamline the addition of new algorithms --- CONTRIBUTING.md | 50 + README.md | 20 +- bdikit/api.py | 154 +- .../column_mapping/algorithms.py | 319 -- .../value_mapping/algorithms.py | 260 -- .../value_mappers.py => mapping_functions.py} | 0 .../__init__.py | 0 .../best}/__init__.py | 0 bdikit/schema_matching/best/base.py | 15 + .../best/contrastivelearning.py | 18 + bdikit/schema_matching/best/gpt.py | 52 + .../schema_matching/best/matcher_factory.py | 63 + bdikit/schema_matching/best/maxvalsim.py | 82 + bdikit/schema_matching/best/twophase.py | 49 + bdikit/schema_matching/best/valentine.py | 106 + bdikit/schema_matching/topk/__init__.py | 1 + bdikit/schema_matching/topk/base.py | 21 + .../topk/contrastivelearning.py} | 30 +- .../schema_matching/topk/matcher_factory.py | 34 + .../__init__.py | 0 bdikit/value_matching/base.py | 37 + bdikit/value_matching/gpt.py | 54 + bdikit/value_matching/matcher_factory.py | 44 + bdikit/value_matching/polyfuzz.py | 141 + docs/source/examples.rst | 1 - examples/top_k_matches.ipynb | 3430 +++++++++++++++++ tests/test_api.py | 14 +- ...e_mapping.py => test_mapping_functions.py} | 2 +- tests/test_schema_matching.py | 8 +- ...g_algorithms.py => test_value_matching.py} | 2 +- 30 files changed, 4244 insertions(+), 763 deletions(-) create mode 100644 CONTRIBUTING.md delete mode 100644 bdikit/mapping_algorithms/column_mapping/algorithms.py delete mode 100644 bdikit/mapping_algorithms/value_mapping/algorithms.py rename bdikit/{mapping_algorithms/value_mapping/value_mappers.py => mapping_functions.py} (100%) rename bdikit/{mapping_algorithms => schema_matching}/__init__.py (100%) rename bdikit/{mapping_algorithms/column_mapping => schema_matching/best}/__init__.py (100%) create mode 100644 bdikit/schema_matching/best/base.py create mode 100644 bdikit/schema_matching/best/contrastivelearning.py create mode 100644 bdikit/schema_matching/best/gpt.py create mode 100644 bdikit/schema_matching/best/matcher_factory.py create mode 100644 bdikit/schema_matching/best/maxvalsim.py create mode 100644 bdikit/schema_matching/best/twophase.py create mode 100644 bdikit/schema_matching/best/valentine.py create mode 100644 bdikit/schema_matching/topk/__init__.py create mode 100644 bdikit/schema_matching/topk/base.py rename bdikit/{mapping_algorithms/column_mapping/topk_matchers.py => schema_matching/topk/contrastivelearning.py} (77%) create mode 100644 bdikit/schema_matching/topk/matcher_factory.py rename bdikit/{mapping_algorithms/value_mapping => value_matching}/__init__.py (100%) create mode 100644 bdikit/value_matching/base.py create mode 100644 bdikit/value_matching/gpt.py create mode 100644 bdikit/value_matching/matcher_factory.py create mode 100644 bdikit/value_matching/polyfuzz.py create mode 100644 examples/top_k_matches.ipynb rename tests/{test_value_mapping.py => test_mapping_functions.py} (96%) rename tests/{test_value_matching_algorithms.py => test_value_matching.py} (96%) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..1106b35d --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,50 @@ +Contributing to bdi-kit +======================= + +There are many ways to contribute to bdi-kit, such as improving the codebase, reporting +issues or bugs, enhancing the documentation, reviewing pull requests from other developers, +adding new matching methods, or expanding support for additional standards. +See the instructions below to get started! + + +Formatting the Code +------------------- + +We format code using the [black](https://black.readthedocs.io/en/stable/) code formatter. +The CI runs for every pull request and will fail if code is not properly formatted. +To make sure formatting is correct, you can do the following steps. + +Make sure you have black installed: +``` +pip install black +``` + +To format the code, anyone can use the command before committing your changes: +``` +make format +``` + +Or you can use the black command directly: +``` +black ./bdikit/ +``` + + +Adding New Matching Methods +--------------------------- + +Contributors can add new methods for schema and value matching by following these steps: + +1. Create a Python module inside the "task folder" folder (e.g., `bdikit/value_matching`). + +2. Define a class in the module that implements either `BaseValueMatcher` (for value matching) or `BaseSchemaMatcher` (for schema matching). + +3. Add a new entry in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). Make sure to add the correct import path for your +module to ensure it can be accessed without errors. + + +Code of Conduct +--------------- + +We abide by the principles of openness, respect, and consideration of others +of the Python Software Foundation: https://www.python.org/psf/codeofconduct/. diff --git a/README.md b/README.md index 399369e8..55419979 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ The `bdi-kit` is a library that assist users in performing data harmonization. I **Warning:** `bdi-kit` is currently in *alpha* stage and under heavy development. Expect APIs to change. + ## Documentation Documentation is available at [https://bdi-kit.readthedocs.io/](https://bdi-kit.readthedocs.io/). @@ -36,21 +37,4 @@ pip install git+https://github.com/VIDA-NYU/bdi-kit@devel ## Contributing -We format code using the [black](https://black.readthedocs.io/en/stable/) code formatter. -The CI runs for every pull request and will fail if code is not properly formatted. -To make sure formatting is correct, you can do the following steps. - -Make sure you have black installed: -``` -pip install black -``` - -To format the code, anyone can use the command before committing your changes: -``` -make format -``` - -Or you can use the black command directly: -``` -black ./bdikit/ -``` \ No newline at end of file +To learn more about making a contribution to bdi-kit, please see our [Contributing guide](./CONTRIBUTING.md). diff --git a/bdikit/api.py b/bdikit/api.py index 92f0d9fa..747a4c40 100644 --- a/bdikit/api.py +++ b/bdikit/api.py @@ -1,19 +1,16 @@ from __future__ import annotations import logging -from enum import Enum + from collections import defaultdict from os.path import join, dirname from typing import ( Union, - Type, List, Dict, TypedDict, - Set, Optional, Tuple, Callable, - Mapping, Any, ) import itertools @@ -22,37 +19,15 @@ import panel as pn from IPython.display import display, Markdown from bdikit.utils import get_gdc_data, get_gdc_metadata -from bdikit.mapping_algorithms.column_mapping.algorithms import ( - BaseSchemaMatcher, - SimFloodSchemaMatcher, - ComaSchemaMatcher, - CupidSchemaMatcher, - DistributionBasedSchemaMatcher, - JaccardSchemaMatcher, - GPTSchemaMatcher, - ContrastiveLearningSchemaMatcher, - TwoPhaseSchemaMatcher, - MaxValSimSchemaMatcher, -) -from bdikit.mapping_algorithms.value_mapping.value_mappers import ValueMapper -from bdikit.models.contrastive_learning.cl_api import ( - DEFAULT_CL_MODEL, -) -from bdikit.mapping_algorithms.column_mapping.topk_matchers import ( - TopkColumnMatcher, - CLTopkColumnMatcher, -) -from bdikit.mapping_algorithms.value_mapping.algorithms import ( - ValueMatch, - BaseValueMatcher, - TFIDFValueMatcher, - GPTValueMatcher, - EditDistanceValueMatcher, - EmbeddingValueMatcher, - AutoFuzzyJoinValueMatcher, - FastTextValueMatcher, -) -from bdikit.mapping_algorithms.value_mapping.value_mappers import ( + +from bdikit.schema_matching.best.base import BaseSchemaMatcher +from bdikit.schema_matching.best.matcher_factory import SchemaMatchers +from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher +from bdikit.schema_matching.topk.matcher_factory import TopkMatchers +from bdikit.value_matching.base import BaseValueMatcher, ValueMatch, ValueMatchingResult +from bdikit.value_matching.matcher_factory import ValueMatchers + +from bdikit.mapping_functions import ( ValueMapper, FunctionValueMapper, DictionaryMapper, @@ -67,37 +42,6 @@ logger = logging.getLogger(__name__) -class SchemaMatchers(Enum): - SIMFLOOD = ("similarity_flooding", SimFloodSchemaMatcher) - COMA = ("coma", ComaSchemaMatcher) - CUPID = ("cupid", CupidSchemaMatcher) - DISTRIBUTION_BASED = ("distribution_based", DistributionBasedSchemaMatcher) - JACCARD_DISTANCE = ("jaccard_distance", JaccardSchemaMatcher) - GPT = ("gpt", GPTSchemaMatcher) - CT_LEARGNING = ("ct_learning", ContrastiveLearningSchemaMatcher) - TWO_PHASE = ("two_phase", TwoPhaseSchemaMatcher) - MAX_VAL_SIM = ("max_val_sim", MaxValSimSchemaMatcher) - - def __init__(self, method_name: str, method_class: Type[BaseSchemaMatcher]): - self.method_name = method_name - self.method_class = method_class - - @staticmethod - def get_instance( - method_name: str, **method_kwargs: Mapping[str, Any] - ) -> BaseSchemaMatcher: - methods = {method.method_name: method.method_class for method in SchemaMatchers} - - try: - return methods[method_name](**method_kwargs) - except KeyError: - names = ", ".join(list(methods.keys())) - raise ValueError( - f"The {method_name} algorithm is not supported. " - f"Supported algorithms are: {names}" - ) - - def match_schema( source: pd.DataFrame, target: Union[str, pd.DataFrame] = "gdc", @@ -130,7 +74,7 @@ def match_schema( if isinstance(method, str): if method_args is None: method_args = {} - matcher_instance = SchemaMatchers.get_instance(method, **method_args) + matcher_instance = SchemaMatchers.get_matcher(method, **method_args) elif isinstance(method, BaseSchemaMatcher): matcher_instance = method else: @@ -154,34 +98,12 @@ def _load_table_for_standard(name: str) -> pd.DataFrame: raise ValueError(f"The {name} standard is not supported") -class TopkMatchers(Enum): - CT_LEARNING = ("ct_learning", CLTopkColumnMatcher) - - def __init__(self, method_name: str, method_class: Type[TopkColumnMatcher]): - self.method_name = method_name - self.method_class = method_class - - @staticmethod - def get_instance( - method_name: str, **method_kwargs: Mapping[str, Any] - ) -> TopkColumnMatcher: - methods = {method.method_name: method.method_class for method in TopkMatchers} - try: - return methods[method_name](**method_kwargs) - except KeyError: - names = ", ".join(list(methods.keys())) - raise ValueError( - f"The {method_name} algorithm is not supported. " - f"Supported algorithms are: {names}" - ) - - def top_matches( source: pd.DataFrame, columns: Optional[List[str]] = None, target: Union[str, pd.DataFrame] = "gdc", top_k: int = 10, - method: Union[str, TopkColumnMatcher] = "ct_learning", + method: Union[str, BaseTopkSchemaMatcher] = "ct_learning", method_args: Optional[Dict[str, Any]] = None, ) -> pd.DataFrame: """ @@ -210,12 +132,12 @@ def top_matches( if isinstance(method, str): if method_args is None: method_args = {} - topk_matcher = TopkMatchers.get_instance(method, **method_args) - elif isinstance(method, TopkColumnMatcher): + topk_matcher = TopkMatchers.get_matcher(method, **method_args) + elif isinstance(method, BaseTopkSchemaMatcher): topk_matcher = method else: raise ValueError( - "The method must be a string or an instance of TopkColumnMatcher" + "The method must be a string or an instance of BaseTopkColumnMatcher" ) top_k_matches = topk_matcher.get_recommendations( @@ -232,47 +154,11 @@ def top_matches( return pd.concat(dfs, ignore_index=True) -class ValueMatchers(Enum): - TFIDF = ("tfidf", TFIDFValueMatcher) - EDIT = ("edit_distance", EditDistanceValueMatcher) - EMBEDDINGS = ("embedding", EmbeddingValueMatcher) - AUTOFJ = ("auto_fuzzy_join", AutoFuzzyJoinValueMatcher) - FASTTEXT = ("fasttext", FastTextValueMatcher) - GPT = ("gpt", GPTValueMatcher) - - def __init__(self, method_name: str, method_class: Type[BaseValueMatcher]): - self.method_name = method_name - self.method_class = method_class - - @staticmethod - def get_instance( - method_name: str, **method_kwargs: Mapping[str, Any] - ) -> BaseValueMatcher: - methods = {method.method_name: method.method_class for method in ValueMatchers} - try: - return methods[method_name](**method_kwargs) - except KeyError: - names = ", ".join(list(methods.keys())) - raise ValueError( - f"The {method_name} algorithm is not supported. " - f"Supported algorithms are: {names}" - ) - - -class ValueMatchingResult(TypedDict): - source: str - target: str - matches: List[ValueMatch] - coverage: float - unique_values: Set[str] - unmatch_values: Set[str] - - def match_values( source: pd.DataFrame, target: Union[str, pd.DataFrame], column_mapping: Union[Tuple[str, str], pd.DataFrame], - method: str = DEFAULT_VALUE_MATCHING_METHOD, + method: Union[str, BaseValueMatcher] = DEFAULT_VALUE_MATCHING_METHOD, method_args: Optional[Dict[str, Any]] = None, ) -> Union[pd.DataFrame, List[pd.DataFrame]]: """ @@ -314,11 +200,11 @@ def match_values( if method_args is None: method_args = {} - if "top_n" in method_args and method_args["top_n"] > 1: + if "top_k" in method_args and method_args["top_k"] > 1: logger.warning( - f"Ignoring 'top_n' argument, use the 'top_value_matches()' method to get top-k value matches." + f"Ignoring 'top_k' argument, use the 'top_value_matches()' method to get top-k value matches." ) - method_args["top_n"] = 1 + method_args["top_k"] = 1 matches = _match_values(source, target, column_mapping, method, method_args) @@ -457,7 +343,7 @@ def _match_values( target_domain, column_mapping_list = _format_value_matching_input( source, target, column_mapping ) - value_matcher = ValueMatchers.get_instance(method, **method_args) + value_matcher = ValueMatchers.get_matcher(method, **method_args) mapping_results: List[ValueMatchingResult] = [] for mapping in column_mapping_list: diff --git a/bdikit/mapping_algorithms/column_mapping/algorithms.py b/bdikit/mapping_algorithms/column_mapping/algorithms.py deleted file mode 100644 index d96b0bb8..00000000 --- a/bdikit/mapping_algorithms/column_mapping/algorithms.py +++ /dev/null @@ -1,319 +0,0 @@ -import pandas as pd -from typing import Dict, Optional, Callable -from valentine import valentine_match -from valentine.algorithms import ( - SimilarityFlooding, - Coma, - Cupid, - DistributionBased, - JaccardDistanceMatcher, - BaseMatcher, -) -from valentine.algorithms.matcher_results import MatcherResults -from valentine.algorithms.jaccard_distance import StringDistanceFunction -from openai import OpenAI -from bdikit.models.contrastive_learning.cl_api import ( - DEFAULT_CL_MODEL, -) -from bdikit.mapping_algorithms.column_mapping.topk_matchers import ( - TopkColumnMatcher, - CLTopkColumnMatcher, -) -from bdikit.mapping_algorithms.value_mapping.algorithms import ( - BaseValueMatcher, - TFIDFValueMatcher, -) - - -class BaseSchemaMatcher: - def map(self, dataset: pd.DataFrame, global_table: pd.DataFrame) -> Dict[str, str]: - raise NotImplementedError("Subclasses must implement this method") - - def _fill_missing_matches( - self, dataset: pd.DataFrame, matches: Dict[str, str] - ) -> Dict[str, str]: - for column in dataset.columns: - if column not in matches: - matches[column] = "" - return matches - - -class ValentineSchemaMatcher(BaseSchemaMatcher): - def __init__(self, matcher: BaseMatcher): - self.matcher = matcher - - def map(self, dataset: pd.DataFrame, global_table: pd.DataFrame) -> Dict[str, str]: - matches: MatcherResults = valentine_match(dataset, global_table, self.matcher) - mappings = {} - for match in matches.one_to_one(): - dataset_candidate = match[0][1] - global_table_candidate = match[1][1] - mappings[dataset_candidate] = global_table_candidate - return self._fill_missing_matches(dataset, mappings) - - -class SimFloodSchemaMatcher(ValentineSchemaMatcher): - def __init__( - self, coeff_policy: str = "inverse_average", formula: str = "formula_c" - ): - super().__init__(SimilarityFlooding(coeff_policy=coeff_policy, formula=formula)) - - -class ComaSchemaMatcher(ValentineSchemaMatcher): - def __init__( - self, max_n: int = 0, use_instances: bool = False, java_xmx: str = "1024m" - ): - super().__init__( - Coma(max_n=max_n, use_instances=use_instances, java_xmx=java_xmx) - ) - - -class CupidSchemaMatcher(ValentineSchemaMatcher): - def __init__( - self, - leaf_w_struct: float = 0.2, - w_struct: float = 0.2, - th_accept: float = 0.7, - th_high: float = 0.6, - th_low: float = 0.35, - c_inc: float = 1.2, - c_dec: float = 0.9, - th_ns: float = 0.7, - parallelism: int = 1, - ): - super().__init__( - Cupid( - leaf_w_struct=leaf_w_struct, - w_struct=w_struct, - th_accept=th_accept, - th_high=th_high, - th_low=th_low, - c_inc=c_inc, - c_dec=c_dec, - th_ns=th_ns, - parallelism=parallelism, - ) - ) - - -class DistributionBasedSchemaMatcher(ValentineSchemaMatcher): - def __init__( - self, - threshold1: float = 0.15, - threshold2: float = 0.15, - quantiles: int = 256, - process_num: int = 1, - ): - super().__init__( - DistributionBased( - threshold1=threshold1, - threshold2=threshold2, - quantiles=quantiles, - process_num=process_num, - ) - ) - - -class JaccardSchemaMatcher(ValentineSchemaMatcher): - def __init__( - self, - threshold_dist: float = 0.8, - distance_fun: Callable[[str, str], float] = StringDistanceFunction.Levenshtein, - process_num: int = 1, - ): - super().__init__( - JaccardDistanceMatcher( - threshold_dist=threshold_dist, - distance_fun=distance_fun, - process_num=process_num, - ) - ) - - -class MaxValSimSchemaMatcher(BaseSchemaMatcher): - def __init__( - self, - top_k: int = 20, - top_k_matcher: Optional[TopkColumnMatcher] = None, - value_matcher: Optional[BaseValueMatcher] = None, - ): - if top_k_matcher is None: - self.api = CLTopkColumnMatcher(DEFAULT_CL_MODEL) - elif isinstance(top_k_matcher, TopkColumnMatcher): - self.api = top_k_matcher - else: - raise ValueError( - f"Invalid top_k_matcher type: {type(top_k_matcher)}. " - "Must be a subclass of {TopkColumnMatcher.__name__}" - ) - - if value_matcher is None: - self.value_matcher = TFIDFValueMatcher() - elif isinstance(value_matcher, BaseValueMatcher): - self.value_matcher = value_matcher - else: - raise ValueError( - f"Invalid value_matcher type: {type(value_matcher)}. " - "Must be a subclass of {BaseValueMatcher.__name__}" - ) - - self.top_k = top_k - - def unique_string_values(self, column: pd.Series) -> pd.Series: - column = column.dropna() - if pd.api.types.is_string_dtype(column): - return pd.Series(column.unique(), name=column.name) - else: - return pd.Series(column.unique().astype(str), name=column.name) - - def map( - self, - dataset: pd.DataFrame, - global_table: pd.DataFrame, - ): - topk_column_matches = self.api.get_recommendations( - dataset, global_table, self.top_k - ) - - matches = {} - for source_column_name, scope in zip(dataset.columns, topk_column_matches): - - source_column_name = scope["source_column"] - top_k_columns = scope["top_k_columns"] - - source_column = dataset[source_column_name] - - if not pd.api.types.is_string_dtype(source_column): - matches[source_column_name] = top_k_columns[0].column_name - continue - - source_values = self.unique_string_values(source_column).to_list() - - scores = [] - for top_column in top_k_columns: - target_column_name = top_column.column_name - target_column = global_table[target_column_name] - target_values = self.unique_string_values(target_column).to_list() - value_matches = self.value_matcher.match(source_values, target_values) - score = sum([m.similarity for m in value_matches]) / len(target_values) - score = (top_column.score + score) / 2.0 - scores.append((source_column_name, target_column_name, score)) - - sorted_columns = sorted(scores, key=lambda it: it[2], reverse=True) - - matches[source_column_name] = sorted_columns[0][1] - - return self._fill_missing_matches(dataset, matches) - - -class GPTSchemaMatcher(BaseSchemaMatcher): - def __init__(self): - self.client = OpenAI() - - def map(self, dataset: pd.DataFrame, global_table: pd.DataFrame): - global_columns = global_table.columns - labels = ", ".join(global_columns) - candidate_columns = dataset.columns - mappings = {} - for column in candidate_columns: - col = dataset[column] - values = col.drop_duplicates().dropna() - if len(values) > 15: - rows = values.sample(15).tolist() - else: - rows = values.tolist() - serialized_input = f"{column}: {', '.join([str(row) for row in rows])}" - context = serialized_input.lower() - column_types = self.get_column_type(context, labels) - for column_type in column_types: - if column_type in global_columns: - mappings[column] = column_type - break - return self._fill_missing_matches(dataset, mappings) - - def get_column_type( - self, context: str, labels: str, m: int = 10, model: str = "gpt-4-turbo-preview" - ): - messages = [ - {"role": "system", "content": "You are an assistant for column matching."}, - { - "role": "user", - "content": """ Please select the top """ - + str(m) - + """ class from """ - + labels - + """ which best describes the context. The context is defined by the column name followed by its respective values. Please respond only with the name of the classes separated by semicolon. - \n CONTEXT: """ - + context - + """ \n RESPONSE: \n""", - }, - ] - col_type = self.client.chat.completions.create( - model=model, messages=messages, temperature=0.3 - ) - col_type_content = col_type.choices[0].message.content - return col_type_content.split(";") - - -class ContrastiveLearningSchemaMatcher(BaseSchemaMatcher): - def __init__(self, model_name: str = DEFAULT_CL_MODEL): - self.topk_matcher = CLTopkColumnMatcher(model_name=model_name) - - def map(self, dataset: pd.DataFrame, global_table: pd.DataFrame): - topk_matches = self.topk_matcher.get_recommendations( - dataset, global_table, top_k=1 - ) - matches = {} - for column, top_k_match in zip(dataset.columns, topk_matches): - candidate = top_k_match["top_k_columns"][0][0] - if candidate in global_table.columns: - matches[column] = candidate - return self._fill_missing_matches(dataset, matches) - - -class TwoPhaseSchemaMatcher(BaseSchemaMatcher): - def __init__( - self, - top_k: int = 20, - top_k_matcher: Optional[TopkColumnMatcher] = None, - schema_matcher: BaseSchemaMatcher = SimFloodSchemaMatcher(), - ): - if top_k_matcher is None: - self.api = CLTopkColumnMatcher(DEFAULT_CL_MODEL) - elif isinstance(top_k_matcher, TopkColumnMatcher): - self.api = top_k_matcher - else: - raise ValueError( - f"Invalid top_k_matcher type: {type(top_k_matcher)}. " - "Must be a subclass of {TopkColumnMatcher.__name__}" - ) - - self.schema_matcher = schema_matcher - self.top_k = top_k - - def map( - self, - dataset: pd.DataFrame, - global_table: pd.DataFrame, - ): - topk_column_matches = self.api.get_recommendations( - dataset, global_table, self.top_k - ) - - matches = {} - for column, scope in zip(dataset.columns, topk_column_matches): - candidates = [ - cand[0] - for cand in scope["top_k_columns"] - if cand[0] in global_table.columns - ] - reduced_dataset = dataset[[column]] - reduced_global_table = global_table[candidates] - partial_matches = self.schema_matcher.map( - reduced_dataset, reduced_global_table - ) - - if column in partial_matches: - matches[column] = partial_matches[column] - - return self._fill_missing_matches(dataset, matches) diff --git a/bdikit/mapping_algorithms/value_mapping/algorithms.py b/bdikit/mapping_algorithms/value_mapping/algorithms.py deleted file mode 100644 index ef64b6d5..00000000 --- a/bdikit/mapping_algorithms/value_mapping/algorithms.py +++ /dev/null @@ -1,260 +0,0 @@ -from typing import List, NamedTuple, Callable, Tuple -import ast -from openai import OpenAI -from polyfuzz import PolyFuzz -from polyfuzz.models import EditDistance, TFIDF, Embeddings -from flair.embeddings import TransformerWordEmbeddings, WordEmbeddings -from rapidfuzz import fuzz -from autofj import AutoFJ -from Levenshtein import ratio -import pandas as pd -import flair -import torch -from bdikit.config import get_device, VALUE_MATCHING_THRESHOLD - -flair.device = torch.device(get_device()) - - -class ValueMatch(NamedTuple): - """ - Represents a match between a source value and a target value with a - similarity score. - """ - - source_value: str - target_value: str - similarity: float - - -class BaseValueMatcher: - """ - Base class for value matching algorithms, i.e., algorithms that match - values from a source domain to values from a target domain. - """ - - def match( - self, source_values: List[str], target_values: List[str] - ) -> List[ValueMatch]: - raise NotImplementedError("Subclasses must implement this method") - - -class PolyFuzzValueMatcher(BaseValueMatcher): - """ - Base class for value matching algorithms based on the PolyFuzz library. - """ - - def __init__(self, polyfuzz_model: PolyFuzz, threshold: float): - self.model = polyfuzz_model - self.threshold = threshold - - def match( - self, - source_values: List[str], - target_values: List[str], - ) -> List[ValueMatch]: - - self.model.match(source_values, target_values) - match_results = self.model.get_matches() - match_results.sort_values(by="Similarity", ascending=False, inplace=True) - - matches = [] - for _, row in match_results.iterrows(): - source = row[0] - top_matches = row[1:] - indexes = range(0, len(top_matches) - 1, 2) - - for index in indexes: - target = top_matches[index] - similarity = top_matches[index + 1] - if similarity >= self.threshold: - matches.append(ValueMatch(source, target, similarity)) - - return matches - - -class TFIDFValueMatcher(PolyFuzzValueMatcher): - """ - Value matching algorithm based on the TF-IDF similarity between values. - """ - - def __init__( - self, - n_gram_range: Tuple[int, int] = (1, 3), - clean_string: bool = True, - threshold: float = VALUE_MATCHING_THRESHOLD, - top_k: int = 1, - cosine_method: str = "sparse", - ): - - super().__init__( - PolyFuzz( - method=TFIDF( - n_gram_range=n_gram_range, - clean_string=clean_string, - min_similarity=threshold, - top_n=top_k, - cosine_method=cosine_method, - ) - ), - threshold, - ) - - -class EditDistanceValueMatcher(PolyFuzzValueMatcher): - """ - Value matching algorithm based on the edit distance between values. - """ - - def __init__( - self, - scorer: Callable[[str, str], float] = fuzz.ratio, - n_jobs: int = -1, - threshold: float = VALUE_MATCHING_THRESHOLD, - ): - # Return scores between 0 and 1 - normalized_scorer = lambda str1, str2: scorer(str1, str2) / 100.0 - super().__init__( - PolyFuzz( - method=EditDistance( - n_jobs=n_jobs, scorer=normalized_scorer, normalize=False - ) - ), - threshold, - ) - - -class EmbeddingValueMatcher(PolyFuzzValueMatcher): - """ - Value matching algorithm based on the cosine similarity of value embeddings. - """ - - def __init__( - self, - model_name: str = "bert-base-multilingual-cased", - threshold: float = VALUE_MATCHING_THRESHOLD, - top_k: int = 1, - cosine_method: str = "sparse", - ): - embeddings = TransformerWordEmbeddings(model_name) - method = Embeddings( - embeddings, - min_similarity=threshold, - top_n=top_k, - cosine_method=cosine_method, - ) - super().__init__(PolyFuzz(method), threshold) - - -class FastTextValueMatcher(PolyFuzzValueMatcher): - """ - Value matching algorithm based on the cosine similarity of FastText embeddings. - """ - - def __init__( - self, - model_name: str = "en-crawl", - threshold: float = VALUE_MATCHING_THRESHOLD, - top_k: int = 1, - cosine_method: str = "sparse", - ): - embeddings = WordEmbeddings(model_name) - method = Embeddings( - embeddings, - min_similarity=threshold, - top_n=top_k, - cosine_method=cosine_method, - ) - super().__init__(PolyFuzz(method), threshold) - - -class GPTValueMatcher(BaseValueMatcher): - def __init__( - self, - threshold: float = VALUE_MATCHING_THRESHOLD, - ): - self.client = OpenAI() - self.threshold = threshold - - def match( - self, - source_values: List[str], - target_values: List[str], - ) -> List[ValueMatch]: - target_values_set = set(target_values) - matches = [] - - for source_value in source_values: - completion = self.client.chat.completions.create( - model="gpt-4-turbo-preview", - messages=[ - { - "role": "system", - "content": "You are an intelligent system that given a term, you have to choose a value from a list that best matches the term. " - "These terms belong to the medical domain, and the list contains terms in the Genomics Data Commons (GDC) format.", - }, - { - "role": "user", - "content": f'For the term: "{source_value}", choose a value from this list {target_values}. ' - "Return the value from the list with a similarity score, between 0 and 1, with 1 indicating the highest similarity. " - "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. " - 'Only provide a Python dictionary. For example {"term": "term from the list", "score": 0.8}.', - }, - ], - ) - - response_message = completion.choices[0].message.content - try: - response_dict = ast.literal_eval(response_message) - target_value = response_dict["term"] - score = float(response_dict["score"]) - if target_value in target_values_set and score >= self.threshold: - matches.append(ValueMatch(source_value, target_value, score)) - except: - print( - f'Errors parsing response for "{source_value}": {response_message}' - ) - - return matches - - -class AutoFuzzyJoinValueMatcher(BaseValueMatcher): - def __init__( - self, - threshold: float = VALUE_MATCHING_THRESHOLD, - ): - self.threshold = threshold - - def match( - self, - source_values: List[str], - target_values: List[str], - ) -> List[ValueMatch]: - - source_values = sorted(list(set(source_values))) - target_values = sorted(list(set(target_values))) - - df_source_values = pd.DataFrame( - {"id": range(1, len(source_values) + 1), "title": source_values} - ) - df_target_values = pd.DataFrame( - {"id": range(1, len(target_values) + 1), "title": target_values} - ) - - matches = [] - try: - autofj = AutoFJ( - precision_target=self.threshold, - join_function_space="autofj_md", - verbose=True, - ) - LR_joins = autofj.join(df_source_values, df_target_values, id_column="id") - if len(LR_joins) > 0: - for _, row in LR_joins.iterrows(): - title_l = row["title_l"] - title_r = row["title_r"] - similarity = ratio(title_l, title_r) - if similarity >= self.threshold: - matches.append(ValueMatch(title_l, title_r, similarity)) - except Exception as e: - return matches - return matches diff --git a/bdikit/mapping_algorithms/value_mapping/value_mappers.py b/bdikit/mapping_functions.py similarity index 100% rename from bdikit/mapping_algorithms/value_mapping/value_mappers.py rename to bdikit/mapping_functions.py diff --git a/bdikit/mapping_algorithms/__init__.py b/bdikit/schema_matching/__init__.py similarity index 100% rename from bdikit/mapping_algorithms/__init__.py rename to bdikit/schema_matching/__init__.py diff --git a/bdikit/mapping_algorithms/column_mapping/__init__.py b/bdikit/schema_matching/best/__init__.py similarity index 100% rename from bdikit/mapping_algorithms/column_mapping/__init__.py rename to bdikit/schema_matching/best/__init__.py diff --git a/bdikit/schema_matching/best/base.py b/bdikit/schema_matching/best/base.py new file mode 100644 index 00000000..4be62cdd --- /dev/null +++ b/bdikit/schema_matching/best/base.py @@ -0,0 +1,15 @@ +import pandas as pd +from typing import Dict + + +class BaseSchemaMatcher: + def map(self, source: pd.DataFrame, target: pd.DataFrame) -> Dict[str, str]: + raise NotImplementedError("Subclasses must implement this method") + + def _fill_missing_matches( + self, dataset: pd.DataFrame, matches: Dict[str, str] + ) -> Dict[str, str]: + for column in dataset.columns: + if column not in matches: + matches[column] = "" + return matches diff --git a/bdikit/schema_matching/best/contrastivelearning.py b/bdikit/schema_matching/best/contrastivelearning.py new file mode 100644 index 00000000..d0c172ac --- /dev/null +++ b/bdikit/schema_matching/best/contrastivelearning.py @@ -0,0 +1,18 @@ +import pandas as pd +from bdikit.schema_matching.best.base import BaseSchemaMatcher +from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL +from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher + + +class ContrastiveLearningSchemaMatcher(BaseSchemaMatcher): + def __init__(self, model_name: str = DEFAULT_CL_MODEL): + self.topk_matcher = CLTopkSchemaMatcher(model_name=model_name) + + def map(self, source: pd.DataFrame, target: pd.DataFrame): + topk_matches = self.topk_matcher.get_recommendations(source, target, top_k=1) + matches = {} + for column, top_k_match in zip(source.columns, topk_matches): + candidate = top_k_match["top_k_columns"][0][0] + if candidate in target.columns: + matches[column] = candidate + return self._fill_missing_matches(source, matches) diff --git a/bdikit/schema_matching/best/gpt.py b/bdikit/schema_matching/best/gpt.py new file mode 100644 index 00000000..68d27803 --- /dev/null +++ b/bdikit/schema_matching/best/gpt.py @@ -0,0 +1,52 @@ +import pandas as pd +from openai import OpenAI +from bdikit.schema_matching.best.base import BaseSchemaMatcher + + +class GPTSchemaMatcher(BaseSchemaMatcher): + def __init__(self): + self.client = OpenAI() + + def map(self, source: pd.DataFrame, target: pd.DataFrame): + target_columns = target.columns + labels = ", ".join(target_columns) + candidate_columns = source.columns + mappings = {} + for column in candidate_columns: + col = source[column] + values = col.drop_duplicates().dropna() + if len(values) > 15: + rows = values.sample(15).tolist() + else: + rows = values.tolist() + serialized_input = f"{column}: {', '.join([str(row) for row in rows])}" + context = serialized_input.lower() + column_types = self.get_column_type(context, labels) + for column_type in column_types: + if column_type in target_columns: + mappings[column] = column_type + break + return self._fill_missing_matches(source, mappings) + + def get_column_type( + self, context: str, labels: str, m: int = 10, model: str = "gpt-4-turbo-preview" + ): + messages = [ + {"role": "system", "content": "You are an assistant for column matching."}, + { + "role": "user", + "content": """ Please select the top """ + + str(m) + + """ class from """ + + labels + + """ which best describes the context. The context is defined by the column name followed by its respective values. Please respond only with the name of the classes separated by semicolon. + \n CONTEXT: """ + + context + + """ \n RESPONSE: \n""", + }, + ] + col_type = self.client.chat.completions.create( + model=model, messages=messages, temperature=0.3 + ) + col_type_content = col_type.choices[0].message.content + return col_type_content.split(";") diff --git a/bdikit/schema_matching/best/matcher_factory.py b/bdikit/schema_matching/best/matcher_factory.py new file mode 100644 index 00000000..b292245f --- /dev/null +++ b/bdikit/schema_matching/best/matcher_factory.py @@ -0,0 +1,63 @@ +import importlib +from enum import Enum +from typing import Mapping, Any +from bdikit.schema_matching.best.base import BaseSchemaMatcher + + +class SchemaMatchers(Enum): + SIMFLOOD = ( + "similarity_flooding", + "bdikit.schema_matching.best.valentine.SimFloodSchemaMatcher", + ) + COMA = ( + "coma", + "bdikit.schema_matching.best.valentine.ComaSchemaMatcher", + ) + CUPID = ( + "cupid", + "bdikit.schema_matching.best.valentine.CupidSchemaMatcher", + ) + DISTRIBUTION_BASED = ( + "distribution_based", + "bdikit.schema_matching.best.valentine.DistributionBasedSchemaMatcher", + ) + JACCARD_DISTANCE = ( + "jaccard_distance", + "bdikit.schema_matching.best.valentine.JaccardDistanceSchemaMatcher", + ) + GPT = ("gpt", "bdikit.schema_matching.best.gpt.GPTSchemaMatcher") + CT_LEARNING = ( + "ct_learning", + "bdikit.schema_matching.best.contrastivelearning.ContrastiveLearningSchemaMatcher", + ) + TWO_PHASE = ( + "two_phase", + "bdikit.schema_matching.best.twophase.TwoPhaseSchemaMatcher", + ) + MAX_VAL_SIM = ( + "max_val_sim", + "bdikit.schema_matching.best.maxvalsim.MaxValSimSchemaMatcher", + ) + + def __init__(self, matcher_name: str, matcher_path: str): + self.matcher_name = matcher_name + self.matcher_path = matcher_path + + @staticmethod + def get_matcher( + matcher_name: str, **matcher_kwargs: Mapping[str, Any] + ) -> BaseSchemaMatcher: + if matcher_name not in matchers: + names = ", ".join(list(matchers.keys())) + raise ValueError( + f"The {matcher_name} algorithm is not supported. " + f"Supported algorithms are: {names}" + ) + # Load the class dynamically + module_path, class_name = matchers[matcher_name].rsplit(".", 1) + module = importlib.import_module(module_path) + + return getattr(module, class_name)(**matcher_kwargs) + + +matchers = {method.matcher_name: method.matcher_path for method in SchemaMatchers} diff --git a/bdikit/schema_matching/best/maxvalsim.py b/bdikit/schema_matching/best/maxvalsim.py new file mode 100644 index 00000000..1fc2dea8 --- /dev/null +++ b/bdikit/schema_matching/best/maxvalsim.py @@ -0,0 +1,82 @@ +import pandas as pd +from typing import Optional +from bdikit.schema_matching.best.base import BaseSchemaMatcher +from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL +from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher +from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher +from bdikit.value_matching.polyfuzz import TFIDFValueMatcher +from bdikit.value_matching.base import BaseValueMatcher + + +class MaxValSimSchemaMatcher(BaseSchemaMatcher): + def __init__( + self, + top_k: int = 20, + top_k_matcher: Optional[BaseTopkSchemaMatcher] = None, + value_matcher: Optional[BaseValueMatcher] = None, + ): + if top_k_matcher is None: + self.api = CLTopkSchemaMatcher(DEFAULT_CL_MODEL) + elif isinstance(top_k_matcher, BaseTopkSchemaMatcher): + self.api = top_k_matcher + else: + raise ValueError( + f"Invalid top_k_matcher type: {type(top_k_matcher)}. " + "Must be a subclass of {BaseTopkColumnMatcher.__name__}" + ) + + if value_matcher is None: + self.value_matcher = TFIDFValueMatcher() + elif isinstance(value_matcher, BaseValueMatcher): + self.value_matcher = value_matcher + else: + raise ValueError( + f"Invalid value_matcher type: {type(value_matcher)}. " + "Must be a subclass of {BaseValueMatcher.__name__}" + ) + + self.top_k = top_k + + def unique_string_values(self, column: pd.Series) -> pd.Series: + column = column.dropna() + if pd.api.types.is_string_dtype(column): + return pd.Series(column.unique(), name=column.name) + else: + return pd.Series(column.unique().astype(str), name=column.name) + + def map( + self, + source: pd.DataFrame, + target: pd.DataFrame, + ): + topk_column_matches = self.api.get_recommendations(source, target, self.top_k) + + matches = {} + for source_column_name, scope in zip(source.columns, topk_column_matches): + + source_column_name = scope["source_column"] + top_k_columns = scope["top_k_columns"] + + source_column = source[source_column_name] + + if not pd.api.types.is_string_dtype(source_column): + matches[source_column_name] = top_k_columns[0].column_name + continue + + source_values = self.unique_string_values(source_column).to_list() + + scores = [] + for top_column in top_k_columns: + target_column_name = top_column.column_name + target_column = target[target_column_name] + target_values = self.unique_string_values(target_column).to_list() + value_matches = self.value_matcher.match(source_values, target_values) + score = sum([m.similarity for m in value_matches]) / len(target_values) + score = (top_column.score + score) / 2.0 + scores.append((source_column_name, target_column_name, score)) + + sorted_columns = sorted(scores, key=lambda it: it[2], reverse=True) + + matches[source_column_name] = sorted_columns[0][1] + + return self._fill_missing_matches(source, matches) diff --git a/bdikit/schema_matching/best/twophase.py b/bdikit/schema_matching/best/twophase.py new file mode 100644 index 00000000..815d4f3e --- /dev/null +++ b/bdikit/schema_matching/best/twophase.py @@ -0,0 +1,49 @@ +import pandas as pd +from typing import Optional +from bdikit.schema_matching.best.base import BaseSchemaMatcher +from bdikit.schema_matching.best.valentine import SimFloodSchemaMatcher +from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL +from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher +from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher + + +class TwoPhaseSchemaMatcher(BaseSchemaMatcher): + def __init__( + self, + top_k: int = 20, + top_k_matcher: Optional[BaseTopkSchemaMatcher] = None, + schema_matcher: BaseSchemaMatcher = SimFloodSchemaMatcher(), + ): + if top_k_matcher is None: + self.api = CLTopkSchemaMatcher(DEFAULT_CL_MODEL) + elif isinstance(top_k_matcher, BaseTopkSchemaMatcher): + self.api = top_k_matcher + else: + raise ValueError( + f"Invalid top_k_matcher type: {type(top_k_matcher)}. " + "Must be a subclass of {BaseTopkColumnMatcher.__name__}" + ) + + self.schema_matcher = schema_matcher + self.top_k = top_k + + def map( + self, + source: pd.DataFrame, + target: pd.DataFrame, + ): + topk_column_matches = self.api.get_recommendations(source, target, self.top_k) + + matches = {} + for column, scope in zip(source.columns, topk_column_matches): + candidates = [ + cand[0] for cand in scope["top_k_columns"] if cand[0] in target.columns + ] + reduced_source = source[[column]] + reduced_target = target[candidates] + partial_matches = self.schema_matcher.map(reduced_source, reduced_target) + + if column in partial_matches: + matches[column] = partial_matches[column] + + return self._fill_missing_matches(source, matches) diff --git a/bdikit/schema_matching/best/valentine.py b/bdikit/schema_matching/best/valentine.py new file mode 100644 index 00000000..67f8fc25 --- /dev/null +++ b/bdikit/schema_matching/best/valentine.py @@ -0,0 +1,106 @@ +import pandas as pd +from typing import Dict, Callable +from bdikit.schema_matching.best.base import BaseSchemaMatcher +from valentine import valentine_match +from valentine.algorithms.matcher_results import MatcherResults +from valentine.algorithms.jaccard_distance import StringDistanceFunction +from valentine.algorithms import ( + SimilarityFlooding, + Coma, + Cupid, + DistributionBased, + JaccardDistanceMatcher, + BaseMatcher, +) + + +class ValentineSchemaMatcher(BaseSchemaMatcher): + def __init__(self, matcher: BaseMatcher): + self.matcher = matcher + + def map(self, source: pd.DataFrame, target: pd.DataFrame) -> Dict[str, str]: + matches: MatcherResults = valentine_match(source, target, self.matcher) + mappings = {} + for match in matches.one_to_one(): + source_candidate = match[0][1] + target_candidate = match[1][1] + mappings[source_candidate] = target_candidate + return self._fill_missing_matches(source, mappings) + + +class SimFloodSchemaMatcher(ValentineSchemaMatcher): + def __init__( + self, coeff_policy: str = "inverse_average", formula: str = "formula_c" + ): + super().__init__(SimilarityFlooding(coeff_policy=coeff_policy, formula=formula)) + + +class ComaSchemaMatcher(ValentineSchemaMatcher): + def __init__( + self, max_n: int = 0, use_instances: bool = False, java_xmx: str = "1024m" + ): + super().__init__( + Coma(max_n=max_n, use_instances=use_instances, java_xmx=java_xmx) + ) + + +class CupidSchemaMatcher(ValentineSchemaMatcher): + def __init__( + self, + leaf_w_struct: float = 0.2, + w_struct: float = 0.2, + th_accept: float = 0.7, + th_high: float = 0.6, + th_low: float = 0.35, + c_inc: float = 1.2, + c_dec: float = 0.9, + th_ns: float = 0.7, + parallelism: int = 1, + ): + super().__init__( + Cupid( + leaf_w_struct=leaf_w_struct, + w_struct=w_struct, + th_accept=th_accept, + th_high=th_high, + th_low=th_low, + c_inc=c_inc, + c_dec=c_dec, + th_ns=th_ns, + parallelism=parallelism, + ) + ) + + +class DistributionBasedSchemaMatcher(ValentineSchemaMatcher): + def __init__( + self, + threshold1: float = 0.15, + threshold2: float = 0.15, + quantiles: int = 256, + process_num: int = 1, + ): + super().__init__( + DistributionBased( + threshold1=threshold1, + threshold2=threshold2, + quantiles=quantiles, + process_num=process_num, + ) + ) + + +class JaccardSchemaMatcher(ValentineSchemaMatcher): + def __init__( + self, + threshold_dist: float = 0.8, + distance_fun: Callable[[str, str], float] = StringDistanceFunction.Levenshtein, + process_num: int = 1, + ): + super().__init__( + JaccardDistanceMatcher( + threshold_dist=threshold_dist, + distance_fun=distance_fun, + process_num=process_num, + ) + ) diff --git a/bdikit/schema_matching/topk/__init__.py b/bdikit/schema_matching/topk/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/bdikit/schema_matching/topk/__init__.py @@ -0,0 +1 @@ + diff --git a/bdikit/schema_matching/topk/base.py b/bdikit/schema_matching/topk/base.py new file mode 100644 index 00000000..87725f08 --- /dev/null +++ b/bdikit/schema_matching/topk/base.py @@ -0,0 +1,21 @@ +from abc import ABCMeta, abstractmethod +from typing import List, NamedTuple, TypedDict +import pandas as pd + + +class ColumnScore(NamedTuple): + column_name: str + score: float + + +class TopkMatching(TypedDict): + source_column: str + top_k_columns: List[ColumnScore] + + +class BaseTopkSchemaMatcher(metaclass=ABCMeta): + @abstractmethod + def get_recommendations( + self, source: pd.DataFrame, target: pd.DataFrame, top_k: int + ) -> List[TopkMatching]: + pass diff --git a/bdikit/mapping_algorithms/column_mapping/topk_matchers.py b/bdikit/schema_matching/topk/contrastivelearning.py similarity index 77% rename from bdikit/mapping_algorithms/column_mapping/topk_matchers.py rename to bdikit/schema_matching/topk/contrastivelearning.py index 8b670650..a014bb49 100644 --- a/bdikit/mapping_algorithms/column_mapping/topk_matchers.py +++ b/bdikit/schema_matching/topk/contrastivelearning.py @@ -1,7 +1,11 @@ -from abc import ABCMeta, abstractmethod -from typing import List, NamedTuple, TypedDict import pandas as pd import numpy as np +from typing import List +from bdikit.schema_matching.topk.base import ( + ColumnScore, + TopkMatching, + BaseTopkSchemaMatcher, +) from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances from bdikit.models.contrastive_learning.cl_api import ( ContrastiveLearningAPI, @@ -10,25 +14,7 @@ from bdikit.models import ColumnEmbedder -class ColumnScore(NamedTuple): - column_name: str - score: float - - -class TopkMatching(TypedDict): - source_column: str - top_k_columns: List[ColumnScore] - - -class TopkColumnMatcher(metaclass=ABCMeta): - @abstractmethod - def get_recommendations( - self, source: pd.DataFrame, target: pd.DataFrame, top_k: int - ) -> List[TopkMatching]: - pass - - -class EmbeddingSimilarityTopkColumnMatcher(TopkColumnMatcher): +class EmbeddingSimilarityTopkSchemaMatcher(BaseTopkSchemaMatcher): def __init__(self, column_embedder: ColumnEmbedder, metric: str = "cosine"): self.api = column_embedder self.metric = metric @@ -68,7 +54,7 @@ def get_recommendations( return top_k_results -class CLTopkColumnMatcher(EmbeddingSimilarityTopkColumnMatcher): +class CLTopkSchemaMatcher(EmbeddingSimilarityTopkSchemaMatcher): def __init__(self, model_name: str = DEFAULT_CL_MODEL, metric: str = "cosine"): super().__init__( column_embedder=ContrastiveLearningAPI(model_name=model_name), metric=metric diff --git a/bdikit/schema_matching/topk/matcher_factory.py b/bdikit/schema_matching/topk/matcher_factory.py new file mode 100644 index 00000000..e2567876 --- /dev/null +++ b/bdikit/schema_matching/topk/matcher_factory.py @@ -0,0 +1,34 @@ +import importlib +from enum import Enum +from typing import Mapping, Any +from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher + + +class TopkMatchers(Enum): + CT_LEARNING = ( + "ct_learning", + "bdikit.schema_matching.topk.contrastivelearning.CLTopkSchemaMatcher", + ) + + def __init__(self, matcher_name: str, matcher_path: str): + self.matcher_name = matcher_name + self.matcher_path = matcher_path + + @staticmethod + def get_matcher( + matcher_name: str, **matcher_kwargs: Mapping[str, Any] + ) -> BaseTopkSchemaMatcher: + if matcher_name not in matchers: + names = ", ".join(list(matchers.keys())) + raise ValueError( + f"The {matcher_name} algorithm is not supported. " + f"Supported algorithms are: {names}" + ) + # Load the class dynamically + module_path, class_name = matchers[matcher_name].rsplit(".", 1) + module = importlib.import_module(module_path) + + return getattr(module, class_name)(**matcher_kwargs) + + +matchers = {method.matcher_name: method.matcher_path for method in TopkMatchers} diff --git a/bdikit/mapping_algorithms/value_mapping/__init__.py b/bdikit/value_matching/__init__.py similarity index 100% rename from bdikit/mapping_algorithms/value_mapping/__init__.py rename to bdikit/value_matching/__init__.py diff --git a/bdikit/value_matching/base.py b/bdikit/value_matching/base.py new file mode 100644 index 00000000..f8124bf8 --- /dev/null +++ b/bdikit/value_matching/base.py @@ -0,0 +1,37 @@ +from typing import List, NamedTuple, TypedDict, Set + + +class ValueMatch(NamedTuple): + """ + Represents a match between a source value and a target value with a + similarity score. + """ + + source_value: str + target_value: str + similarity: float + + +class ValueMatchingResult(TypedDict): + """ + Represents the result of a value matching operation. + """ + + source: str + target: str + matches: List[ValueMatch] + coverage: float + unique_values: Set[str] + unmatch_values: Set[str] + + +class BaseValueMatcher: + """ + Base class for value matching algorithms, i.e., algorithms that match + values from a source domain to values from a target domain. + """ + + def match( + self, source_values: List[str], target_values: List[str] + ) -> List[ValueMatch]: + raise NotImplementedError("Subclasses must implement this method") diff --git a/bdikit/value_matching/gpt.py b/bdikit/value_matching/gpt.py new file mode 100644 index 00000000..7fcde3e6 --- /dev/null +++ b/bdikit/value_matching/gpt.py @@ -0,0 +1,54 @@ +from typing import List +from openai import OpenAI +from bdikit.value_matching.base import BaseValueMatcher, ValueMatch +from bdikit.config import VALUE_MATCHING_THRESHOLD + + +class GPTValueMatcher(BaseValueMatcher): + def __init__( + self, + threshold: float = VALUE_MATCHING_THRESHOLD, + ): + self.client = OpenAI() + self.threshold = threshold + + def match( + self, + source_values: List[str], + target_values: List[str], + ) -> List[ValueMatch]: + target_values_set = set(target_values) + matches = [] + + for source_value in source_values: + completion = self.client.chat.completions.create( + model="gpt-4-turbo-preview", + messages=[ + { + "role": "system", + "content": "You are an intelligent system that given a term, you have to choose a value from a list that best matches the term. " + "These terms belong to the medical domain, and the list contains terms in the Genomics Data Commons (GDC) format.", + }, + { + "role": "user", + "content": f'For the term: "{source_value}", choose a value from this list {target_values}. ' + "Return the value from the list with a similarity score, between 0 and 1, with 1 indicating the highest similarity. " + "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. " + 'Only provide a Python dictionary. For example {"term": "term from the list", "score": 0.8}.', + }, + ], + ) + + response_message = completion.choices[0].message.content + try: + response_dict = ast.literal_eval(response_message) + target_value = response_dict["term"] + score = float(response_dict["score"]) + if target_value in target_values_set and score >= self.threshold: + matches.append(ValueMatch(source_value, target_value, score)) + except: + print( + f'Errors parsing response for "{source_value}": {response_message}' + ) + + return matches diff --git a/bdikit/value_matching/matcher_factory.py b/bdikit/value_matching/matcher_factory.py new file mode 100644 index 00000000..b121830f --- /dev/null +++ b/bdikit/value_matching/matcher_factory.py @@ -0,0 +1,44 @@ +import importlib +from enum import Enum +from typing import Mapping, Any +from bdikit.value_matching.base import BaseValueMatcher + + +class ValueMatchers(Enum): + TFIDF = ("tfidf", "bdikit.value_matching.polyfuzz.TFIDFValueMatcher") + EDIT = ( + "edit_distance", + "bdikit.value_matching.polyfuzz.EditDistanceValueMatcher", + ) + EMBEDDINGS = ( + "embedding", + "bdikit.value_matching.polyfuzz.EmbeddingValueMatcher", + ) + FASTTEXT = ( + "fasttext", + "bdikit.value_matching.polyfuzz.FastTextValueMatcher", + ) + GPT = ("gpt", "bdikit.value_matching.gpt.GPTValueMatcher") + + def __init__(self, matcher_name: str, matcher_path: str): + self.matcher_name = matcher_name + self.matcher_path = matcher_path + + @staticmethod + def get_matcher( + matcher_name: str, **matcher_kwargs: Mapping[str, Any] + ) -> BaseValueMatcher: + if matcher_name not in matchers: + names = ", ".join(list(matchers.keys())) + raise ValueError( + f"The {matcher_name} algorithm is not supported. " + f"Supported algorithms are: {names}" + ) + # Load the class dynamically + module_path, class_name = matchers[matcher_name].rsplit(".", 1) + module = importlib.import_module(module_path) + + return getattr(module, class_name)(**matcher_kwargs) + + +matchers = {method.matcher_name: method.matcher_path for method in ValueMatchers} diff --git a/bdikit/value_matching/polyfuzz.py b/bdikit/value_matching/polyfuzz.py new file mode 100644 index 00000000..8ead0d2d --- /dev/null +++ b/bdikit/value_matching/polyfuzz.py @@ -0,0 +1,141 @@ +import flair +import torch +from rapidfuzz import fuzz +from polyfuzz import PolyFuzz +from typing import List, Callable, Tuple +from bdikit.value_matching.base import BaseValueMatcher, ValueMatch +from polyfuzz.models import EditDistance, TFIDF, Embeddings +from flair.embeddings import TransformerWordEmbeddings, WordEmbeddings +from bdikit.config import get_device, VALUE_MATCHING_THRESHOLD + + +flair.device = torch.device(get_device()) + + +class PolyFuzzValueMatcher(BaseValueMatcher): + """ + Base class for value matching algorithms based on the PolyFuzz library. + """ + + def __init__(self, polyfuzz_model: PolyFuzz, threshold: float): + self.model = polyfuzz_model + self.threshold = threshold + + def match( + self, + source_values: List[str], + target_values: List[str], + ) -> List[ValueMatch]: + + self.model.match(source_values, target_values) + match_results = self.model.get_matches() + match_results.sort_values(by="Similarity", ascending=False, inplace=True) + + matches = [] + for _, row in match_results.iterrows(): + source = row[0] + top_matches = row[1:] + indexes = range(0, len(top_matches) - 1, 2) + + for index in indexes: + target = top_matches[index] + similarity = top_matches[index + 1] + if similarity >= self.threshold: + matches.append((source, target, similarity)) + + return matches + + +class TFIDFValueMatcher(PolyFuzzValueMatcher): + """ + Value matching algorithm based on the TF-IDF similarity between values. + """ + + def __init__( + self, + n_gram_range: Tuple[int, int] = (1, 3), + clean_string: bool = True, + threshold: float = VALUE_MATCHING_THRESHOLD, + top_k: int = 1, + cosine_method: str = "sparse", + ): + + super().__init__( + PolyFuzz( + method=TFIDF( + n_gram_range=n_gram_range, + clean_string=clean_string, + min_similarity=threshold, + top_n=top_k, + cosine_method=cosine_method, + ) + ), + threshold, + ) + + +class EditDistanceValueMatcher(PolyFuzzValueMatcher): + """ + Value matching algorithm based on the edit distance between values. + """ + + def __init__( + self, + scorer: Callable[[str, str], float] = fuzz.ratio, + n_jobs: int = -1, + threshold: float = VALUE_MATCHING_THRESHOLD, + ): + # Return scores between 0 and 1 + normalized_scorer = lambda str1, str2: scorer(str1, str2) / 100.0 + super().__init__( + PolyFuzz( + method=EditDistance( + n_jobs=n_jobs, scorer=normalized_scorer, normalize=False + ) + ), + threshold, + ) + + +class EmbeddingValueMatcher(PolyFuzzValueMatcher): + """ + Value matching algorithm based on the cosine similarity of value embeddings. + """ + + def __init__( + self, + model_name: str = "bert-base-multilingual-cased", + threshold: float = VALUE_MATCHING_THRESHOLD, + top_k: int = 1, + cosine_method: str = "sparse", + ): + embeddings = TransformerWordEmbeddings(model_name) + method = Embeddings( + embeddings, + min_similarity=threshold, + top_n=top_k, + cosine_method=cosine_method, + ) + super().__init__(PolyFuzz(method), threshold) + + +class FastTextValueMatcher(PolyFuzzValueMatcher): + """ + Value matching algorithm based on the cosine similarity of FastText embeddings. + """ + + def __init__( + self, + model_name: str = "en-crawl", + threshold: float = VALUE_MATCHING_THRESHOLD, + top_k: int = 1, + cosine_method: str = "sparse", + ): + embeddings = WordEmbeddings(model_name) + method = Embeddings( + embeddings, + min_similarity=threshold, + top_n=top_k, + cosine_method=cosine_method, + ) + super().__init__(PolyFuzz(method), threshold) diff --git a/docs/source/examples.rst b/docs/source/examples.rst index 5ae0131d..9096005a 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -6,4 +6,3 @@ Here can find different Jupyter notebook examples about how to use `bdi-kit`: - `Changing the parameters of the matching methods `__ - `Getting the top-k value matches `__ - `Analyzing one attribute/column at a time `__ - diff --git a/examples/top_k_matches.ipynb b/examples/top_k_matches.ipynb new file mode 100644 index 00000000..4a13fad7 --- /dev/null +++ b/examples/top_k_matches.ipynb @@ -0,0 +1,3430 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting Top-K Value Matches" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function now() {\n", + " return new Date();\n", + " }\n", + "\n", + " var force = true;\n", + " var py_version = '3.4.2'.replace('rc', '-rc.').replace('.dev', '-dev.');\n", + " var reloading = false;\n", + " var Bokeh = root.Bokeh;\n", + "\n", + " if (typeof (root._bokeh_timeout) === \"undefined\" || force) {\n", + " root._bokeh_timeout = Date.now() + 5000;\n", + " root._bokeh_failed_load = false;\n", + " }\n", + "\n", + " function run_callbacks() {\n", + " try {\n", + " root._bokeh_onload_callbacks.forEach(function(callback) {\n", + " if (callback != null)\n", + " callback();\n", + " });\n", + " } finally {\n", + " delete root._bokeh_onload_callbacks;\n", + " }\n", + " console.debug(\"Bokeh: all callbacks have finished\");\n", + " }\n", + "\n", + " function load_libs(css_urls, js_urls, js_modules, js_exports, callback) {\n", + " if (css_urls == null) css_urls = [];\n", + " if (js_urls == null) js_urls = [];\n", + " if (js_modules == null) js_modules = [];\n", + " if (js_exports == null) js_exports = {};\n", + "\n", + " root._bokeh_onload_callbacks.push(callback);\n", + "\n", + " if (root._bokeh_is_loading > 0) {\n", + " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", + " return null;\n", + " }\n", + " if (js_urls.length === 0 && js_modules.length === 0 && Object.keys(js_exports).length === 0) {\n", + " run_callbacks();\n", + " return null;\n", + " }\n", + " if (!reloading) {\n", + " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", + " }\n", + "\n", + " function on_load() {\n", + " root._bokeh_is_loading--;\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", + " run_callbacks()\n", + " }\n", + " }\n", + " window._bokeh_on_load = on_load\n", + "\n", + " function on_error() {\n", + " console.error(\"failed to load \" + url);\n", + " }\n", + "\n", + " var skip = [];\n", + " if (window.requirejs) {\n", + " window.requirejs.config({'packages': {}, 'paths': {'tabulator': 'https://cdn.jsdelivr.net/npm/tabulator-tables@5.5.0/dist/js/tabulator.min', 'moment': 'https://cdn.jsdelivr.net/npm/luxon/build/global/luxon.min'}, 'shim': {}});\n", + " require([\"tabulator\"], function(Tabulator) {\n", + "\twindow.Tabulator = Tabulator\n", + "\ton_load()\n", + " })\n", + " require([\"moment\"], function(moment) {\n", + "\twindow.moment = moment\n", + "\ton_load()\n", + " })\n", + " root._bokeh_is_loading = css_urls.length + 2;\n", + " } else {\n", + " root._bokeh_is_loading = css_urls.length + js_urls.length + js_modules.length + Object.keys(js_exports).length;\n", + " }\n", + "\n", + " var existing_stylesheets = []\n", + " var links = document.getElementsByTagName('link')\n", + " for (var i = 0; i < links.length; i++) {\n", + " var link = links[i]\n", + " if (link.href != null) {\n", + "\texisting_stylesheets.push(link.href)\n", + " }\n", + " }\n", + " for (var i = 0; i < css_urls.length; i++) {\n", + " var url = css_urls[i];\n", + " if (existing_stylesheets.indexOf(url) !== -1) {\n", + "\ton_load()\n", + "\tcontinue;\n", + " }\n", + " const element = document.createElement(\"link\");\n", + " element.onload = on_load;\n", + " element.onerror = on_error;\n", + " element.rel = \"stylesheet\";\n", + " element.type = \"text/css\";\n", + " element.href = url;\n", + " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", + " document.body.appendChild(element);\n", + " } if (((window.Tabulator !== undefined) && (!(window.Tabulator instanceof HTMLElement))) || window.requirejs) {\n", + " var urls = ['https://cdn.holoviz.org/panel/1.4.4/dist/bundled/datatabulator/tabulator-tables@5.5.0/dist/js/tabulator.min.js'];\n", + " for (var i = 0; i < urls.length; i++) {\n", + " skip.push(urls[i])\n", + " }\n", + " } if (((window.moment !== undefined) && (!(window.moment instanceof HTMLElement))) || window.requirejs) {\n", + " var urls = ['https://cdn.holoviz.org/panel/1.4.4/dist/bundled/datatabulator/luxon/build/global/luxon.min.js'];\n", + " for (var i = 0; i < urls.length; i++) {\n", + " skip.push(urls[i])\n", + " }\n", + " } var existing_scripts = []\n", + " var scripts = document.getElementsByTagName('script')\n", + " for (var i = 0; i < scripts.length; i++) {\n", + " var script = scripts[i]\n", + " if (script.src != null) {\n", + "\texisting_scripts.push(script.src)\n", + " }\n", + " }\n", + " for (var i = 0; i < js_urls.length; i++) {\n", + " var url = js_urls[i];\n", + " if (skip.indexOf(url) !== -1 || existing_scripts.indexOf(url) !== -1) {\n", + "\tif (!window.requirejs) {\n", + "\t on_load();\n", + "\t}\n", + "\tcontinue;\n", + " }\n", + " var element = document.createElement('script');\n", + " element.onload = on_load;\n", + " element.onerror = on_error;\n", + " element.async = false;\n", + " element.src = url;\n", + " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", + " document.head.appendChild(element);\n", + " }\n", + " for (var i = 0; i < js_modules.length; i++) {\n", + " var url = js_modules[i];\n", + " if (skip.indexOf(url) !== -1 || existing_scripts.indexOf(url) !== -1) {\n", + "\tif (!window.requirejs) {\n", + "\t on_load();\n", + "\t}\n", + "\tcontinue;\n", + " }\n", + " var element = document.createElement('script');\n", + " element.onload = on_load;\n", + " element.onerror = on_error;\n", + " element.async = false;\n", + " element.src = url;\n", + " element.type = \"module\";\n", + " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", + " document.head.appendChild(element);\n", + " }\n", + " for (const name in js_exports) {\n", + " var url = js_exports[name];\n", + " if (skip.indexOf(url) >= 0 || root[name] != null) {\n", + "\tif (!window.requirejs) {\n", + "\t on_load();\n", + "\t}\n", + "\tcontinue;\n", + " }\n", + " var element = document.createElement('script');\n", + " element.onerror = on_error;\n", + " element.async = false;\n", + " element.type = \"module\";\n", + " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", + " element.textContent = `\n", + " import ${name} from \"${url}\"\n", + " window.${name} = ${name}\n", + " window._bokeh_on_load()\n", + " `\n", + " document.head.appendChild(element);\n", + " }\n", + " if (!js_urls.length && !js_modules.length) {\n", + " on_load()\n", + " }\n", + " };\n", + "\n", + " function inject_raw_css(css) {\n", + " const element = document.createElement(\"style\");\n", + " element.appendChild(document.createTextNode(css));\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " var js_urls = [\"https://cdn.holoviz.org/panel/1.4.4/dist/bundled/datatabulator/tabulator-tables@5.5.0/dist/js/tabulator.min.js\", \"https://cdn.holoviz.org/panel/1.4.4/dist/bundled/datatabulator/luxon/build/global/luxon.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-3.4.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.4.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.4.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.4.2.min.js\", \"https://cdn.holoviz.org/panel/1.4.4/dist/panel.min.js\"];\n", + " var js_modules = [];\n", + " var js_exports = {};\n", + " var css_urls = [\"https://cdn.holoviz.org/panel/1.4.4/dist/bundled/datatabulator/tabulator-tables@5.5.0/dist/css/tabulator_simple.min.css?v=1.4.4\"];\n", + " var inline_js = [ function(Bokeh) {\n", + " Bokeh.set_log_level(\"info\");\n", + " },\n", + "function(Bokeh) {} // ensure no trailing comma for IE\n", + " ];\n", + "\n", + " function run_inline_js() {\n", + " if ((root.Bokeh !== undefined) || (force === true)) {\n", + " for (var i = 0; i < inline_js.length; i++) {\n", + "\ttry {\n", + " inline_js[i].call(root, root.Bokeh);\n", + "\t} catch(e) {\n", + "\t if (!reloading) {\n", + "\t throw e;\n", + "\t }\n", + "\t}\n", + " }\n", + " // Cache old bokeh versions\n", + " if (Bokeh != undefined && !reloading) {\n", + "\tvar NewBokeh = root.Bokeh;\n", + "\tif (Bokeh.versions === undefined) {\n", + "\t Bokeh.versions = new Map();\n", + "\t}\n", + "\tif (NewBokeh.version !== Bokeh.version) {\n", + "\t Bokeh.versions.set(NewBokeh.version, NewBokeh)\n", + "\t}\n", + "\troot.Bokeh = Bokeh;\n", + " }} else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(run_inline_js, 100);\n", + " } else if (!root._bokeh_failed_load) {\n", + " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", + " root._bokeh_failed_load = true;\n", + " }\n", + " root._bokeh_is_initializing = false\n", + " }\n", + "\n", + " function load_or_wait() {\n", + " // Implement a backoff loop that tries to ensure we do not load multiple\n", + " // versions of Bokeh and its dependencies at the same time.\n", + " // In recent versions we use the root._bokeh_is_initializing flag\n", + " // to determine whether there is an ongoing attempt to initialize\n", + " // bokeh, however for backward compatibility we also try to ensure\n", + " // that we do not start loading a newer (Panel>=1.0 and Bokeh>3) version\n", + " // before older versions are fully initialized.\n", + " if (root._bokeh_is_initializing && Date.now() > root._bokeh_timeout) {\n", + " root._bokeh_is_initializing = false;\n", + " root._bokeh_onload_callbacks = undefined;\n", + " console.log(\"Bokeh: BokehJS was loaded multiple times but one version failed to initialize.\");\n", + " load_or_wait();\n", + " } else if (root._bokeh_is_initializing || (typeof root._bokeh_is_initializing === \"undefined\" && root._bokeh_onload_callbacks !== undefined)) {\n", + " setTimeout(load_or_wait, 100);\n", + " } else {\n", + " root._bokeh_is_initializing = true\n", + " root._bokeh_onload_callbacks = []\n", + " var bokeh_loaded = Bokeh != null && (Bokeh.version === py_version || (Bokeh.versions !== undefined && Bokeh.versions.has(py_version)));\n", + " if (!reloading && !bokeh_loaded) {\n", + "\troot.Bokeh = undefined;\n", + " }\n", + " load_libs(css_urls, js_urls, js_modules, js_exports, function() {\n", + "\tconsole.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", + "\trun_inline_js();\n", + " });\n", + " }\n", + " }\n", + " // Give older versions of the autoload script a head-start to ensure\n", + " // they initialize before we start loading newer version.\n", + " setTimeout(load_or_wait, 100)\n", + "}(window));" + ], + "application/vnd.holoviews_load.v0+json": "(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n var py_version = '3.4.2'.replace('rc', '-rc.').replace('.dev', '-dev.');\n var reloading = false;\n var Bokeh = root.Bokeh;\n\n if (typeof (root._bokeh_timeout) === \"undefined\" || force) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks;\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, js_modules, js_exports, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n if (js_modules == null) js_modules = [];\n if (js_exports == null) js_exports = {};\n\n root._bokeh_onload_callbacks.push(callback);\n\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls.length === 0 && js_modules.length === 0 && Object.keys(js_exports).length === 0) {\n run_callbacks();\n return null;\n }\n if (!reloading) {\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n }\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n window._bokeh_on_load = on_load\n\n function on_error() {\n console.error(\"failed to load \" + url);\n }\n\n var skip = [];\n if (window.requirejs) {\n window.requirejs.config({'packages': {}, 'paths': {'tabulator': 'https://cdn.jsdelivr.net/npm/tabulator-tables@5.5.0/dist/js/tabulator.min', 'moment': 'https://cdn.jsdelivr.net/npm/luxon/build/global/luxon.min'}, 'shim': {}});\n require([\"tabulator\"], function(Tabulator) {\n\twindow.Tabulator = Tabulator\n\ton_load()\n })\n require([\"moment\"], function(moment) {\n\twindow.moment = moment\n\ton_load()\n })\n root._bokeh_is_loading = css_urls.length + 2;\n } else {\n root._bokeh_is_loading = css_urls.length + js_urls.length + js_modules.length + Object.keys(js_exports).length;\n }\n\n var existing_stylesheets = []\n var links = document.getElementsByTagName('link')\n for (var i = 0; i < links.length; i++) {\n var link = links[i]\n if (link.href != null) {\n\texisting_stylesheets.push(link.href)\n }\n }\n for (var i = 0; i < css_urls.length; i++) {\n var url = css_urls[i];\n if (existing_stylesheets.indexOf(url) !== -1) {\n\ton_load()\n\tcontinue;\n }\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error;\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n } if (((window.Tabulator !== undefined) && (!(window.Tabulator instanceof HTMLElement))) || window.requirejs) {\n var urls = ['https://cdn.holoviz.org/panel/1.4.4/dist/bundled/datatabulator/tabulator-tables@5.5.0/dist/js/tabulator.min.js'];\n for (var i = 0; i < urls.length; i++) {\n skip.push(urls[i])\n }\n } if (((window.moment !== undefined) && (!(window.moment instanceof HTMLElement))) || window.requirejs) {\n var urls = ['https://cdn.holoviz.org/panel/1.4.4/dist/bundled/datatabulator/luxon/build/global/luxon.min.js'];\n for (var i = 0; i < urls.length; i++) {\n skip.push(urls[i])\n }\n } var existing_scripts = []\n var scripts = document.getElementsByTagName('script')\n for (var i = 0; i < scripts.length; i++) {\n var script = scripts[i]\n if (script.src != null) {\n\texisting_scripts.push(script.src)\n }\n }\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n if (skip.indexOf(url) !== -1 || existing_scripts.indexOf(url) !== -1) {\n\tif (!window.requirejs) {\n\t on_load();\n\t}\n\tcontinue;\n }\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n for (var i = 0; i < js_modules.length; i++) {\n var url = js_modules[i];\n if (skip.indexOf(url) !== -1 || existing_scripts.indexOf(url) !== -1) {\n\tif (!window.requirejs) {\n\t on_load();\n\t}\n\tcontinue;\n }\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n element.type = \"module\";\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n for (const name in js_exports) {\n var url = js_exports[name];\n if (skip.indexOf(url) >= 0 || root[name] != null) {\n\tif (!window.requirejs) {\n\t on_load();\n\t}\n\tcontinue;\n }\n var element = document.createElement('script');\n element.onerror = on_error;\n element.async = false;\n element.type = \"module\";\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n element.textContent = `\n import ${name} from \"${url}\"\n window.${name} = ${name}\n window._bokeh_on_load()\n `\n document.head.appendChild(element);\n }\n if (!js_urls.length && !js_modules.length) {\n on_load()\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n var js_urls = [\"https://cdn.holoviz.org/panel/1.4.4/dist/bundled/datatabulator/tabulator-tables@5.5.0/dist/js/tabulator.min.js\", \"https://cdn.holoviz.org/panel/1.4.4/dist/bundled/datatabulator/luxon/build/global/luxon.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-3.4.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.4.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.4.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.4.2.min.js\", \"https://cdn.holoviz.org/panel/1.4.4/dist/panel.min.js\"];\n var js_modules = [];\n var js_exports = {};\n var css_urls = [\"https://cdn.holoviz.org/panel/1.4.4/dist/bundled/datatabulator/tabulator-tables@5.5.0/dist/css/tabulator_simple.min.css?v=1.4.4\"];\n var inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {} // ensure no trailing comma for IE\n ];\n\n function run_inline_js() {\n if ((root.Bokeh !== undefined) || (force === true)) {\n for (var i = 0; i < inline_js.length; i++) {\n\ttry {\n inline_js[i].call(root, root.Bokeh);\n\t} catch(e) {\n\t if (!reloading) {\n\t throw e;\n\t }\n\t}\n }\n // Cache old bokeh versions\n if (Bokeh != undefined && !reloading) {\n\tvar NewBokeh = root.Bokeh;\n\tif (Bokeh.versions === undefined) {\n\t Bokeh.versions = new Map();\n\t}\n\tif (NewBokeh.version !== Bokeh.version) {\n\t Bokeh.versions.set(NewBokeh.version, NewBokeh)\n\t}\n\troot.Bokeh = Bokeh;\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n }\n root._bokeh_is_initializing = false\n }\n\n function load_or_wait() {\n // Implement a backoff loop that tries to ensure we do not load multiple\n // versions of Bokeh and its dependencies at the same time.\n // In recent versions we use the root._bokeh_is_initializing flag\n // to determine whether there is an ongoing attempt to initialize\n // bokeh, however for backward compatibility we also try to ensure\n // that we do not start loading a newer (Panel>=1.0 and Bokeh>3) version\n // before older versions are fully initialized.\n if (root._bokeh_is_initializing && Date.now() > root._bokeh_timeout) {\n root._bokeh_is_initializing = false;\n root._bokeh_onload_callbacks = undefined;\n console.log(\"Bokeh: BokehJS was loaded multiple times but one version failed to initialize.\");\n load_or_wait();\n } else if (root._bokeh_is_initializing || (typeof root._bokeh_is_initializing === \"undefined\" && root._bokeh_onload_callbacks !== undefined)) {\n setTimeout(load_or_wait, 100);\n } else {\n root._bokeh_is_initializing = true\n root._bokeh_onload_callbacks = []\n var bokeh_loaded = Bokeh != null && (Bokeh.version === py_version || (Bokeh.versions !== undefined && Bokeh.versions.has(py_version)));\n if (!reloading && !bokeh_loaded) {\n\troot.Bokeh = undefined;\n }\n load_libs(css_urls, js_urls, js_modules, js_exports, function() {\n\tconsole.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n\trun_inline_js();\n });\n }\n }\n // Give older versions of the autoload script a head-start to ensure\n // they initialize before we start loading newer version.\n setTimeout(load_or_wait, 100)\n}(window));" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "\n", + "if ((window.PyViz === undefined) || (window.PyViz instanceof HTMLElement)) {\n", + " window.PyViz = {comms: {}, comm_status:{}, kernels:{}, receivers: {}, plot_index: []}\n", + "}\n", + "\n", + "\n", + " function JupyterCommManager() {\n", + " }\n", + "\n", + " JupyterCommManager.prototype.register_target = function(plot_id, comm_id, msg_handler) {\n", + " if (window.comm_manager || ((window.Jupyter !== undefined) && (Jupyter.notebook.kernel != null))) {\n", + " var comm_manager = window.comm_manager || Jupyter.notebook.kernel.comm_manager;\n", + " comm_manager.register_target(comm_id, function(comm) {\n", + " comm.on_msg(msg_handler);\n", + " });\n", + " } else if ((plot_id in window.PyViz.kernels) && (window.PyViz.kernels[plot_id])) {\n", + " window.PyViz.kernels[plot_id].registerCommTarget(comm_id, function(comm) {\n", + " comm.onMsg = msg_handler;\n", + " });\n", + " } else if (typeof google != 'undefined' && google.colab.kernel != null) {\n", + " google.colab.kernel.comms.registerTarget(comm_id, (comm) => {\n", + " var messages = comm.messages[Symbol.asyncIterator]();\n", + " function processIteratorResult(result) {\n", + " var message = result.value;\n", + " console.log(message)\n", + " var content = {data: message.data, comm_id};\n", + " var buffers = []\n", + " for (var buffer of message.buffers || []) {\n", + " buffers.push(new DataView(buffer))\n", + " }\n", + " var metadata = message.metadata || {};\n", + " var msg = {content, buffers, metadata}\n", + " msg_handler(msg);\n", + " return messages.next().then(processIteratorResult);\n", + " }\n", + " return messages.next().then(processIteratorResult);\n", + " })\n", + " }\n", + " }\n", + "\n", + " JupyterCommManager.prototype.get_client_comm = function(plot_id, comm_id, msg_handler) {\n", + " if (comm_id in window.PyViz.comms) {\n", + " return window.PyViz.comms[comm_id];\n", + " } else if (window.comm_manager || ((window.Jupyter !== undefined) && (Jupyter.notebook.kernel != null))) {\n", + " var comm_manager = window.comm_manager || Jupyter.notebook.kernel.comm_manager;\n", + " var comm = comm_manager.new_comm(comm_id, {}, {}, {}, comm_id);\n", + " if (msg_handler) {\n", + " comm.on_msg(msg_handler);\n", + " }\n", + " } else if ((plot_id in window.PyViz.kernels) && (window.PyViz.kernels[plot_id])) {\n", + " var comm = window.PyViz.kernels[plot_id].connectToComm(comm_id);\n", + " comm.open();\n", + " if (msg_handler) {\n", + " comm.onMsg = msg_handler;\n", + " }\n", + " } else if (typeof google != 'undefined' && google.colab.kernel != null) {\n", + " var comm_promise = google.colab.kernel.comms.open(comm_id)\n", + " comm_promise.then((comm) => {\n", + " window.PyViz.comms[comm_id] = comm;\n", + " if (msg_handler) {\n", + " var messages = comm.messages[Symbol.asyncIterator]();\n", + " function processIteratorResult(result) {\n", + " var message = result.value;\n", + " var content = {data: message.data};\n", + " var metadata = message.metadata || {comm_id};\n", + " var msg = {content, metadata}\n", + " msg_handler(msg);\n", + " return messages.next().then(processIteratorResult);\n", + " }\n", + " return messages.next().then(processIteratorResult);\n", + " }\n", + " }) \n", + " var sendClosure = (data, metadata, buffers, disposeOnDone) => {\n", + " return comm_promise.then((comm) => {\n", + " comm.send(data, metadata, buffers, disposeOnDone);\n", + " });\n", + " };\n", + " var comm = {\n", + " send: sendClosure\n", + " };\n", + " }\n", + " window.PyViz.comms[comm_id] = comm;\n", + " return comm;\n", + " }\n", + " window.PyViz.comm_manager = new JupyterCommManager();\n", + " \n", + "\n", + "\n", + "var JS_MIME_TYPE = 'application/javascript';\n", + "var HTML_MIME_TYPE = 'text/html';\n", + "var EXEC_MIME_TYPE = 'application/vnd.holoviews_exec.v0+json';\n", + "var CLASS_NAME = 'output';\n", + "\n", + "/**\n", + " * Render data to the DOM node\n", + " */\n", + "function render(props, node) {\n", + " var div = document.createElement(\"div\");\n", + " var script = document.createElement(\"script\");\n", + " node.appendChild(div);\n", + " node.appendChild(script);\n", + "}\n", + "\n", + "/**\n", + " * Handle when a new output is added\n", + " */\n", + "function handle_add_output(event, handle) {\n", + " var output_area = handle.output_area;\n", + " var output = handle.output;\n", + " if ((output.data == undefined) || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n", + " return\n", + " }\n", + " var id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", + " var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", + " if (id !== undefined) {\n", + " var nchildren = toinsert.length;\n", + " var html_node = toinsert[nchildren-1].children[0];\n", + " html_node.innerHTML = output.data[HTML_MIME_TYPE];\n", + " var scripts = [];\n", + " var nodelist = html_node.querySelectorAll(\"script\");\n", + " for (var i in nodelist) {\n", + " if (nodelist.hasOwnProperty(i)) {\n", + " scripts.push(nodelist[i])\n", + " }\n", + " }\n", + "\n", + " scripts.forEach( function (oldScript) {\n", + " var newScript = document.createElement(\"script\");\n", + " var attrs = [];\n", + " var nodemap = oldScript.attributes;\n", + " for (var j in nodemap) {\n", + " if (nodemap.hasOwnProperty(j)) {\n", + " attrs.push(nodemap[j])\n", + " }\n", + " }\n", + " attrs.forEach(function(attr) { newScript.setAttribute(attr.name, attr.value) });\n", + " newScript.appendChild(document.createTextNode(oldScript.innerHTML));\n", + " oldScript.parentNode.replaceChild(newScript, oldScript);\n", + " });\n", + " if (JS_MIME_TYPE in output.data) {\n", + " toinsert[nchildren-1].children[1].textContent = output.data[JS_MIME_TYPE];\n", + " }\n", + " output_area._hv_plot_id = id;\n", + " if ((window.Bokeh !== undefined) && (id in Bokeh.index)) {\n", + " window.PyViz.plot_index[id] = Bokeh.index[id];\n", + " } else {\n", + " window.PyViz.plot_index[id] = null;\n", + " }\n", + " } else if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", + " var bk_div = document.createElement(\"div\");\n", + " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", + " var script_attrs = bk_div.children[0].attributes;\n", + " for (var i = 0; i < script_attrs.length; i++) {\n", + " toinsert[toinsert.length - 1].childNodes[1].setAttribute(script_attrs[i].name, script_attrs[i].value);\n", + " }\n", + " // store reference to server id on output_area\n", + " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", + " }\n", + "}\n", + "\n", + "/**\n", + " * Handle when an output is cleared or removed\n", + " */\n", + "function handle_clear_output(event, handle) {\n", + " var id = handle.cell.output_area._hv_plot_id;\n", + " var server_id = handle.cell.output_area._bokeh_server_id;\n", + " if (((id === undefined) || !(id in PyViz.plot_index)) && (server_id !== undefined)) { return; }\n", + " var comm = window.PyViz.comm_manager.get_client_comm(\"hv-extension-comm\", \"hv-extension-comm\", function () {});\n", + " if (server_id !== null) {\n", + " comm.send({event_type: 'server_delete', 'id': server_id});\n", + " return;\n", + " } else if (comm !== null) {\n", + " comm.send({event_type: 'delete', 'id': id});\n", + " }\n", + " delete PyViz.plot_index[id];\n", + " if ((window.Bokeh !== undefined) & (id in window.Bokeh.index)) {\n", + " var doc = window.Bokeh.index[id].model.document\n", + " doc.clear();\n", + " const i = window.Bokeh.documents.indexOf(doc);\n", + " if (i > -1) {\n", + " window.Bokeh.documents.splice(i, 1);\n", + " }\n", + " }\n", + "}\n", + "\n", + "/**\n", + " * Handle kernel restart event\n", + " */\n", + "function handle_kernel_cleanup(event, handle) {\n", + " delete PyViz.comms[\"hv-extension-comm\"];\n", + " window.PyViz.plot_index = {}\n", + "}\n", + "\n", + "/**\n", + " * Handle update_display_data messages\n", + " */\n", + "function handle_update_output(event, handle) {\n", + " handle_clear_output(event, {cell: {output_area: handle.output_area}})\n", + " handle_add_output(event, handle)\n", + "}\n", + "\n", + "function register_renderer(events, OutputArea) {\n", + " function append_mime(data, metadata, element) {\n", + " // create a DOM node to render to\n", + " var toinsert = this.create_output_subarea(\n", + " metadata,\n", + " CLASS_NAME,\n", + " EXEC_MIME_TYPE\n", + " );\n", + " this.keyboard_manager.register_events(toinsert);\n", + " // Render to node\n", + " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", + " render(props, toinsert[0]);\n", + " element.append(toinsert);\n", + " return toinsert\n", + " }\n", + "\n", + " events.on('output_added.OutputArea', handle_add_output);\n", + " events.on('output_updated.OutputArea', handle_update_output);\n", + " events.on('clear_output.CodeCell', handle_clear_output);\n", + " events.on('delete.Cell', handle_clear_output);\n", + " events.on('kernel_ready.Kernel', handle_kernel_cleanup);\n", + "\n", + " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", + " safe: true,\n", + " index: 0\n", + " });\n", + "}\n", + "\n", + "if (window.Jupyter !== undefined) {\n", + " try {\n", + " var events = require('base/js/events');\n", + " var OutputArea = require('notebook/js/outputarea').OutputArea;\n", + " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", + " register_renderer(events, OutputArea);\n", + " }\n", + " } catch(err) {\n", + " }\n", + "}\n" + ], + "application/vnd.holoviews_load.v0+json": "\nif ((window.PyViz === undefined) || (window.PyViz instanceof HTMLElement)) {\n window.PyViz = {comms: {}, comm_status:{}, kernels:{}, receivers: {}, plot_index: []}\n}\n\n\n function JupyterCommManager() {\n }\n\n JupyterCommManager.prototype.register_target = function(plot_id, comm_id, msg_handler) {\n if (window.comm_manager || ((window.Jupyter !== undefined) && (Jupyter.notebook.kernel != null))) {\n var comm_manager = window.comm_manager || Jupyter.notebook.kernel.comm_manager;\n comm_manager.register_target(comm_id, function(comm) {\n comm.on_msg(msg_handler);\n });\n } else if ((plot_id in window.PyViz.kernels) && (window.PyViz.kernels[plot_id])) {\n window.PyViz.kernels[plot_id].registerCommTarget(comm_id, function(comm) {\n comm.onMsg = msg_handler;\n });\n } else if (typeof google != 'undefined' && google.colab.kernel != null) {\n google.colab.kernel.comms.registerTarget(comm_id, (comm) => {\n var messages = comm.messages[Symbol.asyncIterator]();\n function processIteratorResult(result) {\n var message = result.value;\n console.log(message)\n var content = {data: message.data, comm_id};\n var buffers = []\n for (var buffer of message.buffers || []) {\n buffers.push(new DataView(buffer))\n }\n var metadata = message.metadata || {};\n var msg = {content, buffers, metadata}\n msg_handler(msg);\n return messages.next().then(processIteratorResult);\n }\n return messages.next().then(processIteratorResult);\n })\n }\n }\n\n JupyterCommManager.prototype.get_client_comm = function(plot_id, comm_id, msg_handler) {\n if (comm_id in window.PyViz.comms) {\n return window.PyViz.comms[comm_id];\n } else if (window.comm_manager || ((window.Jupyter !== undefined) && (Jupyter.notebook.kernel != null))) {\n var comm_manager = window.comm_manager || Jupyter.notebook.kernel.comm_manager;\n var comm = comm_manager.new_comm(comm_id, {}, {}, {}, comm_id);\n if (msg_handler) {\n comm.on_msg(msg_handler);\n }\n } else if ((plot_id in window.PyViz.kernels) && (window.PyViz.kernels[plot_id])) {\n var comm = window.PyViz.kernels[plot_id].connectToComm(comm_id);\n comm.open();\n if (msg_handler) {\n comm.onMsg = msg_handler;\n }\n } else if (typeof google != 'undefined' && google.colab.kernel != null) {\n var comm_promise = google.colab.kernel.comms.open(comm_id)\n comm_promise.then((comm) => {\n window.PyViz.comms[comm_id] = comm;\n if (msg_handler) {\n var messages = comm.messages[Symbol.asyncIterator]();\n function processIteratorResult(result) {\n var message = result.value;\n var content = {data: message.data};\n var metadata = message.metadata || {comm_id};\n var msg = {content, metadata}\n msg_handler(msg);\n return messages.next().then(processIteratorResult);\n }\n return messages.next().then(processIteratorResult);\n }\n }) \n var sendClosure = (data, metadata, buffers, disposeOnDone) => {\n return comm_promise.then((comm) => {\n comm.send(data, metadata, buffers, disposeOnDone);\n });\n };\n var comm = {\n send: sendClosure\n };\n }\n window.PyViz.comms[comm_id] = comm;\n return comm;\n }\n window.PyViz.comm_manager = new JupyterCommManager();\n \n\n\nvar JS_MIME_TYPE = 'application/javascript';\nvar HTML_MIME_TYPE = 'text/html';\nvar EXEC_MIME_TYPE = 'application/vnd.holoviews_exec.v0+json';\nvar CLASS_NAME = 'output';\n\n/**\n * Render data to the DOM node\n */\nfunction render(props, node) {\n var div = document.createElement(\"div\");\n var script = document.createElement(\"script\");\n node.appendChild(div);\n node.appendChild(script);\n}\n\n/**\n * Handle when a new output is added\n */\nfunction handle_add_output(event, handle) {\n var output_area = handle.output_area;\n var output = handle.output;\n if ((output.data == undefined) || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n return\n }\n var id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n if (id !== undefined) {\n var nchildren = toinsert.length;\n var html_node = toinsert[nchildren-1].children[0];\n html_node.innerHTML = output.data[HTML_MIME_TYPE];\n var scripts = [];\n var nodelist = html_node.querySelectorAll(\"script\");\n for (var i in nodelist) {\n if (nodelist.hasOwnProperty(i)) {\n scripts.push(nodelist[i])\n }\n }\n\n scripts.forEach( function (oldScript) {\n var newScript = document.createElement(\"script\");\n var attrs = [];\n var nodemap = oldScript.attributes;\n for (var j in nodemap) {\n if (nodemap.hasOwnProperty(j)) {\n attrs.push(nodemap[j])\n }\n }\n attrs.forEach(function(attr) { newScript.setAttribute(attr.name, attr.value) });\n newScript.appendChild(document.createTextNode(oldScript.innerHTML));\n oldScript.parentNode.replaceChild(newScript, oldScript);\n });\n if (JS_MIME_TYPE in output.data) {\n toinsert[nchildren-1].children[1].textContent = output.data[JS_MIME_TYPE];\n }\n output_area._hv_plot_id = id;\n if ((window.Bokeh !== undefined) && (id in Bokeh.index)) {\n window.PyViz.plot_index[id] = Bokeh.index[id];\n } else {\n window.PyViz.plot_index[id] = null;\n }\n } else if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n var bk_div = document.createElement(\"div\");\n bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n var script_attrs = bk_div.children[0].attributes;\n for (var i = 0; i < script_attrs.length; i++) {\n toinsert[toinsert.length - 1].childNodes[1].setAttribute(script_attrs[i].name, script_attrs[i].value);\n }\n // store reference to server id on output_area\n output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n }\n}\n\n/**\n * Handle when an output is cleared or removed\n */\nfunction handle_clear_output(event, handle) {\n var id = handle.cell.output_area._hv_plot_id;\n var server_id = handle.cell.output_area._bokeh_server_id;\n if (((id === undefined) || !(id in PyViz.plot_index)) && (server_id !== undefined)) { return; }\n var comm = window.PyViz.comm_manager.get_client_comm(\"hv-extension-comm\", \"hv-extension-comm\", function () {});\n if (server_id !== null) {\n comm.send({event_type: 'server_delete', 'id': server_id});\n return;\n } else if (comm !== null) {\n comm.send({event_type: 'delete', 'id': id});\n }\n delete PyViz.plot_index[id];\n if ((window.Bokeh !== undefined) & (id in window.Bokeh.index)) {\n var doc = window.Bokeh.index[id].model.document\n doc.clear();\n const i = window.Bokeh.documents.indexOf(doc);\n if (i > -1) {\n window.Bokeh.documents.splice(i, 1);\n }\n }\n}\n\n/**\n * Handle kernel restart event\n */\nfunction handle_kernel_cleanup(event, handle) {\n delete PyViz.comms[\"hv-extension-comm\"];\n window.PyViz.plot_index = {}\n}\n\n/**\n * Handle update_display_data messages\n */\nfunction handle_update_output(event, handle) {\n handle_clear_output(event, {cell: {output_area: handle.output_area}})\n handle_add_output(event, handle)\n}\n\nfunction register_renderer(events, OutputArea) {\n function append_mime(data, metadata, element) {\n // create a DOM node to render to\n var toinsert = this.create_output_subarea(\n metadata,\n CLASS_NAME,\n EXEC_MIME_TYPE\n );\n this.keyboard_manager.register_events(toinsert);\n // Render to node\n var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n render(props, toinsert[0]);\n element.append(toinsert);\n return toinsert\n }\n\n events.on('output_added.OutputArea', handle_add_output);\n events.on('output_updated.OutputArea', handle_update_output);\n events.on('clear_output.CodeCell', handle_clear_output);\n events.on('delete.Cell', handle_clear_output);\n events.on('kernel_ready.Kernel', handle_kernel_cleanup);\n\n OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n safe: true,\n index: 0\n });\n}\n\nif (window.Jupyter !== undefined) {\n try {\n var events = require('base/js/events');\n var OutputArea = require('notebook/js/outputarea').OutputArea;\n if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n register_renderer(events, OutputArea);\n }\n } catch(err) {\n }\n}\n" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "24c468b8-31d0-40be-b33a-e76ec684733b" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "import bdikit as bdi\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we are mapping data from Dou et al. (https://pubmed.ncbi.nlm.nih.gov/37567170/) to the GDC format." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RaceEthnicityFIGO_stage
0WhiteNot-Hispanic or LatinoIA
1WhiteNot-Hispanic or LatinoIA
2WhiteNot-Hispanic or LatinoIA
3NaNNaNNaN
4WhiteNot-Hispanic or LatinoIA
5WhiteNot-Hispanic or LatinoIA
6WhiteNot-Hispanic or LatinoIA
7WhiteNot-Hispanic or LatinoIA
8WhiteNot-Hispanic or LatinoIIIA
9WhiteNot-Hispanic or LatinoIA
\n", + "
" + ], + "text/plain": [ + " Race Ethnicity FIGO_stage\n", + "0 White Not-Hispanic or Latino IA\n", + "1 White Not-Hispanic or Latino IA\n", + "2 White Not-Hispanic or Latino IA\n", + "3 NaN NaN NaN\n", + "4 White Not-Hispanic or Latino IA\n", + "5 White Not-Hispanic or Latino IA\n", + "6 White Not-Hispanic or Latino IA\n", + "7 White Not-Hispanic or Latino IA\n", + "8 White Not-Hispanic or Latino IIIA\n", + "9 White Not-Hispanic or Latino IA" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = pd.read_csv(\"./datasets/dou.csv\")\n", + "columns = [\n", + " \"Race\",\n", + " \"Ethnicity\",\n", + " \"FIGO_stage\",\n", + "]\n", + "\n", + "dataset[columns].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can send a `Tuple (source column, target column)` as a parameter to the function `top_value_matches()`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "column_mapping = (\"FIGO_stage\", \"figo_stage\")\n", + "\n", + "value_mappings = bdi.top_value_matches(\n", + " dataset,\n", + " target=\"gdc\",\n", + " column_mapping=column_mapping,\n", + " top_k=5\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "
**Source column:** FIGO_stage
**Target column:** figo_stage
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "3147bfb8-ab42-49c4-84a6-f0f86769963b" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "2b564d63-699e-44ea-a09e-216949491960" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "7233093b-616d-4b1f-b0d7-b7a9d5223723" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "e63fcd9d-264d-4ef8-a5ba-03ba8856c487" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "d745b002-8df3-477b-915c-16b1eb68e193" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "47e9acb4-2a94-4d9b-b57a-08815a67239f" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "7e89be1d-8941-4286-a637-ccd9384a6539" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source t...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "ff2875cb-d9c6-4d75-83d8-f594a2c3588b" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source target ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "5a45e4c4-fb84-4709-acd2-1e32ac4fbfb7" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "bdi.view_value_matches(value_mappings)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also send a `DataFrame` of column mappings as a parameter to `top_value_matches()`:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sourcetarget
0FIGO_stagefigo_stage
1Ethnicityethnicity
2Racerace
\n", + "
" + ], + "text/plain": [ + " source target\n", + "0 FIGO_stage figo_stage\n", + "1 Ethnicity ethnicity\n", + "2 Race race" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "column_mappings = bdi.match_schema(dataset[columns], target=\"gdc\", method=\"coma\")\n", + "column_mappings" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "value_mappings = bdi.top_value_matches(\n", + " dataset,\n", + " target=\"gdc\",\n", + " column_mapping=column_mappings,\n", + " top_k=5\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "
**Source column:** FIGO_stage
**Target column:** figo_stage
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "6ed837c4-64e3-4253-b4d3-c21b2d160459" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "52a0c687-52fb-4162-86e5-7b230678659f" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "657b87d0-25fb-4f00-9716-787edeb9a06e" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "d0e0c79e-0e4b-4738-8c7c-2f1f38a5565f" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "d3340264-5a3e-467b-9295-8d374e384d41" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "c41309f1-fb9d-4ef6-9e74-e1629d0e28c3" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "652c7047-49af-4b2f-a2b8-c4f955c13cb3" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source t...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "eb36a8a0-eaf9-4330-be07-d83712eb07c1" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source target ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "b1779a3b-a25a-46dc-b44a-6ed7216ab93c" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "
**Source column:** Ethnicity
**Target column:** ethnicity
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "151aa5f8-cdd0-4968-b934-12cda0755d15" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source t...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "3d313c03-1656-438d-bd09-5242fc41a43a" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "e6f934d7-3634-4efe-a9c4-e187d9dc443d" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source target ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "812f4889-3c47-4b2a-b9cc-d5a193ec9c0b" + } + }, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "
**Source column:** Race
**Target column:** race
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source target ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "fc1ab7e8-7791-4cac-be67-504077b84569" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "b1601335-8edb-48ba-8e73-e71458251439" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "43465b29-6485-4973-a15f-d3c4976c3ece" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "0a3e1094-ad19-4ab6-bba0-f34dfc6aef4f" + } + }, + "output_type": "display_data" + }, + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Tabulator(value= source ...)" + ] + }, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "c184a41c-2457-4ad5-9a7f-8561444097fa" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "bdi.view_value_matches(value_mappings)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/test_api.py b/tests/test_api.py index f4ad733c..1c5f9d43 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd import numpy as np -from bdikit.mapping_algorithms.value_mapping.value_mappers import ( +from bdikit.mapping_functions import ( FunctionValueMapper, IdentityValueMapper, ) @@ -361,27 +361,35 @@ def test_top_value_matches(): } ) column_mapping = ("fruits", "fruit_names") - # when matches = bdi.top_value_matches(df_source, df_target, column_mapping) # then assert len(matches) == 4 # number of dataframes in the list + # when df_match = matches[0] # top matches for apple + + # then assert len(df_match) == 2 assert "source" in df_match.columns assert "target" in df_match.columns assert "similarity" in df_match.columns + # when df_match = matches[1] # top matches for banana + + # then assert len(df_match) == 2 assert "source" in df_match.columns assert "target" in df_match.columns assert "similarity" in df_match.columns + # when df_match = matches[2] # top matches for orange + + # then assert len(df_match) == 1 assert "source" in df_match.columns assert "target" in df_match.columns - assert "similarity" in df_match.columns + assert "similarity" in df_match.columns \ No newline at end of file diff --git a/tests/test_value_mapping.py b/tests/test_mapping_functions.py similarity index 96% rename from tests/test_value_mapping.py rename to tests/test_mapping_functions.py index 5a20f1fb..652456d3 100644 --- a/tests/test_value_mapping.py +++ b/tests/test_mapping_functions.py @@ -1,6 +1,6 @@ import pandas as pd import numpy as np -from bdikit.mapping_algorithms.value_mapping.value_mappers import ( +from bdikit.mapping_functions import ( FunctionValueMapper, DictionaryMapper, IdentityValueMapper, diff --git a/tests/test_schema_matching.py b/tests/test_schema_matching.py index 6b637727..698beeb1 100644 --- a/tests/test_schema_matching.py +++ b/tests/test_schema_matching.py @@ -1,13 +1,13 @@ import pandas as pd -from bdikit.mapping_algorithms.column_mapping.algorithms import ( +from bdikit.schema_matching.best.valentine import ( SimFloodSchemaMatcher, JaccardSchemaMatcher, DistributionBasedSchemaMatcher, ComaSchemaMatcher, CupidSchemaMatcher, - TwoPhaseSchemaMatcher, - ContrastiveLearningSchemaMatcher, ) +from bdikit.schema_matching.best.twophase import TwoPhaseSchemaMatcher +from bdikit.schema_matching.best.contrastivelearning import ContrastiveLearningSchemaMatcher def test_basic_column_mapping_algorithms(): @@ -33,7 +33,7 @@ def test_basic_column_mapping_algorithms(): ) # when - mapping = column_matcher.map(dataset=table1, global_table=table2) + mapping = column_matcher.map(source=table1, target=table2) # then assert { diff --git a/tests/test_value_matching_algorithms.py b/tests/test_value_matching.py similarity index 96% rename from tests/test_value_matching_algorithms.py rename to tests/test_value_matching.py index c43a739b..79621c51 100644 --- a/tests/test_value_matching_algorithms.py +++ b/tests/test_value_matching.py @@ -1,6 +1,6 @@ import unittest import pandas as pd -from bdikit.mapping_algorithms.value_mapping.algorithms import ( +from bdikit.value_matching.polyfuzz import ( TFIDFValueMatcher, EditDistanceValueMatcher, )