Skip to content

Commit

Permalink
Merge pull request #35
Browse files Browse the repository at this point in the history
  • Loading branch information
roquelopez authored May 28, 2024
2 parents fd3e27e + c6417d7 commit ec86b89
Show file tree
Hide file tree
Showing 6 changed files with 2,112 additions and 572 deletions.
6 changes: 2 additions & 4 deletions bdikit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,10 @@ def load_dataset(self, dataset_path):

return self.dataset

def reduce_scope(self, num_columns=5, num_candidates=5):
def reduce_scope(self):
self.scope_manager = ScopeReducingManager(self.dataset, self.global_table)
self.reduced_scope = self.scope_manager.reduce()
plot_reduce_scope(self.reduced_scope, num_columns, num_candidates)

return self.reduced_scope
return plot_reduce_scope(self.reduced_scope, self.dataset)

def map_columns(self, algorithm='SimFloodAlgorithm'):
self.column_manager = ColumnMappingManager(self.dataset, self.global_table, algorithm)
Expand Down
35 changes: 4 additions & 31 deletions bdikit/visualization/mappings.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,13 @@
import pandas as pd
from copy import deepcopy
from IPython.display import display
from bdikit.utils import get_gdc_metadata
from bdikit.visualization.scope_reducing import ScopeReducerExplorer

pd.set_option('display.max_colwidth', None)

def plot_reduce_scope(reduced_scope, num_columns=5, num_candidates=5, max_chars=150):
gdc_metadata = get_gdc_metadata()

if num_columns is None:
num_columns = len(reduced_scope)

if num_candidates is None:
num_candidates = len(reduced_scope[0]['Top k columns'])

for column_data in reduced_scope[:num_columns]:
column_name = column_data['Candidate column']
recommendations = []
for candidate_name, candidate_similarity in column_data['Top k columns'][:num_candidates]:
candidate_description = gdc_metadata[candidate_name].get('description', '')
candidate_description = truncate_text(candidate_description, max_chars)
candidate_values = ', '.join(gdc_metadata[candidate_name].get('enum', []))
candidate_values = truncate_text(candidate_values, max_chars)
recommendations.append((candidate_name, candidate_similarity, candidate_description, candidate_values))

print(f'\n{column_name}:')
candidates_df = pd.DataFrame(recommendations, columns=['Candidate', 'Similarity', 'Description', 'Values (sample)'])
display(candidates_df)

def plot_reduce_scope(reduced_scope, dataset):
scope_explorer = ScopeReducerExplorer(dataset, reduced_scope)
return scope_explorer.explore()

def plot_column_mappings(column_mappings):
column_mappings_df = pd.DataFrame(column_mappings.items(), columns=['Original Column', 'Target Column'])
Expand All @@ -47,10 +27,3 @@ def plot_value_mappings(value_mappings, include_unmatches=True):

matches_df = pd.DataFrame(matches, columns=['Current Value', 'Target Value', 'Similarity'])
display(matches_df)


def truncate_text(text, max_chars):
if len(text) > max_chars:
return text[:max_chars] + '...'
else:
return text
146 changes: 144 additions & 2 deletions bdikit/visualization/scope_reducing.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,160 @@
import json
import logging

import altair as alt
import numpy as np
import pandas as pd
import panel as pn
from bdikit.utils import read_gdc_schema
from bdikit.utils import get_gdc_metadata
from Levenshtein import distance
from sklearn.cluster import AffinityPropagation

logger = logging.getLogger(__name__)

pn.extension("tabulator")
pn.extension("mathjax")
pn.extension("vega")

logger = logging.getLogger(__name__)
def clean_reduced_scope(reduced_scope, max_chars_samples):
gdc_metadata = get_gdc_metadata()

candidates_dfs = {}

for column_data in reduced_scope:
column_name = column_data['Candidate column']
recommendations = []
for candidate_name, candidate_similarity in column_data['Top k columns']:
candidate_description = gdc_metadata[candidate_name].get('description', '')
candidate_description = candidate_description
candidate_values = ', '.join(gdc_metadata[candidate_name].get('enum', []))
candidate_values = truncate_text(candidate_values, max_chars_samples)
recommendations.append((candidate_name, candidate_similarity, candidate_description, candidate_values))

candidates_dfs[column_name] = pd.DataFrame(recommendations, columns=['Candidate', 'Similarity', 'Description', 'Values (sample)'])

return candidates_dfs


def truncate_text(text, max_chars):
if len(text) > max_chars:
return text[:max_chars] + '...'
else:
return text

class ScopeReducerExplorer:
def __init__(self,
dataset,
reduced_scope,
max_chars_samples=150,
height=600) -> None:
self.dataset = dataset
self.reduced_scope = reduced_scope
self.candidates_dfs = clean_reduced_scope(reduced_scope, max_chars_samples=max_chars_samples)
self.height = height
self.max_candidates = len(reduced_scope[0]['Top k columns'])

def _plot_column_histogram(self, column):
if self.dataset [column].dtype == 'float64':
print(column)
chart = alt.Chart(self.dataset .fillna('Null'), height=300).mark_bar().encode(
alt.X(column, bin=True),
y='count()',
).properties(
width="container",
title='Histogram of '+column
)
return chart
else:
values = list(self.dataset [column].unique())
if len(values) == len(self.dataset [column]):
string = f'''Values are unique.
Some samples: {values[:5]}'''
return pn.pane.Markdown(string)
else:
if np.nan in values:
values.remove(np.nan)
values.sort()

chart = alt.Chart(self.dataset .fillna('Null'), height=300).mark_bar().encode(
x=alt.X(
column+":N",
sort=values,
),
y="count()",
).properties(
width="container",
title='Histogram of '+column
)
return chart

def _candidates_table(self, column, n_candidates):
df = self.candidates_dfs[column].loc[:n_candidates-1]

bokeh_formatters = {
#'Similarity': {'type': 'progress', 'min': 0.0, 'max': 1.0, 'legend': True}, # Show similarity as bars - Not working properly
'Description': {'type': 'textarea'},
'Values (sample)': {'type': 'textarea'}
}
text_align = {
'Similarity': 'center',
'index': 'center'
}
widths = {
'index': '7%',
'Candidate': '20%',
'Similarity': '10%',
'Description': '33%',
'Values (sample)': '30%'
}

table_candidates = pn.widgets.Tabulator(df,
formatters=bokeh_formatters,
text_align=text_align,
widths=widths,
sizing_mode='stretch_width',
height=self.height,
embed_content=True,
header_align='center',
theme='simple',
disabled=True
)
return table_candidates

def explore(self):
select_column = pn.widgets.Select(name='Column selected',
options=list(self.candidates_dfs.keys()),
align='center'
)
select_n_candidates = pn.widgets.EditableIntSlider(name='Number of candidates',
start=1, end=self.max_candidates,
step=1, value=min(5, self.max_candidates),
align='center'
)
cand_table = pn.bind(self._candidates_table, select_column, select_n_candidates)
column_hist = pn.bind(self._plot_column_histogram, select_column)

explorer = pn.Column(
pn.Row(
'# Scope Reducing Explorer',
pn.Spacer(width=30),
select_column,
pn.Spacer(width=30),
select_n_candidates,
align=('start','center')
),
pn.Spacer(height=30),
pn.Row(
pn.Column(
column_hist,
width=500
),
pn.Spacer(width=30),
cand_table
),
styles=dict(background="white")
)

return explorer


class SRHeatMapManager:
Expand Down
Loading

0 comments on commit ec86b89

Please sign in to comment.