Skip to content

Commit

Permalink
The logic for creating/importing subjects table updated. supporting g…
Browse files Browse the repository at this point in the history
…eneral case now
  • Loading branch information
Arashhs committed Dec 30, 2023
1 parent 241e312 commit 810d668
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 134 deletions.
47 changes: 0 additions & 47 deletions conf/tem/config.yaml

This file was deleted.

113 changes: 62 additions & 51 deletions script/import_hub_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,61 +125,72 @@ def import_page():
# option to populate subject table or feature time series tables
is_subjects_populated = st.checkbox("Populate subject table?")

if uploaded_file and is_subjects_populated:
# set subject table name
subject_tbl_name = st.text_input("Enter subject table name", value="subjects")
st.success("File uploaded!")
if uploaded_file:
st.success(f"File `{uploaded_file.name}` uploaded!")
df = pd.read_csv(uploaded_file)
st.write("Columns in your CSV:")
st.write(df.columns)

if st.button("Populate Database"):
populate_subject_table(df, selected_db, config_path, user_tbl_name=subject_tbl_name)
st.success("Database populated!")


if uploaded_file and not is_subjects_populated:
st.success("File uploaded!")
df = pd.read_csv(uploaded_file)
df = generate_mets_by_calories(df)
st.write("Columns in your CSV:")
st.write(df.columns)

# Map columns
st.subheader("Mapping")

# Default selections based on column name similarity
default_timestamp = find_closest_name(df.columns, 'time timestamp date start_time end_time')
default_user_id = find_closest_name(df.columns, 'user id email patient')

timestamp_col = st.selectbox("**Select Timestamp Column**", df.columns, index=df.columns.get_loc(default_timestamp))
user_id_col = st.selectbox("**Select User ID Column**", df.columns, index=df.columns.get_loc(default_user_id))

# Foldable block for optional mappings
mappings = {
config['mapping']['columns']['timestamp']: timestamp_col,
config["mapping"]['columns']['user_id']: user_id_col,
}
table_mappings = {}
with st.expander("**Map Features to W4H Tables**", expanded=True):
st.write("Map your CSV columns to corresponding W4H tables.")

choices = ["None"] + list(df.columns)
for target_table_name in config['mapping']['tables']['time_series'] + config['mapping']['tables']['geo']:
target_table_label = ' '.join([label.capitalize() for label in target_table_name.replace('_', ' ').split()])
st.subheader(target_table_label)
def_choice = find_closest_name(choices, target_table_label)
mapped_col = st.selectbox("Select Corresponding Column", choices,
key=target_table_name, index=choices.index(def_choice))
table_mappings[target_table_name] = mapped_col if mapped_col != "None" else None

# Once mappings are set, allow the user to populate the database
if st.button("Populate Database"):
mappings = {**mappings, **table_mappings}

populate_db(df, selected_db, mappings, config_path)
st.success("Database populated!")

# create a mapping between the csv columns and the database tables
st.subheader("Mapping")
choices = ["None"] + list(df.columns)
mappings = {}
# if subject table is populated, populate subject table
if is_subjects_populated:
user_tbl_name = config['mapping']['tables']['user_table']['name']
with st.expander(f"**Map subject attributes to the W4H `{user_tbl_name}` table attributes**", expanded=True):
st.write(f"Map your CSV columns to corresponding W4H `{user_tbl_name}` table attributes.")
for target_attribute in config['mapping']['tables']['user_table']['attributes']:
target_attribute_label = ' '.join([label.capitalize() for label in target_attribute['name'].replace('_', ' ').split()])
st.write(f"**{target_attribute_label}**")
# write the description of the attribute
st.write(f'**Description:** {target_attribute["description"]}')
# write the data type of the attribute
st.write(f'**Data Type:** {target_attribute["type"]}')
def_choice = find_closest_name(choices, target_attribute_label)
mapped_col = st.selectbox("Select Corresponding Column", choices,
key=target_attribute['name'], index=choices.index(def_choice))
mappings[target_attribute['name']] = mapped_col if mapped_col != "None" else None
st.markdown("""---""")
# Once mappings are set, allow the user to populate the database
if st.button("Populate Database"):
populate_subject_table(df, selected_db, mappings, config_path)
st.success(f"Subject table `{user_tbl_name}` populated!")


# else, populate feature time series tables
else:
# Default selections based on column name similarity
default_timestamp = find_closest_name(df.columns, 'time timestamp date start_time end_time')
default_user_id = find_closest_name(df.columns, 'user id email patient')

timestamp_col = st.selectbox("**Select Timestamp Column**", df.columns, index=df.columns.get_loc(default_timestamp))
user_id_col = st.selectbox("**Select User ID Column**", df.columns, index=df.columns.get_loc(default_user_id))

# Foldable block for optional mappings
mappings = {
config['mapping']['columns']['timestamp']: timestamp_col,
config["mapping"]['columns']['user_id']: user_id_col,
}
table_mappings = {}
with st.expander("**Map Features to W4H Tables**", expanded=True):
st.write("Map your CSV columns to corresponding W4H tables.")

choices = ["None"] + list(df.columns)
for target_table_name in config['mapping']['tables']['time_series'] + config['mapping']['tables']['geo']:
target_table_label = ' '.join([label.capitalize() for label in target_table_name.replace('_', ' ').split()])
st.subheader(target_table_label)
def_choice = find_closest_name(choices, target_table_label)
mapped_col = st.selectbox("Select Corresponding Column", choices,
key=target_table_name, index=choices.index(def_choice))
table_mappings[target_table_name] = mapped_col if mapped_col != "None" else None

# Once mappings are set, allow the user to populate the database
if st.button("Populate Database"):
mappings = {**mappings, **table_mappings}

populate_db(df, selected_db, mappings, config_path)
st.success("GeoMTS tables populated!")

# if __name__ == "__main__":
# main()
83 changes: 47 additions & 36 deletions script/w4h_db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from loguru import logger
import pandas as pd
from sqlalchemy import create_engine, text, MetaData, Table, Column, String, ForeignKey, DateTime, REAL
from sqlalchemy import create_engine, text, MetaData, Table, Column, String, ForeignKey, DateTime, REAL, Integer, Float, Boolean
from sqlalchemy.orm import sessionmaker
from sqlalchemy_utils import database_exists, create_database

Expand All @@ -26,35 +26,36 @@ def create_tables(db_server_nickname:str, db_name: str, config_file='conf/config
metadata = MetaData()
config = load_config(config_file=config_file)
db_engine = get_db_engine(config_file, db_server_nickname=db_server_nickname, db_name=db_name)
try:
columns_config = config["mapping"]["columns"]

# Create the user table
user_table_config = config["mapping"]["tables"]["user_table"]
user_columns = [eval(f'Column("{col_name}", {col_dtype}, primary_key={col_name == columns_config["user_id"]})') for col_name, col_dtype in user_table_config["columns"].items()] # Convert string to actual SQLAlchemy type
user_table = Table(user_table_config["name"], metadata, *user_columns)


# Create time series tables
for table_name in config["mapping"]["tables"]["time_series"]:
table = Table(table_name, metadata,
Column(columns_config["user_id"], ForeignKey(user_table_config["name"] + '.' + columns_config["user_id"]), primary_key=True),
Column(columns_config["timestamp"], DateTime, primary_key=True),
Column(columns_config["value"], REAL),
)

# Create geo tables
for table_name in config["mapping"]["tables"]["geo"]:
table = Table(table_name, metadata,
Column(columns_config["user_id"], ForeignKey(user_table_config["name"] + '.' + columns_config["user_id"]), primary_key=True),
Column(columns_config["timestamp"], DateTime, primary_key=True),
Column(columns_config["value"], Geometry('POINT'))
)

metadata.create_all(db_engine)
except Exception as err:
db_engine.dispose()
logger.error(err)
# try:
columns_config = config["mapping"]["columns"]

# Create the user table
user_table_config = config["mapping"]["tables"]["user_table"]
dtype_mappings = config['mapping']['data_type_mappings']
user_columns = [eval(f'Column("{col_attribute["name"]}", {dtype_mappings[col_attribute["type"]]}, primary_key={col_attribute["name"] == columns_config["user_id"]})') for col_attribute in user_table_config["attributes"]] # Convert string to actual SQLAlchemy type
user_table = Table(user_table_config["name"], metadata, *user_columns)


# Create time series tables
for table_name in config["mapping"]["tables"]["time_series"]:
table = Table(table_name, metadata,
Column(columns_config["user_id"], ForeignKey(user_table_config["name"] + '.' + columns_config["user_id"]), primary_key=True),
Column(columns_config["timestamp"], DateTime, primary_key=True),
Column(columns_config["value"], REAL),
)

# Create geo tables
for table_name in config["mapping"]["tables"]["geo"]:
table = Table(table_name, metadata,
Column(columns_config["user_id"], ForeignKey(user_table_config["name"] + '.' + columns_config["user_id"]), primary_key=True),
Column(columns_config["timestamp"], DateTime, primary_key=True),
Column(columns_config["value"], Geometry('POINT'))
)

metadata.create_all(db_engine)
# except Exception as err:
# db_engine.dispose()
# logger.error(err)



Expand Down Expand Up @@ -204,13 +205,14 @@ def populate_tables(df: pd.DataFrame, db_name: str, mappings: dict, config_path=
engine.dispose()


def populate_subject_table(df: pd.DataFrame, db_name: str, config_path='conf/config.yaml', user_tbl_name=None):
"""Populate the W4H subject table in the given database with the data from the given dataframe based on
the given subject table name in the config file.
def populate_subject_table(df: pd.DataFrame, db_name: str, mappings: dict, config_path='conf/config.yaml'):
"""Populate the W4H tables in the given database with the data from the given dataframe based on
the mappings between the CSV columns and the database tables.
Args:
df (pd.DataFrame): Dataframe containing the subject data to be inserted into the database
db_name (str): Name of the subject database to insert the data into
df (pd.DataFrame): Dataframe containing the data to be inserted into the database
db_name (str): Name of the database to insert the data into
mappings (dict): Dictionary containing the mappings between the CSV columns and the database tables
config_path (str, optional): Path to the config file. Defaults to 'conf/config.yaml'.
"""
# Load the config
Expand All @@ -219,12 +221,21 @@ def populate_subject_table(df: pd.DataFrame, db_name: str, config_path='conf/con
# Create a session
engine = get_db_engine(config_path, mixed_db_name=db_name)

# create a user table dataframe using the mappings
user_tbl_name = config['mapping']['tables']['user_table']['name']
user_df = pd.DataFrame()
for k, v in mappings.items():
if v is not None:
user_df[k] = df[v]
# populate the user table (directly push df to table), if already exists, append new users
df.to_sql(user_tbl_name, engine, if_exists='append', index=False)
# if columns don't exist, ignore
user_df.to_sql(user_tbl_name, engine, if_exists='append', index=False)

# Commit the remaining changes and close the session
engine.dispose()



def getCurrentDbByUsername(username):
with sqlite3.connect('user.db') as conn:
cursor = conn.cursor()
Expand Down

0 comments on commit 810d668

Please sign in to comment.