diff --git a/data_steward/analytics/cdr_ops/qc_serology_ct_dataset.py b/data_steward/analytics/cdr_ops/qc_serology_ct_dataset.py index 6800d29c13..f0ba36bc9f 100644 --- a/data_steward/analytics/cdr_ops/qc_serology_ct_dataset.py +++ b/data_steward/analytics/cdr_ops/qc_serology_ct_dataset.py @@ -13,9 +13,8 @@ # name: python3 # --- # + tags=["parameters"] -rt_dataset = "" ct_ser_dataset = "" -new_ct_dataset = "" +new_ser_ct_dataset = "" cur_project = "" cur_out_project = "" run_as = "" @@ -38,15 +37,17 @@ def get_table(table, cols, dataset, project): query = JINJA_ENV.from_string(""" - SELECT {cols} - FROM `{dataset}.{table}` - """) - df = pd.read_gbq(query, project_id=project) + SELECT {{cols}} + FROM `{{dataset}}.{{table}}` + """ + q = query.render(cols = cols, project_id=project, dataset=dataset) + df =execute(client, q) return df -def get_data_input(rt_dataset, ct_ser_dataset, new_ct_dataset, cur_out_project, - cur_project): +def get_data_input(ct_ser_dataset, new_ser_ct_dataset, cur_out_project, cur_project): + + rt_dataset='R2020q4r1_antibody_quest' ########## query = JINJA_ENV.from_string(""" SELECT distinct serology_person_id @@ -56,155 +57,166 @@ def get_data_input(rt_dataset, ct_ser_dataset, new_ct_dataset, cur_out_project, FROM `{{rt_dataset}}.person` LEFT JOIN `{{rt_dataset}}.pid_sid_map` USING(serology_person_id) WHERE control_status = 'Positive' """) - pos_controls_query = query.render(project_id=cur_project, - rt_dataset=rt_dataset) - pos_controls_provider = execute(client, pos_controls_query) + + pos_controls_query = query.render(project_id=cur_project, rt_dataset=rt_dataset) + pos_controls_provider =execute(client, pos_controls_query) + + ############ - query = JINJA_ENV.from_string( - """ SELECT distinct biobank_id, serology_person_id + query = JINJA_ENV.from_string(""" SELECT distinct biobank_id, serology_person_id FROM `{{rt_dataset}}.mayo_person` JOIN `{{rt_dataset}}.pid_sid_map` USING(biobank_id)""") - mayo_positive_query = query.render(project_id=cur_project, - rt_dataset=rt_dataset) - mayo_positive_controls = execute(client, mayo_positive_query) + + mayo_positive_query = query.render(project_id=cur_project, rt_dataset=rt_dataset) + mayo_positive_controls =execute(client, mayo_positive_query) + + ############ - query = JINJA_ENV.from_string( - """SELECT DISTINCT serology_person_id, person_id - FROM `{{ct_ser_dataset}}.serology_person` """ - ) - ct_person_query = query.render(project_id=cur_out_project, - ct_ser_dataset=ct_ser_dataset) - ct_person_table = execute(client, ct_person_query) + query = JINJA_ENV.from_string("""SELECT DISTINCT serology_person_id, person_id + FROM `{{ct_ser_dataset}}.serology_person` """) + ct_person_query = query.render(project_id=cur_out_project, ct_ser_dataset=ct_ser_dataset) + ct_person_table =execute(client, ct_person_query) + ############ query = JINJA_ENV.from_string(""" SELECT DISTINCT * - FROM `{{new_ct_dataset}}.serology_person` """ - ) - person_query = query.render(project_id=cur_project, - new_ct_dataset=new_ct_dataset) - new_ct_person_table = execute(client, person_query) + FROM `{{new_ser_ct_dataset}}.serology_person` """) + person_query = query.render(project_id=cur_project, new_ser_ct_dataset=new_ser_ct_dataset) + new_ct_person_table =execute(client, person_query) + ############ query = JINJA_ENV.from_string(""" SELECT distinct table_name, column_name - FROM {{new_ct_dataset}}.INFORMATION_SCHEMA.COLUMNS""") - schema_query = query.render(project_id=cur_project, - new_ct_dataset=new_ct_dataset) - schema = execute(client, schema_query) + FROM {{new_ser_ct_dataset}}.INFORMATION_SCHEMA.COLUMNS""") + schema_query = query.render(project_id=cur_project, new_ser_ct_dataset=new_ser_ct_dataset) + schema =execute(client, schema_query) + return pos_controls_provider, mayo_positive_controls, ct_person_table, new_ct_person_table, schema -def serology_dataset_qc(new_ct_dataset, rt_dataset, ct_ser_dataset, cur_project, - cur_out_project): +def serology_dataset_qc(new_ser_ct_dataset, ct_ser_dataset, cur_project, cur_out_project): + + rt_dataset='R2020q4r1_antibody_quest' pos_controls_provider, mayo_positive_controls, ct_person_table, new_ct_person_table, schema = \ - get_data_input(rt_dataset=rt_dataset, ct_ser_dataset=ct_ser_dataset - , new_ct_dataset=new_ct_dataset - , cur_out_project=cur_out_project, cur_project=cur_project) - print('QC FOR dataset ' + new_ct_dataset + - ". If no 'Fail' is printed out, it means the QC Passed.") - print( - '\n################################################ QC1 ####################################################' - ) - print( - "Check that there are no individual from Mayo's positive controls in the dataset at all.\n" - ) + get_data_input(ct_ser_dataset = ct_ser_dataset + , new_ser_ct_dataset = new_ser_ct_dataset + , cur_out_project=cur_out_project, cur_project = cur_project) + + print('QC FOR dataset '+new_ser_ct_dataset) + + + print('\n################################################ QC1 ####################################################') + print("Check that there are no individuals from Mayo's positive controls in "+new_ser_ct_dataset+" at all.\n") + pid_col = 'serology_person_id' all_tables_pids = [] for table in schema.table_name.unique(): if pid_col in schema[schema.table_name == table].column_name.unique(): - df = get_table(dataset=new_ct_dataset, - table=table, - project=cur_project, - cols='serology_person_id') + df = get_table(dataset = new_ser_ct_dataset, table = table, project=cur_project, cols = 'serology_person_id') pids = list(set(df[pid_col])) - all_tables_pids = all_tables_pids + pids - n_common_pids = len( - set(all_tables_pids).intersection( - set(mayo_positive_controls['serology_person_id']))) - if n_common_pids != 0: - print("\033[1;31m" + " Fail: " + str( - n_common_pids) + " individual(s) from Mayo's positive controls are found in " \ - + new_ct_dataset + " ." + "\033[0;0m") - print( - '\n############################################## QC2 ####################################################' - ) - print('Check that none of the tables in ' + new_ct_dataset + - ' have any data for participants not included in ' + ct_ser_dataset + - '.') + all_tables_pids = all_tables_pids+pids + + n_common_pids = len(set(all_tables_pids).intersection(set(mayo_positive_controls['serology_person_id']))) + + if n_common_pids!=0: + print("\033[1;31m"+" Fail: "+str(n_common_pids)+" individual(s) from Mayo's positive controls are found in "\ + + new_ser_ct_dataset+" ."+"\033[0;0m") + else: + print('PASS') + + + print('\n############################################## QC2 ####################################################') + print('Check that none of the tables in '+new_ser_ct_dataset+' have any data for participants not included in '+ct_ser_dataset+'.') + pids_not_in_ct = set(all_tables_pids) - set(ct_person_table[pid_col]) - if len(pids_not_in_ct) != 0: + + if len(pids_not_in_ct) !=0: print('\n') - print("\033[1;31m" + ' Fail! ' + str(len(pids_not_in_ct)) + - ' pids in ' + new_ct_dataset + ' are not in ' + ct_ser_dataset + - "\033[0;0m") - print( - '\n\n############################################## QC3 ####################################################' - ) - print(''' In ''' + new_ct_dataset + '''.serology_person, check that: + print("\033[1;31m"+' Fail! '+ str(len(pids_not_in_ct)) +' pids in '+new_ser_ct_dataset+' are not in '+ct_ser_dataset+"\033[0;0m") + else: + print('PASS') + + print('\n\n############################################## QC3 ####################################################') + print('''In '''+new_ser_ct_dataset+'''.serology_person, check that:''') + + ## 0 check that all pids, regardless of control status have a serology_person_id + print('''\n1 All participants regardless of control_status have a serology_person_id and a control_status''') - - All participants regardless of control_status have a serology_person_id and a control_status + if True in new_ct_person_table[['serology_person_id', 'control_status']].isna().drop_duplicates().any().unique(): + print( "\033[1;31m"+' Fail! Some individuals do not have serology_person_id and/or control_status in '+new_ser_ct_dataset+\ + '.serology_person. All participants in the serology dataset should have this'+"\033[0;0m") - - VUMC non-controls and VUMC controls (neg controls): - - have person_id - - do **not** have demographics. Their person_ids will be used to link to CDR person. - - - Positive controls (non VUMC; from Boston and Vanderbilt U.): - - do **not** have person_id since they are not in AoU database - - Boston Positive Controls do **not** have demographics becasye they did not provide any - - Vanderbilt U. have demographics (provided by Vanderbilt U.)''') - print('\n') - ## 0 check that all pids, regardless of control status - if True in new_ct_person_table[['serology_person_id', 'control_status' - ]].isna().drop_duplicates().any().unique(): - print( - "\033[1;31m" + ' Fail! Some individuals do not have serology_person_id and/or control_status in ' + new_ct_dataset + \ - '.serology_person. All participants in the serology dataset should have this' + "\033[0;0m") + else: + print( 'PASS') + # 1 VUMC non-controls and VUMC controls (neg controls) - non_and_neg_controls = new_ct_person_table[ - new_ct_person_table.control_status.isin(['Non-Control', 'Negative'])] - non_and_neg_controls_demog = non_and_neg_controls.drop([ - 'serology_person_id', 'collection_date', 'control_status', 'person_id' - ], 1).drop_duplicates() - ## check that they have person_id in the person table + + non_and_neg_controls = new_ct_person_table[new_ct_person_table.control_status.isin(['Non-Control','Negative'])] + non_and_neg_controls_demog = non_and_neg_controls.drop(['serology_person_id','collection_date' + , 'control_status', 'person_id'],1).drop_duplicates() + + ## check that they have person_id in the person table + print('''\n2 All participants regardless of control_status have a person_id '''+new_ser_ct_dataset+'''.serology_person.''') + if non_and_neg_controls['person_id'].isna().any() == True: - print( - "\033[1;31m" + ' Fail! Some non controls (aou participants) are missing person_id in ' + new_ct_dataset + \ - '.serology_person. They need it to be able to link to CDR person table demographics.' + "\033[0;0m") - ## check that they DO NOT have demograohic data in the person table + print( "\033[1;31m"+' Fail! Some non controls (aou participants) are missing person_id in '+new_ser_ct_dataset+\ + '.serology_person. They need it to be able to link to CDR person table demographics.'+"\033[0;0m") + else: + print( 'PASS') + + + ## check that they DO NOT have demograohic data in the person table + print('''\n3 VUMC non-controls and VUMC controls (neg controls): + - have person_id + - do **not** have demographics. Their person_ids will be used to link to CDR person.''') + if False in non_and_neg_controls_demog.isna().any().unique(): - print( - "\033[1;31m" + ' Fail! Some non controls (aou participants) have demographics data in ' + new_ct_dataset + \ - '.serology_person.' + "\033[0;0m") + print( "\033[1;31m"+' Fail! Some non controls (aou participants) have demographics data in '+new_ser_ct_dataset+\ + '.serology_person.'+"\033[0;0m") + else: + print( 'PASS') + # 2 positive Controls: - pos_controls = new_ct_person_table[new_ct_person_table.control_status == - 'Positive'] + pos_controls = new_ct_person_table[new_ct_person_table.control_status == 'Positive'] + ## check that positive controls do not have person_id in the person table + print('''\n4 Positive controls (non VUMC; from Boston and Vanderbilt U.): + - do **not** have person_id since they are not in AoU database + - Boston Positive Controls do **not** have demographics becasue they did not provide any + - Vanderbilt U. have demographics (provided by Vanderbilt U.)''') + if pos_controls['person_id'].isna().any() == False: - print( - "\033[1;31m" + ' Fail! Some positive controls (non aou participants) have person_id in ' + new_ct_dataset + \ - '.serology_person.' + "\033[0;0m") + print( "\033[1;31m"+' Fail! Some positive controls (non aou participants) have person_id in '+new_ser_ct_dataset+\ + '.serology_person.'+"\033[0;0m") + else: + print( 'PASS') + + # 3 check that Boston positive controls DO NOT have demograohic data in the person table - non_demog_cols = [ - 'serology_person_id', 'collection_date', 'control_status', 'person_id', - 'control_status', 'Provider' - ] - boston_pos_controls = pos_controls.merge( - pos_controls_provider[pos_controls_provider.Provider == 'Boston']) - boston_pos_controls_demog = boston_pos_controls.drop(non_demog_cols, - 1).drop_duplicates() + print('''\n5 Boston Positive Controls do **not** have demographics becasue they did not provide any''') + + non_demog_cols =['serology_person_id','collection_date', 'control_status', 'person_id', 'control_status', 'Provider'] + + boston_pos_controls = pos_controls.merge(pos_controls_provider[pos_controls_provider.Provider == 'Boston']) + boston_pos_controls_demog = boston_pos_controls.drop(non_demog_cols,1).drop_duplicates() + if False in boston_pos_controls_demog.isna().any().unique(): - print( - "\033[1;31m" + - ' Fail! Some Boston positive controls have demographics data in ' - + new_ct_dataset + '.serology_person.' + "\033[0;0m") + print( "\033[1;31m"+' Fail! Some Boston positive controls have demographics data in '+new_ser_ct_dataset+'.serology_person.'+"\033[0;0m") + else: + print( 'PASS') + + # 4 check that Vanderbilt U positive controls have demograohic data in the person table - vandi_pos_controls = pos_controls.merge( - pos_controls_provider[pos_controls_provider.Provider == 'Vanderbilt']) - vandi_pos_controls_demog = vandi_pos_controls.drop(non_demog_cols, - 1).drop_duplicates() + print('''\n6 Vanderbilt U. have demographics (provided by Vanderbilt U.)''') + + vandi_pos_controls = pos_controls.merge(pos_controls_provider[pos_controls_provider.Provider == 'Vanderbilt']) + vandi_pos_controls_demog = vandi_pos_controls.drop(non_demog_cols,1).drop_duplicates() + if False not in vandi_pos_controls_demog.isna().all().unique(): - # display(vandi_pos_controls) - print( - "\033[1;31m" + ' Fail! Vanderbilt positive controls do not have demographics data in ' + new_ct_dataset + \ - '.serology_person.' + "\033[0;0m") + #display(vandi_pos_controls) + print( "\033[1;31m"+' Fail! Vanderbilt positive controls do not have demographics data in '+new_ser_ct_dataset+\ + '.serology_person.'+"\033[0;0m") + else: + print( 'PASS') -serology_dataset_qc(new_ct_dataset, rt_dataset, ct_ser_dataset, cur_project, - cur_out_project) +serology_dataset_qc(new_ct_dataset = new_ct_dataset, rt_dataset = rt_dataset + , ct_ser_dataset = ct_ser_dataset, dataset_project = dataset_project, project = project)