Merge pull request #891 from biorack/main

Merge main changes back into add_atlas_to_db
biorack · Aug 27, 2024 · 7b229e1 · 7b229e1
2 parents 1b29fcc + 83af6db
commit 7b229e1
Show file tree

Hide file tree

Showing 22 changed files with 2,176 additions and 3,326 deletions.
diff --git a/docs/Untargeted_Analysis.md b/docs/Untargeted_Analysis.md
diff --git a/example_config.yaml b/example_config.yaml
@@ -225,6 +225,7 @@ workflows:
             num_points: 3
             peak_height: 1e6
             msms_score: 0.6
+            msms_frag_ratio: 0.3
             generate_analysis_outputs: True
         - name: QC-NEG
           atlas:
@@ -281,6 +282,7 @@ workflows:
             num_points: 3
             peak_height: 1e6
             msms_score: 0.6
+            msms_frag_ratio: 0.3
             generate_analysis_outputs: True
   - name: Hybrid
     rt_alignment:

diff --git a/metatlas/io/metatlas_get_data_helper_fun.py b/metatlas/io/metatlas_get_data_helper_fun.py
@@ -37,7 +37,7 @@ def sort_atlas_csv(input_csv: str, column1: str, column2: str, istd_atlas: bool)
     """
     # Read the CSV file into a DataFrame
     df = pd.read_csv(input_csv)
-    
+
     # Sort the DataFrame based on the specified columns in ascending order
     if istd_atlas == True:
         if 'label' in df.columns:
@@ -52,7 +52,7 @@ def sort_atlas_csv(input_csv: str, column1: str, column2: str, istd_atlas: bool)
             sorted_df = df.sort_values(by=[column1, column2], ascending=[True, True])
     else:
         sorted_df = df.sort_values(by=[column1, column2], ascending=[True, True])
-    
+
     return(sorted_df)
 
 def create_msms_dataframe(df):
@@ -121,7 +121,7 @@ def arrange_ms2_data(metatlas_dataset: MetatlasDataset, do_centroid: bool) -> pd
                                   'measured_precursor_mz': measured_precursor_mz, 'measured_precursor_intensity': measured_precursor_intensity,
                                   'precursor_mz': precursor_mz, 'name': cid_name, 'adduct': adduct, 'inchi_key': inchi_key,
                                   'matchms_spectrum': matchms_spectrum, 'query_spectrum': spectrum, 'cid_pmz_tolerance': mz_tolerance,
-                                  'cid_rt_min': rt_min, 'cid_rt_max': rt_max})
+                                  'cid_rt_min': rt_min, 'cid_rt_max': rt_max, 'compound_idx': compound_idx})
 
     return pd.DataFrame(msms_data)
 

diff --git a/metatlas/io/update_lcmsfiles_in_lims.py b/metatlas/io/update_lcmsfiles_in_lims.py
@@ -7,7 +7,7 @@
 # you need this!
 # https://github.com/LabKey/labkey-api-python
 #
-# sys.path.insert(0,'/Users/bpb/repos/labkey-api-python/')
+sys.path.insert(0,'/global/common/software/m2650/labkey-api-python')
 # sys.path.insert(0,'/global/homes/b/bpb/repos/labkey-api-python/')
 # import labkey as lk
 from labkey.api_wrapper import APIWrapper
@@ -54,7 +54,7 @@
 key_file = '/global/cfs/cdirs/metatlas/labkey_user.txt'
 with open(key_file,'r') as fid:
     api_key = fid.read().strip()
-labkey_server='metatlas.nersc.gov'
+labkey_server='metatlas.lbl.gov'
 project_name='LIMS/'
 api = APIWrapper(labkey_server, project_name, use_ssl=True,api_key=api_key)
 
@@ -142,7 +142,7 @@ def get_table_from_lims(table,columns=None):
         df = df[[c for c in df.columns if not c.startswith('_')]]
         return df
 
-def update_table_in_lims(df,table,method='update',max_size=1000):
+def update_table_in_lims(df,table,method='update',max_size=100):
 
     """
     Note: Do ~1000 rows at a time.  Any more and you get a 504 error.  Maybe increasing the timeout would help.
@@ -211,25 +211,30 @@ def update_lcmsrun_names(tables=['mzml_file','hdf5_file','raw_file']):
         temp['name'] = missing_from_lcmsruns
         update_table_in_lims(temp,'lcmsrun',method='insert')
 
-    #remove extra ones
-    if len(extra_in_lcmsruns)>0:
-        chunks = [extra_in_lcmsruns[x:x+10] for x in range(0, len(extra_in_lcmsruns), 10)]
-        for i,extra in enumerate(chunks):
-            sql = """SELECT Key FROM lcmsrun where name IN (%s);"""%','.join(['\'%s\''%e for e in extra])
-            # print(sql)
-            schema = 'lists'
-            sql_result = api.query.execute_sql(schema, sql,max_rows=1e7)
-            if sql_result is None:
-                print(('execute_sql: Failed to load results from ' + schema + '.' + table))
-            #     return None
-            else:
-                temp = pd.DataFrame(sql_result['rows'])
-                temp = temp[[c for c in temp.columns if not c.startswith('_')]]
-            #     return df
-
-            #if temp.shape[0]>0:
-            #    update_table_in_lims(temp,'lcmsrun',method='delete')
-            #    print('removing chunk %d'%i)
+    # # remove extra ones
+    # if len(extra_in_lcmsruns)>0:
+    #     lcmsruns = get_table_from_lims('lcmsrun',columns=['Key','name'])
+    #     temp = lcmsruns[lcmsruns['name'].isin(extra_in_lcmsruns)]
+    #     if temp.shape[0]>0:
+    #         print('LCMS run entries to remove:',temp.shape[0])
+    #         update_table_in_lims(temp,'lcmsrun',method='delete')
+#         chunks = [extra_in_lcmsruns[x:x+10] for x in range(0, len(extra_in_lcmsruns), 5)]
+#         for i,extra in enumerate(chunks):
+#             sql = """SELECT Key FROM lcmsrun where name IN (%s);"""%','.join(['\'%s\''%e for e in extra])
+#             # print(sql)
+#             schema = 'lists'
+#             sql_result = api.query.execute_sql(schema, sql,max_rows=1e7)
+#             if sql_result is None:
+#                 print(('execute_sql: Failed to load results from ' + schema + '.' + table))
+#             #     return None
+#             else:
+#                 temp = pd.DataFrame(sql_result['rows'])
+#                 temp = temp[[c for c in temp.columns if not c.startswith('_')]]
+#             #     return df
+
+#             if temp.shape[0]>0:
+#                 update_table_in_lims(temp,'lcmsrun',method='delete')
+#                 print('removing chunk %d'%i)
 
     return missing_from_lcmsruns,extra_in_lcmsruns
 
@@ -268,69 +273,6 @@ def get_lcmsrun_matrix():#labkey_server='metatlas-dev.nersc.gov',project_name='/
 
 
 
-def update_file_conversion_tasks(task,lcmsruns=None,file_conversion_tasks=None):#,labkey_server='metatlas-dev.nersc.gov',project_name='/LIMS'):
-    """
-    gets current tasks and current files and determines if new tasks need to be made:
-
-    task will be:['mzml_to_hdf5','raw_to_mzml','mzml_to_spectralhits','mzml_to_pactolus']
-
-    """
-    input_type = task.split('_')[0]
-    output_type = task.split('_')[-1]
-
-    if file_conversion_tasks is None:
-        file_conversion_tasks = get_table_from_lims('file_conversion_task',columns=['Key','input_file','output_file','task','status'])
-
-    if lcmsruns is None:
-        lcmsruns = get_lcmsrun_matrix()
-    if file_conversion_tasks.shape[0]>0:
-        done_input_files = lcmsruns.loc[pd.notna(lcmsruns['%s_filename'%input_type]),'%s_filename'%input_type]
-        done_output_files = lcmsruns.loc[pd.notna(lcmsruns['%s_filename'%output_type]),'%s_filename'%output_type]
-        inputfile_idx = file_conversion_tasks['input_file'].isin(done_input_files)
-        outputfile_idx = file_conversion_tasks['output_file'].isin(done_output_files)
-
-        task_idx = file_conversion_tasks['task']==task
-
-        # This finds where output file exists
-        done_tasks_idx = (task_idx) & (outputfile_idx)
-        if sum(done_tasks_idx)>0:
-            update_table_in_lims(file_conversion_tasks.loc[done_tasks_idx,['Key']],'file_conversion_task',method='delete')
-            #,labkey_server=labkey_server,project_name=project_name)
-            print(('%s: There are %d tasks where output file exist and will be removed'%(task,file_conversion_tasks[done_tasks_idx].shape[0])))
-
-
-        # This finds where input file is missing
-        done_tasks_idx = (task_idx) & (~inputfile_idx)
-        if sum(done_tasks_idx)>0:
-            update_table_in_lims(file_conversion_tasks.loc[done_tasks_idx,['Key']],'file_conversion_task',method='delete')
-            #,labkey_server=labkey_server,project_name=project_name)
-            print(('%s: There are %d tasks where input file is missing and will be removed'%(task,file_conversion_tasks[done_tasks_idx].shape[0])))
-
-    right_now_str = datetime.now().strftime("%Y%m%d %H:%M:%S")
-
-    idx = (pd.notna(lcmsruns['%s_filename'%input_type])) & (pd.isna(lcmsruns['%s_filename'%output_type]))
-
-    temp = pd.DataFrame()
-
-    temp['input_file'] = lcmsruns.loc[idx,'%s_filename'%input_type]
-    temp['output_file'] = temp['input_file'].apply(lambda x: re.sub('\%s$'%EXTENSIONS[input_type],'%s'%EXTENSIONS[output_type],x))
-    temp['task'] = task
-    temp['status'] = STATUS['initiation']
-    temp['log'] = 'detected: %s'%right_now_str
-    temp.reset_index(drop=True,inplace=True)
-    cols = temp.columns
-    if file_conversion_tasks.shape[0]>0:
-        temp = pd.merge(temp,file_conversion_tasks.add_suffix('_task'),left_on=['input_file','output_file'],right_on=['input_file_task','output_file_task'],how='outer',indicator=True)
-        new_tasks = temp[temp['_merge']=='left_only'].copy()
-    else:
-        new_tasks = temp
-    new_tasks = new_tasks[cols]
-    new_tasks.reset_index(drop=True,inplace=True)
-
-    print(("There are %d new tasks"%new_tasks.shape[0]))
-
-    if new_tasks.shape[0]>0:
-        update_table_in_lims(new_tasks,'file_conversion_task',method='insert')
 
 def update_file_table(file_table):
     file_type = file_table.split('_')[0]
@@ -339,40 +281,44 @@ def update_file_table(file_table):
     dates,files = get_files_from_disk(PROJECT_DIRECTORY,v['extension'])
     if len(files)>0:
         df = pd.DataFrame(data={'filename':files,'file_type':file_type,'timeepoch':dates})
-        df['basename'] = df['filename'].apply(os.path.basename)
-        df['name'] = df['filename'].apply(complex_name_splitter) #make a name for grouping associated content
+        # df['basename'] = df['filename'].apply(os.path.basename)
+        # df['name'] = df['filename'].apply(complex_name_splitter) #make a name for grouping associated content
     else:
         df = pd.DataFrame()
         df['filename'] = 'None'
         df['file_type'] = file_type
         df['timeepoch'] = 0
-        df['basename'] = 'None'
-        df['name'] = 'None'
+        # df['basename'] = 'None'
+        # df['name'] = 'None'
 
     print(('\tThere were %d files on disk'%len(files)))
 
-    cols = ['filename','name','Key']
+    cols = ['filename','Key']
     df_lims = get_table_from_lims(v['lims_table'],columns=cols)
+    # df_lims.drop_duplicates('filename',inplace=True)
     print(('\tThere were %d files from LIMS table %s'%(df_lims.shape[0],v['lims_table'])))
 
 
-    diff_df = pd.merge(df, df_lims,on=['filename','name'], how='outer', indicator='Exist')
+    diff_df = pd.merge(df, df_lims,on=['filename'], how='outer', indicator='Exist')
     diff_df = diff_df.loc[diff_df['Exist'] != 'both'] #(left_only, right_only, or both)
     print(('\tThere are %d different'%diff_df.shape[0]))
     print('')
 #         diff_df.fillna('',inplace=True)
     diff_df['parameters'] = 1
 
-    cols = ['file_type','filename','timeepoch','basename','name']
-    temp = diff_df.loc[diff_df['Exist']=='left_only',cols]
+    cols = ['file_type','filename','timeepoch']
+    temp = diff_df.loc[diff_df['Exist']=='left_only',cols].copy()
     if temp.shape[0]>0:
+        temp['name'] = temp['filename'].apply(complex_name_splitter)
+        temp['basename'] = temp['filename'].apply(lambda x: os.path.basename(x))
         update_table_in_lims(temp,file_table,method='insert')#,index_column='Key',columns=None,labkey_server='metatlas-dev.nersc.gov',project_name='/LIMS'):
 
-    cols = ['Key','filename']
+    cols = ['Key']
     temp = diff_df.loc[diff_df['Exist']=='right_only',cols]
     temp['Key'] = temp['Key'].astype(int)
-    #if temp.shape[0]>0:
-    #    update_table_in_lims(temp,file_table,method='delete')#,index_column='Key',columns=None,labkey_server='metatlas-dev.nersc.gov',project_name='/LIMS'):
+    # if temp.shape[0]>0:
+    #     update_table_in_lims(temp,file_table,method='delete')#,index_column='Key',columns=None,labkey_server='metatlas-dev.nersc.gov',project_name='/LIMS'):
+    return temp
 #         df.to_csv('/global/homes/b/bpb/Downloads/%s_files.tab'%k,index=None,sep='\t')
 
 def str2bool(v):
@@ -418,10 +364,6 @@ def main():
             update_lcmsrun_matrix(t)
 
     # 4. remove any file conversion tasks that have already occured
-    # TODO!!!!!# TODO!!!!!# TODO!!!!!
-    # TODO!!!!!# TODO!!!!!# TODO!!!!!
-    # TODO!!!!!# TODO!!!!!# TODO!!!!!
-    # TODO!!!!!# TODO!!!!!# TODO!!!!!
     #if str2bool(args['update_fileconversion_tasks'])==True:
     #    # 5. populate any file conversion tasks that need to occur
     #    lcmsruns = get_lcmsrun_matrix() #this could be moved up abote to step 3 and save a few queries

diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
@@ -1668,12 +1668,12 @@ def file_with_max_score(data, frag_refs, compound_idx, filter_by):
 
             for i, msv_sample in enumerate(np.split(msv_sample_scans, scan_idxs, axis=1)):
 
-                mms_sample = Spectrum(mz=msv_sample[0], intensities=msv_sample[1], metadata={'precursor_mz':np.nan})
+                mms_sample = Spectrum(mz=msv_sample[0], intensities=np.sqrt(msv_sample[1]), metadata={'precursor_mz':np.nan})
 
                 for f, frag in sp.filter_frag_refs(data, frag_refs, compound_idx, file_idx, filter_by).iterrows():
                     msv_ref = sp.sort_ms_vector_by_mz(np.array(frag['mz_intensities']).T)
 
-                    mms_ref = Spectrum(mz=msv_ref[0], intensities=msv_ref[1], metadata={'precursor_mz':np.nan})
+                    mms_ref = Spectrum(mz=msv_ref[0], intensities=np.sqrt(msv_ref[1]), metadata={'precursor_mz':np.nan})
 
                     score = cos.pair(mms_sample, mms_ref)['score'].item()
 
@@ -1952,14 +1952,14 @@ def top_five_scoring_files(data, frag_refs, compound_idx, filter_by):
             current_best_msv_ref = None
             current_best_rt = None
 
-            mms_sample = Spectrum(mz=msv_sample[0], intensities=msv_sample[1], metadata={'precursor_mz':np.nan})
+            mms_sample = Spectrum(mz=msv_sample[0], intensities=np.sqrt(msv_sample[1]), metadata={'precursor_mz':np.nan})
 
             for ref_idx, frag in sp.filter_frag_refs(data, frag_refs, compound_idx, file_idx, filter_by).iterrows():
                 msv_ref = np.array(frag['mz_intensities']).T
 
                 msv_sample_aligned, msv_ref_aligned = sp.pairwise_align_ms_vectors(msv_sample, msv_ref, 0.005)
 
-                mms_ref = Spectrum(mz=msv_ref[0], intensities=msv_ref[1], metadata={'precursor_mz':np.nan})
+                mms_ref = Spectrum(mz=msv_ref[0], intensities=np.sqrt(msv_ref[1]), metadata={'precursor_mz':np.nan})
 
                 score = cos.pair(mms_sample, mms_ref)['score'].item()
 
@@ -2187,7 +2187,7 @@ def build_msms_refs_spectra(msms_refs: pd.DataFrame) -> pd.DataFrame:
     """Converts MS2 spectral arrays from strings to numpy arrays and creates MatchMS spectra."""
 
     msms_refs['spectrum'] = msms_refs['spectrum'].apply(lambda s: np.array(json.loads(s)))
-    msms_refs['matchms_spectrum'] = msms_refs.apply(lambda s: Spectrum(s.spectrum[0], s.spectrum[1], metadata={'precursor_mz':s.precursor_mz}), axis=1)
+    msms_refs['matchms_spectrum'] = msms_refs.apply(lambda s: Spectrum(s.spectrum[0], np.sqrt(s.spectrum[1]), metadata={'precursor_mz':s.precursor_mz}), axis=1)
 
     return msms_refs
 
@@ -2220,29 +2220,28 @@ def convert_to_centroid(sample_df):
         return sample_df[:, idx]
     return np.zeros((0, 0))
 
-def create_nonmatched_msms_hits(msms_data: pd.DataFrame, inchi_key: str) -> pd.DataFrame:
+def create_nonmatched_msms_hits(msms_data: pd.DataFrame, inchi_key: str, compound_idx: int) -> pd.DataFrame:
     """
     Create MSMS hits DataFrame with no corresponding match in MSMS refs.
 
     If there is no InChi Key in the atlas (compound identifications) for a particular compound,
     Setting inchi_key to an empty string is necessary for correct plotting.
     """
 
+    compound_msms_hits = msms_data[msms_data['compound_idx'] == compound_idx].reset_index(drop=True).copy()
+
     if inchi_key is None:
-        inchi_msms_hits = msms_data[msms_data['inchi_key'].isnull()].reset_index(drop=True).copy()
-        inchi_msms_hits['inchi_key'] = ''
-    else:
-        inchi_msms_hits = msms_data[msms_data['inchi_key'] == inchi_key].reset_index(drop=True).copy()
+        compound_msms_hits['inchi_key'] = ''
 
-    inchi_msms_hits['database'] = np.nan
-    inchi_msms_hits['id'] = np.nan
-    inchi_msms_hits['adduct'] = ''
+    compound_msms_hits['database'] = np.nan
+    compound_msms_hits['id'] = np.nan
+    compound_msms_hits['adduct'] = ''
 
-    inchi_msms_hits['score'] = msms_data['measured_precursor_intensity']
-    inchi_msms_hits['num_matches'] = 0
-    inchi_msms_hits['spectrum'] = [np.array([np.array([]), np.array([])])] * len(inchi_msms_hits)
+    compound_msms_hits['score'] = compound_msms_hits['measured_precursor_intensity']
+    compound_msms_hits['num_matches'] = 0
+    compound_msms_hits['spectrum'] = [np.array([np.array([]), np.array([])])] * len(compound_msms_hits)
 
-    return inchi_msms_hits
+    return compound_msms_hits
 
 
 def score_matrix(cos: Type[CosineHungarian], references: List[SpectrumType], queries: List[SpectrumType]) -> npt.NDArray:
@@ -2280,16 +2279,24 @@ def score_matrix(cos: Type[CosineHungarian], references: List[SpectrumType], que
     return scores_array
 
 
-def get_hits_per_compound(cos: Type[CosineHungarian], inchi_key: str,
+def get_hits_per_compound(cos: Type[CosineHungarian], compound_idx: int,
                           msms_data: pd.DataFrame, msms_refs: pd.DataFrame) -> pd.DataFrame:
     """
-    Get MSMS hits for an individual InChiKey.
+    Get MSMS hits for an individual compound index and InChiKey.
 
     If no key is given, or it isn't present in the MSMS refs, create nonmatched dummy hits DataFrame.
     """
+    unique_inchi_keys = msms_data[msms_data['compound_idx']==compound_idx]['inchi_key'].unique()
 
-    if inchi_key not in msms_refs['inchi_key'].tolist():
-        nonmatched_msms_hits = create_nonmatched_msms_hits(msms_data, inchi_key)
+    if len(unique_inchi_keys) > 1:
+        raise Exception('Only 1 inchi_key is allowed per compound name.')
+    elif len(unique_inchi_keys) == 0:
+        inchi_key = None
+    else:
+        inchi_key = unique_inchi_keys[0]
+
+    if inchi_key not in msms_refs['inchi_key'].tolist() or inchi_key is None:
+        nonmatched_msms_hits = create_nonmatched_msms_hits(msms_data, inchi_key, compound_idx)
         return nonmatched_msms_hits
 
     filtered_msms_refs = msms_refs[msms_refs['inchi_key']==inchi_key].reset_index(drop=True).copy()
@@ -2307,7 +2314,7 @@ def get_hits_per_compound(cos: Type[CosineHungarian], inchi_key: str,
     inchi_msms_hits = inchi_msms_hits[inchi_msms_hits['precursor_ppm_error']<=inchi_msms_hits['cid_pmz_tolerance']]
 
     if inchi_msms_hits.empty:
-        nonmatched_msms_hits = create_nonmatched_msms_hits(msms_data, inchi_key)
+        nonmatched_msms_hits = create_nonmatched_msms_hits(msms_data, inchi_key, compound_idx)
         return nonmatched_msms_hits
 
     return inchi_msms_hits
@@ -2366,15 +2373,14 @@ def get_msms_hits(metatlas_dataset: MetatlasDataset, extra_time: bool | float =
     if len(msms_data) == 0:
         return pd.DataFrame(columns=msms_hits_cols).set_index(['database', 'id', 'file_name', 'msms_scan'])
 
-    data_inchi_keys = set(msms_data.inchi_key.tolist())
+    compound_names = ma_data.get_compound_names(metatlas_dataset)[0]
 
     cos = CosineHungarian(tolerance=frag_mz_tolerance)
 
     msms_hits = []
-    for inchi_key in tqdm(data_inchi_keys, unit='compound', disable=in_papermill()):
-
-        inchi_msms_hits = get_hits_per_compound(cos, inchi_key, msms_data, msms_refs)
-        msms_hits.append(inchi_msms_hits)
+    for compound_idx in tqdm(range(len(compound_names)), unit='compound', disable=in_papermill()):
+        compound_msms_hits = get_hits_per_compound(cos, compound_idx, msms_data, msms_refs)
+        msms_hits.append(compound_msms_hits)
 
     msms_hits = pd.concat(msms_hits)