Skip to content

Commit

Permalink
Merge pull request #891 from biorack/main
Browse files Browse the repository at this point in the history
Merge main changes back into add_atlas_to_db
  • Loading branch information
bkieft-usa authored Aug 27, 2024
2 parents 1b29fcc + 83af6db commit 7b229e1
Show file tree
Hide file tree
Showing 22 changed files with 2,176 additions and 3,326 deletions.
217 changes: 217 additions & 0 deletions docs/Untargeted_Analysis.md

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions example_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ workflows:
num_points: 3
peak_height: 1e6
msms_score: 0.6
msms_frag_ratio: 0.3
generate_analysis_outputs: True
- name: QC-NEG
atlas:
Expand Down Expand Up @@ -281,6 +282,7 @@ workflows:
num_points: 3
peak_height: 1e6
msms_score: 0.6
msms_frag_ratio: 0.3
generate_analysis_outputs: True
- name: Hybrid
rt_alignment:
Expand Down
6 changes: 3 additions & 3 deletions metatlas/io/metatlas_get_data_helper_fun.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def sort_atlas_csv(input_csv: str, column1: str, column2: str, istd_atlas: bool)
"""
# Read the CSV file into a DataFrame
df = pd.read_csv(input_csv)

# Sort the DataFrame based on the specified columns in ascending order
if istd_atlas == True:
if 'label' in df.columns:
Expand All @@ -52,7 +52,7 @@ def sort_atlas_csv(input_csv: str, column1: str, column2: str, istd_atlas: bool)
sorted_df = df.sort_values(by=[column1, column2], ascending=[True, True])
else:
sorted_df = df.sort_values(by=[column1, column2], ascending=[True, True])

return(sorted_df)

def create_msms_dataframe(df):
Expand Down Expand Up @@ -121,7 +121,7 @@ def arrange_ms2_data(metatlas_dataset: MetatlasDataset, do_centroid: bool) -> pd
'measured_precursor_mz': measured_precursor_mz, 'measured_precursor_intensity': measured_precursor_intensity,
'precursor_mz': precursor_mz, 'name': cid_name, 'adduct': adduct, 'inchi_key': inchi_key,
'matchms_spectrum': matchms_spectrum, 'query_spectrum': spectrum, 'cid_pmz_tolerance': mz_tolerance,
'cid_rt_min': rt_min, 'cid_rt_max': rt_max})
'cid_rt_min': rt_min, 'cid_rt_max': rt_max, 'compound_idx': compound_idx})

return pd.DataFrame(msms_data)

Expand Down
142 changes: 42 additions & 100 deletions metatlas/io/update_lcmsfiles_in_lims.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# you need this!
# https://github.com/LabKey/labkey-api-python
#
# sys.path.insert(0,'/Users/bpb/repos/labkey-api-python/')
sys.path.insert(0,'/global/common/software/m2650/labkey-api-python')
# sys.path.insert(0,'/global/homes/b/bpb/repos/labkey-api-python/')
# import labkey as lk
from labkey.api_wrapper import APIWrapper
Expand Down Expand Up @@ -54,7 +54,7 @@
key_file = '/global/cfs/cdirs/metatlas/labkey_user.txt'
with open(key_file,'r') as fid:
api_key = fid.read().strip()
labkey_server='metatlas.nersc.gov'
labkey_server='metatlas.lbl.gov'
project_name='LIMS/'
api = APIWrapper(labkey_server, project_name, use_ssl=True,api_key=api_key)

Expand Down Expand Up @@ -142,7 +142,7 @@ def get_table_from_lims(table,columns=None):
df = df[[c for c in df.columns if not c.startswith('_')]]
return df

def update_table_in_lims(df,table,method='update',max_size=1000):
def update_table_in_lims(df,table,method='update',max_size=100):

"""
Note: Do ~1000 rows at a time. Any more and you get a 504 error. Maybe increasing the timeout would help.
Expand Down Expand Up @@ -211,25 +211,30 @@ def update_lcmsrun_names(tables=['mzml_file','hdf5_file','raw_file']):
temp['name'] = missing_from_lcmsruns
update_table_in_lims(temp,'lcmsrun',method='insert')

#remove extra ones
if len(extra_in_lcmsruns)>0:
chunks = [extra_in_lcmsruns[x:x+10] for x in range(0, len(extra_in_lcmsruns), 10)]
for i,extra in enumerate(chunks):
sql = """SELECT Key FROM lcmsrun where name IN (%s);"""%','.join(['\'%s\''%e for e in extra])
# print(sql)
schema = 'lists'
sql_result = api.query.execute_sql(schema, sql,max_rows=1e7)
if sql_result is None:
print(('execute_sql: Failed to load results from ' + schema + '.' + table))
# return None
else:
temp = pd.DataFrame(sql_result['rows'])
temp = temp[[c for c in temp.columns if not c.startswith('_')]]
# return df

#if temp.shape[0]>0:
# update_table_in_lims(temp,'lcmsrun',method='delete')
# print('removing chunk %d'%i)
# # remove extra ones
# if len(extra_in_lcmsruns)>0:
# lcmsruns = get_table_from_lims('lcmsrun',columns=['Key','name'])
# temp = lcmsruns[lcmsruns['name'].isin(extra_in_lcmsruns)]
# if temp.shape[0]>0:
# print('LCMS run entries to remove:',temp.shape[0])
# update_table_in_lims(temp,'lcmsrun',method='delete')
# chunks = [extra_in_lcmsruns[x:x+10] for x in range(0, len(extra_in_lcmsruns), 5)]
# for i,extra in enumerate(chunks):
# sql = """SELECT Key FROM lcmsrun where name IN (%s);"""%','.join(['\'%s\''%e for e in extra])
# # print(sql)
# schema = 'lists'
# sql_result = api.query.execute_sql(schema, sql,max_rows=1e7)
# if sql_result is None:
# print(('execute_sql: Failed to load results from ' + schema + '.' + table))
# # return None
# else:
# temp = pd.DataFrame(sql_result['rows'])
# temp = temp[[c for c in temp.columns if not c.startswith('_')]]
# # return df

# if temp.shape[0]>0:
# update_table_in_lims(temp,'lcmsrun',method='delete')
# print('removing chunk %d'%i)

return missing_from_lcmsruns,extra_in_lcmsruns

Expand Down Expand Up @@ -268,69 +273,6 @@ def get_lcmsrun_matrix():#labkey_server='metatlas-dev.nersc.gov',project_name='/



def update_file_conversion_tasks(task,lcmsruns=None,file_conversion_tasks=None):#,labkey_server='metatlas-dev.nersc.gov',project_name='/LIMS'):
"""
gets current tasks and current files and determines if new tasks need to be made:
task will be:['mzml_to_hdf5','raw_to_mzml','mzml_to_spectralhits','mzml_to_pactolus']
"""
input_type = task.split('_')[0]
output_type = task.split('_')[-1]

if file_conversion_tasks is None:
file_conversion_tasks = get_table_from_lims('file_conversion_task',columns=['Key','input_file','output_file','task','status'])

if lcmsruns is None:
lcmsruns = get_lcmsrun_matrix()
if file_conversion_tasks.shape[0]>0:
done_input_files = lcmsruns.loc[pd.notna(lcmsruns['%s_filename'%input_type]),'%s_filename'%input_type]
done_output_files = lcmsruns.loc[pd.notna(lcmsruns['%s_filename'%output_type]),'%s_filename'%output_type]
inputfile_idx = file_conversion_tasks['input_file'].isin(done_input_files)
outputfile_idx = file_conversion_tasks['output_file'].isin(done_output_files)

task_idx = file_conversion_tasks['task']==task

# This finds where output file exists
done_tasks_idx = (task_idx) & (outputfile_idx)
if sum(done_tasks_idx)>0:
update_table_in_lims(file_conversion_tasks.loc[done_tasks_idx,['Key']],'file_conversion_task',method='delete')
#,labkey_server=labkey_server,project_name=project_name)
print(('%s: There are %d tasks where output file exist and will be removed'%(task,file_conversion_tasks[done_tasks_idx].shape[0])))


# This finds where input file is missing
done_tasks_idx = (task_idx) & (~inputfile_idx)
if sum(done_tasks_idx)>0:
update_table_in_lims(file_conversion_tasks.loc[done_tasks_idx,['Key']],'file_conversion_task',method='delete')
#,labkey_server=labkey_server,project_name=project_name)
print(('%s: There are %d tasks where input file is missing and will be removed'%(task,file_conversion_tasks[done_tasks_idx].shape[0])))

right_now_str = datetime.now().strftime("%Y%m%d %H:%M:%S")

idx = (pd.notna(lcmsruns['%s_filename'%input_type])) & (pd.isna(lcmsruns['%s_filename'%output_type]))

temp = pd.DataFrame()

temp['input_file'] = lcmsruns.loc[idx,'%s_filename'%input_type]
temp['output_file'] = temp['input_file'].apply(lambda x: re.sub('\%s$'%EXTENSIONS[input_type],'%s'%EXTENSIONS[output_type],x))
temp['task'] = task
temp['status'] = STATUS['initiation']
temp['log'] = 'detected: %s'%right_now_str
temp.reset_index(drop=True,inplace=True)
cols = temp.columns
if file_conversion_tasks.shape[0]>0:
temp = pd.merge(temp,file_conversion_tasks.add_suffix('_task'),left_on=['input_file','output_file'],right_on=['input_file_task','output_file_task'],how='outer',indicator=True)
new_tasks = temp[temp['_merge']=='left_only'].copy()
else:
new_tasks = temp
new_tasks = new_tasks[cols]
new_tasks.reset_index(drop=True,inplace=True)

print(("There are %d new tasks"%new_tasks.shape[0]))

if new_tasks.shape[0]>0:
update_table_in_lims(new_tasks,'file_conversion_task',method='insert')

def update_file_table(file_table):
file_type = file_table.split('_')[0]
Expand All @@ -339,40 +281,44 @@ def update_file_table(file_table):
dates,files = get_files_from_disk(PROJECT_DIRECTORY,v['extension'])
if len(files)>0:
df = pd.DataFrame(data={'filename':files,'file_type':file_type,'timeepoch':dates})
df['basename'] = df['filename'].apply(os.path.basename)
df['name'] = df['filename'].apply(complex_name_splitter) #make a name for grouping associated content
# df['basename'] = df['filename'].apply(os.path.basename)
# df['name'] = df['filename'].apply(complex_name_splitter) #make a name for grouping associated content
else:
df = pd.DataFrame()
df['filename'] = 'None'
df['file_type'] = file_type
df['timeepoch'] = 0
df['basename'] = 'None'
df['name'] = 'None'
# df['basename'] = 'None'
# df['name'] = 'None'

print(('\tThere were %d files on disk'%len(files)))

cols = ['filename','name','Key']
cols = ['filename','Key']
df_lims = get_table_from_lims(v['lims_table'],columns=cols)
# df_lims.drop_duplicates('filename',inplace=True)
print(('\tThere were %d files from LIMS table %s'%(df_lims.shape[0],v['lims_table'])))


diff_df = pd.merge(df, df_lims,on=['filename','name'], how='outer', indicator='Exist')
diff_df = pd.merge(df, df_lims,on=['filename'], how='outer', indicator='Exist')
diff_df = diff_df.loc[diff_df['Exist'] != 'both'] #(left_only, right_only, or both)
print(('\tThere are %d different'%diff_df.shape[0]))
print('')
# diff_df.fillna('',inplace=True)
diff_df['parameters'] = 1

cols = ['file_type','filename','timeepoch','basename','name']
temp = diff_df.loc[diff_df['Exist']=='left_only',cols]
cols = ['file_type','filename','timeepoch']
temp = diff_df.loc[diff_df['Exist']=='left_only',cols].copy()
if temp.shape[0]>0:
temp['name'] = temp['filename'].apply(complex_name_splitter)
temp['basename'] = temp['filename'].apply(lambda x: os.path.basename(x))
update_table_in_lims(temp,file_table,method='insert')#,index_column='Key',columns=None,labkey_server='metatlas-dev.nersc.gov',project_name='/LIMS'):

cols = ['Key','filename']
cols = ['Key']
temp = diff_df.loc[diff_df['Exist']=='right_only',cols]
temp['Key'] = temp['Key'].astype(int)
#if temp.shape[0]>0:
# update_table_in_lims(temp,file_table,method='delete')#,index_column='Key',columns=None,labkey_server='metatlas-dev.nersc.gov',project_name='/LIMS'):
# if temp.shape[0]>0:
# update_table_in_lims(temp,file_table,method='delete')#,index_column='Key',columns=None,labkey_server='metatlas-dev.nersc.gov',project_name='/LIMS'):
return temp
# df.to_csv('/global/homes/b/bpb/Downloads/%s_files.tab'%k,index=None,sep='\t')

def str2bool(v):
Expand Down Expand Up @@ -418,10 +364,6 @@ def main():
update_lcmsrun_matrix(t)

# 4. remove any file conversion tasks that have already occured
# TODO!!!!!# TODO!!!!!# TODO!!!!!
# TODO!!!!!# TODO!!!!!# TODO!!!!!
# TODO!!!!!# TODO!!!!!# TODO!!!!!
# TODO!!!!!# TODO!!!!!# TODO!!!!!
#if str2bool(args['update_fileconversion_tasks'])==True:
# # 5. populate any file conversion tasks that need to occur
# lcmsruns = get_lcmsrun_matrix() #this could be moved up abote to step 3 and save a few queries
Expand Down
60 changes: 33 additions & 27 deletions metatlas/plots/dill2plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -1668,12 +1668,12 @@ def file_with_max_score(data, frag_refs, compound_idx, filter_by):

for i, msv_sample in enumerate(np.split(msv_sample_scans, scan_idxs, axis=1)):

mms_sample = Spectrum(mz=msv_sample[0], intensities=msv_sample[1], metadata={'precursor_mz':np.nan})
mms_sample = Spectrum(mz=msv_sample[0], intensities=np.sqrt(msv_sample[1]), metadata={'precursor_mz':np.nan})

for f, frag in sp.filter_frag_refs(data, frag_refs, compound_idx, file_idx, filter_by).iterrows():
msv_ref = sp.sort_ms_vector_by_mz(np.array(frag['mz_intensities']).T)

mms_ref = Spectrum(mz=msv_ref[0], intensities=msv_ref[1], metadata={'precursor_mz':np.nan})
mms_ref = Spectrum(mz=msv_ref[0], intensities=np.sqrt(msv_ref[1]), metadata={'precursor_mz':np.nan})

score = cos.pair(mms_sample, mms_ref)['score'].item()

Expand Down Expand Up @@ -1952,14 +1952,14 @@ def top_five_scoring_files(data, frag_refs, compound_idx, filter_by):
current_best_msv_ref = None
current_best_rt = None

mms_sample = Spectrum(mz=msv_sample[0], intensities=msv_sample[1], metadata={'precursor_mz':np.nan})
mms_sample = Spectrum(mz=msv_sample[0], intensities=np.sqrt(msv_sample[1]), metadata={'precursor_mz':np.nan})

for ref_idx, frag in sp.filter_frag_refs(data, frag_refs, compound_idx, file_idx, filter_by).iterrows():
msv_ref = np.array(frag['mz_intensities']).T

msv_sample_aligned, msv_ref_aligned = sp.pairwise_align_ms_vectors(msv_sample, msv_ref, 0.005)

mms_ref = Spectrum(mz=msv_ref[0], intensities=msv_ref[1], metadata={'precursor_mz':np.nan})
mms_ref = Spectrum(mz=msv_ref[0], intensities=np.sqrt(msv_ref[1]), metadata={'precursor_mz':np.nan})

score = cos.pair(mms_sample, mms_ref)['score'].item()

Expand Down Expand Up @@ -2187,7 +2187,7 @@ def build_msms_refs_spectra(msms_refs: pd.DataFrame) -> pd.DataFrame:
"""Converts MS2 spectral arrays from strings to numpy arrays and creates MatchMS spectra."""

msms_refs['spectrum'] = msms_refs['spectrum'].apply(lambda s: np.array(json.loads(s)))
msms_refs['matchms_spectrum'] = msms_refs.apply(lambda s: Spectrum(s.spectrum[0], s.spectrum[1], metadata={'precursor_mz':s.precursor_mz}), axis=1)
msms_refs['matchms_spectrum'] = msms_refs.apply(lambda s: Spectrum(s.spectrum[0], np.sqrt(s.spectrum[1]), metadata={'precursor_mz':s.precursor_mz}), axis=1)

return msms_refs

Expand Down Expand Up @@ -2220,29 +2220,28 @@ def convert_to_centroid(sample_df):
return sample_df[:, idx]
return np.zeros((0, 0))

def create_nonmatched_msms_hits(msms_data: pd.DataFrame, inchi_key: str) -> pd.DataFrame:
def create_nonmatched_msms_hits(msms_data: pd.DataFrame, inchi_key: str, compound_idx: int) -> pd.DataFrame:
"""
Create MSMS hits DataFrame with no corresponding match in MSMS refs.
If there is no InChi Key in the atlas (compound identifications) for a particular compound,
Setting inchi_key to an empty string is necessary for correct plotting.
"""

compound_msms_hits = msms_data[msms_data['compound_idx'] == compound_idx].reset_index(drop=True).copy()

if inchi_key is None:
inchi_msms_hits = msms_data[msms_data['inchi_key'].isnull()].reset_index(drop=True).copy()
inchi_msms_hits['inchi_key'] = ''
else:
inchi_msms_hits = msms_data[msms_data['inchi_key'] == inchi_key].reset_index(drop=True).copy()
compound_msms_hits['inchi_key'] = ''

inchi_msms_hits['database'] = np.nan
inchi_msms_hits['id'] = np.nan
inchi_msms_hits['adduct'] = ''
compound_msms_hits['database'] = np.nan
compound_msms_hits['id'] = np.nan
compound_msms_hits['adduct'] = ''

inchi_msms_hits['score'] = msms_data['measured_precursor_intensity']
inchi_msms_hits['num_matches'] = 0
inchi_msms_hits['spectrum'] = [np.array([np.array([]), np.array([])])] * len(inchi_msms_hits)
compound_msms_hits['score'] = compound_msms_hits['measured_precursor_intensity']
compound_msms_hits['num_matches'] = 0
compound_msms_hits['spectrum'] = [np.array([np.array([]), np.array([])])] * len(compound_msms_hits)

return inchi_msms_hits
return compound_msms_hits


def score_matrix(cos: Type[CosineHungarian], references: List[SpectrumType], queries: List[SpectrumType]) -> npt.NDArray:
Expand Down Expand Up @@ -2280,16 +2279,24 @@ def score_matrix(cos: Type[CosineHungarian], references: List[SpectrumType], que
return scores_array


def get_hits_per_compound(cos: Type[CosineHungarian], inchi_key: str,
def get_hits_per_compound(cos: Type[CosineHungarian], compound_idx: int,
msms_data: pd.DataFrame, msms_refs: pd.DataFrame) -> pd.DataFrame:
"""
Get MSMS hits for an individual InChiKey.
Get MSMS hits for an individual compound index and InChiKey.
If no key is given, or it isn't present in the MSMS refs, create nonmatched dummy hits DataFrame.
"""
unique_inchi_keys = msms_data[msms_data['compound_idx']==compound_idx]['inchi_key'].unique()

if inchi_key not in msms_refs['inchi_key'].tolist():
nonmatched_msms_hits = create_nonmatched_msms_hits(msms_data, inchi_key)
if len(unique_inchi_keys) > 1:
raise Exception('Only 1 inchi_key is allowed per compound name.')
elif len(unique_inchi_keys) == 0:
inchi_key = None
else:
inchi_key = unique_inchi_keys[0]

if inchi_key not in msms_refs['inchi_key'].tolist() or inchi_key is None:
nonmatched_msms_hits = create_nonmatched_msms_hits(msms_data, inchi_key, compound_idx)
return nonmatched_msms_hits

filtered_msms_refs = msms_refs[msms_refs['inchi_key']==inchi_key].reset_index(drop=True).copy()
Expand All @@ -2307,7 +2314,7 @@ def get_hits_per_compound(cos: Type[CosineHungarian], inchi_key: str,
inchi_msms_hits = inchi_msms_hits[inchi_msms_hits['precursor_ppm_error']<=inchi_msms_hits['cid_pmz_tolerance']]

if inchi_msms_hits.empty:
nonmatched_msms_hits = create_nonmatched_msms_hits(msms_data, inchi_key)
nonmatched_msms_hits = create_nonmatched_msms_hits(msms_data, inchi_key, compound_idx)
return nonmatched_msms_hits

return inchi_msms_hits
Expand Down Expand Up @@ -2366,15 +2373,14 @@ def get_msms_hits(metatlas_dataset: MetatlasDataset, extra_time: bool | float =
if len(msms_data) == 0:
return pd.DataFrame(columns=msms_hits_cols).set_index(['database', 'id', 'file_name', 'msms_scan'])

data_inchi_keys = set(msms_data.inchi_key.tolist())
compound_names = ma_data.get_compound_names(metatlas_dataset)[0]

cos = CosineHungarian(tolerance=frag_mz_tolerance)

msms_hits = []
for inchi_key in tqdm(data_inchi_keys, unit='compound', disable=in_papermill()):

inchi_msms_hits = get_hits_per_compound(cos, inchi_key, msms_data, msms_refs)
msms_hits.append(inchi_msms_hits)
for compound_idx in tqdm(range(len(compound_names)), unit='compound', disable=in_papermill()):
compound_msms_hits = get_hits_per_compound(cos, compound_idx, msms_data, msms_refs)
msms_hits.append(compound_msms_hits)

msms_hits = pd.concat(msms_hits)

Expand Down
Loading

0 comments on commit 7b229e1

Please sign in to comment.