Skip to content

Commit

Permalink
Merge pull request #876 from biorack/fix_dtype_warn
Browse files Browse the repository at this point in the history
Fix Pandas DType Incompatibility Warning
  • Loading branch information
tharwood3 authored Jun 14, 2024
2 parents e08d62e + c90a7a0 commit 20169af
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 17 deletions.
58 changes: 56 additions & 2 deletions metatlas/plots/dill2plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -2426,7 +2426,13 @@ def make_identification_figure_v2(input_fname: Optional[Path] = None, input_data
msms_hits_df = msms_hits.reset_index().sort_values('score', ascending=False)
compound_names = ma_data.get_compound_names(data, use_labels)[0]
file_names = ma_data.get_file_names(data)
match = pd.DataFrame()

match_dtypes = {'label': str,
'file name': str,
'Matching M/Zs above 1E-3*max': str,
'All matching M/Zs': str}

match = pd.DataFrame(columns=match_dtypes).astype(match_dtypes)
disable_interactive_plots()
plt.clf()
for compound_idx, _ in enumerate(compound_names):
Expand Down Expand Up @@ -2712,7 +2718,55 @@ def export_atlas_to_spreadsheet(atlas, output_filename=None):
cols.extend([(c, ['mz_references', 0, c]) for c in ['mz', 'mz_tolerance', 'adduct']])
cols.append(('polarity', ['mz_references', 0, 'detected_polarity']))

out = pd.DataFrame()
cols_dtypes = {'chebi_id': str,
'chebi_url': str,
'creation_time': float,
'description': str,
'formula': str,
'head_id': str,
'hmdb_id': str,
'hmdb_url': str,
'img_abc_id': str,
'inchi': str,
'inchi_key': str,
'iupac_name': str,
'kegg_id': str,
'kegg_url': str,
'last_modified': float,
'lipidmaps_id': str,
'lipidmaps_url': str,
'metacyc_id': str,
'mono_isotopic_molecular_weight': float,
'name': str,
'neutralized_2d_inchi': str,
'neutralized_2d_inchi_key': str,
'neutralized_inchi': str,
'neutralized_inchi_key': str,
'num_free_radicals': float,
'number_components': float,
'permanent_charge': float,
'prev_uid': str,
'pubchem_compound_id': str,
'pubchem_url': str,
'source': str,
'synonyms': str,
'unique_id': str,
'username': str,
'wikipedia_url': str,
'label': str,
'id_notes': str,
'ms1_notes': str,
'ms2_notes': str,
'identification_notes': str,
'rt_min': float,
'rt_max': float,
'rt_peak': float,
'mz': float,
'mz_tolerance': float,
'adduct': str,
'polarity': str}

out = pd.DataFrame(columns=cols_dtypes).astype(cols_dtypes)
is_atlas = isinstance(atlas, metob.Atlas)
compound_ids = atlas.compound_identifications if is_atlas else [i['identification'] for i in atlas[0]]
for i, my_id in enumerate(compound_ids):
Expand Down
65 changes: 50 additions & 15 deletions metatlas/tools/fastanalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def calculate_compound_total_score(final_df, compound_idx, quality_scores):
else:
final_df.loc[compound_idx, 'msi_level'] = "Level 1"
return final_df

def make_stats_table(input_fname: Optional[Path] = None, input_dataset = [], msms_hits_df = None,
include_lcmsruns = [], exclude_lcmsruns = [], include_groups = [], exclude_groups = [],
output_loc: Optional[Path] = None,
Expand Down Expand Up @@ -177,7 +177,42 @@ def make_stats_table(input_fname: Optional[Path] = None, input_dataset = [], msm
delta_mz = abs(mz_theoretical - avg_mz_measured)
delta_ppm = delta_mz / mz_theoretical * 1e6

final_df = pd.concat([final_df, pd.DataFrame({"index":[compound_idx]})], ignore_index=True)
final_df_dtypes = {'identified_metabolite': str,
'label': str,
'overlapping_compound': str,
'overlapping_inchi_keys': str,
'formula': str,
'polarity': str,
'exact_mass': float,
'inchi_key': str,
'msms_quality': float,
'mz_quality': float,
'rt_quality': float,
'total_score': float,
'msi_level': str,
'isomer_details': str,
'identification_notes': str,
'ms1_notes': str,
'ms2_notes': str,
'msms_quality': float,
'max_intensity': float,
'max_intensity_file': str,
'ms1_rt_peak': float,
'msms_file': str,
'msms_rt': float,
'msms_numberofions': float,
'msms_matchingions': str,
'msms_score': float,
'mz_adduct': str}

final_df = pd.concat([final_df, pd.DataFrame({"index": [compound_idx]})], ignore_index=True)

for col_name, col_dtype in final_df_dtypes.items():
if col_name in final_df.columns:
final_df = final_df.astype({col_name: col_dtype})
else:
final_df[col_name] = pd.Series(dtype=col_dtype)

final_df.loc[compound_idx, 'identified_metabolite'] = ""
if use_labels or len(cid.compound) == 0:
cid_label = cid.name
Expand Down Expand Up @@ -243,7 +278,7 @@ def make_stats_table(input_fname: Optional[Path] = None, input_dataset = [], msm
if len(cid.compound) == 0:
final_df.loc[compound_idx, 'formula'] = ""
final_df.loc[compound_idx, 'polarity'] = cid.mz_references[0].detected_polarity
final_df.loc[compound_idx, 'exact_mass'] = ""
final_df.loc[compound_idx, 'exact_mass'] = np.nan
final_df.loc[compound_idx, 'inchi_key'] = ""
else:
final_df.loc[compound_idx, 'formula'] = cid.compound[0].formula
Expand All @@ -252,7 +287,7 @@ def make_stats_table(input_fname: Optional[Path] = None, input_dataset = [], msm
final_df.loc[compound_idx, 'inchi_key'] = cid.compound[0].inchi_key

final_df.loc[compound_idx, 'identified_metabolite'] = final_df.loc[compound_idx, 'overlapping_compound'] or final_df.loc[compound_idx, 'label']
final_df.loc[compound_idx, 'msms_quality'] = "" # this gets updated after ms2_notes column is added
final_df.loc[compound_idx, 'msms_quality'] = np.nan # this gets updated after ms2_notes column is added

if delta_ppm <= 5 or delta_mz <= 0.0015:
final_df.loc[compound_idx, 'mz_quality'] = 1
Expand All @@ -261,7 +296,7 @@ def make_stats_table(input_fname: Optional[Path] = None, input_dataset = [], msm
elif delta_ppm > 10:
final_df.loc[compound_idx, 'mz_quality'] = 0
else:
final_df.loc[compound_idx, 'mz_quality'] = ""
final_df.loc[compound_idx, 'mz_quality'] = np.nan

rt_error = abs(cid.rt_references[0].rt_peak - avg_rt_measured)
if rt_error <= 0.5:
Expand All @@ -271,8 +306,8 @@ def make_stats_table(input_fname: Optional[Path] = None, input_dataset = [], msm
elif rt_error > 2:
final_df.loc[compound_idx, 'rt_quality'] = 0
else:
final_df.loc[compound_idx, 'rt_quality'] = ""
final_df.loc[compound_idx, 'total_score'] = "" # this gets updated after ms2_notes column is added
final_df.loc[compound_idx, 'rt_quality'] = np.nan
final_df.loc[compound_idx, 'total_score'] = np.nan # this gets updated after ms2_notes column is added
final_df.loc[compound_idx, 'msi_level'] = "" # this gets updated after ms2_notes column is added
final_df.loc[compound_idx, 'isomer_details'] = ""
final_df.loc[compound_idx, 'identification_notes'] = cid.identification_notes
Expand All @@ -281,22 +316,22 @@ def make_stats_table(input_fname: Optional[Path] = None, input_dataset = [], msm
try:
final_df.loc[compound_idx, 'msms_quality'] = float(final_df.loc[compound_idx, 'ms2_notes'].split(',')[0])
except ValueError:
final_df.loc[compound_idx, 'msms_quality'] = ''
final_df.loc[compound_idx, 'msms_quality'] = np.nan
quality_scores = [final_df.loc[compound_idx, x] for x in ['msms_quality', 'mz_quality', 'rt_quality']]
if all(isinstance(x, (int, float)) for x in quality_scores):
final_df = calculate_compound_total_score(final_df, compound_idx, quality_scores)
else:
final_df.loc[compound_idx, 'total_score'] = ""
final_df.loc[compound_idx, 'total_score'] = np.nan
final_df.loc[compound_idx, 'msi_level'] = ""
if len(intensities) > 0:
final_df.loc[compound_idx, 'max_intensity'] = intensities.loc[intensities['intensity'].astype(float).idxmax()]['intensity']
max_intensity_file_id = int(intensities.loc[intensities['intensity'].astype(float).idxmax()]['file_id'])
final_df.loc[compound_idx, 'max_intensity_file'] = file_names[max_intensity_file_id]
final_df.loc[compound_idx, 'ms1_rt_peak'] = dataset[max_intensity_file_id][compound_idx]['data']['ms1_summary']['rt_peak']
else:
final_df.loc[compound_idx, 'max_intensity'] = ""
final_df.loc[compound_idx, 'max_intensity'] = np.nan
final_df.loc[compound_idx, 'max_intensity_file'] = ""
final_df.loc[compound_idx, 'ms1_rt_peak'] = ""
final_df.loc[compound_idx, 'ms1_rt_peak'] = np.nan
if file_idxs != []:
final_df.loc[compound_idx, 'msms_file'] = file_names[file_idxs[0]]
final_df.loc[compound_idx, 'msms_rt'] = float("%.2f" % rt_list[0])
Expand All @@ -317,7 +352,7 @@ def make_stats_table(input_fname: Optional[Path] = None, input_dataset = [], msm
if all(isinstance(x, (int, float)) for x in quality_scores):
final_df = calculate_compound_total_score(final_df, compound_idx, quality_scores)
else:
final_df.loc[compound_idx, 'total_score'] = ""
final_df.loc[compound_idx, 'total_score'] = np.nan
final_df.loc[compound_idx, 'msi_level'] = ""
else: # When single matching fragment ion is not the precursor, set score to best.
logger.info("Notice! Single matching MSMS fragment ion %s is not within ppm tolerance (%s) of the precursor mass (%s) for %s. Setting MSMS score to the best score of %s.", single_matching_ion, ppm_tolerance, precursor_mass, final_df.loc[compound_idx, 'identified_metabolite'], scores[0])
Expand All @@ -328,10 +363,10 @@ def make_stats_table(input_fname: Optional[Path] = None, input_dataset = [], msm
final_df.loc[compound_idx, 'msms_score'] = float("%.4f" % scores[0])
else:
final_df.loc[compound_idx, 'msms_file'] = ""
final_df.loc[compound_idx, 'msms_rt'] = ""
final_df.loc[compound_idx, 'msms_numberofions'] = ""
final_df.loc[compound_idx, 'msms_rt'] = np.nan
final_df.loc[compound_idx, 'msms_numberofions'] = np.nan
final_df.loc[compound_idx, 'msms_matchingions'] = ""
final_df.loc[compound_idx, 'msms_score'] = ""
final_df.loc[compound_idx, 'msms_score'] = np.nan
final_df.loc[compound_idx, 'mz_adduct'] = cid.mz_references[0].adduct
final_df.loc[compound_idx, 'mz_theoretical'] = float("%.4f" % mz_theoretical)
final_df.loc[compound_idx, 'mz_measured'] = float("%.4f" % avg_mz_measured)
Expand Down

0 comments on commit 20169af

Please sign in to comment.