From b3e8c54c81b74b9e9e5c5bcfc6bfc905e07cdcd8 Mon Sep 17 00:00:00 2001 From: Jorrit Boekel Date: Tue, 22 Aug 2023 15:36:17 +0200 Subject: [PATCH] Fix off-by-one error in PTM protein site nr reporting #17 --- bin/luciphor_parse.py | 4 +-- bin/luciphor_prep.py | 57 +++++++++++++++++++++++++------------------ 2 files changed, 35 insertions(+), 26 deletions(-) diff --git a/bin/luciphor_parse.py b/bin/luciphor_parse.py index a719088..4caeaec 100755 --- a/bin/luciphor_parse.py +++ b/bin/luciphor_parse.py @@ -139,9 +139,9 @@ def annotate_protein_and_flanks(psm, ptmpsm, tdb, ptmnames): continue protseq = tdb[p].seq protptms = [] - site_protlocs = [ptm['site'][1] + x for x in peplocs] + site_protlocs = [ptm['site_report'] + x for x in peplocs] protlocs = '/'.join([str(x) for x in site_protlocs]) - protptms.append(f'{ptm["site"][0]}{protlocs}') + protptms.append(f'{ptm["aa"]}{protlocs}') flankpos = [(max(x-7, 0) , min(x+8, len(protseq))) for x in site_protlocs] flankseqs.update([str(protseq[x[0]:x[1]]) for x in flankpos]) proteins_loc[p].append('{}:{}'.format(ptm['name'], ','.join(protptms))) diff --git a/bin/luciphor_prep.py b/bin/luciphor_prep.py index 3a4920e..d923813 100755 --- a/bin/luciphor_prep.py +++ b/bin/luciphor_prep.py @@ -183,6 +183,13 @@ def lucimass_mod_dict(self): class PSM: + '''A PSM class containing mods, scores, FLR, etc + Mods are defined as dicts, and apart from other info like aa, type, mass, etc, + they contain two keys, site_lucin, site_report which are zero resp. one-based + residue indices for luciphor input (zero based) and reporting to PSM tables + (one based) + ''' + def __init__(self): self.mods = [] self.top_flr = False @@ -192,6 +199,24 @@ def __init__(self): self.sequence = False self.seq_in_scorepep_fmt = False + + def get_modtype(self, mod, labileptmnames, stableptmnames): + if not mod['var']: + mtype = 'fixed' + elif mod['name_lower'] in labileptmnames: + mtype = 'labile' + elif mod['name_lower'] in stableptmnames: + mtype = 'stable' + else: + mtype = 'variable' + return mtype + + def get_mod_dict(self, residue, sitenum, modptm, labileptmnames, stableptmnames): + return {'aa': residue, 'site_lucin': sitenum, 'site_report': sitenum + 1, + 'type': self.get_modtype(modptm, labileptmnames, stableptmnames), + 'mass': modptm['mass'], 'name': modptm['name'], + 'name_lower': modptm['name_lower'], 'adjusted_mass': modptm['adjusted_mass']} + def parse_msgf_peptide(self, msgfseq, msgf_mods, labileptmnames, stableptmnames): self.mods = [] barepep = '' @@ -210,25 +235,11 @@ def parse_msgf_peptide(self, msgfseq, msgf_mods, labileptmnames, stableptmnames) start = x.end() for mass in re.findall('[\+\-][0-9.]+', x.group(2)): mod = msgf_mods[float(mass)][0] # only take first, contains enough info - self.mods.append({ - 'site': (residue, sitenum), 'type': self.get_modtype(mod, labileptmnames, stableptmnames), - 'mass': mod['mass'], 'name': mod['name'], 'name_lower': mod['name_lower'], - 'adjusted_mass': mod['adjusted_mass'] - }) + self.mods.append(self.get_mod_dict(residue, sitenum, mod, labileptmnames, + stableptmnames)) self.sequence = f'{barepep}{msgfseq[start:]}' - def get_modtype(self, mod, labileptmnames, stableptmnames): - if not mod['var']: - mtype = 'fixed' - elif mod['name_lower'] in labileptmnames: - mtype = 'labile' - elif mod['name_lower'] in stableptmnames: - mtype = 'stable' - else: - mtype = 'variable' - return mtype - - def parse_luciphor_peptide(self, luciline, ptms_map, labileptms, stabileptms): + def parse_luciphor_peptide(self, luciline, ptms_map, labileptmnames, stableptmnames): '''From a luciphor sequence, create a peptide with PTMs ptms_map = {f'{residue}int(79 + mass_S/T/Y)': {'name': Phospho, etc} ''' @@ -243,13 +254,11 @@ def parse_luciphor_peptide(self, luciline, ptms_map, labileptms, stabileptms): barepep += modpep[start:x.start()+1] start = x.end() ptm = ptms_map[f'{x.group(1)}{int(x.group(2))}'] - if ptm['name_lower'] in labileptms: + if ptm['name_lower'] in labileptmnames: sitenum = len(barepep) - 1 if len(barepep) else -100 residue = barepep[-1] if len(barepep) else '[' - self.mods.append({ - 'site': (residue, sitenum), 'type': self.get_modtype(ptm, labileptms, stabileptms), - 'mass': ptm['mass'], 'name': ptm['name'], 'name_lower': ptm['name_lower'], - }) + self.mods.append(self.get_mod_dict(residue, sitenum, ptm, labileptmnames, + stableptmnames)) self.sequence = f'{barepep}{modpep[start:]}' self.seq_in_scorepep_fmt = re.sub(r'([A-Z])\[[0-9]+\]', lambda x: x.group(1).lower(), modpep) @@ -273,7 +282,7 @@ def luciphor_input_sites(self): lucimods = [] for m in self.mods: if m['type'] != 'fixed': - lucimods.append((m['site'][1], str(m['mass'] + aa_weights_monoiso[m['site'][0]]))) + lucimods.append((m['site_lucin'], str(m['mass'] + aa_weights_monoiso[m['aa']]))) return ','.join([f'{x[0]}={x[1]}' for x in lucimods]) def add_ptms_from_psm(self, psmmods): @@ -288,7 +297,7 @@ def topptm_output(self): for ptm in self.mods: if ptm['type'] not in output_types: continue - site = f'{ptm["site"][0]}{ptm["site"][1] + 1}' + site = f'{ptm["aa"]}{ptm["site_report"]}' try: ptmsites[ptm['name']].append(site) except KeyError: