-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_email_output_c.py
126 lines (99 loc) · 3.69 KB
/
process_email_output_c.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Usnish Majumdar, last updated 1/12/16
# SCRIPT #2
# For a gmail account, this script will programmatically log into your gmail account
# and extract cascleave results generated by the webserver over at:
# http://sunflower.kuicr.kyoto-u.ac.jp/~sjn/Cascleave/webserver.html
# It then searches a CSV file of proteomics hits to find fragments that match
# putative caspase cleavage sites. These fragments are then listed along with
# the proteomics protein hit next to the original search protein description
# and sequence.
# USAGE:
# $python process_email_output_c.py
# INPUTS:
# output.fasta is a file that includes all the protein identifiers and sequences
# obtained from NCBI .
# Unique_FDR_CC.csv is a file that contains sequence fragments obtained from
# proteomics analysis.
# OUTPUTS:
# found_sites_unique_FDR_CC.txt lists protein names, followed by their sequences,
# followed by predicted cleavage hits, followed by matches in the proteomics fragments.
import sys
import imaplib
import getpass
import email
import datetime
import io
import csv
import string
#put gmail
email_id = '****@*****.***'
identifiers = []
sequences = []
#read sequences from fasta file
fasta_file = open('output.fasta','r')
for line in fasta_file:
if line[0] == '>':
identifiers.append(line.rstrip())
else:
sequences.append(line.rstrip())
fasta_file.close()
sys.exit()
#PROCESS CASPASE FRAGMENTS
fragment_dict = {}
with open("Unique_FDR_CC.txt",'rU') as tsvfile:
tsvreader = csv.reader(tsvfile, delimiter="\t")
for line in tsvreader:
fragment_dict[line[3]] = line[6]
for key in list(fragment_dict.keys()):
#remove commas from values
fragment_dict[key] = fragment_dict[key].replace(',','')
#remove modifications from protein string
fragment_dict[key.translate(None,string.ascii_lowercase)] = fragment_dict.pop(key)
# EMAIL PROCESS
####################################################################################################################################
# IMAP CODE SOURCES
# http://www.voidynullness.net/blog/2013/07/25/gmail-email-with-python-via-imap/
# https://yuji.wordpress.com/2011/06/22/python-imaplib-imap-example-with-gmail/
mail = imaplib.IMAP4_SSL('imap.gmail.com')
f4 = open('found_sites_unique_FDR_CC.txt','w')
try:
mail.login(email_id, getpass.getpass()) #login to email
except imaplib.ImailAP4.error:
print("LOGIN FAILED!!! ")
# exit or fail
mail.select("inbox")
result, mail_ids = mail.uid('search', None, '(HEADER Subject "The predicted result of your submitted sequence")')
mail_ids = mail_ids[0].split(' ')
raw_emails = []
for mail_id in mail_ids:
result, data = mail.uid('fetch', mail_id, "(RFC822)")
raw_emails.append(data[0][1])
for raw_email in raw_emails:
identifier = 'blank'
caspase_sites = []
sequence = 'blank'
for index, seq in enumerate(sequences):
if seq[0:60] in raw_email:
identifier = identifiers[index]
sequence = seq
email_buffer = io.StringIO(raw_email)
print(identifier, file=f4)
print(sequence, file=f4)
## extracting matches and putting into output file.
for line in email_buffer:
n_term = ""
if "*" in line:
print(line, end=' ', file=f4)
n_term = line.rstrip().split('*')[-1]
count = 0
for key in list(fragment_dict.keys()):
if key.startswith(n_term):
print('\t\t\t\t\t',key, fragment_dict[key], file=f4)
count += 1
if count == 0:
print('\t\t\t\t\t',"No n-term fragments found.", file=f4)
#print line.rstrip()[-9:]
# print >>f4, identifier
# print >>f4, sequence
# print >>f4, caspase_sites
f4.close()