process_email_output_c.py

# Usnish Majumdar, last updated 1/12/16
# SCRIPT #2
# For a gmail account, this script will programmatically log into your gmail account
# and extract cascleave results generated by the webserver over at:
# http://sunflower.kuicr.kyoto-u.ac.jp/~sjn/Cascleave/webserver.html
# It then searches a CSV file of proteomics hits to find fragments that match
# putative caspase cleavage sites. These fragments are then listed along with
# the proteomics protein hit next to the original search protein description
# and sequence.
# USAGE:
# $python process_email_output_c.py
# INPUTS: 
# output.fasta is a file that includes all the protein identifiers and sequences
# obtained from NCBI .
# Unique_FDR_CC.csv is a file that contains sequence fragments obtained from
# proteomics analysis.
# OUTPUTS:
# found_sites_unique_FDR_CC.txt lists protein names, followed by their sequences,
# followed by predicted cleavage hits, followed by matches in the proteomics fragments.


import sys
import imaplib
import getpass
import email
import datetime
import io
import csv
import string

#put gmail
email_id = '****@*****.***'

identifiers = []
sequences = []

#read sequences from fasta file
fasta_file = open('output.fasta','r')
for line in fasta_file:
    if line[0] == '>':
        identifiers.append(line.rstrip())
    else:
        sequences.append(line.rstrip())

fasta_file.close()

sys.exit()

#PROCESS CASPASE FRAGMENTS

fragment_dict = {}

with open("Unique_FDR_CC.txt",'rU') as tsvfile:
    tsvreader = csv.reader(tsvfile, delimiter="\t")
    for line in tsvreader:
        fragment_dict[line[3]] = line[6]

for key in list(fragment_dict.keys()):
    #remove commas from values
    fragment_dict[key] = fragment_dict[key].replace(',','')
    #remove modifications from protein string
    fragment_dict[key.translate(None,string.ascii_lowercase)] = fragment_dict.pop(key)

# EMAIL PROCESS
####################################################################################################################################

# IMAP CODE SOURCES
# http://www.voidynullness.net/blog/2013/07/25/gmail-email-with-python-via-imap/
# https://yuji.wordpress.com/2011/06/22/python-imaplib-imap-example-with-gmail/

mail = imaplib.IMAP4_SSL('imap.gmail.com')

f4 = open('found_sites_unique_FDR_CC.txt','w')

try:
    mail.login(email_id, getpass.getpass()) #login to email
except imaplib.ImailAP4.error:
    print("LOGIN FAILED!!! ")
    # exit or fail

mail.select("inbox")
result, mail_ids = mail.uid('search', None, '(HEADER Subject "The predicted result of your submitted sequence")')

mail_ids = mail_ids[0].split(' ')

raw_emails = []

for mail_id in mail_ids:
    result, data = mail.uid('fetch', mail_id, "(RFC822)")
    raw_emails.append(data[0][1])

for raw_email in raw_emails:
    identifier = 'blank'
    caspase_sites = []
    sequence = 'blank'
    for index, seq in enumerate(sequences):
        if seq[0:60] in raw_email:
            identifier = identifiers[index]
            sequence = seq

    email_buffer = io.StringIO(raw_email)

    print(identifier, file=f4)
    print(sequence, file=f4)

## extracting matches and putting into output file.

    for line in email_buffer:
        n_term = ""
        if "*" in line:
            print(line, end=' ', file=f4)
            n_term = line.rstrip().split('*')[-1]
            count = 0
            for key in list(fragment_dict.keys()):
                if key.startswith(n_term):
                    print('\t\t\t\t\t',key, fragment_dict[key], file=f4)
                    count += 1
            if count == 0:
                print('\t\t\t\t\t',"No n-term fragments found.", file=f4)
            #print line.rstrip()[-9:]

    # print >>f4, identifier
    # print >>f4, sequence
    # print >>f4, caspase_sites

f4.close()