-
Notifications
You must be signed in to change notification settings - Fork 0
/
biopython_cook.py
executable file
·48 lines (39 loc) · 1.24 KB
/
biopython_cook.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#! /usr/bin/env python
"""
A module with classical bioinfo tasks using Biopython.
So far:
- Getting a fasta file from a list of GIs
"""
from Bio import Entrez
from Bio import SeqIO
import csv
import sys
Entrez.email = '[email protected]'
#-------------------------------------------------------------------------------
def getGIs(gis_csv, col_num=2):
"""
Get the GIs from a csv file, specifying the number of the column
where to find them (col index starting at 0)
"""
gis = []
with open(gis_csv, 'r') as f:
next(f)
for row in csv.reader(f):
gis.append(row[col_num])
return gis
#-------------------------------------------------------------------------------
def getFastaFromGIs(gis_list):
"""
From a list of GIs (as string) get a multifasta
"""
# Get data from NCBI
handle = Entrez.efetch(db='protein', id=gis_list,
rettype='fasta', retmode='text')
for record in SeqIO.parse(handle, 'fasta'):
print '>' + record.id, record.description
print record.seq
handle.close()
#-------------------------------------------------------------------------------
if __name__ == '__main__':
gis = getGIs(sys.argv[1])
getFastaFromGIs(gis)