-
Notifications
You must be signed in to change notification settings - Fork 10
/
fasta2dfoil.py
executable file
·137 lines (119 loc) · 5.08 KB
/
fasta2dfoil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
DFOIL: Directional introgression testing a five-taxon phylogeny
James B. Pease
http://www.github.com/jbpease/dfoil
fasta2dfoil -
This script takes one or more FASTA files containing
5 or 4 taxa and counts site patterns for use in DFOIL/Dstat analysis.
To combine multiple FASTA files, each file should be sequences
from one locus (i.e., one entry in the final table) and
the names of sequences must be identical in all files.
Example Usage:
python3 fasta2dfoil.py INPUT.fasta --out OUTPUT.fasta \
--names TAXA1,TAXA2,TAXA3,TAXA4
"""
from __future__ import print_function
import sys
import argparse
from itertools import groupby
_LICENSE = """
If you use this software please cite:
Pease JB and MW Hahn. 2015.
"Detection and Polarization of Introgression in a Five-taxon Phylogeny"
Systematic Biology. 64 (4): 651-662.
http://www.dx.doi.org/10.1093/sysbio/syv023
This file is part of DFOIL.
DFOIL is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
DFOIL is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with DFOIL. If not, see <http://www.gnu.org/licenses/>.
"""
def fasta_iter(fasta_name):
"""
given a fasta file. yield tuples of header, sequence
Adapted from https://github.com/brentp
"""
filehandler = open(fasta_name, 'r')
faiter = (x[1] for x in groupby(filehandler, lambda line: line[0] == ">"))
for header in faiter:
header = next(header)[1:].strip()
seq = "".join(s.strip() for s in next(faiter))
yield header, seq
def generate_argparser():
parser = argparse.ArgumentParser(
prog="fasta2dfoil.py",
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
epilog=_LICENSE)
parser.add_argument("fastafile", nargs='*',
help="""one or more input fasta
files for each locus""")
parser.add_argument("--out", "-o", required=True,
help="""output count file, one entry per fasta""")
parser.add_argument("--names", "-n", nargs=1, required=True,
help="""Order of the 5 (or 4) taxa separated by commas.
Names must be consistent in all input files,
outgroup should be last""")
parser.add_argument("--version", action="version", version="2017-11-07",
help="display version information and quit")
return parser
def main(arguments=None):
"""Main method"""
arguments = arguments if arguments is not None else sys.argv[1:]
parser = generate_argparser()
args = parser.parse_args(args=arguments)
position = 0
taxa = args.names[0].split(',')
ntaxa = len(taxa)
if ntaxa is 4:
headers = ['AAAA', 'AABA', 'ABAA', 'ABBA',
'BAAA', 'BABA', 'BBAA', 'BBBA']
elif ntaxa is 5:
headers = ['AAAAA', 'AAABA', 'AABAA', 'AABBA',
'ABAAA', 'ABABA', 'ABBAA', 'ABBBA',
'BAAAA', 'BAABA', 'BABAA', 'BABBA',
'BBAAA', 'BBABA', 'BBBAA', 'BBBBA']
else:
raise RuntimeError("Invalid number of taxa, use 5 or 4")
with open(args.out, 'w') as outfile:
outfile.write("#chrom\tposition\t{}\n".format('\t'.join(headers)))
for infilename in args.fastafile:
site_count = {}
seqs = {}
for header, seq in fasta_iter(infilename):
seqs[header] = seq
if list(sorted(seqs.keys())) != list(sorted(taxa)):
raise RuntimeError(
"Error: Labels from {} ({}) do not match --names ({})".format(
infilename, seqs.keys(), taxa))
for label in seqs:
if len(seqs[label]) != len(seqs[taxa[0]]):
raise RuntimeError(
"Error: Sequences in {} are of unequal length ({})".format(
infilename, ",".join(["{}={}".format(
x, len(seqs[x])) for x in seqs])))
for i in range(len(list(seqs.values())[0])):
site = [str(seqs[name][i]).upper() for name in taxa]
if len(set(site)) > 2:
continue
if set(site) - set('ATGC'):
continue
site_code = ''.join(['A' if x == site[-1].upper() else 'B'
for x in site])
site_count[site_code] = site_count.get(site_code, 0) + 1
with open(args.out, 'a') as outfile:
outfile.write("{}\t{}\t{}\n".format(infilename, position,
'\t'.join([str(site_count.get(x, 0))
for x in headers])))
position += 1
return ''
if __name__ == "__main__":
main()