-
Notifications
You must be signed in to change notification settings - Fork 0
/
fasta2pylibseq.py
84 lines (70 loc) · 2.5 KB
/
fasta2pylibseq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 14 16:32:37 2017
@author: scott
"""
import numpy as np
import pickle
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('INfasta', metavar="INfasta", type=str,
help='path to fasta file')
parser.add_argument('-s', '--seqs', type=int, required=True,
help="number of sequences")
parser.add_argument('-l', "--seqlength", type=int, required=True,
help="length of sequence")
parser.add_argument('-p', '--popsizelist', nargs='+', type=int, required=True,
help="population list")
args = parser.parse_args()
class MSgenotype(object):
def __init__(self, haplotype=[], positions=[], names=[], popsizelist=[]):
self.haps = haplotype
self.pos = positions
self.names = names
self.pops = popsizelist
def fasta2pylibseq(fastaFile, numberseqs, lensequence, popsizelist):
'''take a fasta alignmnet with out group as first sequence and convert it
to ms-type format.
Parameters
----------
fasta : file
alignment in fasta format
numberseq : int
number of sequences to expect for array size not including reference
lensequence : int
length of sequence
Returns
-------
gtbinary : object
genotype object of genotypes in strings
positions : array
list of mutation positions 0 - 1
'''
seqnames = []
positions = np.arange(0.0, 1.0, 1.0/lensequence)
genotypes = np.zeros(shape=[numberseqs, len(positions)])
tmpfasta = []
with open("{}".format(fastaFile), 'r') as fasta_aln:
for line in fasta_aln:
if line.startswith(">"):
seqnames.append(line[1:].strip())
else:
tmpfasta.append(line.strip())
refseq = tmpfasta[0]
for ind, seq in enumerate(tmpfasta[1:]):
try:
muts = [i for i in range(len(seq)) if refseq[i] != seq[i]]
except IndexError:
pass
genotypes[ind][muts] = 1
fastaname = MSgenotype(genotypes, positions, seqnames[1:], popsizelist)
with open('{}.pkl'.format('fastaout'), 'wb') as output:
pickler = pickle.Pickler(output, -1)
fastaout = MSgenotype(fastaname.haps, fastaname.pos, fastaname.names,
fastaname.pops)
pickler.dump(fastaout)
del fastaout
return(None)
if __name__ == '__main__':
fasta2pylibseq(args.INfasta, args.seqs, args.seqlength, args.popsizelist)