-
Notifications
You must be signed in to change notification settings - Fork 0
/
fastaparser.py
39 lines (30 loc) · 1.4 KB
/
fastaparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/bin/python
import pandas as pd
import argparse
from Bio import SeqIO
parser = argparse.ArgumentParser()
parser.add_argument("fasta", help=".fasta file to parse.")
parser.add_argument("records", help="File containing fasta headers to isolate.")
args = parser.parse_args()
input_file = open(args.records, "r")
content = input_file.read()
content_list = content.split("\n")
input_file.close()
sequences_in_file = []
sequences_not_in_file = []
df_record_lengths = pd.DataFrame(columns=('record', 'length'))
for record in SeqIO.parse(args.fasta, "fasta"):
df_record_lengths = df_record_lengths.append({'record' : record.name, 'length' : len(record.seq)}, ignore_index = True)
if record.id.split(" ")[0] in content_list:
sequences_in_file.append(record)
else:
sequences_not_in_file.append(record)
df1 = df_record_lengths[df_record_lengths['record'].isin(content_list)]
df2 = df_record_lengths[~df_record_lengths['record'].isin(content_list)]
outfile1 = args.records.split(".txt")[0] + "_records"
outfile2 = args.records.split(".txt")[0] + "_remaining"
SeqIO.write(sequences_in_file, outfile1 + ".fasta", "fasta")
SeqIO.write(sequences_not_in_file, outfile2 + ".fasta", "fasta")
df1.to_csv(outfile1 + ".lengths", index=False, sep='\t', header=True)
df2.to_csv(outfile2 + ".lengths", index=False, sep='\t', header=True)
print("fastaparser finished parsing.")