forked from wish1832/GISAID_Pipeline_Functions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
MSA_Reader.py
1330 lines (1145 loc) · 71.6 KB
/
MSA_Reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#The path to the biopython packages was not automatically added to anaconda. To allow import of the package, a manual path must be specified.
import os
#Absolute location:
#os.environ['CONDA_PREFIX']=/home/wish1832/anaconda3
#Code to append biopython location to path
os.environ['PATH'] = os.environ['PATH']+';'+os.environ['CONDA_PREFIX']+"/pkgs/biopython-1.78-py38h7b6447c_0/lib/python3.8/site-packages"
from Bio import AlignIO as msa_in
from Bio.Align import MultipleSeqAlignment as msa
import Bio.SeqRecord
import numpy as np
import pandas as pd
def initalize_file(outname):
"""
outname = the directory and prefix of the output file (string)
For example, setting outname "mydir/mutant" will create an outfile in the directory mydir with the name mutant_variants_raw.tsv.
Other reports generated by the output will also exist in the directory mydir and have the prefix "mutant".
Given an output file prefix, initialize_files() will create one file (outname_variants_raw.tsv)
containing raw data on the substitutions, insertions, and deletions relative to the reference genome found in the sequence alignment.
The function will print headers in each file in the order specified below:
Field 1: Cluster_ID (e.g. clust48)
Field 2: Number of sequences in cluster
Field 3: Type of variant (insertion, deletion, substitution)
Field 4: Mutation code (formatted based on the nomenclature indicated by the Human Genome Variation Society https://varnomen.hgvs.org/recommendations/protein/)
Field 5: Amino Acid(s) of Reference Strand
For insertions, this is the two amino acid residues the insertion ocurrs between.
For deletions, this will be the reference residue(s) deleted
Field 6: Variant Amino Acid Residue
For multi-codon insertions, this field will contain a string of multiple codons
For deletions, this will be blank
Field 7: Start position of variant on multiple sequence alignment
For insertions this will be the codon number on the multiple sequence alignment before the inserted codon(s)
Field 8: End position of variant on multiple sequence alignment
For insertions this will be the codon number on the multiple sequence alignment before the inserted codon(s)
For substitutions and single-base deletions this will be blank
Field 9: Start position of variant on reference strand
For insertions this will be the codon number on the reference strand before the inserted codon(s)
Field 10: End position of variant on reference strand
For insertions this will be the codon number on the reference strand before the inserted codon(s)
For substitutions and single-base deletions this will be blank
"""
#Create filename for raw output
raw_out = outname+"_variants_raw.tsv"
#Define header according to elements listed in docstring
header = ["Cluster_ID","Cluster_Size","Type","Code","Ref Residue(s)","Var Residue(s)","AA_Start(MSA)","AA_End(MSA)","AA_Start(Ref)","AA_End(Ref)"]
#Print header in output file
with open(raw_out,"w") as outfile:
outfile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(header[0],header[1],header[2],header[3],header[4],header[5],header[6],header[7],header[8],header[9]))
"""Code below will prevent overwriting in final version.
if not file:
with open(file,"w") as file
else
print("Error: {} already exists. Overwriting prohibited.".format(file))"""
#Return the filename for use in other functions
return raw_out
def write_line(outfile,clustname,clustsize,var_type,code,ref_AA,var_AA,msa_start,ref_start,msa_end="-",ref_end="-"):
"""
Will write out to the variant .tsv values given the location of an output list with 6 Fields in the following format:
Field 1: Cluster_ID (e.g. clust48)
Field 2: Number of sequences in cluster
Field 3: Type of variant (insertion, deletion, substitution)
Field 4: Mutation code (formatted based on the nomenclature indicated by the Human Genome Variation Society https://varnomen.hgvs.org/recommendations/protein/)
Field 5: Amino Acid(s) of Reference Strand
For insertions, this is the two amino acid residues the insertion ocurrs between.
For deletions, this will be the reference residue(s) deleted
Field 6: Variant Amino Acid Residue
For multi-codon insertions, this field will contain multiple columns
For deletions, this will be blank
Field 7: Start position of variant on multiple sequence alignment
For insertions this will be the codon number on the multiple sequence alignment before the inserted codon(s)
Field 8: End position of variant on multiple sequence alignment
For insertions this will be the codon number on the multiple sequence alignment before the inserted codon(s)
For substitutions and single-base deletions this will be blank
Field 9: Start position of variant on reference strand
For insertions this will be the codon number on the reference strand before the inserted codon(s)
Field 10: End position of variant on reference strand
For insertions this will be the codon number on the reference strand before the inserted codon(s)
For substitutions and single-base deletions this will be blank
"""
#Write to output file
if outfile:
with open(outfile,"a") as file:
file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(clustname,clustsize,var_type,code,ref_AA,var_AA,msa_start,msa_end,ref_start,ref_end))
#Raise a FileNotFoundError if the output file has not been initialized
else:
raise FileNotFoundError("{} does not exist.".format(outfile))
def reference_position(reference):
"""
For Variant Code Creation, the positions must correspond to the reference sequence, rather than the alignent.
Given the reference genome, reference_position() will return a list of integers that stores the reference genome position corresponding to the msa position.
If there is no base in the reference genome at a given position in the msa (when a "-" is in the reference genome), a blank is stored in this list.
"""
ref_list=[]
pos=1
for i in reference:
if i != "-":
ref_list.append(pos)
pos += 1
else:
ref_list.append("-")
return ref_list
def msa_iterator(alignment,raw_out,reference,ref_index,outfile):
"""
alignment=a biopython multiple sequence alignment object
raw_out=filename for outputting raw data
reference=Reference strand to which sequences are compared (biopython RefSeq object)
ref_list=A list relating an amino acid position on a multiple sequence alingment to its corresponding position on the reference strand
i_begin=Amino acid position on multiple sequence alignment at which to begin iteration (allows for skipping over multi-base insertions and deletions)
j_begin=Alignment object at which to begin
msa_iterator() will iterate through the multiple sequence alignment object and compare bases to the reference genome.
Insertions, deletions, and substitutions will be detected and handled by separate functions based on type.
i is the amino acid position, j is the sequence position in the MSA (origional iterated through AA position first)
"""
#Every sequence in the alignment will be iteratively compared to the reference strand
#Multi-base deletions and insertions require a loop in which iterables can be edited
#A while loop will be used instead of a for loop to iterate through all i codons in j genomes.
#Begin with i and j equal to zero
i,j=0,0
#While loop will run until the last i position of the last j strain is tested.
while not (j==len(alignment)-1 and i==len(reference)):
#Status report: when at the first index of a genome, print progress (current genome/#of genomes in msa)
if i==0:
print("\rExamining genome {0} of {1}. Percent complete {2:.0%}".format(j+1,len(alignment),j/(len(alignment)-1)),end="")
#When all i codons in a genome have been iterated through, the genome index is advanced by one and the i is reset to zero.
if i==len(reference):
j+=1
i=0
else:
#For every codon, test the below conditionals
#Conditional to detect a substitution
if reference[i] != alignment[j,i] and reference[i]!="-" and alignment[j,i]!="-":
substitution_case(i,j,alignment,reference,ref_index,outfile)
#Move on to next codon
i+=1
#Conditional to detect a deletion
elif (reference[i] != alignment[j,i]) and alignment[j,i]=="-":
#Call the deletion_case function, which will check for a multi-base deletion and return the value i to resume iteration from
i=deletion_case(j,i,alignment,reference,ref_index,outfile)
#Conditional to detect an insertion
elif reference[i] != alignment[j,i] and reference[i]=="-":
i=insertion_case(j,i,alignment,reference,ref_index,outfile)
#insertion_case will set i to the next codon automatically
#If there is no variant, increase i by one to test the next codon.
elif reference[i]==alignment[j,i]:
i+=1
#Test for unanticipated scenarios
else:
print("A codon comparison has been found that does not meet any conditionals in the msa_iterator() loop.")
def substitution_case(i,j,alignment,reference,ref_index,outfile):
"""
Will activate when a substution is detected during iteration through the multiple sequence alignment.
"""
#Store position of variant in MSA (codon position is one more than the index)
msa_pos=i+1
#Store position of variant in reference genome
ref_pos=ref_index[i]
#Store codon in position i of reference genome
ref_AA=reference[i]
#Store the variant codon
var_AA=alignment[j,i] #Retrieves the codon from the sequence at position j, at position i
#Store the name of the cluster associated with the variant sequence (sequence index j)
#The cluster name is in the id of the RefSeq object, before the ";".
clustname=alignment[j].id.split(sep=";")[0]
#Retrieve size of cluster associated with the variant sequence j
#The cluster size is printed betweeen ";" and "=" in id
clustsize=alignment[j].id.split(sep="=")[1].split(sep=";")[0]
#Store the substitution code
#Based on the sequence variant nomenclature suggested by the Human Genome Variation Society,
#The code format is <Reference_Amino_Acid><Codon_Position(Reference Genome)><Variant_Amino_Acid>
#Source (https://varnomen.hgvs.org/recommendations/protein/variant/substitution/)
code="{}{}{}".format(ref_AA,ref_pos,var_AA)
#Write above information to file
write_line(outfile,clustname,clustsize,"sub",code,ref_AA,var_AA,msa_pos,ref_pos,"-","-")
#Once information is stored, return to iteration at the next base (returns to the For loop in the parent function)
def deletion_case(j,i,alignment,reference,ref_index,outfile):
"""
Will activate when a deletion is detected during iteration through the multiple sequence alignment.
Since deletions may span two or more bases and nomenclature counts multi-base deletions as a single event,
iteration through the multiple sequence alignment in the parent msa_iterator() function must be skipped ahead
to the end of the detion region to avoid a multiple-codon deletion being read as multiple instances of a single-base deletion.
After recording a deletion, msa_iterator() is restarted at the position i corresponding to the codon after the end of the deletion.
"""
#Raise an error if the function is triggered by a codon that is not deleted
if not (alignment[j,i]=="-" and reference[i]!="-"):
raise ValueError("Deletion case called on a codon region without a deletion")
#Or if the deletion case is not called at the beginning of a deletion region
#Code detects a deletion in the codon one index before the current i it is called on
elif i!=0 and ((reference[i-1]!=alignment[j,i-1]) and alignment[j,i-1]=="-"):
raise ValueError("Deletion case called in the middle of a multi-base deletion")
#Record the position at the beginning of a deletion
i_start=i
#Step 1: Iterate forward until extant codons are found on both the reference strand and genome j.
while True:
#If a deletion case is detected, or if both the reference genome and genome j have a blank at index i, try the next codon.
if ((alignment[j,i]=="-" and reference[i]!="-") or (alignment[j,i]=="-" and reference[i]=="-")):
#If the base currently being tested at the end of the genome, a special case is triggered
if i==len(reference)-1:
#Set i_Cflank as being non-existent (affects which index is returned to msa_iterator() at end of function)
i_Cflank=None
break #Break the while loop and continue to step 2.
else:
i+=1 #Increase index by one and continue to test the next codon
#When the C-flanking extant codon is found, record the index of the flank.
elif (alignment[j,i]!="-" and reference[i]!="-"):
#The index of the C-flanking codon will be returned to msa_iterator() as the position to resume variant checking.
i_Cflank=i
#Break the while loop and continue to step 2.
break
#Special case: it is possible for an insertion to ocurr with a deletion (delins event).
elif alignment[j,i]!="-" and reference[i]=="-":
#Run the delins case and store the index of the C-flanking codon to the event
i=delins_case(j,i_start,alignment,reference,ref_index,outfile)
#msa_iterator() will continue testing for variants at the index i
return i
#Step 2: Iterate backwards over any blanks that may exist until the end of the deletion region is reached.
#Iteration will begin at the C-flanking codon (or the end of the genome) and will include the start of the deletion region.
for i in range((i_Cflank if i_Cflank else len(reference)-1),i_start-1,-1):
#If a blank is discovered at the position, continue to previous codon.
if (reference[i]=="-" and alignment[j,i]=="-"):
continue
#If a deletion is found, stop iteration and record current position as i_end (end of deletion region).
elif (reference[i]!="-" and alignment[j,i]=="-"):
i_end=i
#Break the for loop and continue to step 3.
break
#Step 3: Record deletion information using i_start and i_end.
record_deletion_info(j,i_start,i_end,alignment,reference,ref_index,outfile)
#Return the index of the C-flanking codon to msa_iterator() if it exists.
#For a deletion at the end of the genome, return i=len(reference), which will cause msa_iterator() to skip to the next line.
return (i_Cflank if i_Cflank else len(reference))
def insertion_case(j,i,alignment,reference,ref_index,outfile):
"""
insertion_case() will gather the following information regarding the insertion in 5 steps:
instype=Either "ins", "N_ext", or "C_ext". instype will tell the function record_insertion_info() which
type of insertion case to record. Insertions at the 5' end of the transcriptome are N-terminal extentions,
and must be recorded differently from standard insertions. The same is true of C-terminal extentions, which
are insertions at the 3' end of the transcriptome.
If N-terminal or C-terminal extentions have not been discovered, instype will be set to "ins".
i_start=The index of the start of the insertion region (first location where reference is blank and the variant genome has a codon)
i_Nflank= The index of the flanking N-terminal codon to the insertion in the reference strand (required for the code).
For N-terminal extentions, this will be equal to "None".
i_end=The index of the end of the insertion region (last location where the reference is blank and the variant genome has a codon)
i_Cflank=The index of the flanking C-terminal codon to the insertion in the reference strand (required for the insertion code).
For C-terminal extentions, this will be equal to "None".
"""
instype=None
#Raise an error if the function is triggered by a codon that is not inserted
if not (alignment[j,i]!="-" and reference[i]=="-"):
raise ValueError("Deletion case called on a codon region without a insertion")
#Or if the insertion case is not called at the beginning of a deletion region
#Code detects a insertion in the codon one index before the current i it is called on
elif i!=0 and ((reference[i-1]!=alignment[j,i-1]) and reference[i-1]=="-"):
raise ValueError("Deletion case called in the middle of a multi-base insertion")
#Step 1: Record the index at beginning of insertion
i_start=i
#Begin iteration
#Step 2: It is not necessarily true that the flanking reference codon is one index before the insertion start.
#A for loop will iterate backwards until an extant reference codon is found.
#If the insertion region is discovered in the first codon position, a special case is triggered.
if i!=0:
for idx in range(i-1,-1,-1): #Iterates backwards from the start of the insertion region, ending at 0
#idx used instead of i to prevent overwriting of i to be passed to be msa_iterator
if reference[idx]=="-":
#Special case, if an N-terminal flanking codon has not been discovered when the beginning of the genome
#is reached, this is an N-terminal extention.
if idx==0:
i_Nflank=None
break #Break the loop and continue to step 3.
#If the N-terminal flanking codon is found, store the index of the codon.
elif reference[idx]!="-":
i_Nflank=idx
break #Break the loop and continue to step 3.
#Step 2: Special case of insertion at beginning of region
#Called when an insertion is found starting at i=0 (impossible for a N-terminal flank to exist)
else:
#The index of the N-terminal flank is set to "none"
i_Nflank=None
#Step 3: Iterate forward until the first position of an extant codon on both the reference and the variant strand
#Will yield i_Cflank
while True:
#Check for an insertion, or a blank on both the reference and the genome j, at the codon position. If so, continue on
if (reference[i]=="-" and alignment[j,i]!="-") or (reference[i]=="-" and alignment[j,i]=="-"):
#Special case: end of the genome is reached and contains a blank (will also be triggered if the
#insertion begins at the end of the variant genome). This is a C-terminal extension.
if i==len(reference)-1:
i_Cflank=None
break #Terminate the loop and continue to step 4
#If not at the end of the genome, increase the index by one and test the next codon.
else:
i+=1
#Check for a deletion at the codon. If so, switch to the indel case
elif reference[i]!="-" and alignment[j,i]=="-":
#i_start will be passed to the indel function
i=delins_case(j,i_start,alignment,reference,ref_index,outfile)
return i #After delins case is ran, return the position of the C-terminal flank if it exists to the msa_iterator() function
#Check for extant codons in both reference and genome j at the position
elif reference[i]!="-" and alignment[j,i]!="-":
#The position of the C-terminal flanking codon will be stored.
i_Cflank=i
break #Terminate the loop and continue to step 4.
#Step 4: Iterate backward over any blank codons on both the reference and variant strands until a codon on the variant strand is reached
#Iteration will begin one codon before the C-terminal flank if it exists, otherwise iteration begins at the end of the genome.
for idx in range((i_Cflank-1 if i_Cflank else len(reference)-1),(i_Nflank if i_Nflank else -1),-1): #Iterates backwards to i_Nflank (excluding i_Nflank) if it exists, otherwise iterates to zero(inclusive).
#if a blank is found, continue backwards iteration
if alignment[j,idx]=="-":
continue
#The first codon found in the alignment will mark the end of the inserted sequence in the multiple sequence alignment.
elif alignment[j,idx]!="-":
i_end=idx
break #Break the loop and continue to step 5.
#Step 5: determine type of insertion based on the presence/absence of N-terminal and C-terminal flanking codons
if i_Cflank and i_Nflank:
instype="ins"
elif i_Cflank and not i_Nflank:
instype="N_ext"
elif i_Nflank and not i_Cflank:
instype="C_ext"
#All indeces have been gathered. The indeces will be passed to the record_insertion_info() function to record the insertion in the file
record_insertion_info(j,instype,i_start,i_end,alignment,reference,ref_index,outfile,i_Nflank,i_Cflank)
#Recording complete. Pass index of the C-flanking codon back to msa_iterator() if it exists
return (i_Cflank if i_Cflank else len(reference))
def delins_case(j,i_start,alignment,reference,ref_index,outfile):
"""
insertion_case() will gather the following information regarding the insertion-deletion (indel) event in 5 steps:
i_start: The beginning of the insertion region. This is passed to the function by the function that triggered the delins case.
i_end: The index of the end of the indel region (last location where the reference is blank and the variant genome has a codon)
i_first: The index of the first codon deleted (used for computation of the variant code)
i_last: The index of the last codon deleted (used for computation of the variant code)
codons_inserted: The amino acid codons that have been inserted into the deletion region, with any blanks between residues removed.
"""
#Step 1: Iterate until the C-terminal codon of the delins region is found
#This will be when there is an extant codon on both the reference strain and on genome j.
i=i_start
while True:
#Test the current codon.
#If the current codon is an insertion, deletion, or a double blank, continue iteration.
if (reference[i]!="-" and alignment[j,i]=="-") or (reference[i]=="-" and alignment[j,i]!="-") or (reference[i]=="-" and alignment[j,i]=="-"):
#Special case: the end of the genome is reached before a C-terminal flank is found
if i==len(reference)-1:
i_Cflank=None
#Break the while loop and continue to step 2
break
else:
i+=1
#If there are codons on the reference and alignment genomes at the current position, the end of the indel region has been reached.
elif (reference[i]!="-" and alignment[j,i]!="-"):
#Store this position as the position of the C-terminal flanking codon to the indel region (iteration in msa_iterator() must resume from this point.)
i_Cflank=i
break #Break the while loop and continue to step 2.
#Step 2: Iterate backward over any blanks that may exist until the last codon that is either inserted or deleted
#Record this position as i_end
for i in range((i_Cflank if i_Cflank else len(reference)-1),i_start-1,-1):
#If a blank is discovered at the position, continue to previous codon.
if (reference[i]=="-" and alignment[j,i]=="-"):
continue
#If a deletion or an insertion is found, stop iteration and record current position as i_end (end of indel region).
elif (reference[i]!="-" and alignment[j,i]=="-") or (reference[i]=="-" and alignment[j,i]!="-"):
i_end=i
break
#Step 3: Iterate through the indel region on the reference strand, storing the indeces of the first and last codons
#The first and last codons on the reference strand are the first and last amino acids deleted (necessary for code).
#Iteration is through region from i_start up to and including i_end.
#Step 3.1: Finding the first codon
for i in range(i_start,i_end+1):
if (reference[i]=="-"):
continue
elif (reference[i]!="-"):
#Store the index of the first codon
i_first=i
break
#Step 3.2: Finding the last codon
for i in range(i_end,i_start-1,-1):
if (reference[i]=="-"):
continue
elif (reference[i]!="-"):
#Store the index of the last codon
i_last=i
break
#Step 4: Iterate through the slice of the variant strand and store the codons, removing any blanks
#The function insertion_var_AA() will do this.
codons_inserted=AA_blank_remover(i_start,i_end,j,alignment)
#February 14th, 2020
#Special Case: amino acid inserted is the same as the amino acids deleted (not really an indel)
#if i_first==i_last and reference[i_first]==codons_inserted:
#
#Step 5: Record information using the indeces discovered.
record_delins_info(i_start,i_end,i_first,i_last,j,codons_inserted,alignment,reference,ref_index,outfile)
#Must return the position of the C-flanking codon to the indel region, so msa_iterator() does not record the region more than once.
return (i_Cflank if i_Cflank else len(reference)) #If the end of the genome is reached, pass that index to msa_iterator().
def record_deletion_info(j,i_start,i_end,alignment,reference,ref_index,outfile):
"""
Information recording
"""
#Store the name of the cluster associated with the variant sequence (sequence index j)
#The cluster name is in the id of the RefSeq object, before the ";".
clustname=alignment[j].id.split(sep=";")[0]
#Retrieve size of cluster associated with the variant sequence j
#The cluster size is printed betweeen ";" and "=" in id
clustsize=alignment[j].id.split(sep="=")[1].split(sep=";")[0]
#Conditional: for single codon deletions, positions and amino acid residues will be stored as a single entry
#For multi-codon deletions, data will be stored as a range
#For single-codon deletions
if i_start==i_end:
#Store position of variant in MSA
msa_start=i_end+1
msa_end="-"
#Store position of variant in reference genome
ref_start=ref_index[i_end]
ref_end="-"
#Store codon in position i of reference genome
ref_AA=reference[i_end]
#Store the variant codon
var_AA="-" #Retrieves the codon from the sequence at position j, at position i
#Store the deletion code
"""
Based on the sequence variant nomenclature suggested by the Human Genome Variation Society,
the code format is <Amino_Acid_Deleted><Position_of_Deleted_Codon, reference Genome>del
source: (https://varnomen.hgvs.org/recommendations/protein/variant/deletion/)
"""
code="{}{}del".format(ref_AA,ref_start)
#For multi-codon deletions
else:
#Store position range in MSA
msa_start=i_start+1
msa_end=i_end+1
#Store position range in reference genome
ref_start=ref_index[i_start]
ref_end=ref_index[i_end]
#Store reference amino acids that have been deleted in sequence j
#Slicing does not include the end value, so the end value should be one index after the end of the deletion region (i)
ref_AA=deletion_ref_AA(i_start,i_end,reference)
#Store a blank for deleted variant codons
var_AA="-"
"""
Store the deletion code
Based on the sequence variant nomenclature suggested by the Human Genome Variation Society,
the code format is <Beginning_Amino_Acid><Beginning_Codon_Position, reference Genome>_<Ending_Amino_Acid><Ending_Codon_Position>del
source: (https://varnomen.hgvs.org/recommendations/protein/variant/deletion/)
"""
code="{}{}_{}{}del".format(reference[i_start],ref_index[i_start],reference[i_end],ref_index[i_end])
#Write above information to file
write_line(outfile,clustname,clustsize,"del",code,ref_AA,var_AA,msa_start,ref_start,msa_end,ref_end)
def record_insertion_info(j,instype,i_start,i_end,alignment,reference,ref_index,outfile,i_Nflank=None,i_Cflank=None):
#Store the name of the cluster associated with the variant sequence (sequence index j)
#The cluster name is in the id of the RefSeq object, before the ";".
clustname=alignment[j].id.split(sep=";")[0]
#Retrieve size of cluster associated with the variant sequence j
#The cluster size is printed betweeen ";" and "=" in id
clustsize=alignment[j].id.split(sep="=")[1].split(sep=";")[0]
#Store starting position on the multiple sequence alignment
msa_start=i_start+1
ref_AA="-"
#msa_end and var_AA: depend on whether the insertion is a single base or a multi-base insertion
if i_start==i_end:
msa_end="-"
var_AA=alignment[j,i_start]
else:
msa_end=i_end+1
var_AA=AA_blank_remover(i_start,i_end,j,alignment)
#ref_start and ref_end: defined as the reference positions of the N- and C- terminal flanking codons if they exist
ref_end=(ref_index[i_Cflank] if i_Cflank else "-") #C-flank is the flank downstream of the insertion.
ref_start=(ref_index[i_Nflank] if i_Nflank else "-")
#The flanking reference residues are stored for creation of the code, if they exist.
Nflank=(reference[i_Nflank] if i_Nflank else None)
Cflank=(reference[i_Cflank] if i_Cflank else None)
#The "type" and code fields depend on whether the insertion is a normal case, an N-terminal extension, or a C-terminal extension.
#TODO: instype will be passed through for output. Delete this later.
if instype=="ins":
print_type=instype
#Code is formatted as <N_flank amino acid ><>_<><>ins<var_AA>
code="{}{}_{}{}ins{}".format(Nflank,ref_start,Cflank,ref_end,var_AA)
elif instype=="N_ext":
#print type stores the type to be printed to the output file, as well as the type indicated in the code
#Both N-terminal and C-terminal extentions will be represented as "ext"
print_type="ext"
#Code: will be formatted <C-flanking amino acid>1ext<Var_AA inserted>
#TODO: This is at odds with the Human Genome Variation Society, which reccomends that extentions describe the number of new codons instead of their type.
#I chose to print the type because we don't have the genetic information in this experiment. Should I keep things this way or change the format to match the reccomendations?
code="{}1ext{}".format(Cflank,var_AA)
elif instype=="C_ext":
print_type="ext"
#Code: will be formatted <N-flanking amino acid><reference position of N-flanking amino acid>ext<Var_AA inserted>
code="{}{}ext{}".format(Nflank,ref_start,var_AA)
#Write infomation to file
write_line(outfile,clustname,clustsize,print_type,code,ref_AA,var_AA,msa_start,ref_start,msa_end,ref_end)
def record_delins_info(i_start,i_end,i_first,i_last,j,codons_inserted,alignment,reference,ref_index,outfile):
###Record cluster information###
#Store the name of the cluster associated with the variant sequence (sequence index j)
#The cluster name is in the id of the RefSeq object, before the ";".
clustname=alignment[j].id.split(sep=";")[0]
#Retrieve size of cluster associated with the variant sequence j
#The cluster size is printed betweeen ";" and "=" in id
clustsize=alignment[j].id.split(sep="=")[1].split(sep=";")[0]
#Record type (always equal to delins)
print_type="delins"
###Record indel code###
#Format: <first codon deleted><position of first codon deleted>_<last codon deleted><position of last codon deeted>delins<AA codons inserted>
#Store identity and position of first codon deleted, using its index
del_AA_first=reference[i_first]
#ref_start=Position of first codon deleted
ref_start=ref_index[i_first]
#Code format depends on if there is one or more codons deleted.
#If multiple codons are deleted (i_first!=i_last), calculate the code in the format below.
if i_first!=i_last:
#Store identity and position of last codon deleted, using its index
del_AA_last=reference[i_last]
#ref_end=Position of last codon deleted
ref_end=ref_index[i_last]
#Indel code
code="{}{}_{}{}delins{}".format(del_AA_first,ref_start,del_AA_last,ref_end,codons_inserted)
#Follow the conditional below for single-codon deletions.
else:
ref_end="-" #will store a blank on the raw data output file.
code="{}{}delins{}".format(del_AA_first,ref_start,codons_inserted)
###Record AA sequence on reference and variant strands###
#ref_AA and var_AA will simply be the reference sequence and the variant sequence in the indel region,
#Without blanks removed.
ref_AA=str(reference[i_start:i_end+1])
var_AA=str(alignment[j,i_start:i_end+1].seq)
#Record start and end positions on multiple sequence alignment (equal to index of start and end of region plus one)
msa_start=i_start+1
msa_end=i_end+1
#Write information to file
write_line(outfile,clustname,clustsize,print_type,code,ref_AA,var_AA,msa_start,ref_start,msa_end,ref_end)
def deletion_ref_AA(i_start,i_end,reference):
"""
For multi-base deletions, records the refernce AA's deleted without blanks that may appear in the aligned reference genome.
This function should only be called on multi-base deletions, otherwise an error will occur
"""
ref_AA=reference[i_start]
#the amino acid at i_start is included first, and iteration goes up to and including i_end (must be i_end+1 because range() does not include stop value)
for i in range(i_start+1,i_end+1):
#If an amino acid exists on the reference genome (no blank on msa), add the AA to the sequence
if reference[i]!="-":
ref_AA=ref_AA+reference[i]
return ref_AA
def AA_blank_remover(i_start,i_end,j,alignment):
"""
AA_blank_remover() records the amino acids within a specified region on a specified genome j,
excluding blanks that may exist due to the multiple sequence alignment.
For example, an insertion reading "FLP--AV" in the multiple sequence alginment will be converted to "FLPAV".
i_start: the starting index of the region (integer)
i_end: the ending index of the region (integer)
j: the index of the genome to be read (integer)
alingment: a biopython multiple sequence alignent object consisting of biopython SeqRecord objects.
"""
#Begin with an empty variable
AA_Seq=None
#Iterate from the first codon in the region to the last, including both first and last
for i in range(i_start,i_end+1):
#Blanks will be skipped by the conditionals below.
#The first non-blank character will be detected and assigned to AA_seq
if alignment[j,i]!="-" and not AA_Seq:
AA_Seq=alignment[j,i]
#If there is at least one codon already stored in AA_seq, the next codon will be appended to the existing sequence.
elif alignment[j,i]!="-" and AA_Seq:
AA_Seq=AA_Seq+alignment[j,i]
return AA_Seq
"""
Part 2: Functions for analysis of raw variant data and report production
"""
def initialize_report_files(outname):
"""
outname = the directory and file prefix of the output files. Specific files will contain
intitialize_report_files will create six files containing reports on the mutations present.
"""
subs_report=outname+"_substitutions.txt"
ins_report=outname+"_insertions.txt"
dels_report=outname+"_deletions.txt"
delins_report=outname+"_indels.txt"
clusters_report=outname+"_clusters.txt"
all_report=outname+"_all_by_code.csv"
count_report=outname+"_variant_counts.csv"
#Create blank files to be filled with information in later steps
for file in [subs_report,ins_report,dels_report,delins_report,clusters_report,all_report,count_report]:
with (open(file,"w")) as outfile:
outfile.write("")
return subs_report,ins_report,dels_report,delins_report,clusters_report,all_report,count_report
#Create a class for storing all of the fields in the raw data file created in part 1
class VariantRecord:
"""
Creates a variant_record object that stores the data for each variant produced in part 1.
--Instance variables--
clustname: Cluster_ID
clustsize: Number of sequences in cluster
print_type: Type of variant (insertion, deletion, substitution)
code: Mutation code (formatted based on the nomenclature indicated by the Human Genome Variation Society https://varnomen.hgvs.org/recommendations/protein/)
ref_AA: Amino Acid(s) of Reference Strand
For insertions, this is the two amino acid residues the insertion ocurrs between.
For deletions, this will be the reference residue(s) deleted
var_AA: Variant Amino Acid Residue
For multi-codon insertions, this field will contain multiple columns
For deletions, this will be blank
msa_start: Start position of variant on multiple sequence alignment
For insertions this will be the codon number on the multiple sequence alignment before the inserted codon(s)
msa_end: End position of variant on multiple sequence alignment
For insertions this will be the codon number on the multiple sequence alignment before the inserted codon(s)
For substitutions and single-base deletions this will be blank
ref_start: Start position of variant on reference strand
For insertions this will be the codon number on the reference strand before the inserted codon(s)
ref_end: End position of variant on reference strand
For insertions this will be the codon number on the reference strand before the inserted codon(s)
For substitutions and single-base deletions this will be blank
"""
def __init__(self,clustname,clustsize,print_type,code,ref_AA,var_AA,msa_start,msa_end,ref_start,ref_end):
self.__clustname=clustname
self.__clustsize=(int(clustsize) if clustsize!="-" else None)
self.__print_type=print_type
self.__code=code
self.__ref_AA=ref_AA
self.__var_AA=var_AA
self.__msa_start=(int(msa_start) if msa_start!="-" else None) #Blanks may exist in this region in the output file
self.__msa_end=(int(msa_end) if msa_end!="-" else None)
self.__ref_start=(int(ref_start) if ref_start!="-" else None)
self.__ref_end=(int(ref_end) if ref_end!="-" else None)
#Functions to print each variable
def get_clustname(self):
return self.__clustname
def get_clustsize(self):
return self.__clustsize
def get_print_type(self):
return self.__print_type
def get_code(self):
return self.__code
def get_ref_AA(self):
return self.__ref_AA
def get_var_AA(self):
return self.__var_AA
def get_msa_start(self):
return self.__msa_start
def get_msa_end(self):
return self.__msa_end
def get_ref_start(self):
return self.__ref_start
def get_ref_end(self):
return self.__ref_end
#Define method to change the code of the record (used when breaking down multi-base deletions in the count file)
def set_code(self,new_code):
"""
set_code(new_code)
set_code will alter the code of the variant record object, and is used for processing multi-base
deletions when setting up the CSV file for variant counts.
new_code=The desired new code (string)
WARNING: this will alter the VariantRecord object, and can corrupt results if not used properly.
"""
self.__code=str(new_code)
#Define output upon calling print() on the VariantRecord object
def __str__(self):
return "[{:<25}{:<5}{:<8}{:<15}]".format(self.__clustname,self.__clustsize,self.__print_type,self.__code)
def initialize_mutation_database(outname):
"""
initialize_mutation_database will create a list of VariantRecord objects using the raw data file produced from
iteration through the multiple sequence alignment.
"""
var_database=[]
with open(outname+"_variants_raw.tsv","r") as file:
#Load lines
lines=file.readlines()
#Exclude header
for line in lines[1:]:
#Remove newline characters
line=line.strip()
#Store each field in its corresponding VariantRecord attribute
clustname,clustsize,print_type,code,ref_AA,var_AA,msa_start,msa_end,ref_start,ref_end = line.split("\t")
#Use attributes from the line to create a VariantRecord object, and add the object to the list.
var_database.append(VariantRecord(clustname,clustsize,print_type,code,ref_AA,var_AA,msa_start,msa_end,ref_start,ref_end))
return var_database
#natsorted package used in the function below
from natsort import natsorted
def info_by_cluster(var_database,clusters_report):
"""
info_by_cluster() will produce an output file listing the mutations present in each cluster,
sorted in ascending order, alphabetically and then numerically, by cluster ID.
var_database: a list of VariantRecord objects created from the intermediate tab-separated file
produced by msa_iterator().
clusters_report: the path of the file name for exporting info by cluster, created by
initialize_report_files().
"""
#Get a list of unique cluster IDs
#A set is used to retrieve the unique cluster IDs (sets store only unique values, with no ordering)
clust_ids=set()
for Record in var_database:
clust_ids.add(Record.get_clustname())
#Convert set back to a list to allow sorting
clust_ids=list(clust_ids)
#Sort based on ascending cluster ID number (requires natural sorting from the natsort package)
clust_ids=natsorted(clust_ids)
#Print info to outname_clusters.txt
with open(clusters_report,"a") as file:
#Print header to file
print("-"*30,"Variants observed by cluster","-"*30,sep="\n",end="\n",file=file)
#Iterate through unique cluster IDs, printing all variants associated with each cluster ID
for clustname in clust_ids:
#Iteratively subset database for the cluster id
subset=[record for record in var_database if record.get_clustname()==clustname]
#Print the header for the cluster id, then the number of sequences, then list the mutation code of each mutation event observed.
#End the print statement with a double new line to leave one line of blank space between each cluster in the record.
print(">{}, {} sequences".format(clustname,subset[0].get_clustsize()),*(record.get_code() for record in subset),sep="\n",end="\n\n",file=file)
def info_by_code(var_database,var_type,subs_report,ins_report,dels_report,delins_report):
"""
info_by_code() will return information on the identity of clusters with a variant, for each unique variant
in the multiple sequence alignment.
var_database: A list of VariantRecord objects generated from the intermediate tab-separated file produced by msa_iterator().
var_type: The type of variants to return information for. Accepts "sub" (substitutions),"ins" (insertions),
"del"(deletions), and "delins"(insertion-deletions).
subs_report: the path of the file name for exporting substitution info by code, created by
initialize_report_files().
ins_report: the path of the file name for exporting insertion info by code, created by
initialize_report_files().
dels_report: the path of the file name for exporting deletion info by code, created by
initialize_report_files().
delins_report: the path of the file name for exporting insertion-deletion info by code, created by
initialize_report_files().
"""
if var_type=="sub":
#subset for substitutions
substitutions=[record for record in var_database if record.get_print_type()=="sub"]
#Create a set of unique mutation codes
#Sorting should be performed based on the start position on the multiple sequence alignment. A dictionary is used to sort this
subcodes={}
for record in substitutions:
#Conditional below returns a dictionary of unique mutation codes
if record.get_code() not in subcodes:
subcodes[record.get_code()]=record.get_msa_start()
#Sort the dictionary by value of MSA start position (returns a list of the substitution codes sorted by msa start position)
subcodes=sorted(subcodes, key=lambda x:subcodes[x])
#Print info to outname_substitutions.txt
with open(subs_report,"a") as file:
#Print header to file
print("-"*40,"Clusters by Mutation Code: Substitutions","-"*40,sep="\n",end="\n",file=file)
#For each unique code, subset the substitutions database by that code and print statistics
for code in subcodes:
#Iteratively subset database for the mutation code
subset=[record for record in var_database if record.get_code()==code]
#Calculate total number of sequences with the mutation
total_seq=0
for record in subset:
#For each cluster with the mutation code, get the size of the cluster and add that to the total number of sequences
total_seq+=record.get_clustsize()
#Print the header for the mutation code, then the total number of sequences with the code
#Next, the cluster name and number of sequences in the cluster is printed for each cluster with the code
#End the print statement with a double new line to leave one line of blank space between each cluster in the record.
print(">{}, {} sequences".format(code,total_seq),*("{}; {} sequences".format(record.get_clustname(),record.get_clustsize()) for record in subset),sep="\n",end="\n\n",file=file)
elif var_type=="ins":
#Subset for insertions
insertions=[record for record in var_database if record.get_print_type()=="ext" or record.get_print_type()=="ins"]
#Create a set of unique mutation codes
#Sorting should be performed based on the start position on the multiple sequence alignment. A dictionary is used to sort this
inscodes={}
for record in insertions:
#Conditional below returns a dictionary of unique mutation codes
if record.get_code() not in inscodes:
inscodes[record.get_code()]=record.get_msa_start()
#Sort the dictionary by value of MSA start position (returns a list of the insertion codes sorted by msa start position)
inscodes=sorted(inscodes, key=lambda x:inscodes[x])
#Print information to outname_insertions.txt
with open(ins_report,"a") as file:
#Print header to file
print("-"*40,"Clusters by Mutation Code: Insertions","-"*40,sep="\n",end="\n",file=file)
#For each unique code, subset the insertions database by that code and print statistics
for code in inscodes:
#Iteratively subset database for the mutation code
subset=[record for record in var_database if record.get_code()==code]
#Calculate total number of sequences with the mutation
total_seq=0
for record in subset:
#For each cluster with the mutation code, get the size of the cluster and add that to the total number of sequences
total_seq+=record.get_clustsize()
#Print the header for the mutation code, then the total number of sequences with the code
#Next, the cluster name and number of sequences in the cluster is printed for each cluster with the code
#End the print statement with a double new line to leave one line of blank space between each cluster in the record.
print(">{}, {} sequences".format(code,total_seq),*("{}; {} sequences".format(record.get_clustname(),record.get_clustsize()) for record in subset),sep="\n",end="\n\n",file=file)
elif var_type=="del":
#Subset for deletions
dels=[record for record in var_database if record.get_print_type()=="del"]
#Create a set of unique mutation codes
#Sorting should be performed based on the start position on the multiple sequence alignment. A dictionary is used to sort this
delcodes={}
for record in dels:
#Conditional below returns a dictionary of unique mutation codes
if record.get_code() not in delcodes:
delcodes[record.get_code()]=record.get_msa_start()
#Sort the dictionary by value of MSA start position (returns a list of the deletion codes sorted by msa start position)
delcodes=sorted(delcodes, key=lambda x:delcodes[x])
with open(dels_report,"a") as file:
#Print header to file
print("-"*40,"Clusters by Mutation Code: Deletions","-"*40,sep="\n",end="\n",file=file)
#For each unique code, subset the substitutions database by that code and print statistics
for code in delcodes:
#Iteratively subset database for the mutation code
subset=[record for record in var_database if record.get_code()==code]
#Calculate total number of sequences with the mutation
total_seq=0
for record in subset:
#For each cluster with the mutation code, get the size of the cluster and add that to the total number of sequences
total_seq+=record.get_clustsize()
#Print the header for the mutation code, then the total number of sequences with the code
#Next, the cluster name and number of sequences in the cluster is printed for each cluster with the code
#End the print statement with a double new line to leave one line of blank space between each cluster in the record.
print(">{}, {} sequences".format(code,total_seq),*("{}; {} sequences".format(record.get_clustname(),record.get_clustsize()) for record in subset),sep="\n",end="\n\n",file=file)
elif var_type=="indel":
#Subset for indels
indels=[record for record in var_database if record.get_print_type()=="delins"]
#Create a set of unique mutation codes
#Sorting should be performed based on the start position on the multiple sequence alignment. A dictionary is used to sort this
indelcodes={}
for record in indels:
#Conditional below returns a dictionary of unique mutation codes
if record.get_code() not in indelcodes:
indelcodes[record.get_code()]=record.get_msa_start()
#Sort the dictionary by value of MSA start position (returns a list of the insertion-deletion codes sorted by msa start position)
indelcodes=sorted(indelcodes, key=lambda x:indelcodes[x])
with open(delins_report,"a") as file:
#Print header to file
print("-"*40,"Clusters by Mutation Code: Indels","-"*40,sep="\n",end="\n",file=file)
#For each unique code, subset the substitutions database by that code and print statistics
for code in indelcodes:
#Iteratively subset database for the mutation code
subset=[record for record in var_database if record.get_code()==code]
#Calculate total number of sequences with the mutation
total_seq=0
for record in subset:
#For each cluster with the mutation code, get the size of the cluster and add that to the total number of sequences
total_seq+=record.get_clustsize()
#Print the header for the mutation code, then the total number of sequences with the code
#Next, the cluster name and number of sequences in the cluster is printed for each cluster with the code
#End the print statement with a double new line to leave one line of blank space between each cluster in the record.
print(">{}, {} sequences".format(code,total_seq),*("{}; {} sequences".format(record.get_clustname(),record.get_clustsize()) for record in subset),sep="\n",end="\n\n",file=file)
def mutation_csv(var_database,all_report,alignment):
"""
mutation_csv() will produce a comma-separated values file with the following fields:
Position: the position of the mutation on the reference strand. For insertions, this will be the
Type: The type of mutation (sub,ins,del,indel,ext)
Code: Mutation code, according to standard nomenclature
Freq: The total number of sequences with the mutation.
"""
unique_codes={}
#First, all mutations will be sorted by the start position in the reference sequence. A dictionary will be used to achieve this
for record in var_database:
#A list of unique codes is generated by adding any code that is not already in the unique_codes dictionary.
if record.get_code() not in unique_codes:
#N-terminal extensions do not have an entry for ref_start(), so 1 will be used in this case
#(insertion ocurrs before first reference position).
if record.get_print_type()=="ext" and not record.get_ref_start():
unique_codes[record.get_code()]=1
#For normal cases (ref_start exists), add the reference position to the dictionary for sorting
elif record.get_ref_start():
unique_codes[record.get_code()]=record.get_ref_start() if record.get_ref_start() else 1
#Throw an error for any unanticipated records without a reference position recorded.
else:
raise ValueError("Unanticipated Case Missing Reference Position.")
#Sort the dictionary of codes by reference position of variant
unique_codes=sorted(unique_codes,key=lambda x:unique_codes[x])
#Number of codes written to CSV is stored for progress reporting
#Set counter to zero initially
codes_printed=0
#Write header of CSV to output file: