-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMultiplicity.py
4206 lines (3645 loc) · 159 KB
/
Multiplicity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# MacroFold.py
# (c) 2016-2017 Nikolaus Howe
# by Niki Howe,
# Summer/Fall/Winter
# (July->January) 2016/2017
# Imports
import numpy as np
import math
import sys
import csv
import random
# For testing with UNAFold
import subprocess
# The temperature (in Kelvin)
global T
T = 273.15 + 37
# The gas constant (kcal/mol*K) ## R=k_B*N_A=(1.38e-23 J/K)/(cal/4.184J)(6.022e23) ##
global r
r = 0.0019872036
# The beta constant
global beta
beta = 1./(r*T)
# get_all_structures
def get_structs(sequence, only_num=False, write=True):
# Make sure it is upper-case
sequence = sequence.upper()
# List of allowed pairs
# Assumes only capital letters and no T's
legal_pairs = {'AU', 'UA', 'GC', 'CG', 'GU', 'UG'}
# Make a new file which contains the sequence
with open("./sequence.seq", 'w') as myfile:
myfile.write(sequence+'\n')
# Get all the folds of that sequence using crumple
all_folds = subprocess.getoutput("./crumple/crumple -i sequence.seq")
# Put them in a list and get rid of the last two lines of text
all_folds = set(all_folds.split()[:-2])
# The two places where dangles
# can appear are in the external
# loop and in multibranches
# External loop
old_folds = set()
while all_folds != old_folds:
old_folds = set(all_folds)
for fold in old_folds:
i = 0
while i < len(fold) - 1:
if fold[i] == '(':
if i > 0 and fold[i-1] == '.':
new_fold = fold[:i-1] + '<' + fold[i:]
all_folds.add(new_fold)
i = get_match(i, fold)
if fold[i] == ')':
if i < len(fold)-1 and fold[i+1] == '.':
new_fold = fold[:i+1] + '>' + fold[i+2:]
all_folds.add(new_fold)
i += 1
# Multibranch loops
old_folds = set()
while all_folds != old_folds:
old_folds = set(all_folds)
for fold in old_folds:
for i, char in enumerate(fold):
if char == '(':
num = get_num_structures(i, fold)
if num >= 2:
j = get_match(i, fold)
#print(fold, i, j)
#if fold[i+1] == '.':
#new_fold = fold[:i+1] + '>' + fold[i+2:]
#all_folds.add(new_fold)
#if fold[j-1] == '.':
#new_fold = fold[:j-1] + '<' + fold[j:]
k = i+1
#print(i, k, j)
while k < j:
if fold[k] == '.':
if fold[k-1] == '(' or fold[k-1] == ')':
new_fold = fold[:k] + '>' + fold[k+1:]
all_folds.add(new_fold)
if fold[k+1] == '(' or fold[k+1] == ')':
new_fold = fold[:k] + '<' + fold[k+1:]
all_folds.add(new_fold)
if fold[k] == '(':
k = get_match(k, fold)
k += 1
# Remove the folds with a tstack which is not allowed
# (namely, if the stacking bases can pair with each other)
old_folds = set(all_folds)
for fold in old_folds:
for i, char in enumerate(fold):
if char == '(':
if i > 0 and fold[i-1] == '<':
j = get_match(i, fold)
if j < len(fold) - 1 and fold[j+1] == '>':
if sequence[i-1]+sequence[j+1] in legal_pairs:
if fold in all_folds: # because we might have removed it in the meantime
all_folds.remove(fold)
if fold[i+1] == '>':
j = get_match(i, fold)
if fold[j] == ')':
if fold[j-1] == '<':
if sequence[i+1]+sequence[j-1] in legal_pairs:
if fold in all_folds:
all_folds.remove(fold)
#for fold in old_folds - all_folds:
#print(fold)
#for fold in all_folds:
#print(fold)
# For every fold, make a pair_dict (right AND left-looking)
# and a dangle_list, which we will use to make
# the extended .ct files
all_folds = sorted(list(all_folds))
if only_num:
return len(all_folds)
structure_list = []
for fold in all_folds:
pair_dict = {}
dangle_list = []
for i, char in enumerate(fold):
if char == '(':
j = get_match(i, fold)
pair_dict[i] = j
pair_dict[j] = i
if char == '<':
dangle_list.append(i)
if char == '>':
dangle_list.append(i-1)
structure_list.append((pair_dict, dangle_list))
#for element in structure_list:
#print(element)
if write:
# Write these folds to an extended .ct file
for i, fold in enumerate(all_folds):
#if (i+1)%2000 == 0:
#print(i+1, fold)
with open("./folds/fold{}.txt".format(i+1), "w") as text_file:
# Get the corresponding structure
pairs = structure_list[i][0]
dangles = structure_list[i][1]
#print(pairs, dangles)
# Make lists for all the stacking
left_stacking = []
right_stacking = []
# Stacking from pairing
for pos, char in enumerate(fold):
if char == '(':
j = get_match(pos, fold)
num = get_num_structures(pos, fold)
if num == 1:
#print("one", fold[pos: j+1])
# normal stacking
if fold[pos+1] == '(' and fold[j-1] == ')':
left_stacking.append(pos+1)
left_stacking.append(j)
right_stacking.append(pos)
right_stacking.append(j-1)
# 0x1 bulge loop
elif fold[pos+1] == '(' and fold[j-1] == '.' and fold[j-2] == ')':
left_stacking.append(pos+1)
left_stacking.append(j)
right_stacking.append(pos)
right_stacking.append(j-2)
# 1x0 bulge loop
elif fold[pos+1] == '.' and fold[pos+2] == '(' and fold[j-1] == ')':
left_stacking.append(pos+2)
left_stacking.append(j)
right_stacking.append(pos)
right_stacking.append(j-1)
# Stacking from dangles
for d in dangles:
left_stacking.append(d+1)
right_stacking.append(d)
#print("left stacking ", left_stacking)
#print("right stacking", right_stacking)
# Write the first line of the .ct file
text_file.write(str(len(sequence))+'\t'+"fold{}".format(i+1)+'\n')
# Write the other lines
for line, char in enumerate(sequence):
# The first four columns
text_file.write(str(line+1)+'\t'+char+'\t'+str(line)+'\t'+str((line+2)%(len(sequence)+1))+'\t')
# The pairing column
if fold[line] == '(' or fold[line] == ')':
text_file.write(str(pairs[line]+1)+'\t')
else:
text_file.write(str(0)+'\t')
# Same as first column
text_file.write(str(line+1)+'\t')
# Stack-to-the-left column
if line > 0 and line in left_stacking:
if fold[line-1] == '(' or fold[line-1] == ')' or fold[line-1] == '<' or fold[line-1] == '>':
text_file.write(str(line)+'\t')
elif fold[line-1] == '.':
text_file.write(str(line-1)+'\t')
else:
raise Exception("This is not supposed to happen")
else:
text_file.write(str(0)+'\t')
# Stack-to-the-right column
if line < len(fold) - 1 and line in right_stacking:
#print(line)
if fold[line+1] == ')' or fold[line+1] == '(' or fold[line+1] == '<' or fold[line+1] == '>':
text_file.write(str(line+2)+'\n')
elif fold[line+1] == '.':
text_file.write(str(line+3)+'\n')
else:
raise Exception("This is not supposed to happen")
else:
text_file.write(str(0)+'\n')
# Returns the number of folds
return(len(all_folds), all_folds, structure_list)
# Given a place to start looking, returns the number of structures inside this one
# Can be arbitrarily large
def get_num_structures(index, seq):
# Get the closing index for this structure
closing_index = get_match(index, seq)
#if index == 4 and closing_index == 20: print(seq, index, closing_index)
next_index = index + 1
num_structures = 0
while next_index < closing_index:
#if index == 4 and closing_index == 20: print(next_index)
#if index == 4 and closing_index == 20: print(seq[next_index])
if seq[next_index] == '(':
num_structures += 1
next_index = get_match(next_index, seq) # used to read get_match(next_index, seq) + 1. Fixed on Jan 16.
#print("num_structures", num_structures)
next_index += 1
return num_structures
# index gives the index of the left paren we are looking
# for a match of; string gives the whole string in
# which we are searching
def get_match(index, string, left_bracket='(', right_bracket=')'):
# We keep track of how many parens we still
# need to match
count = 0
# Add on left parens, subtract right ones
# When there are zero left to match,
# we have matched the original
for i, char in enumerate(string[index:]):
if char == left_bracket:
count += 1
if char == right_bracket:
count -= 1
if count <= 0:
return index+i
#print(get_num_structures(0, "((...)(...).)"))
# Check the number of folds in a randomly generated sequence of length n
def generate_sequence(n):
# The list of bases to sample from
bases = ['A', 'C', 'G', 'U']
# Sample with replacement with uniform weights
result = ""
for i in range(n):
result += random.choice(bases)
return result
# Given a sequence and a fold (in dot/bracket notation)
# save the .ct file
def make_ct(sequence, fold):
# For every fold, make a pair_dict (right AND left-looking)
# and a dangle_list, which we will use to make
# the extended .ct files
all_folds = sorted(list(all_folds))
if only_num:
return len(all_folds)
structure_list = []
for fold in all_folds:
pair_dict = {}
dangle_list = []
for i, char in enumerate(fold):
if char == '(':
j = get_match(i, fold)
pair_dict[i] = j
pair_dict[j] = i
if char == '<':
dangle_list.append(i)
if char == '>':
dangle_list.append(i-1)
structure_list.append((pair_dict, dangle_list))
#for element in structure_list:
#print(element)
if write:
# Write these folds to an extended .ct file
for i, fold in enumerate(all_folds):
#if (i+1)%2000 == 0:
#print(i+1, fold)
with open("./folds/fold{}.txt".format(i+1), "w") as text_file:
# Get the corresponding structure
pairs = structure_list[i][0]
dangles = structure_list[i][1]
#print(pairs, dangles)
# Make lists for all the stacking
left_stacking = []
right_stacking = []
# Stacking from pairing
for pos, char in enumerate(fold):
if char == '(':
j = get_match(pos, fold)
num = get_num_structures(pos, fold)
if num == 1:
#print("one", fold[pos: j+1])
# normal stacking
if fold[pos+1] == '(' and fold[j-1] == ')':
left_stacking.append(pos+1)
left_stacking.append(j)
right_stacking.append(pos)
right_stacking.append(j-1)
# 0x1 bulge loop
elif fold[pos+1] == '(' and fold[j-1] == '.' and fold[j-2] == ')':
left_stacking.append(pos+1)
left_stacking.append(j)
right_stacking.append(pos)
right_stacking.append(j-2)
# 1x0 bulge loop
elif fold[pos+1] == '.' and fold[pos+2] == '(' and fold[j-1] == ')':
left_stacking.append(pos+2)
left_stacking.append(j)
right_stacking.append(pos)
right_stacking.append(j-1)
# Stacking from dangles
for d in dangles:
left_stacking.append(d+1)
right_stacking.append(d)
#print("left stacking ", left_stacking)
#print("right stacking", right_stacking)
# Write the first line of the .ct file
text_file.write(str(len(sequence))+'\t'+"fold{}".format(i+1)+'\n')
# Write the other lines
for line, char in enumerate(sequence):
# The first four columns
text_file.write(str(line+1)+'\t'+char+'\t'+str(line)+'\t'+str((line+2)%(len(sequence)+1))+'\t')
# The pairing column
if fold[line] == '(' or fold[line] == ')':
text_file.write(str(pairs[line]+1)+'\t')
else:
text_file.write(str(0)+'\t')
# Same as first column
text_file.write(str(line+1)+'\t')
# Stack-to-the-left column
if line > 0 and line in left_stacking:
if fold[line-1] == '(' or fold[line-1] == ')' or fold[line-1] == '<' or fold[line-1] == '>':
text_file.write(str(line)+'\t')
elif fold[line-1] == '.':
text_file.write(str(line-1)+'\t')
else:
raise Exception("This is not supposed to happen")
else:
text_file.write(str(0)+'\t')
# Stack-to-the-right column
if line < len(fold) - 1 and line in right_stacking:
#print(line)
if fold[line+1] == ')' or fold[line+1] == '(' or fold[line+1] == '<' or fold[line+1] == '>':
text_file.write(str(line+2)+'\n')
elif fold[line+1] == '.':
text_file.write(str(line+3)+'\n')
else:
raise Exception("This is not supposed to happen")
else:
text_file.write(str(0)+'\n')
# Returns the number of folds
return(len(all_folds), all_folds, structure_list)
# Check if a variable has been assigned a value yet
def exists(x):
return (x in locals() or x in globals())
# Fill all the energy tables
def fillEnergyTables(no_dangles=False, zero_energies=False, force=False):
if force:
loadTerminalMismatchHEnergy(no_dangles, zero_energies)
loadTerminalMismatchEEnergy(no_dangles, zero_energies)
loadTerminalMismatchMEnergy(no_dangles, zero_energies)
loadDangle3Energy(no_dangles, zero_energies)
loadDangle5Energy(no_dangles, zero_energies)
loadBulgeLoopEnergy(zero_energies)
loadInternalLoopEnergy(zero_energies)
loadStackEnergy(zero_energies)
loadHairpinEnergy(zero_energies)
loadSpecialLoops(zero_energies)
loadCLoops(zero_energies)
loadAsymmetry(zero_energies)
loadInt11(zero_energies)
loadInt21(zero_energies)
loadInt22(zero_energies)
load_penalty_dicts(zero_energies)
load_multibranch_penalties(zero_energies)
else:
if not exists("terminalMismatchHEnergy"): loadTerminalMismatchHEnergy(no_dangles, zero_energies)
if not exists("terminalMismatchEEnergy"): loadTerminalMismatchEEnergy(no_dangles, zero_energies)
if not exists("terminalMismatchMEnergy"): loadTerminalMismatchMEnergy(no_dangles, zero_energies)
if not exists("dangle3Energy"): loadDangle3Energy(no_dangles, zero_energies)
if not exists("dangle5Energy"): loadDangle5Energy(no_dangles, zero_energies)
if not exists("bulgeLoopEnergy"): loadBulgeLoopEnergy(zero_energies)
if not exists("internalLoopEnergy"): loadInternalLoopEnergy(zero_energies)
if not exists("stackEnergy"): loadStackEnergy(zero_energies)
if not exists("hairpinEnergy"): loadHairpinEnergy(zero_energies)
if not exists("specialLoops"): loadSpecialLoops(zero_energies)
if not exists("CLoops"): loadCLoops(zero_energies)
if not exists("asymmetry"): loadAsymmetry(zero_energies)
if not exists("int11"): loadInt11(zero_energies)
if not exists("int21"): loadInt21(zero_energies)
if not exists("int22"): loadInt22(zero_energies)
if not exists("endPenalty"): load_penalty_dicts(zero_energies)
if not exists("ea"): load_multibranch_penalties(zero_energies)
# Fill in the dangle3 table
# Note that the table is organized as: 5PrimeBase, 3PrimeBase, DangleBase, Energy
# where the 5PrimeBase (i) and 3PrimeBase (j) are paired, with DangleBase dangling off
# to the left of the 3PrimeBase (at j-1)
def loadDangle3Energy(no_dangle=False, zero_energies=False):
global dangle3Energy
dangle3Energy = {}
if no_dangle:
filename = "inf_dangle3.csv"
elif zero_energies:
filename = "zero_dangle3.csv"
else:
filename = "dangle3.csv"
with open("./data/{}".format(filename)) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for i, row in enumerate(reader):
# Skip the header row
if i == 0:
continue
else:
dangle3Energy[str(row[0])+str(row[1])+str(row[2])] = math.exp(-beta*float(row[3]))
# Fill in the dangle5 table
# Note that the table is organized as: 5PrimeBase, 3PrimeBase, DangleBase, Energy
# where the 5PrimeBase (i) and 3PrimeBase (j) are paired, with DangleBase dangling off
# to the right of the 5PrimeBase (at i+1)
def loadDangle5Energy(no_dangle=False, zero_energies=False):
global dangle5Energy
dangle5Energy = {}
if no_dangle:
filename = "inf_dangle5.csv"
elif zero_energies:
filename = "zero_dangle5.csv"
else:
filename = "dangle5.csv"
with open("./data/{}".format(filename)) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for i, row in enumerate(reader):
# Skip the header row
if i == 0:
continue
else:
dangle5Energy[str(row[0])+str(row[1])+str(row[2])] = math.exp(-beta*float(row[3]))
# Fill in the bulge loop energy dict (length : Energy)
def loadBulgeLoopEnergy(zero_energies=False):
global bulgeLoopEnergy
bulgeLoopEnergy = {}
if zero_energies:
filename = "zero_bulge.csv"
else:
filename = "bulge.csv"
with open("./data/{}".format(filename)) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for i, row in enumerate(reader):
# Skip the header row
if i == 0:
continue
else:
bulgeLoopEnergy[int(row[0])] = math.exp(-beta*float(row[1]))
# Fill in the internal loop energy dict (length : Energy)
def loadInternalLoopEnergy(zero_energies=False):
global internalLoopEnergy
internalLoopEnergy = {}
if zero_energies:
filename = "zero_internal.csv"
else:
filename = "internal.csv"
with open("./data/{}".format(filename)) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for i, row in enumerate(reader):
# Skip the header row
if i == 0:
continue
else:
internalLoopEnergy[int(row[0])] = math.exp(-beta*float(row[1]))
# Fill in the stack energy dict (5primeOuter5primeInner3primeInner3primeOuter : Energy)
def loadStackEnergy(zero_energies=False):
global stackEnergy
stackEnergy = {}
if zero_energies:
filename = "zero_stack.csv"
else:
filename = "stack.csv"
with open("./data/{}".format(filename)) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for i, row in enumerate(reader):
# Skip the header row
if i == 0:
continue
else:
stackEnergy[str(row[0])+str(row[1])+str(row[2])+str(row[3])] = math.exp(-beta*float(row[4]))
# Fill in the stack terminal mismatch energy dict (5primeOuter5primeInner3primeInner3primeOuter : Energy)
def loadTerminalMismatchHEnergy(no_dangle=False, zero_energies=False):
global terminalMismatchHEnergy
terminalMismatchHEnergy = {}
if no_dangle:
filename = "inf_tstackh.csv"
elif zero_energies:
filename = "zero_tstackh.csv"
else:
filename = "tstackh.csv"
with open("./data/{}".format(filename)) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for i, row in enumerate(reader):
# Skip the header row
if i == 0:
continue
else:
terminalMismatchHEnergy[str(row[0])+str(row[1])+str(row[2])+str(row[3])] = math.exp(-beta*float(row[4]))
def loadTerminalMismatchEEnergy(no_dangle=False, zero_energies=False):
global terminalMismatchEEnergy
terminalMismatchEEnergy = {}
if no_dangle:
filename = "inf_tstacke.csv"
elif zero_energies:
filename = "zero_tstacke.csv"
else:
filename = "tstacke.csv"
with open("./data/{}".format(filename)) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for i, row in enumerate(reader):
# Skip the header row
if i == 0:
continue
else:
terminalMismatchEEnergy[str(row[0])+str(row[1])+str(row[2])+str(row[3])] = math.exp(-beta*float(row[4]))
def loadTerminalMismatchMEnergy(no_dangle=False, zero_energies=False):
global terminalMismatchMEnergy
terminalMismatchMEnergy = {}
if no_dangle:
filename = "inf_tstackm.csv"
elif zero_energies:
filename = "zero_tstackm.csv"
else:
filename = "tstackm.csv"
with open("./data/{}".format(filename)) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for i, row in enumerate(reader):
# Skip the header row
if i == 0:
continue
else:
terminalMismatchMEnergy[str(row[0])+str(row[1])+str(row[2])+str(row[3])] = math.exp(-beta*float(row[4]))
# Fill in the hairpin energy dict (length : energy)
def loadHairpinEnergy(zero_energies=False):
global hairpinEnergy
hairpinEnergy = {}
if zero_energies:
filename = "zero_hairpin.csv"
else:
filename = "hairpin.csv"
with open("./data/{}".format(filename)) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for i, row in enumerate(reader):
# Skip the header row
if i == 0:
continue
else:
hairpinEnergy[int(row[0])] = math.exp(-beta*float(row[1]))
# Fill in the special loop dict (sequence : energy)
def loadSpecialLoops(zero_energies=False):
global specialLoops
specialLoops = {}
# Triloops
# These are using the 2004 data (not currently implemented)
'''with open("./data/triloop.csv") as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for i, row in enumerate(reader):
# Skip the header row
if i == 0:
continue
else:
specialLoops[str(row[0])] = float(row[1])
'''
# Tetraloops
if zero_energies:
filename = "zero_tloop.csv"
else:
filename = "tloop.csv"
with open("./data/{}".format(filename)) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for i, row in enumerate(reader):
# Skip the header row
if i == 0:
continue
else:
specialLoops[str(row[0])] = math.exp(-beta*float(row[1]))
# Hexaloops
# This actually does not belong here, but should be checked separately,
# as hexaloops are from 2004 and have their final energy (not to be added to the earlier energy I don't *think*)
'''with open("./data/hexaloop.csv") as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for i, row in enumerate(reader):
# Skip the header row
if i == 0:
continue
else:
specialLoops[str(row[0])] = float(row[1])'''
# Fill in the C-loop dictionary (hairpinsequence : energy)
def loadCLoops(zero_energies=False):
global CLoops
CLoops = {}
# C-only loop of size 3 (from Table 6 of Mathews)
CLoops["CCC"] = math.exp(-beta*1.4)
# C-only loops of larger size (up to 30) (from Table 6 of Mathews)
if zero_energies:
for i in CLoops:
CLoops[i] = 1
else:
for i in range(4, 30):
CLoops["C"*i] = math.exp(-beta*(0.3*i + 1.6))
# Asymmetry term
# Coef rounded from 0.48 to 0.5 to follow unafold and RNAeval with 99 rules
def loadAsymmetry(zero_energies=False):
global asymmetry
asymmetry = {}
if zero_energies:
for i in range(0, 31):
asymmetry[i] = 1
else:
for i in range(0, 31):
asymmetry[i] = math.exp(-beta*min(3, 0.5*abs(i)))
#print("asymmetry", asymmetry)
# Fill the 1x1 internal loop dict (sequence : energy)
def loadInt11(zero_energies=False):
global int11
int11 = {}
if zero_energies:
filename = "zero_int11.csv"
else:
filename = "int11.csv"
with open("./data/{}".format(filename)) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for i, row in enumerate(reader):
# Skip the first row
if i == 0:
continue
else:
int11[str(row[0])+str(row[1])+str(row[2])+
str(row[3])+str(row[4])+str(row[5])] = math.exp(-beta*float(row[6]))
# Fill the internal 2x1 bulge table (sequence : energy)
def loadInt21(zero_energies=False):
global int21
int21 = {}
if zero_energies:
filename = "zero_int21.csv"
else:
filename = "int21.csv"
with open("./data/{}".format(filename)) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for i, row in enumerate(reader):
# Skip the first row
if i == 0:
continue
else:
int21[str(row[0])+str(row[1])+str(row[2])+
str(row[3])+str(row[4])+str(row[5])+str(row[6])] = math.exp(-beta*float(row[7]))
# Fill the 2x2 internal bulge tables (sequence : energy)
def loadInt22(zero_energies=False):
global int22
int22 = {}
if zero_energies:
filename = "zero_int22.csv"
else:
filename = "int22.csv"
with open("./data/{}".format(filename)) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for i, row in enumerate(reader):
# Skip the first row
if i == 0:
continue
else:
int22[str(row[0])+str(row[1])+str(row[2])+str(row[3])+
str(row[4])+str(row[5])+str(row[6])+str(row[7])] = math.exp(-beta*float(row[8]))
# Fill the penalty dictionaries
def load_penalty_dicts(zero_energies=False):
# Penalty for terminating a stack with anything but a GC
# Hairpin version http://rna.urmc.rochester.edu/NNDB/turner99/hairpin-example-3.html
# Rounded from 0.45 to 0.5 to follow unafold3.9
# Switching it to 0.5 for testing against UNAFold
global endPenalty
# Loop version http://rna.urmc.rochester.edu/NNDB/turner99/internal-example-3.html
# Rounded from 0.65 to 0.7 to follow unafold3.9 (tstacki.DAT) and RNAeval with 99 rules
global loopEndPenalty
# Loop version
global loopFirstMismatchEnergy
# Bonus for GU closure (closing pair with earlier two 5' bases before it : energy)
# This is applied 'only to hairpins with a 5' closing G that is preceded by two G residues'
# - Mathews
global guClosureEnergy
if zero_energies:
endPenalty = {('AU'): 1, ('UA'): 1, ('GU'): 1, ('UG'): 1}
loopEndPenalty = {('AU'): 1, ('UA'): 1, ('GU'): 1, ('UG'): 1}
loopFirstMismatchEnergy = {('UU'): 1, ('GA'): 1, ('AG'): 1}
guClosureEnergy = {('GGGU'): 1}
else:
endPenalty = {('AU'): math.exp(-beta*0.5), ('UA'): math.exp(-beta*0.5), ('GU'): math.exp(-beta*0.5), ('UG'): math.exp(-beta*0.5)}
loopEndPenalty = {('AU'): math.exp(-beta*0.7), ('UA'): math.exp(-beta*0.7), ('GU'): math.exp(-beta*0.7), ('UG'): math.exp(-beta*0.7)}
loopFirstMismatchEnergy = {('UU'): math.exp(-beta*(-0.7)), ('GA'): math.exp(-beta*(-1.1)), ('AG'): math.exp(-beta*(-1.1))}
guClosureEnergy = {('GGGU'): math.exp(-beta*(-2.2))}
####################
# Global Variables #
####################
###########
# Testing #
###########
# Show the energy contributions
global debug_energy
debug_energy = False
# Show the structure
global debug_structure
debug_structure = False
############################
# Sequence and Environment #
############################
# The sequence itself
global s
# The length of the sequence
global N
# The scale constant (expected value for one base)
global scale
#scale = -0.34
scale = 0
# The maximum allowed internal loop or hairpin size
global l
l = 30
###################
# Arrays #
###################
# The four arrays (now five because of the two Z's)
# 1D, step through to calculate overall partition function ##BETTER EXPLANATION NEEDED##
global Z3
global Z5
# 2D, forces i and j to be paired
global Zb
# 2D, forces 'structure' somewhere between i and j
global Z1
# 2D, forces multibranch somewhere between i and j
global Z2
#####################
# Bases and Pairs #
#####################
# Convert base letter to corresponding int
global base_dict
base_dict = {'A': 0, 'a': 0, 'C': 1, 'c': 1, 'G': 2, 'g': 2, 'U': 3, 'u': 3, 'T': 3, 't': 3}
# List of lists of allowed pairs.
# R[i] contains all bases i is allowed to pair with to the right.
global R
R = []
# L[j] contains all bases j is allowed to pair with to the left.
global L
L = []
# Will contain the list of all pairs (for one given microstate calculated by the traceback)
global pair_dict
# Will contain the list of all dangles
global dangle_list
# Will contain the next e index for each d
global e_index
###################
# Energy Tables #
###################
# List of allowed pairs, stored as tuples
# Assumes only capital letters and no T's
global allowedPairs
allowedPairs = {('A','U'), ('U','A'), ('G','C'), ('C','G'), ('G','U'), ('U','G')}
# Stack energy
# Arranged like (5primeOuter5primeInner3primeInner3primeOuter : Energy)
global stackEnergy
# Internal Loops
global int11
global int21
global int22
# Hairpin
# Normal hairpin loops (just count how many in the loop)
global hairpinEnergy
# Special loops for which we know the specific energy
global specialLoops
# C-only loops
global CLoops
# Asymmetry value for bulge loops
global Asymmetry
# Internal loops
global internalLoopEnergy
# Bulge Loops
global bulgeLoopEnergy
# Dangle penalties
# Of the form (i, j, dangle)
global dangle5Energy
# Of the form (dangle, i, j)
global dangle3Energy
# Terminal Mismatch
# These are arranged like (5primeouter5primeinner3primeouter3primeinner : Energy)
global terminalMismatchHEnergy
global terminalMismatchIEnergy
global terminalMismatchEEnergy
global terminalMismatchMEnergy
####################
# Energy Penalties #
####################
# Multibranch Penalties
def load_multibranch_penalties(zero_energies=False):
global ea
global eb
global ec
if zero_energies:
ea = 1
eb = 1
ec = 1
else:
ea = math.exp(-beta*3.4)
eb = math.exp(-beta*0.0)
ec = math.exp(-beta*0.4)
################################
# Loading and Useful functions #
################################
# Load in a sequence
def loadSequence(seq):
global s
global olds
global N
global Z3
global Z5
global Zb
global Z1
global Z2
global e_index
olds = str(seq).upper()
N = len(olds)
s = olds*2 # we double the string so that we can correctly handle the wraparound with R and L
Z3 = np.full(N, 1, dtype=np.float64)
Z5 = np.full(N, 1, dtype=np.float64)
Zb = np.full((N,N), 0, dtype=np.float64)
Z1 = np.full((N,N), 0, dtype=np.float64)
Z2 = np.full((N,N), 0, dtype=np.float64)
# Add indices of other bases which can pair with this one
# R contains potential pairs to the right of the base
# L contains potintial pairs to the left of the base
# If given a Python list of lists of pairing partners,
# it will use those instead of calculating all legal pairs
# Usage: if base 0 is allowed to pair with 5 and 6, and base
# 1 is allowed to pair with base 7, it will look like:
# [[5,6], [7], ... ]
def fillAllowedPairsList(likely_pairs=None):
global R
global L
R = []
L = []
# If we gave a list of pairs, load it
if not not likely_pairs:
for i in range(N):
R.append([])
L.append([])
for base in likely_pairs[i]:
if base >= min(i+4, N) and base < max(N, i+N-3):
R[-1].append(base)
elif base+N >= min(i+4, N) and base < max(N, i+N-3): # for the wraparound case
R[-1].append(base+N)
for base in likely_pairs[i]:
if base >= 0 and base < i-3:
L[-1].append(base)
L[-1].reverse()
# Otherwise, load all legal pairs
else:
for i in range(N):
# Make an empty list on the end of our list of lists
R.append([])
L.append([])
# Fill it with the allowed pairs
# These need to be AU, GC, or GU (or mirror/T-equivalent of those),
# and must be able to form a helix of size at least 3
for j in range(min(i+4, N), max(N, i+N-3)):
if ( (s[i] in base_dict and s[j] in base_dict) and