From c73321e77d4e95ac87fd75cc010005670d66936b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Leon=20Sch=C3=BCtz?= Date: Mon, 15 Apr 2024 11:37:35 +0200 Subject: [PATCH 1/4] updated to new file format --- src/straglron.py | 4 +- src/utils/ReaderAnalyser.py | 253 +++++++++--------- src/utils/Visualiser.py | 26 +- .../__pycache__/ReaderAnalyser.cpython-38.pyc | Bin 5229 -> 5512 bytes .../__pycache__/Structures.cpython-38.pyc | Bin 1975 -> 1971 bytes .../__pycache__/Visualiser.cpython-38.pyc | Bin 5557 -> 5659 bytes .../extract_repeats.cpython-38.pyc | Bin 2321 -> 2341 bytes src/utils/extract_repeats.py | 16 +- 8 files changed, 165 insertions(+), 134 deletions(-) diff --git a/src/straglron.py b/src/straglron.py index f328740..67eb389 100644 --- a/src/straglron.py +++ b/src/straglron.py @@ -125,10 +125,12 @@ def main(): loci_dict = ra.lociBedReader(args.loci_file) expansions = ra.resultBedReader(args.path_input_bed, loci_dict) sample_id = Path(args.path_input_bed).stem - + vis.getHistData(args.path_input_tsv, expansions) for expansion_object in expansions: + if __name__ == '__main__': + expansion_object if args.altclust: ra.newGenotyping(expansion_object, args.cutoff, False) diff --git a/src/utils/ReaderAnalyser.py b/src/utils/ReaderAnalyser.py index 63533c6..3271015 100644 --- a/src/utils/ReaderAnalyser.py +++ b/src/utils/ReaderAnalyser.py @@ -6,19 +6,19 @@ import numpy as np from sklearn.mixture import GaussianMixture + def resultBedReader(file, loci_dict): - with open(file) as straglr: - + straglr_reader = csv.reader(straglr, delimiter='\t') expansion_list: list[Expansion] = [] sample_id = Path(file).stem - - #loci allele1 allele2 YES/NO Referenz size_difference PathRange + + # loci allele1 allele2 YES/NO Referenz size_difference PathRange for straglr in straglr_reader: - if straglr[0]!='#chrom': - #Variables are assigned based on the coordinates in the straglr reader - coords = straglr[0]+straglr[1]+straglr[2] + if straglr[0] != '#chrom': + # Variables are assigned based on the coordinates in the straglr reader + coords = straglr[0] + straglr[1] + straglr[2] chr = straglr[0] start = straglr[1] end = straglr[2] @@ -27,8 +27,18 @@ def resultBedReader(file, loci_dict): int_path_range = loci_dict[coords][0].pathogenic_range motif = straglr[3] alleles = 0 - #"-" means that straglr assigned no different number for second allele -> allele 1 = allele 2; In the following 2 different allele sizes were assigned - if straglr[8]!='-': + # "-" means that there was no coverage in this region -> skip this loci + if straglr[4] == '-': + continue + # allele1 = np.nan + # allele2 = np.nan + # copy_number_1 = np.nan + # copy_number_2 = np.nan + # allele1_support = 0 + # allele2_support = 0 + # alleles = 0 + # "-" means that straglr assigned no different number for second allele -> allele 1 = allele 2; In the following 2 different allele sizes were assigned + elif straglr[8] != '-': allele1 = round(float(straglr[4])) allele2 = round(float(straglr[7])) copy_number_1 = straglr[5] @@ -45,113 +55,116 @@ def resultBedReader(file, loci_dict): allele1_support = straglr[6] allele2_support = straglr[6] alleles = 1 - - expansion_object = Expansion(chr, start, end, loci, motif, allele1, allele2, reference_size, int_path_range, copy_number_1, copy_number_2, allele1_support, allele2_support, sample_id) + + expansion_object = Expansion(chr, start, end, loci, motif, allele1, allele2, reference_size, int_path_range, copy_number_1, copy_number_2, allele1_support, + allele2_support, sample_id) analyseGenotype(expansion_object, alleles) expansion_list.append(expansion_object) - - #List of expansions, though not pathogenic by definition, are sorted by chromosome or the normalized size difference between sample and reference length on hg38 in descending order - + + # List of expansions, though not pathogenic by definition, are sorted by chromosome or the normalized size difference between sample and reference length on hg38 in descending order + return expansion_list + def analyseGenotype(expansion_object: Expansion, alleles): - - if alleles == 1: - + if alleles == 0: + # repeat not covered + expansion_object.in_pathogenic_range = 'NA' + expansion_object.size_difference = np.nan + elif alleles == 1: + if expansion_object.pathogenic_range == 'NA': - expansion_object.in_pathogenic_range ='NA' + expansion_object.in_pathogenic_range = 'NA' if expansion_object.allele1_size > expansion_object.allele2_size: - expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size + expansion_object.size_difference = expansion_object.allele1_size - expansion_object.wt_size else: - expansion_object.size_difference = expansion_object.allele2_size-expansion_object.wt_size + expansion_object.size_difference = expansion_object.allele2_size - expansion_object.wt_size - #Check for pathogenic expansion size happens here; Size of sample expansion checked against lower boundary of pathogenic range from literature; Assignment of respective loci and sizes to corresponding lists + # Check for pathogenic expansion size happens here; Size of sample expansion checked against lower boundary of pathogenic range from literature; Assignment of respective loci and sizes to corresponding lists elif expansion_object.allele1_size > int(expansion_object.pathogenic_range) or expansion_object.allele2_size > int(expansion_object.pathogenic_range): expansion_object.in_pathogenic_range = 'Yes' if expansion_object.allele1_size > expansion_object.allele2_size: - expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size + expansion_object.size_difference = expansion_object.allele1_size - expansion_object.wt_size else: - expansion_object.size_difference = expansion_object.allele2_size-expansion_object.wt_size - - #If size of sample expansion is still below pathogenic range, they are added into general list - else: + expansion_object.size_difference = expansion_object.allele2_size - expansion_object.wt_size + + # If size of sample expansion is still below pathogenic range, they are added into general list + else: expansion_object.in_pathogenic_range = 'No' if expansion_object.allele1_size > expansion_object.allele2_size: - expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size + expansion_object.size_difference = expansion_object.allele1_size - expansion_object.wt_size else: - expansion_object.size_difference = expansion_object.allele2_size-expansion_object.wt_size - - else: + expansion_object.size_difference = expansion_object.allele2_size - expansion_object.wt_size + + else: if expansion_object.pathogenic_range == 'NA': expansion_object.in_pathogenic_range = 'NA' - #Same check and assignment as above - if expansion_object.allele1_size>expansion_object.allele2_size: - expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size + # Same check and assignment as above + if expansion_object.allele1_size > expansion_object.allele2_size: + expansion_object.size_difference = expansion_object.allele1_size - expansion_object.wt_size else: - expansion_object.size_difference = expansion_object.allele2_size-expansion_object.wt_size - - elif expansion_object.allele1_size> int(expansion_object.pathogenic_range): + expansion_object.size_difference = expansion_object.allele2_size - expansion_object.wt_size + + elif expansion_object.allele1_size > int(expansion_object.pathogenic_range): expansion_object.in_pathogenic_range = 'Yes' - expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size - + expansion_object.size_difference = expansion_object.allele1_size - expansion_object.wt_size + else: expansion_object.in_pathogenic_range = 'No' - if expansion_object.allele1_size>expansion_object.allele2_size: - expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size + if expansion_object.allele1_size > expansion_object.allele2_size: + expansion_object.size_difference = expansion_object.allele1_size - expansion_object.wt_size else: - expansion_object.size_difference = expansion_object.allele2_size-expansion_object.wt_size - + expansion_object.size_difference = expansion_object.allele2_size - expansion_object.wt_size + + def lociBedReader(bedFile): Loci = {} with open(bedFile) as lociCoords: - lociReader_reader=csv.reader(lociCoords, delimiter='\t') + lociReader_reader = csv.reader(lociCoords, delimiter='\t') for locus in lociReader_reader: - if locus[0]!='#chrom': + if locus[0] != '#chrom': key = locus[0] + locus[1] + locus[2] - new_object = Locus(locus[4],locus[0],locus[1],locus[2],locus[3],locus[4],locus[5],locus[6],locus[7],locus[8]) + new_object = Locus(locus[4], locus[0], locus[1], locus[2], locus[3], locus[4], locus[5], locus[6], locus[7], locus[8]) if key in Loci: Loci[key].append(new_object) else: - Loci.update({key:[new_object]}) + Loci.update({key: [new_object]}) return Loci + def newGenotyping(expansion_object: Expansion, cutoff, new: bool): - concat_reads = [] number_of_reads = 0 - + if new: - + for new_readlist in expansion_object.new_read_list: - concat_reads += new_readlist number_of_reads = len(concat_reads) else: for original_readlist in expansion_object.read_list: - concat_reads += original_readlist number_of_reads = len(concat_reads) - + rearanged_array = np.array(concat_reads).reshape(-1,1) - X = rearanged_array - - N = np.arange(1,4) - M = np.arange(1,3) - + + N = np.arange(1, 4) + M = np.arange(1, 3) + # fit models with 1-3 components - + if len(np.unique(X)) < 2: models = [GaussianMixture(1, covariance_type='full', init_params="kmeans", max_iter=500).fit(X)] - + elif len(np.unique(X)) == 2: models = [GaussianMixture(m, covariance_type='full', init_params="kmeans", max_iter=500).fit(X) - for m in M] + for m in M] else: models = [GaussianMixture(n, covariance_type='full', init_params="kmeans", max_iter=500).fit(X) - for n in N] + for n in N] BIC = [m.bic(X) for m in models] @@ -159,11 +172,15 @@ def newGenotyping(expansion_object: Expansion, cutoff, new: bool): cluster_assignment = best_model.predict(X) + # print(cluster_assignment) + # print(X) + # print(np.std(X)) + cluster1 = [] cluster2 = [] cluster3 = [] clusters = [] - + for i in range(len(X)): if cluster_assignment[i] == 0: @@ -172,109 +189,103 @@ def newGenotyping(expansion_object: Expansion, cutoff, new: bool): cluster2.append(X[i][0]) if cluster_assignment[i] == 2: cluster3.append(X[i][0]) - - clusters.append(cluster1) + + clusters.append(cluster1) clusters.append(cluster2) clusters.append(cluster3) - - clusters.sort(key=len, reverse=True) - + + clusters.sort(key=len, reverse=True) + cluster1 = clusters[0] cluster2 = clusters[1] cluster3 = clusters[2] - if cluster3: distance13 = abs(np.mean(cluster3) - np.mean(cluster1)) distance12 = abs(np.mean(cluster3) - np.mean(cluster2)) - - if distance12 < distance13: + if distance12 < max(5, 2*np.var(cluster2)): if distance12 < distance13: - if distance12 < max(5, 2*np.var(cluster2)): - + if distance12 < max(5, 2*np.var(cluster2)): cluster2 += cluster3 - - else: - - if distance13 < max(5, 2*np.var(cluster2)): - + #print("Cluster 3 wurde cluster 2 geadded werden") else: + if distance13 < max(5, 2*np.var(cluster2)): cluster1 += cluster3 - - allele1 = cluster1 allele2 = cluster2 - - #print(allele1) - #print(allele2) - - if allele1 and allele2: - - expansion_object.new_read_list = [allele1,allele2] + + # print(allele1) + # print(allele2) + + if allele1 and allele2: + + expansion_object.new_read_list = [allele1, allele2] expansion_object.new_allele1 = np.median(allele1) expansion_object.new_allele2 = np.median(allele2) - + if expansion_object.new_allele1 > expansion_object.new_allele2: - expansion_object.new_size_difference = expansion_object.new_allele1-expansion_object.wt_size + expansion_object.new_size_difference = expansion_object.new_allele1 - expansion_object.wt_size if expansion_object.pathogenic_range == "NA": expansion_object.new_in_pathogenic_range = "NA" elif expansion_object.new_allele1 > int(expansion_object.pathogenic_range): expansion_object.new_in_pathogenic_range = "Yes" - else: + else: expansion_object.new_in_pathogenic_range = "No" - + else: - expansion_object.new_size_difference = expansion_object.new_allele2-expansion_object.wt_size + expansion_object.new_size_difference = expansion_object.new_allele2 - expansion_object.wt_size if expansion_object.pathogenic_range == "NA": expansion_object.new_in_pathogenic_range = "NA" elif expansion_object.new_allele2 > int(expansion_object.pathogenic_range): expansion_object.new_in_pathogenic_range = "Yes" - else: + else: expansion_object.new_in_pathogenic_range = "No" - - expansion_object.new_copy_numberA1 = expansion_object.new_allele1/len(expansion_object.repeat_unit) - expansion_object.new_copy_numberA2 = expansion_object.new_allele2/len(expansion_object.repeat_unit) + + expansion_object.new_copy_numberA1 = expansion_object.new_allele1 / len(expansion_object.repeat_unit) + expansion_object.new_copy_numberA2 = expansion_object.new_allele2 / len(expansion_object.repeat_unit) expansion_object.new_allele1_support = len(allele1) - expansion_object.new_allele2_support = len(allele2) - #print("New: " + expansion_object.title, expansion_object.new_read_list) + expansion_object.new_allele2_support = len(allele2) + # print("New: " + expansion_object.title, expansion_object.new_read_list) else: - expansion_object.new_read_list = [allele1+allele2] + expansion_object.new_read_list = [allele1 + allele2] expansion_object.new_allele1 = np.median(allele1) expansion_object.new_allele2 = np.median(allele1) - expansion_object.new_size_difference = expansion_object.new_allele1-expansion_object.wt_size - + expansion_object.new_size_difference = expansion_object.new_allele1 - expansion_object.wt_size + if expansion_object.pathogenic_range == "NA": - expansion_object.new_in_pathogenic_range = "NA" + expansion_object.new_in_pathogenic_range = "NA" elif expansion_object.new_allele1 > int(expansion_object.pathogenic_range): expansion_object.new_in_pathogenic_range = "Yes" - else: + else: expansion_object.new_in_pathogenic_range = "No" - - expansion_object.new_copy_numberA1 = expansion_object.new_allele1/len(expansion_object.repeat_unit) - expansion_object.new_copy_numberA2 = expansion_object.new_allele2/len(expansion_object.repeat_unit) + + expansion_object.new_copy_numberA1 = expansion_object.new_allele1 / len(expansion_object.repeat_unit) + expansion_object.new_copy_numberA2 = expansion_object.new_allele2 / len(expansion_object.repeat_unit) expansion_object.new_allele1_support = len(allele1) - expansion_object.new_allele2_support = len(allele1) - - - if number_of_reads > 3*cutoff: + + # temp_list = expansion_object.new_read_list + + # print(len(expansion_object.new_read_list)) + + if number_of_reads > 3 * cutoff: for new_list in expansion_object.new_read_list: - + if len(new_list) <= cutoff: expansion_object.new_read_list.remove(new_list) - #expansion_object.read_list = temp_list + # expansion_object.read_list = temp_list newGenotyping(expansion_object, cutoff, True) - -def expansionScorer(expansion : Expansion, new_clustering : bool): - #score = x-xmin/xmax-xmin + + +def expansionScorer(expansion: Expansion, new_clustering: bool): + # score = x-xmin/xmax-xmin if expansion.pathogenic_range != "NA": if new_clustering: - score = (expansion.new_size_difference)/(int(expansion.pathogenic_range)-expansion.wt_size) + score = (expansion.new_size_difference) / (int(expansion.pathogenic_range) - expansion.wt_size) expansion.new_norm_score = round(score, 4) - + else: - score = (expansion.size_difference)/(int(expansion.pathogenic_range)-expansion.wt_size) - #print(expansion.in_pathogenic_range, score, expansion.size_difference, int(expansion.pathogenic_range)) - expansion.norm_score = round(score, 4) + score = (expansion.size_difference) / (int(expansion.pathogenic_range) - expansion.wt_size) + # print(expansion.in_pathogenic_range, score, expansion.size_difference, int(expansion.pathogenic_range)) + expansion.norm_score = round(score, 4) else: - expansion.norm_score = None - \ No newline at end of file + expansion.norm_score = None \ No newline at end of file diff --git a/src/utils/Visualiser.py b/src/utils/Visualiser.py index 4aec811..064f66f 100644 --- a/src/utils/Visualiser.py +++ b/src/utils/Visualiser.py @@ -27,8 +27,16 @@ def getHistData(file, expansions: "list[Expansion]"): for line in read_reader: if not line[0].startswith('#'): if expansion.start+expansion.repeat_unit == line[1]+line[3]: - read_coords_dict.update({line[5]:int(line[8])}) - read_list.append(int(line[7])) + # skip reads with read_status != full + if line[14] != "full": + continue + # read column 'read'/'read_name' and 'read_start' + read_coords_dict.update({line[7]: int(line[11])}) + if line[11] == "NA": + read_coords_dict.update({line[7]: np.nan}) + # read column 'size' + read_list.append(int(line[10])) + expansion.read_dict = read_coords_dict expansion.read_list = [read_list] expansion.title = title @@ -43,11 +51,15 @@ def getHistData(file, expansions: "list[Expansion]"): for line in read_reader: if not line[0].startswith('#'): if expansion.start+expansion.repeat_unit == line[1]+line[3]: - if expansion.copy_numberA1 == line[10]: - read_list_A1.append(int(line[7])) - if expansion.copy_numberA2 == line[10]: - read_list_A2.append(int(line[7])) - read_coords_dict.update({line[5]:int(line[8])}) + # skip reads with read_status != full + if line[14] != "full": + continue + if expansion.copy_numberA1 == line[9]: # read column 'copy_number' + read_list_A1.append(int(line[10])) # read column 'size' + if expansion.copy_numberA2 == line[9]: # read column 'copy_number' + read_list_A2.append(int(line[10])) # read column 'size' + # read column 'read'/'read_name' and 'read_start' + read_coords_dict.update({line[7]:int(line[11])}) expansion.read_dict = read_coords_dict expansion.read_list = [read_list_A1,read_list_A2] expansion.title = title diff --git a/src/utils/__pycache__/ReaderAnalyser.cpython-38.pyc b/src/utils/__pycache__/ReaderAnalyser.cpython-38.pyc index f8365cb6d16086fc114b1d4eb40af1868c6a0f52..e6f97fd4620e23145a0cd36cb3c7237c199887c8 100644 GIT binary patch delta 2807 zcmahLU2Ggjd1iNSy}Q2K-TU=VV)x>ZG^e<6n+ghU8gS~AHb_QwLJ23rarUg8^UvOS zb`KJ3j)NYe0ja8`sj7+xbVgO*kcxPL2LuniRJ_ots;YgcP~ZB%TLq{u<@@Fw*LD+f zr}^fa@Bf>b{l59vnLB6Q?_@F#f#2=ExANZCe(!43eT@t`XMAjMu-5%|D`@GZIHm%BC?P1#h zU!T%PKDOjL+JESnd|Ut3>1m`rGqV}SGf~{+_0{0~OgjpAG*iDBE#GW!JT|l1h-Jrk zVi;qM761?`|6-JN!*=C|#zb+PYDD|Y&>UJa_&H!#C_gKm)I|5Ekd2^>u4{apX@b;9 zOxJLR=))!8=)n0%kTrv8fc;<~1M3tFbKT$zd-%{O>`6`E`X|z1?5)EWksn(CjUHV$ zqMsmtf890sTg*VWD26OB+26zOXYx@7l7}?jJ**MBXbHyMAbt(FN&t>z>R<|8bO^-U z0}ND=zaKSfTsai>T}S7Sh6a-1|FQjD*fK`0l8U*&;Uq|+EzEHp5&e5X1gmDnPV$Yg z)~d%ho2x_R2L>B-MYH21-Q+ax5s^C&z^R*l`sjr(9SA0T#f9T)yHVCQ!Io>s% zMlg)PN03F}A#edyOrA%)fS`yVg+97>YeXf8J{1#B^>vw+`{mi>^0QDOLU z#-5@=F51)Gi!t2IHN8hEUKo9z+?ei|vBor{0MAq$U6@ahjviA1k{tu+aQ7(LnvIQq zO6a?^Z!(=3yIN;R*&6~|7EASXw86~SM)?u=O2Ny}F9~Ql%wR*iTDN2EEi6#BV+yO` zJ7gC=XZYX=|0qs_pu>h)nq^)#`WfV{(vB0(9+B1HVmL3670O)Z{oE4vJh%h7oTB-P zW({bHl2w#3N0Ekw1CCuWyi3POCoR%_Pd@LAP+vBjW8Gf}x1YU3L=LJD@OLqbj9MA% z==M$F-G^^0{*v&I@^|RglF0V6!d2D~^Y$@+fUV_X%5p5NViY|txV2nEv96z5gp2_FvK z3%gSye_tV23Aw`U6ouC*iQi=woOl@HH*L=<-^;G+sJDSQlW$ta95s2Dmw zzPB1zb?+Fgjq1mHGbbj%Gjj=;E zt{=9cd_bscR*xQokD}In2#vzs|D05YRk4TpoB*t(u&MAj0x}*Ut4RN1&V&5#=KRF~ zKRgiNp(J16|9wD&4+?M)>Xa~;CE$c%=M1O_Co)Co7FQrG(myFi@Y2O!64<SIf^{Uc5Y8-u4&Tn@a&NH&@CW zDo-?fprT=cON^EF#ztbUZwB=+l6NxP|2o#NZy=aMa7jAuWJy&stf`4}0i7;4H(M8f zCoj6KGf$(`JOC(V-$V%-^5>AYufx9u%eUnF?%Z{xRcKXTYAo|-Q0N|#e*k)f4+WEG zpw) z5{&3?00=dxp}})Uw`kgMs6(y2zXLrDJDZ`Ufs>=-T0wr2U2>rBSu}?oP`;3xrcCy7 zzwN3`OoO?KrJ5?g0k{g3W0w3FfW*a!!Y1FWMaxYd@Gm3L`J7PEc>{&lke(XIsMr-y zSH=r&wcvI&n&DMt;ykwecwMv#3b2wT$@BkRNmroT=W_0UG_i@YfyJXt;{s$8IrX&CW delta 2513 zcmai0UuYaf7@wKl+xx%w|I(!KlBO-~**1w$+N!0R8jJLy#I}M4Bi zxp~5!|Vd})VFL;I0TjN(gU}2^(VVSd(^Mmhff|r+liq?+Z}S; zswhueBSXuMC7hx1%%mtUyCXwRMa`bqCs|LDDq0UDusT?~P- zNX345R2W8deuy;~FTrp)tVU8!=b89Xk~zW|pfS!QDN)ejIj7C*;+YQCx53Iwfu$>G zhtbjPchT<}AW5@9=XHl;K*s>O_>@EVrjqYr3XDgU9On9#jPEyR*e1@gQNjKUFf5=k z&sM209$3Te$D~i7e}gjA7~Dj9eXS!~i9zJ4`)WT88?==hC#Ojr@di0 z0IJ2H*bX-JyOCGf;I2_G;mlsJN_=)|$`Y1cu|zkxi_CwGVh><}C!{R~5fwl@3&S}G z0#1P)9LUCKk|kN18Z@aH)C5fw)K|gVV|bW-5+aHyfg*+?g(8h2i6Rc7XoxK09Ev=O z0E#~KTj)$-GYK!gP_4`!v-l~i%q=ma#>0J~=OG%2t8?K2s;ke#`y0KuO=INEY~7HW zG!Buv=2EG_=bm4=kx2P`Aw(&GjPEfx6oLrs zC|Gu*Mi$N|ncLo^+p+QC`OR&zJONq`S{^tH0P{^mefA{@?7PS`Y7g>nuVtP8#$;{6 znwWY7VNJZfnK01t2_Ahc!*znWHbJTLfP|L?yPylAZpL$?nz#n#NUL9V;rVfQNW2&l zO^uROZRwid!Ez@B+!y55fADL!k&O-#RorYQk7A2>0mX|bM%5j&koTJfHY+c3WMS5E zE7ir?(J$1#L~ZXejKZ4;I>m8}ptg7gZTE7-2~fXOKO{yQXe-hQZF06EUdEt@L|z8F zqk}LABWIxMbAY{|u+G9AYPI{^Fy=mmVgy8Sz>C|~xsrh1RYH26wqV&_ctdA)%?p&p zPEkHT0hOGYa?P4B%A#yfTb{9O&n_)n!oYfC>T0r=_Nl*;qr0#ldl{^}wWVXn_qx!d zZW@`i-{QTfuxeJ>EkQX~S*bY_FK*AMEW6dIDRD&INL_t(FV0Z}F%iOSB-`F7e+jp$Y7Y_@U;pck>LVzJ~@s=_L2Vj8_OXbYbZD+P>pT-Q) z1INfQA)c@Z)oGrZTArCqEP$n$5;=7}I~eu?_VQxwyck!BT+a~xo$)U_fftLXPgbi7 e0$1xN-8W(TF{3y+qcq1* zKRvT#^9ANGM#iYk>Z~gmMfHF>i}XPRD~MnN5eAd*vq?<;!{*7Y4&rKn2+hd>?6PvY eK&Cwq7c&6~4#px7-%pdNh!IE^=}zuu*8u=~4I(!H delta 152 zcmdnYznz~ul$V!_0SIo$o!`j4fLYZ|KR2&Lzqlm7C^0?NNWZi=wWwG>F(aucv9!cU zzc{%lv!JAS^BLwaM#fv4m04FX3g`oM7a4#ERuI89`2m{*qv_-?Y`)wYAOTGfp*7iu mT~;*;mGivj?&B_!$q diff --git a/src/utils/__pycache__/Visualiser.cpython-38.pyc b/src/utils/__pycache__/Visualiser.cpython-38.pyc index 03ea88ab911b60050937107ee44d882ff602731f..db9df9167b279e9d42e4ed5779ab9885746dbf50 100644 GIT binary patch delta 1357 zcmZ{i-)|d55XW!#?p&XLoSnpJyqEu8>_z|wY^N;xK zbN0@pm) zXJ_}zsZXcX7vu4W0KYdsQOm2^7wYRId4>u?O<1QMc|q8suVndsG85Y(?3f$!guCR! zANY6V=)mTB)QcKYTIec<{E#?h{udeBD1$AW77V2>vPY|BFVaO-BrUix7^7*3uaSp@ z2>q22t5f#VK2>~0{l~9#6G2QIV=w}H?$~0;`%9>J)5Wd9j1rgCez0@uL_2kK%t1 z8|=_8J`wZc6}neHUlgZEg1;r^CRMlwUn;jPt9J;dy(q5fwdZ?yK|KYa4fC%>Crq0y zUu^14J}o&U$GcL2Wcl|}F?$>|yO;8frki(Mo9PvECf{~U=H&INQ>wPDlle-`<>%zW z^i7ZoL?RI~PD1cS;8S5OkuZ!$XpB%gN@65MQW}I5bEM_#3odpLgA`1LLID0HuqngbiP9aVMeEGJem+HRAOhX%C zIsQv{4&DoRM-f?mK{+h5Ct=QSDf8qR{#cm}_5J*JWq2IV&H|i(nICUiwmVmYHdL72 znB`AJ=E?PKJ2EGR&ZF`^3m%DbfKi!g#NRJK_|ceN=t!M|1W(J|a-5;4#JRL998A4`lzGiaDYT;>-O zS3-CqHp@RwoPPQWPICZX((lw9UvjGU8oP>yL(FVJdrWqX|C-26plB&>x7}9TT`t>} zVKP5bsxrG_J9filPw+zWYAB2O7WjL~337vfp4=(Eieb@~?v|>{CEK#uk{m4hecNkZ zQnxJAGH=$LwrY@HU39x@oWRkOX8)P&BCZM b5P8IP#61M2+qXr(S$uu?lzc`lgpI!e@Nq_& delta 1212 zcmZ{iPi)(C6vzGiCvlv_$+D$w%4nO4b#*%k+B(#AsDe?IvTmz1iGPu%;Kru)>eSiK zF5AGNph3BSR`pHnz$FtG#0^zQNPxtNE7Fb;5(hW{7cL;7UEny3P$4bMDGb=#%LaY*t_^vz&KjLR{_2ACGU$G70$c{X?xea0@ zRxTf)`q_%KCeu2KbNj24qjaC+{W$+0$N%ni8?KD}wYOZHUy=XWT`YI;wlWSm{7N}m z)@YnR>5Mdk$7>zLtQz7ws(?|vr%pi?YuZd|oKpAZG0^(VWnphb$7^9zchE;DAY9vNgx;%e!}r1{6x`k(EH0jJ`$4oonP5fOtr^@h7GZJsuCbt`9%JV; zj+mz^`ySFC_wClQW6!>U!hE4zBBKR8wb`=UmMr!f!SBp5IEJah5DejwLM=PM<%Stg zV}sNS_{+<`A=>Lk>&Z98hNHkyGSh`N?6_D`Z{`@ZYD z&w1g7O`~$fIx3}2*xJ2O+Ja5)Q53k8Ht*Z(YZi#)tQ6LtC-hQ5wCak`(!?5TNvHhP ziKN8~6j`R|(5+r<1(EkMZ6**-aw_6Y_FZROV?>O%7;iH+CF!&zmME;}lMWiq)gYi9 z?v45)C3;#?ZC_8KO{u&1^`Xwe{R{Oe0#vS5|<%`oE$#vz77YLnQ dFEJ(m%Hy3$yZ_F_vI7XggK=}o$`BL4B^hTzrP4ze0$c)T}-A-Y5P#`s1 z)J7I@jyOZ?5ZlBqaW;a6*3RHGG9qZ3I2{y(5!qu9hTMfREXV;|TAR7Fdz=o3ozBdp zaHh=7vI=qp6;pnMHACkDTYdQz4vjTZf8Cc=wrq31IqYm+J8B7e$hK}~XohbPXkbAu z84O%lg&ebhQJ(y45g5C|uuOI6y#K^X%sEGKtcQW{n|NOSOt%UJrPyJ|>$dB>^)iUl zJQ#2+$*#V?sw`jv>#A%acrMWMMID2F;0eAizw5VPTNaIz<0VyK@E(ew5vSY13m)sw zuq6V#PBRq~Zx9_Ay`iXwUYsJv5-GT%v7rqhl{Z9`Fvu zJ`H)*e7f=fBCL_VLxURx>Z&Zjn*3nyS$8R~=vbDHb>VoMlB5rrTvl=7Yg0vA1P4oh z%!M8oUcK*$CaOzI4s_P1ofPeQ?EoKA&D<-q!6v`>h?ELEt^hhJvhkj3Q+q%dXZ>EM o6ZoRl>vCjDzpNm6yT1mL6D4i_*?0at@`rUXld_9;$ucbcH+&1Fy#N3J delta 764 zcmZ8f&ubG=5Z-y|{>V#{w($pz*191gM5w7j!Fp(wY7xZZrI&(+HF+^1F`LYLfl@XI z=H43K)ocF(FJ3%(@+96Q|3MJE3PI3$8xid;`+YOB-_CrqyC>OEHnW#bbBm7e=Rb>| ze3Us^eNUOP6bxa&6pO4H4B7G9K_W`rx1PQkec9ZsB?oR~*Lcnvc*H5D^S?U?2Z8uXw(!6Q%FwX&SaLYO&C=VGqq4izY{87 z95khx*i@Xj20Q~vf}*28LSBD^;Myvs(?_M39aWMl#NdS}m-rd_z?=BUJHu^g6U3j%Y32vE+i)Tz68!DM-onU|IkE?nhRZ!^$ zs)6Rs#G9Np$Yastpe5qzGn0%1C!_d~o+`p_5ioBej`vKP2Rnr0R5fh3MO8J!jzsf| gV8MWI;IF~BXWZtWxwCJee|eXZjGyHzp5xiS0UVjBLI3~& diff --git a/src/utils/extract_repeats.py b/src/utils/extract_repeats.py index 64a3edf..c520690 100644 --- a/src/utils/extract_repeats.py +++ b/src/utils/extract_repeats.py @@ -5,6 +5,7 @@ from collections import defaultdict import re + def parse_tsv(tsv, loci=None): support = defaultdict(dict) with open(tsv, 'r') as ff: @@ -13,18 +14,23 @@ def parse_tsv(tsv, loci=None): continue cols = line.rstrip().split('\t') locus = cols[0] + ":" + cols[1] + "-" + cols[2] - status = "does not exist" - read_name = cols[5] - size = cols[7] - read_start = cols[8] - strand = cols[9] + status = cols[14].strip() # was "does not exist" + # ignore skipped/failed reads + if status != "full": + continue + read_name = cols[7] # was cols[5] + size = cols[10] # was cols[7] + read_start = cols[11] # was cols[8] + strand = cols[12] # was cols[9] if loci is not None and locus not in loci: continue + # print(read_name) support[locus][read_name] = int(read_start), int(size), strand return support + def extract_repeats(bam, support, flank_size=10): seqs = {} for locus in support: From 6fd0f93aa23fb946bcbd358ea8d4f88eb68c0002 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Leon=20Sch=C3=BCtz?= Date: Mon, 15 Apr 2024 14:12:57 +0200 Subject: [PATCH 2/4] Refactoring --- .gitignore | 2 ++ src/utils/ReaderAnalyser.py | 34 +++++----------------------------- src/utils/Visualiser.py | 25 ++++++++++--------------- 3 files changed, 17 insertions(+), 44 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..14d349b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea +src/utils/__pycache__ diff --git a/src/utils/ReaderAnalyser.py b/src/utils/ReaderAnalyser.py index 3271015..8cb3b80 100644 --- a/src/utils/ReaderAnalyser.py +++ b/src/utils/ReaderAnalyser.py @@ -30,13 +30,7 @@ def resultBedReader(file, loci_dict): # "-" means that there was no coverage in this region -> skip this loci if straglr[4] == '-': continue - # allele1 = np.nan - # allele2 = np.nan - # copy_number_1 = np.nan - # copy_number_2 = np.nan - # allele1_support = 0 - # allele2_support = 0 - # alleles = 0 + # "-" means that straglr assigned no different number for second allele -> allele 1 = allele 2; In the following 2 different allele sizes were assigned elif straglr[8] != '-': allele1 = round(float(straglr[4])) @@ -137,13 +131,11 @@ def newGenotyping(expansion_object: Expansion, cutoff, new: bool): number_of_reads = 0 if new: - for new_readlist in expansion_object.new_read_list: concat_reads += new_readlist number_of_reads = len(concat_reads) else: - for original_readlist in expansion_object.read_list: concat_reads += original_readlist number_of_reads = len(concat_reads) @@ -172,10 +164,6 @@ def newGenotyping(expansion_object: Expansion, cutoff, new: bool): cluster_assignment = best_model.predict(X) - # print(cluster_assignment) - # print(X) - # print(np.std(X)) - cluster1 = [] cluster2 = [] cluster3 = [] @@ -203,19 +191,15 @@ def newGenotyping(expansion_object: Expansion, cutoff, new: bool): distance13 = abs(np.mean(cluster3) - np.mean(cluster1)) distance12 = abs(np.mean(cluster3) - np.mean(cluster2)) - if distance12 < max(5, 2*np.var(cluster2)): if distance12 < distance13: - - if distance12 < max(5, 2*np.var(cluster2)): + if distance12 < distance13: + if distance12 < max(5, 2*np.var(cluster2)): cluster2 += cluster3 - #print("Cluster 3 wurde cluster 2 geadded werden") else: - if distance13 < max(5, 2*np.var(cluster2)): + else: + if distance13 < max(5, 2*np.var(cluster2)): cluster1 += cluster3 allele1 = cluster1 allele2 = cluster2 - # print(allele1) - # print(allele2) - if allele1 and allele2: expansion_object.new_read_list = [allele1, allele2] @@ -244,7 +228,6 @@ def newGenotyping(expansion_object: Expansion, cutoff, new: bool): expansion_object.new_copy_numberA2 = expansion_object.new_allele2 / len(expansion_object.repeat_unit) expansion_object.new_allele1_support = len(allele1) expansion_object.new_allele2_support = len(allele2) - # print("New: " + expansion_object.title, expansion_object.new_read_list) else: expansion_object.new_read_list = [allele1 + allele2] expansion_object.new_allele1 = np.median(allele1) @@ -262,22 +245,16 @@ def newGenotyping(expansion_object: Expansion, cutoff, new: bool): expansion_object.new_copy_numberA2 = expansion_object.new_allele2 / len(expansion_object.repeat_unit) expansion_object.new_allele1_support = len(allele1) - # temp_list = expansion_object.new_read_list - - # print(len(expansion_object.new_read_list)) - if number_of_reads > 3 * cutoff: for new_list in expansion_object.new_read_list: if len(new_list) <= cutoff: expansion_object.new_read_list.remove(new_list) - # expansion_object.read_list = temp_list newGenotyping(expansion_object, cutoff, True) def expansionScorer(expansion: Expansion, new_clustering: bool): - # score = x-xmin/xmax-xmin if expansion.pathogenic_range != "NA": if new_clustering: score = (expansion.new_size_difference) / (int(expansion.pathogenic_range) - expansion.wt_size) @@ -285,7 +262,6 @@ def expansionScorer(expansion: Expansion, new_clustering: bool): else: score = (expansion.size_difference) / (int(expansion.pathogenic_range) - expansion.wt_size) - # print(expansion.in_pathogenic_range, score, expansion.size_difference, int(expansion.pathogenic_range)) expansion.norm_score = round(score, 4) else: expansion.norm_score = None \ No newline at end of file diff --git a/src/utils/Visualiser.py b/src/utils/Visualiser.py index 064f66f..5feed3d 100644 --- a/src/utils/Visualiser.py +++ b/src/utils/Visualiser.py @@ -13,6 +13,7 @@ from Bio import SeqIO import pysam + def getHistData(file, expansions: "list[Expansion]"): expansions_read_lists = {} for expansion in expansions: @@ -54,9 +55,9 @@ def getHistData(file, expansions: "list[Expansion]"): # skip reads with read_status != full if line[14] != "full": continue - if expansion.copy_numberA1 == line[9]: # read column 'copy_number' + if expansion.copy_numberA1 == line[13]: # read column 'allele' (not 'copy_number') read_list_A1.append(int(line[10])) # read column 'size' - if expansion.copy_numberA2 == line[9]: # read column 'copy_number' + if expansion.copy_numberA2 == line[13]: # read column 'allele' (not 'copy_number') read_list_A2.append(int(line[10])) # read column 'size' # read column 'read'/'read_name' and 'read_start' read_coords_dict.update({line[7]:int(line[11])}) @@ -64,16 +65,16 @@ def getHistData(file, expansions: "list[Expansion]"): expansion.read_list = [read_list_A1,read_list_A2] expansion.title = title expansions_read_lists.update({expansion.repeat_id: (genotype1,genotype2,title,read_list_A1,read_list_A2)}) - + return expansions_read_lists + def plotHistogram(expansion_object: Expansion, plotfolder, bool_altclustering): save_place = plotfolder+"/"+expansion_object.title+'.png' ref_size = expansion_object.wt_size path_range = expansion_object.pathogenic_range - - + if bool_altclustering: allele1_size = expansion_object.new_allele1 allele2_size = expansion_object.new_allele2 @@ -93,10 +94,7 @@ def plotHistogram(expansion_object: Expansion, plotfolder, bool_altclustering): plt.axvline(allele1_size, color='k', linestyle='dashed', linewidth=1, label=allele1_size) plt.axvline(allele2_size, color='k', linestyle='dashed', linewidth=1, label=allele2_size) plt.axvline(ref_size, color='green', linewidth=1, label="wt size: " + str(ref_size)) - - #if path_range != "NA": - #plt.axvline(int(path_range), color='grey', linewidth=1, label="pathogenic range: " + str(path_range)) - + plt.legend() plt.savefig(save_place) plt.close() @@ -111,14 +109,12 @@ def plotHistogram(expansion_object: Expansion, plotfolder, bool_altclustering): plt.axvline(allele1_size, color='black', linestyle='dashed', linewidth=1, label=allele1_size) plt.axvline(allele2_size, color='red', linestyle='dashed', linewidth=1, label=allele2_size) plt.axvline(ref_size, color='green', linewidth=1, label="wt size: " + str(ref_size)) - - #if path_range != "NA": - #plt.axvline(int(path_range), color='grey', linewidth=1, label="pathogenic range: " + str(path_range)) - + plt.legend() plt.savefig(save_place) plt.close() - + + def alleleVisualiser(fasta_file, motif, flank_length, title, output_folder, chromosome, start, end, reference_genome): chr = chromosome @@ -218,7 +214,6 @@ def alleleVisualiser(fasta_file, motif, flank_length, title, output_folder, chro plt.close() - def rectangleMaker(motif_colors, motif_coord_list, size, flank_length, motif): patches = [] color_list = [] From 3c04de51a4d5c8a490c27c049e5f094bb52645cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Leon=20Sch=C3=BCtz?= Date: Mon, 15 Apr 2024 14:19:06 +0200 Subject: [PATCH 3/4] removed temp files --- .gitignore | 3 +++ src/.vscode/launch.json | 13 ------------- src/__pycache__/expansion.cpython-38.pyc | Bin 2790 -> 0 bytes src/__pycache__/locus.cpython-38.pyc | Bin 730 -> 0 bytes .../__pycache__/ReaderAnalyser.cpython-38.pyc | Bin 5512 -> 0 bytes src/utils/__pycache__/Structures.cpython-38.pyc | Bin 1971 -> 0 bytes src/utils/__pycache__/Visualiser.cpython-38.pyc | Bin 5659 -> 0 bytes src/utils/__pycache__/expansion.cpython-38.pyc | Bin 2796 -> 0 bytes .../__pycache__/extract_repeats.cpython-38.pyc | Bin 2341 -> 0 bytes src/utils/__pycache__/loci_struct.cpython-38.pyc | Bin 3343 -> 0 bytes src/utils/__pycache__/locus.cpython-38.pyc | Bin 736 -> 0 bytes 11 files changed, 3 insertions(+), 13 deletions(-) delete mode 100644 src/.vscode/launch.json delete mode 100644 src/__pycache__/expansion.cpython-38.pyc delete mode 100644 src/__pycache__/locus.cpython-38.pyc delete mode 100644 src/utils/__pycache__/ReaderAnalyser.cpython-38.pyc delete mode 100644 src/utils/__pycache__/Structures.cpython-38.pyc delete mode 100644 src/utils/__pycache__/Visualiser.cpython-38.pyc delete mode 100644 src/utils/__pycache__/expansion.cpython-38.pyc delete mode 100644 src/utils/__pycache__/extract_repeats.cpython-38.pyc delete mode 100644 src/utils/__pycache__/loci_struct.cpython-38.pyc delete mode 100644 src/utils/__pycache__/locus.cpython-38.pyc diff --git a/.gitignore b/.gitignore index 14d349b..c9cb337 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ +.vscode .idea src/utils/__pycache__ +src/__pycache__ +src/utils/.vscode \ No newline at end of file diff --git a/src/.vscode/launch.json b/src/.vscode/launch.json deleted file mode 100644 index 1b4ab29..0000000 --- a/src/.vscode/launch.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "version": "0.2.0", - "configurations": [ - { - "name": "Python: Current File", - "type": "python", - "request": "launch", - "program": "${file}", - "console": "integratedTerminal", - "args": ["/mnt/storage2/users/ahbraut2/data/snakemake_test/benchmark/straglr_test_new_Loci/og/Sample012_23014LRa.bed","--hist", "/mnt/storage2/users/ahbraut2/data/snakemake_test/benchmark/straglr_test_new_Loci/og/Sample012_23014LRa.tsv","-o", "/mnt/storage2/users/ahbraut2/scripts/TestFolder/Home", "/mnt/storage2/users/ahbraut2/data/Master_Bedfiles/RepeatLoci_straglr_LT_V2.bed" , "--bam","/mnt/storage2/users/ahbraut2/data/ONT_Pilot_TNAMSE_Links/Sample012_23014LRa.bam", "--cutoff", "3", "--altclust"] - } - ] -} \ No newline at end of file diff --git a/src/__pycache__/expansion.cpython-38.pyc b/src/__pycache__/expansion.cpython-38.pyc deleted file mode 100644 index 6084794ac52111d5b29985b26a9b714925321a1a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2790 zcmb7F%Wfk@6s>N*+Hc2sPUZmv5@29r#vst35iflW58p1u{Qk=-^`ZG}X= zW7d2FEb~jXWyP9ZR-D^*+wH^=Nm+evJ+AB2xpk}R=X$*&P})yMGIE6Yn;Y4ZL*qF< z-3Gi8B*=P;?|1u9kbNLJIUoH?#YXEnJ{<$3IJ8I{+GLUaRvhLiM|s#h6{rYXpb|N- zMJiJTwnSB`!8%l@25gy{)Pk+h8nt1ov`!naHQJ;t*g9>~4s3(&&@OC~?$SNj7Tw3% z9+>jW-^O03gHh&ot-Mlks?j zh0Z#_KI-GjM!)*XgX zS4Sh|^<}R+(NgKIcl<_qlc?9#z6!>X?jA%K9;hF~u2z0mrg^tFJ~fW(2G}9j<-?@+ z0KzU>H~*cr=V`o`PUUm{riJVCd}f`~eCAv_xAxC1wSm5lt?t4k^+ebd?xW9CwTxb4 zYkt7aaEDH2x6U>f#2F9Yf=gPs&R=qtIPS6d1-$AVRMoox&?mec%PeJoLCJ0_C^H^c zvb4d~>i(Vur* z7TtiVeR=XqhNI|oEIas*=HLn>=-JNF{%7276M}?#RS4?Ph&f+O@2yA7Jj=W71@Ds2 zZD?8j%E&L#_CEW6+TNAyoA1H$VN0TqeBZ>JvqUp_zU{6FdV1tO&#d7}$Wx~qYgeCM zr@S!@W;)}St7(^=Zma!HMtdQ-sxOPLamL{yKj3@(NoEaOi(SvU+*~;C<;pFuv8I&r z>1xWX%WXyPW<=-tO}eZ(yl9ZZhxeJM`*t;9#$z9EC|;@1 z8%ozT71te(Xo5G_H8t1$b>a=CBkBRuJ!If3Qhmtq5d&Wv>SKma82F7-U4|Y5=eOc) zQ(rMGUV@~?502s|gin>`F2{0qo!sn~pVyL}#B(1vEB|BFRK|HaUHvya-@o|8f6kUn AE&u=k diff --git a/src/__pycache__/locus.cpython-38.pyc b/src/__pycache__/locus.cpython-38.pyc deleted file mode 100644 index c2eddfeaf5e994d8659b4bbaa1111536c026f1dd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 730 zcmZvZ&5qMB5XbF&v}u}_RfQ0*(91^J8v-FD4qR5{u;P-7Wn#CDl*DE1AhGg|eHmVb zNAQ(L;I^loNrV&xw&v&Y*nfY_eA;YshHdxC=${c|zi2XVF;3p0*&X5oQ%vy(wC_>H zRCLGoagFi68lEyKo};5##TBa}#Z~mcs#wLSNkV#S>cQ5VnUH*vzDKjS2+ute%tuJ> zV`StLWb9L9;upx&FOdtMA(wuI%zTbq`86{41#+zzUda#pA|$=+bja&7n9f*(erlZr z7h>J3kaos3$B?%4w6K%bn!1rrE1?>zrPW~z`lvzowHCIy*P-YQbg~s7`;!h=7wOLC zr2D3xBGo)H9sXjrXN*ic_nVMd-5!Aw0F-h_2q|GfSQ0YAia<4kHR7K;zCwZF9}KoQ zJiacw-j&uFkSBdp4pxIL<@qzn!QGU$hUVgIdFODsg^zt{p)SY&Uthk3Oo*m$oDlRN z`x3$U;=lcVJJ$31kKlkMMm42dh>#1>88x)z7eag)WIL_E6&^W0bcSaX;|R)yC1iY@ XANJEY+e?205`^r{bj4c|{6=F>HdL_p diff --git a/src/utils/__pycache__/ReaderAnalyser.cpython-38.pyc b/src/utils/__pycache__/ReaderAnalyser.cpython-38.pyc deleted file mode 100644 index e6f97fd4620e23145a0cd36cb3c7237c199887c8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5512 zcmbtY-ESOM6`wm_yE|Ube%M}rq%CgB*Mb|@jw&Dxp-GdL7V1J$AdCphcxSvBdv|7i zXEu$ayMXG57Nn{MRj5K0a(F1Yo~#S+M0XM zJ?GwYzs^19clJAzla_+No&D0r-@l?L|DwkDpN7Ws_~Kun@Dz`=6(_5i!_ZdSYDaT4 zrg)mCxAl(U7?L(Xn~n*Z#^jtlYO`JFSdIl=&dc9boC)9X3Lh}f@+R);PSGoRli*JJ zhHrcJ2dY={%HWq|yMlJrn?k$nv1>|w`Vi|;*6J+PzvU)34;fl$6t3Lvx?vndAsWUv zqh>D#QF_Vk#c|+~bW`seLc6T&u^ZR>)I*zBJbg>!OBs#Qm{(>M)F#z^GvoJjnWUf3B>jS5 z0zGK_rqQ<&bF7`)W&MeCdr#-W8hnF9r1@k_ng|v-pA#Ky1(YKQc zkrR1Q5SEw}B~ca=q9`h&DyGD=Fm~{YGWqav+%CpM1m8;5i zaB<_Mwfn4az?_2Tw}VcQ_?*+F-#4FX-sDk-lcgTg#U4sZCmDPCFi)Hu6$TY16$KP^ zJJqAEAEsI}zLlEXcRgsN@u?msekavKx8tW;5GJY3{SBY{VbgEK!5u#>cVTi-%MXKQ zgS%nNPYoXRLN7Ho+L4=ZnmaAIq1)bx{g?bON_M(_YPwy_=hbVez7e$jw9t;4LBk80 ziIa;H?zY<8u?IvWTaRP=LwjztgE&bGvD@jk{YKz9W;2So7i0Z5d1}Oo%ac@ten%(0 zd>(6XY^=UPiybbr6D7e$nseK2zwO6EVr59II+M+)yVD4Jopql#R*pAUu_rljr4jeK z-3Sv77^@>j`Dnq>`ljDZ4ivtGrQCmUxf3SKaS~yp{_1is_IbSQ-i(_!d+n9wR*)=T z%g*4na5?79oMXQnpZ($xi~D(VAWwOCE(XP~jp_!m(fa~EVGWMI-={1g@_ zn8)x07d%xc4MjQ(rGY0Hu0x_h@{v$Bbx#HL!>kY520n93=W7ExHwUNF(6;|ce0+m; z_$JAdJW73+ZRzokNq*GkA9^~?O1($|jr#-oQ5v7|(DT@cza1YD+N25Ja1Fl!E!PjX za{A;Bz^neB@eaw4){GIa99#8pp4F2pL(lMk>-H2l26Ih1f>d3*D4js6eIKT+u0{1i zIu(S{{*OIYHSDHZ=!R*DUNc_c#^81H1zc=e92q&i^+5|3POaP4T8wX7`;DSgN(G7|<2nOcRcteG&}vMMz-$)u6rvV08f6xu^g)O+5ZZJKFXM912u~>C z@<-Bwq!DbuK|nzX=f7(q$Qgodnf!MMTF||(3sbuCMZxwB$jlZCSjHK1?i9`^*Ens9 zled<)$;n5{>*Vypd69|O zg+h)_r#RQ?6L0|hrZ zO|U~4UxFlILPH?ex2xfW1iohzEd)^1=(p526V+3R1DPo$nlQ)5fe)X8KzlCHaXZ>Q zwr>)MyQlVZQeO_b@}98^a0gvxQlS2`2>h5O_j3iKED!dA{`mo&2VF=QS`%+8_W&~h zP89x8Qba$1JOX&{Ai$HX#rjrw?nGY|FW!1f*x+jaC$1g@6BN3f2xPI*lN#>Hr3k$`J_Zp>ek`v&6ewcDw=)$x&?9sV2P>_Br$&FGfPk(L2h z6GC<-f-oSw%H2-vd6~vOL&XY;`e)Kf3Y%mPS?;2>k!=}WP8n5U#O7zw*)d$s-5m!CwvlScj~o*QLP?hi)^^XA_s9usk)+iLLMPYdJ|$Mj%cy6D1oW6P7R_Cc z9i8kj)nH+%1`~A54!VOgK`S1>zwyQ@|1#94QzZUa%s5MukHMM}C#DsWjwMzXR~}}q zX4*&CbC6)kDKq9#l7(!w6W#LV0UudmeNMvB)NJ;WXk&wal}@DCjKU@kQa($kG)U~C zjRB)bDrAUMJlJ|Sy8uu#fC|r1$0dqn-apJUS$jFG^FZ-Jkmz>fvHth zTeVD#mKw^yaFjsOS2IbE1-aB^nm~4Q;4C2JTwr+`V-$h#p%*ybCcKYvc{Q&c(I#fE zVoXs(FZ|nBMb>L5u`JAw@kN!>j_PV!7^T#yEe$CTknjVG9oD31#8kcXLuz5W% zlP7)yv^0A8V)!XEvsH%??=<3O#C`rOF|8v;UE=`_+a$h`^^s0)4yqjP4G8ElG3q`$s!i&wksn@GC>FNk0O{@@E>EDL4k+ zMRzB2MWuMF4Vw?obg~}{Gxu@^ie)M$F?_18M^Rf|keo8J#S?4JpU0Evwfzf}yv3AS XlHFwYj#|q8;zF52iiSVxu)r`)+9OILW@<=zE( zO_#p#Uk0MX{XZxVKbbt7*jB}M1c`}Amt{RRV=-)$E{{U)Q6GmzWNNX49 z)E>~SeV|tdK)>z)gSrRo)O}!B4}jfz2<+7%uwRdWgSrbGasnJ#jyaycw?fT+iafCT zL#w%G$8TBQ=lF&PcKo*GJC^TSzGwOV9euO@Atzw8yD-Ffiy{}%BjYBg()dcpQX5y~ z+;pWVMXXuMO}`yhd8*AI&N7jSDO2gWxRPd#G)#)}jOEq(g^=G(%?)v8#+#j0Sr%|l ztANd|qVq~O2X$4vE;E6b@0GDWEmk5=6DH$)CCp)(vpdQ$K8Nx2y4c%*89VS!MbtwlBRd%*+Fv*>(3JPTaoVDsNkSO}Vjp zxsTg#c&$ChJ^JlrChv60=FAgjAFGR3)cXHl3_eJ$bHSDw99w6D5;LrPEBIu1a(zg*qB5#kKE=pk72y@RP{pbdf1mRtMD4)48WEU zy_?5t!IJ#WU8ZMkeSLnRk6_>kETs-6dR3w#3FyyWpj&ZB?*&GJ{qncvX z?AUr4+REBR@5&G1kf>)owA}Qdh862V6>G802;;8{ot_v!!{?ZTSg9gOV=XvT3ngMD z%t(s1YP3ZS(=5)Ka$^%G>Ng+j7_O5O#5UtTLvf0jAwEZ7JT)0-9lZ7O5lqaXTgQ=I zkv_6++eG;y^5hfPAU{S#h$F=I-0k6?!o;`GZ6t?OqP&P4`w6YEYnKVXZo=pr{4#dy G=;?nt^41Rk diff --git a/src/utils/__pycache__/Visualiser.cpython-38.pyc b/src/utils/__pycache__/Visualiser.cpython-38.pyc deleted file mode 100644 index db9df9167b279e9d42e4ed5779ab9885746dbf50..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5659 zcmbtY&5t8VmCwizJ8h?3E|jQG75`R!`8Y~c5=Kdtrl+J^BL>RkL;=zI}R@}DTU!C7RCn9rEO zO>RZz$nq_;4Q@yF$ngvFcF`|tn(LRI87uo`(443;@_bM87ozH@=GSz)7%hzIetoWI z(OZr?ZB z%@bH*v{`1q6UdztmUl{D-k${HWDt+N|k(!FVX*@O` z+)OQLDC39?8D}eovWDgg{H|Npijmr&*r|ikNejwRg+#n?4P4{f()@Y1d& zezDh6cvK@*ejf^T!KZ)nRQuUYadvnJUSU1Zh)Nr>;tUhI&!K zrcOhdm;u(3O`w!|b}s)N<8#J0jeEv-jXig6<&%%F(u1YdGfRK@VM`ygUDj0U5D427U={e)c?baqd4LHjMUd($X7DH5?lee>=cC$tGqN+{Zw z1Z~jF#@-%fB@s@-Kz0T^tIV6z@jzx?fP)Z)w>rt-Fg)#T&I;2B4`i5GgR#ua@g%dx!8j`f69{vfH{cyU=*oGMj#TEzfsDee z+Knd%o$++E9f~(@eYm^TuED-R6bdp1Sa5dkI$9;^NwU%zS-ePWrG7Y$<-sJpb>7_a zVR69kK-ahKzngco4mkEE3Dvd2ZX64qsN;O6hxog-W-kP zW+Gz|^uw*qX%dQLGuTPGJJaaaW`7_z?@J6t;_dNfBD$MXIf#^+s^mN5j5a9~$z zDPzchTgK1?Zkgg0v@Bqk0URr$bW(TRKwZ$ZqNcg23lu2olUI|aS^3ythAa&$%HEguFitWj3bw-tknwQ9<<8;4U46dSr0D7^rl~rlv3_e7QCb&!m0ZwL3BAFF> zgZ@;6nZ1LvFE)r?2y(sFXaL+D1^ZdyVD8L)UHipg|9$dJWLvtkmwA3`16 z7sW}~E{RV=onM|rvFybWfZK0u$8m(U%5F48KoEm*-!GF`X99~u-xXm`n+YkHnHzY% zZht`=LmuyF-bqBM!kykjF}+)YiSDs@$bj{E z?6JuVNG_>5BsbDUxj?|C2!_f?;1-X~w5gUL-_-KI1MelZ1l~WVi)b(7SwVudDlz8~ zlFbMI@t&Es(#u#u3#lK|V9OO#tGfjxJh#=VVn`KTtYH~z0F;y7@v2&#H55Ci_D>VR zWySsh^Njx`*T^fo7W9MYS$w)1Cpn*x&%vCwWa?pYXIcB ztijWGN6O@06Tv6FBAc^I#|EriOP^DK_m(kShRHf|6L#L;-%gOLsE!9J3EAQT#9z>tfK~k!>?8?gV2Vg$dRgcQY&Q z^~5cRQUK$9>bY4IYP|9=kX@HTB)6eJdMwsS{4FX7 zw=y@GZWClCS%rs5R}3a}8O`i{r2LtEKxIvwUR%Er>>^hs%o7C^ekloMClDey&~ZZ$ z+-9|a^A5u2{9a3|lvx=D(=^dwJ1YX%yE|dxJNgPNzE1iGOfx&!A0)77Cw?ftL4+c% z2ox5?UG&;7`enV3I+TuQ&M20HUgq_pV7%MG_UOxqd1A- zfor7ILad@>yBCUZ+zmUFtB(-Mz~|$F(fdj`_8aHJ@bZ-S{Dpk>(|)3ZsCb7|dU`Qr zb}SOV+QsbWQJ1zfv`^(cdgZ+IcKx#E(geS*t3319!ObsH^u~1lnR>b!_>~Kc85?Ab zd`8zMi(;=C?B@};qc1nUH+Q#I31`p}^L9bU^2B%Jsp!z|_8X_X=kHJaGgj|c^gwQ( zkKKqD`1C=nKYi*BA(*_6!Z7K^;3C}C%%;9QG|aM9NAR`F`CY=B)0@mQadQAa{$?Hg zWwvUr(yanFi8XT#G|yVKJhW@0=cokunS zq$zmpwG51XI<4kw zQdN!wgVrQ%F27r;!rJpyU`wGC5okoV9D> z`>5up1UnH&L2qO61L8}mP*m(w@k1(pM8y#mbOLcI_15q1#K=uWj^lL2rDPw*vmhHlyME8 zb|v!dMA%T9EuI4-?%)KGSs|}AK8rPCa)92GImSK&!EaoY%F$GiyRzjA1LtQr$>I9q zT-p~&j^!g#Y26;gI@kz~uD+n_lvf`kZOjD+(KiR7^KJLFQOu`N_&UY!gl-QWT;R=} QLhiWGGi$HZKI`)T1=!)Kd;kCd diff --git a/src/utils/__pycache__/expansion.cpython-38.pyc b/src/utils/__pycache__/expansion.cpython-38.pyc deleted file mode 100644 index 2b91d7ff9572c448be0e70d2765c9510a403b8f2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2796 zcmb7F%Wfk@6s>N*+HdEb$-Eel00R><27v~RUn`~M=eJe?$cH3076%zH1 zS@R9B%rDiJ6>D}`acdrezy+<+54hf@X^0eY_y)^(+NO|1B=9gO%~a2#X*4zRD>;3iOR4gs*nR) zrYhB7D^#ZjtV2y|!B(kF9oQPJ&?;=5)@U8JK^wFQ+oUbphHcRu+JSA;UAhO`q5D|d z15&4RqYK%p=7j z9S@avB>TOwmP+@$<2TA1$Nir6RWOQmZ$HLxsD6xkTKT7tmce5xAx90wT`~7t?t64^;F&j?xWAt zwTyql2K|8j;SQb7Zk=t+iE|#l1(&RFoxk)%ar6`M3wYH-RMk5G(5Ji{(LCkRoRZy^ zQ06?Y$mJbRDS6Wp>rdQFWdD^OMh!wTcN%tR+{BpL0(1+L z=+8PXi*7;Ho;-OaqhWkHl3n~qb8rO`^lW=!|8s7)2|-G|ECfwx#GEfCx7TB4p61=w zoOi+JHngmMY2+7Z`=9g zSW`y%Y&m7#<+h@Ca-xg;GF{f3U9jI{?3*0_)eDwAZm#(A>lD`^hxeJM`*t~D&SMWR zDPF0_8%WnRHP;;sX^fZGH4WGOb?k+c5%qxS9x?D0sXk!%kb$oa^%29z4E)Ba9z&mj z^ILJYsV^DkFF{)42S@P}!lz1emt#3QPGR~h&T45->bZxTmH)A9D&stxtp01B?_Yf4 EKVSb&lmGw# diff --git a/src/utils/__pycache__/extract_repeats.cpython-38.pyc b/src/utils/__pycache__/extract_repeats.cpython-38.pyc deleted file mode 100644 index e62969a19eecefc31ad8a999efee79c8bee11624..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2341 zcmaJ?OLH7G5SBF0oqgEhk=UseWk{g1K(Zth7n5?45S~?-M}UL{mus?`Sr0oqyVht) z;%H7A&T!^H?Q)<;ll^@)lb8IXX@_%}IA5X_rE0IU71%hc0Yx*X1qlaQ6}IdVGodJiy53A+J3m z-GE2D4jJ+WZ$j2My+zu~``DRmwyCc3B#kGzPJN-HLr zLP%5g!*7+NoSS6pm&#QxXRCxe114VHT~Y4LnUk5z-8rp1k5e&)Wl5riVgxX+g*{qil!8O zZAw)=bre&0n+mUy-Id$q4e}}ZjNBoL;q#X^H5G#5qYmi#16n(v>-!7s(!R;TehZy; z(@CE12SzvNwU47B`z9^zK$uzDHA&O%Xq*(<7o`;0SbOC-&m@G^oUD-ThIVB6op$q4 zFVjIe8IMOoYL=!jWksUh-Y74%2lu4Zeko%KSrbXjw~KfPf0^wi-N+i?#ac`fals!G z@jT)x=y`4#ZrPI%3=N9JobMbM}DMha{ky&1LlKU|B z8M^TW-y@b;0E|a6@c}n|top>zB5KIm`q4iY1OPyx4nwEJkb zaW4_t8OHa~Wrg-YBN+S`qTt#a3s6n_=|~J?*a&E(Y6%M4(c#TZ(fw z7(_A}8loCTIoPD@#Cf4zfe;Ma3@f%VA1qB$Fj{2&9oY?yV$9u|HP{&rbW6oW_k!7f z=>)%wv3wOBp)QTUpQq?KwgP#YuCt5uG-I%s-&y<`^dj}?Stj1Z{9|R=q}>!ETb94X zKmyFR9N$1`riMCmG6l)DE`w~K*ebzz4b*ZCxfGi@oXt?M0TEC6a~wG(yBD4;k(}Vr z9hAs>d(Dl96%G;q2>-SxPT`FxKPX@c*X|&BfWmc8WHJ%2!Gps1Iz-nWP2_eOYX{P1 zTRdmX^`oTXD8xD}Cv)aw0=C|Os|?W*Bw`sU5!W&98N<aS@w4bNa;stEW&773`R z!Zr7pZnhmA#-eZQnr>Ye{mC#XyGB0|nRo}UxQSN!=uAxv zskQYyqaQxzY+N)O)MX5CHaMLrt2{|j(ipBohJuFTrktr$`O*{ucgoC7ev613Sb`j% zhX9}-l<`nEujg66FfqTI<%xxZO$IW38q*fZy<$dJ(REK9W;l%j>+NwYcPt0&shs#E z9ob1A#e+nAfVGQ+vk2I1Q1KxwZKRLdvfhq_`6lr`I$g(gl_x!!jf&FVgW^TwGg1E+ qpG}V4<%!pt%Ssp?=d+OJ3>DTY4dc9(xe776&y{cjSO@nO7W3Y`X z9|QIcW-#+L>hHND8s%6XqdZ0?vkr}R!3EV+$zimODq{dQKxZ7~n8|XljiWrvGYhT7 z3Ty#wffZQ^?E)*a3fdy8vKrbFtFs2$GHbFH+6r4_OK7WXnXRC$u~oK)w$9es2HFO@ z$2QS6*?sl^ZHqm`*&eC#H@^*>Kzd=|F5)z4p*|h+C93=b5Sy`K8*vV3#(7{awt)G# z0JP!-U?DC77vd7I7?**ixB@K4RbVBq0jqHxSc@CLdfWsy8139VF@P z{0bw!s~*v;F#`wB_muGuj%BphRo*=nsvsjrL@Lh%rpkg3xD(kPQ`J#-7GTIa zCwVJDDEhW^Ly>O9JU41DDob+zSXj`QC=e_V6bVWMWdibg0f#q4ji64@AZQY_2*@wQ z62S)G(hv{OTMD;@*DUh^fLW?7z0SQ6PBmKll}amnVIC7CTY|3su>xxe&99$lp?^Vya8q@7bm z+!o()xmX5iss-v_*Fp(0{z(4%HWt(KWOSb{-&+2*sd^g?`;emGsihCZEWyELAuN-Jx_T0Z0q=>DQO2@&J&Hr~fA*oW$aqvPgNVf_yIQ zhcfK*Nv@&_{V?*5RRI~pP^u+IN~8`>#2Jz>$sNg6L-0{DHcId)7jpb0h11#0Q8tfD z_FKzjc6ufuZ`YH@fNp^JieQp8y`Ukvrr0LfA=o7#9nG-#RglUumefWh_t;03lt$45 zGVi-+XgiEEnO@rxpMzXbC=kyGw42TIk}t9PEvkMeN~WN350RFE L1({m^+SvOS$FsvE diff --git a/src/utils/__pycache__/locus.cpython-38.pyc b/src/utils/__pycache__/locus.cpython-38.pyc deleted file mode 100644 index eba8e8cd924ac8c2f214d4d95865c69a85a7fbe6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 736 zcmZvZ&5qMB5XYUbHf__gsu1E8df7;OLqZ6|fy=5KR$OwiOzgIilDNz`NUVHgUxruV z5q#wlxb10Y8X*ONqxpF}_J771e_F4Xg2wihH9un^e(_>D5?Z_^!!6;1&_YKKWZ&aT z==e_TlA7Xw4azbqo|9ul6=_k$I@0ljs1lu!CJouGtp}{v6CwL7eNTqB1Rr@Jg^!7m zPl&NkiHXmMsh<%uKPS$7PMrG%G51U2!mo%+Ul3PXP|0Y&D?-|<&V*%s0^3=%LBb1-1~+DL5fO(=Q`ooXeh{%FG0S-F!v zn!c&WNHvX&$zM#)Tq5Ji-8!Ub+5>O{fLjhJBV)`Mb4JcsF!;@2Mfj)2S12g_L%@cn z$9HAdyAquRbu>5SfCjKsr_Z1UcT-{w&Dmjj=V-fyk9~?vEC~7QvBVn|{3a7mPWZE3 From abd4cd4d59ada957f578936ba16b532dab6db3b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Leon=20Sch=C3=BCtz?= Date: Mon, 15 Apr 2024 15:10:54 +0200 Subject: [PATCH 4/4] Revert to commit 03551bc sry, that should have been on my fork -.- --- .gitignore | 5 - src/.vscode/launch.json | 13 + src/__pycache__/expansion.cpython-38.pyc | Bin 0 -> 2790 bytes src/__pycache__/locus.cpython-38.pyc | Bin 0 -> 730 bytes src/straglron.py | 4 +- src/utils/ReaderAnalyser - Kopie.py | 350 ++++++++++++++++++ src/utils/ReaderAnalyser.py | 227 ++++++------ src/utils/Visualiser.py | 47 +-- .../__pycache__/ReaderAnalyser.cpython-38.pyc | Bin 0 -> 5229 bytes .../__pycache__/Structures.cpython-38.pyc | Bin 0 -> 1975 bytes .../__pycache__/Visualiser.cpython-38.pyc | Bin 0 -> 5557 bytes .../__pycache__/expansion.cpython-38.pyc | Bin 0 -> 2796 bytes .../extract_repeats.cpython-38.pyc | Bin 0 -> 2321 bytes .../__pycache__/loci_struct.cpython-38.pyc | Bin 0 -> 3343 bytes src/utils/__pycache__/locus.cpython-38.pyc | Bin 0 -> 736 bytes src/utils/extract_repeats.py | 16 +- 16 files changed, 509 insertions(+), 153 deletions(-) delete mode 100644 .gitignore create mode 100644 src/.vscode/launch.json create mode 100644 src/__pycache__/expansion.cpython-38.pyc create mode 100644 src/__pycache__/locus.cpython-38.pyc create mode 100644 src/utils/ReaderAnalyser - Kopie.py create mode 100644 src/utils/__pycache__/ReaderAnalyser.cpython-38.pyc create mode 100644 src/utils/__pycache__/Structures.cpython-38.pyc create mode 100644 src/utils/__pycache__/Visualiser.cpython-38.pyc create mode 100644 src/utils/__pycache__/expansion.cpython-38.pyc create mode 100644 src/utils/__pycache__/extract_repeats.cpython-38.pyc create mode 100644 src/utils/__pycache__/loci_struct.cpython-38.pyc create mode 100644 src/utils/__pycache__/locus.cpython-38.pyc diff --git a/.gitignore b/.gitignore deleted file mode 100644 index c9cb337..0000000 --- a/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -.vscode -.idea -src/utils/__pycache__ -src/__pycache__ -src/utils/.vscode \ No newline at end of file diff --git a/src/.vscode/launch.json b/src/.vscode/launch.json new file mode 100644 index 0000000..1b4ab29 --- /dev/null +++ b/src/.vscode/launch.json @@ -0,0 +1,13 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "args": ["/mnt/storage2/users/ahbraut2/data/snakemake_test/benchmark/straglr_test_new_Loci/og/Sample012_23014LRa.bed","--hist", "/mnt/storage2/users/ahbraut2/data/snakemake_test/benchmark/straglr_test_new_Loci/og/Sample012_23014LRa.tsv","-o", "/mnt/storage2/users/ahbraut2/scripts/TestFolder/Home", "/mnt/storage2/users/ahbraut2/data/Master_Bedfiles/RepeatLoci_straglr_LT_V2.bed" , "--bam","/mnt/storage2/users/ahbraut2/data/ONT_Pilot_TNAMSE_Links/Sample012_23014LRa.bam", "--cutoff", "3", "--altclust"] + } + ] +} \ No newline at end of file diff --git a/src/__pycache__/expansion.cpython-38.pyc b/src/__pycache__/expansion.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6084794ac52111d5b29985b26a9b714925321a1a GIT binary patch literal 2790 zcmb7F%Wfk@6s>N*+Hc2sPUZmv5@29r#vst35iflW58p1u{Qk=-^`ZG}X= zW7d2FEb~jXWyP9ZR-D^*+wH^=Nm+evJ+AB2xpk}R=X$*&P})yMGIE6Yn;Y4ZL*qF< z-3Gi8B*=P;?|1u9kbNLJIUoH?#YXEnJ{<$3IJ8I{+GLUaRvhLiM|s#h6{rYXpb|N- zMJiJTwnSB`!8%l@25gy{)Pk+h8nt1ov`!naHQJ;t*g9>~4s3(&&@OC~?$SNj7Tw3% z9+>jW-^O03gHh&ot-Mlks?j zh0Z#_KI-GjM!)*XgX zS4Sh|^<}R+(NgKIcl<_qlc?9#z6!>X?jA%K9;hF~u2z0mrg^tFJ~fW(2G}9j<-?@+ z0KzU>H~*cr=V`o`PUUm{riJVCd}f`~eCAv_xAxC1wSm5lt?t4k^+ebd?xW9CwTxb4 zYkt7aaEDH2x6U>f#2F9Yf=gPs&R=qtIPS6d1-$AVRMoox&?mec%PeJoLCJ0_C^H^c zvb4d~>i(Vur* z7TtiVeR=XqhNI|oEIas*=HLn>=-JNF{%7276M}?#RS4?Ph&f+O@2yA7Jj=W71@Ds2 zZD?8j%E&L#_CEW6+TNAyoA1H$VN0TqeBZ>JvqUp_zU{6FdV1tO&#d7}$Wx~qYgeCM zr@S!@W;)}St7(^=Zma!HMtdQ-sxOPLamL{yKj3@(NoEaOi(SvU+*~;C<;pFuv8I&r z>1xWX%WXyPW<=-tO}eZ(yl9ZZhxeJM`*t;9#$z9EC|;@1 z8%ozT71te(Xo5G_H8t1$b>a=CBkBRuJ!If3Qhmtq5d&Wv>SKma82F7-U4|Y5=eOc) zQ(rMGUV@~?502s|gin>`F2{0qo!sn~pVyL}#B(1vEB|BFRK|HaUHvya-@o|8f6kUn AE&u=k literal 0 HcmV?d00001 diff --git a/src/__pycache__/locus.cpython-38.pyc b/src/__pycache__/locus.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2eddfeaf5e994d8659b4bbaa1111536c026f1dd GIT binary patch literal 730 zcmZvZ&5qMB5XbF&v}u}_RfQ0*(91^J8v-FD4qR5{u;P-7Wn#CDl*DE1AhGg|eHmVb zNAQ(L;I^loNrV&xw&v&Y*nfY_eA;YshHdxC=${c|zi2XVF;3p0*&X5oQ%vy(wC_>H zRCLGoagFi68lEyKo};5##TBa}#Z~mcs#wLSNkV#S>cQ5VnUH*vzDKjS2+ute%tuJ> zV`StLWb9L9;upx&FOdtMA(wuI%zTbq`86{41#+zzUda#pA|$=+bja&7n9f*(erlZr z7h>J3kaos3$B?%4w6K%bn!1rrE1?>zrPW~z`lvzowHCIy*P-YQbg~s7`;!h=7wOLC zr2D3xBGo)H9sXjrXN*ic_nVMd-5!Aw0F-h_2q|GfSQ0YAia<4kHR7K;zCwZF9}KoQ zJiacw-j&uFkSBdp4pxIL<@qzn!QGU$hUVgIdFODsg^zt{p)SY&Uthk3Oo*m$oDlRN z`x3$U;=lcVJJ$31kKlkMMm42dh>#1>88x)z7eag)WIL_E6&^W0bcSaX;|R)yC1iY@ XANJEY+e?205`^r{bj4c|{6=F>HdL_p literal 0 HcmV?d00001 diff --git a/src/straglron.py b/src/straglron.py index 67eb389..f328740 100644 --- a/src/straglron.py +++ b/src/straglron.py @@ -125,12 +125,10 @@ def main(): loci_dict = ra.lociBedReader(args.loci_file) expansions = ra.resultBedReader(args.path_input_bed, loci_dict) sample_id = Path(args.path_input_bed).stem - + vis.getHistData(args.path_input_tsv, expansions) for expansion_object in expansions: - if __name__ == '__main__': - expansion_object if args.altclust: ra.newGenotyping(expansion_object, args.cutoff, False) diff --git a/src/utils/ReaderAnalyser - Kopie.py b/src/utils/ReaderAnalyser - Kopie.py new file mode 100644 index 0000000..e9f1c67 --- /dev/null +++ b/src/utils/ReaderAnalyser - Kopie.py @@ -0,0 +1,350 @@ +import csv +import pysam +from pathlib import Path +from .Structures import Expansion +from .Structures import Locus +import numpy as np +from sklearn.mixture import GaussianMixture + +def resultBedReader(file, loci_dict): + + with open(file) as straglr: + + straglr_reader = csv.reader(straglr, delimiter='\t') + expansion_list: list[Expansion] = [] + sample_id = Path(file).stem + + #loci allele1 allele2 YES/NO Referenz size_difference PathRange + for straglr in straglr_reader: + if straglr[0]!='#chrom': + #Variables are assigned based on the coordinates in the straglr reader + coords = straglr[0]+straglr[1]+straglr[2] + chr = straglr[0] + start = straglr[1] + end = straglr[2] + loci = loci_dict[coords][0].name + reference_size = int(loci_dict[coords][0].reference_size) + int_path_range = loci_dict[coords][0].pathogenic_range + motif = straglr[3] + alleles = 0 + # "-" means that there was no coverage in this region + if straglr[4] == '-': + allele1 = np.nan + allele2 = np.nan + copy_number_1 = np.nan + copy_number_2 = np.nan + allele1_support = 0 + allele2_support = 0 + alleles = 0 + #"-" means that straglr assigned no different number for second allele -> allele 1 = allele 2; In the following 2 different allele sizes were assigned + elif straglr[8]!='-': + allele1 = round(float(straglr[4])) + allele2 = round(float(straglr[7])) + copy_number_1 = straglr[5] + copy_number_2 = straglr[8] + allele1_support = straglr[6] + allele2_support = straglr[9] + alleles = 2 + + else: + allele1 = round(float(straglr[4])) + allele2 = round(float(straglr[4])) + copy_number_1 = straglr[5] + copy_number_2 = straglr[5] + allele1_support = straglr[6] + allele2_support = straglr[6] + alleles = 1 + + expansion_object = Expansion(chr, start, end, loci, motif, allele1, allele2, reference_size, int_path_range, copy_number_1, copy_number_2, allele1_support, allele2_support, sample_id) + analyseGenotype(expansion_object, alleles) + expansion_list.append(expansion_object) + + #List of expansions, though not pathogenic by definition, are sorted by chromosome or the normalized size difference between sample and reference length on hg38 in descending order + + return expansion_list + +def analyseGenotype(expansion_object: Expansion, alleles): + + if alleles == 0: + # repeat not covered + expansion_object.in_pathogenic_range = 'NA' + expansion_object.size_difference = np.nan + elif alleles == 1: + + if expansion_object.pathogenic_range == 'NA': + expansion_object.in_pathogenic_range ='NA' + if expansion_object.allele1_size > expansion_object.allele2_size: + expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size + else: + expansion_object.size_difference = expansion_object.allele2_size-expansion_object.wt_size + + #Check for pathogenic expansion size happens here; Size of sample expansion checked against lower boundary of pathogenic range from literature; Assignment of respective loci and sizes to corresponding lists + elif expansion_object.allele1_size > int(expansion_object.pathogenic_range) or expansion_object.allele2_size > int(expansion_object.pathogenic_range): + expansion_object.in_pathogenic_range = 'Yes' + if expansion_object.allele1_size > expansion_object.allele2_size: + expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size + else: + expansion_object.size_difference = expansion_object.allele2_size-expansion_object.wt_size + + #If size of sample expansion is still below pathogenic range, they are added into general list + else: + expansion_object.in_pathogenic_range = 'No' + if expansion_object.allele1_size > expansion_object.allele2_size: + expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size + else: + expansion_object.size_difference = expansion_object.allele2_size-expansion_object.wt_size + + else: + if expansion_object.pathogenic_range == 'NA': + expansion_object.in_pathogenic_range = 'NA' + #Same check and assignment as above + if expansion_object.allele1_size>expansion_object.allele2_size: + expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size + else: + expansion_object.size_difference = expansion_object.allele2_size-expansion_object.wt_size + + elif expansion_object.allele1_size> int(expansion_object.pathogenic_range): + expansion_object.in_pathogenic_range = 'Yes' + expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size + + else: + expansion_object.in_pathogenic_range = 'No' + if expansion_object.allele1_size>expansion_object.allele2_size: + expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size + else: + expansion_object.size_difference = expansion_object.allele2_size-expansion_object.wt_size + +def lociBedReader(bedFile): + Loci = {} + with open(bedFile) as lociCoords: + lociReader_reader=csv.reader(lociCoords, delimiter='\t') + for locus in lociReader_reader: + if locus[0]!='#chrom': + key = locus[0] + locus[1] + locus[2] + new_object = Locus(locus[4],locus[0],locus[1],locus[2],locus[3],locus[4],locus[5],locus[6],locus[7],locus[8]) + if key in Loci: + Loci[key].append(new_object) + else: + Loci.update({key:[new_object]}) + return Loci + +def newGenotyping(expansion_object: Expansion, cutoff, new: bool): + + concat_reads = [] + number_of_reads = 0 + + if new: + + for new_readlist in expansion_object.new_read_list: + #print("Oldlist: ") + #print(original_readlist) + concat_reads += new_readlist + number_of_reads = len(concat_reads) + #print(number_of_reads) + + else: + + for original_readlist in expansion_object.read_list: + #print("Oldlist: ") + #print(original_readlist) + concat_reads += original_readlist + number_of_reads = len(concat_reads) + #print(number_of_reads) + + rearanged_array = np.array(concat_reads).reshape(-1,1) + #print(expansion_object.repeat_id, expansion_object.repeat_unit, " STARTS HERE: ") + #print(rearanged_array) + + X = rearanged_array + + N = np.arange(1,4) + M = np.arange(1,3) + + # fit models with 1-2 components + + if len(np.unique(X)) < 2: + models = [GaussianMixture(1, covariance_type='full', init_params="kmeans", max_iter=500).fit(X)] + + elif len(np.unique(X)) == 2: + models = [GaussianMixture(m, covariance_type='full', init_params="kmeans", max_iter=500).fit(X) + for m in M] + else: + models = [GaussianMixture(n, covariance_type='full', init_params="kmeans", max_iter=500).fit(X) + for n in N] + + BIC = [m.bic(X) for m in models] + + best_model = models[np.argmin(BIC)] + + cluster_assignment = best_model.predict(X) + + #print(cluster_assignment) + #print(X) + #print(np.std(X)) + + cluster1 = [] + cluster2 = [] + cluster3 = [] + clusters = [] + + for i in range(len(X)): + if cluster_assignment[i] == 0: + cluster1.append(X[i][0]) + if cluster_assignment[i] == 1: + cluster2.append(X[i][0]) + if cluster_assignment[i] == 2: + cluster3.append(X[i][0]) + + #print("This is cluster1: ", cluster1) + #print("This is cluster2: ", cluster2) + #print("This is cluster3: ", cluster3) + + clusters.append(cluster1) + clusters.append(cluster2) + clusters.append(cluster3) + + clusters.sort(key=len, reverse=True) + + cluster1 = clusters[0] + cluster2 = clusters[1] + cluster3 = clusters[2] + + #print("Longest cluster: ", cluster1) + #print("Middle cluster: ", cluster2) + #print("Shortest cluster: ",cluster3) + + if cluster3: + distance13 = abs(np.mean(cluster3) - np.mean(cluster1)) + distance12 = abs(np.mean(cluster3) - np.mean(cluster2)) + if distance12 < distance13: + if distance12 < max(5, 2*np.var(cluster2)): + cluster2 += cluster3 + #print("Cluster 3 wurde cluster 2 geadded werden") + else: + if distance13 < max(5, 2*np.var(cluster2)): + cluster1 += cluster3 + #print("Cluster 3 wurde cluster 1 geadded werden") + #print(np.mean(cluster3), " ", np.var(cluster3)) + #print(np.mean(cluster2), " ", np.var(cluster2)) + #print(np.mean(cluster1), " ", np.var(cluster1)) + + allele1 = cluster1 + allele2 = cluster2 + + #print(allele1) + #print(allele2) + + if allele1 and allele2: + + expansion_object.new_read_list = [allele1,allele2] + expansion_object.new_allele1 = np.median(allele1) + expansion_object.new_allele2 = np.median(allele2) + + if expansion_object.new_allele1 > expansion_object.new_allele2: + expansion_object.new_size_difference = expansion_object.new_allele1-expansion_object.wt_size + if expansion_object.pathogenic_range == "NA": + expansion_object.new_in_pathogenic_range = "NA" + elif expansion_object.new_allele1 > int(expansion_object.pathogenic_range): + expansion_object.new_in_pathogenic_range = "Yes" + else: + expansion_object.new_in_pathogenic_range = "No" + + else: + expansion_object.new_size_difference = expansion_object.new_allele2-expansion_object.wt_size + if expansion_object.pathogenic_range == "NA": + expansion_object.new_in_pathogenic_range = "NA" + elif expansion_object.new_allele2 > int(expansion_object.pathogenic_range): + expansion_object.new_in_pathogenic_range = "Yes" + else: + expansion_object.new_in_pathogenic_range = "No" + + expansion_object.new_copy_numberA1 = expansion_object.new_allele1/len(expansion_object.repeat_unit) + expansion_object.new_copy_numberA2 = expansion_object.new_allele2/len(expansion_object.repeat_unit) + expansion_object.new_allele1_support = len(allele1) + expansion_object.new_allele2_support = len(allele2) + #print("New: " + expansion_object.title, expansion_object.new_read_list) + else: + expansion_object.new_read_list = [allele1+allele2] + expansion_object.new_allele1 = np.median(allele1) + expansion_object.new_allele2 = np.median(allele1) + expansion_object.new_size_difference = expansion_object.new_allele1-expansion_object.wt_size + + if expansion_object.pathogenic_range == "NA": + expansion_object.new_in_pathogenic_range = "NA" + elif expansion_object.new_allele1 > int(expansion_object.pathogenic_range): + expansion_object.new_in_pathogenic_range = "Yes" + else: + expansion_object.new_in_pathogenic_range = "No" + + expansion_object.new_copy_numberA1 = expansion_object.new_allele1/len(expansion_object.repeat_unit) + expansion_object.new_copy_numberA2 = expansion_object.new_allele2/len(expansion_object.repeat_unit) + expansion_object.new_allele1_support = len(allele1) + expansion_object.new_allele2_support = len(allele1) + + #print(expansion_object.title, expansion_object.new_read_list) + + #print("The length of the current lists is {}".format([len(n) for n in expansion_object.new_read_list])) + + #temp_list = expansion_object.new_read_list + + #print(len(expansion_object.new_read_list)) + + if number_of_reads > 3*cutoff: + + for new_list in expansion_object.new_read_list: + + if len(new_list) <= cutoff: + expansion_object.new_read_list.remove(new_list) + #expansion_object.read_list = temp_list + newGenotyping(expansion_object, cutoff, True) + +def expansionScorer(expansion : Expansion, new_clustering : bool): + #score = x-xmin/xmax-xmin + if expansion.pathogenic_range != "NA": + if new_clustering: + score = (expansion.new_size_difference)/(int(expansion.pathogenic_range)-expansion.wt_size) + expansion.new_norm_score = round(score, 4) + + else: + score = (expansion.size_difference)/(int(expansion.pathogenic_range)-expansion.wt_size) + #print(expansion.in_pathogenic_range, score, expansion.size_difference, int(expansion.pathogenic_range)) + expansion.norm_score = round(score, 4) + else: + expansion.norm_score = None + +''' +def bamfileReader(bamFile, pathogenics): + samfile = pysam.AlignmentFile(bamFile, "rb") + counter = 0 + expansion = pathogenics + chromosome = expansion.chr + start = int(expansion.start) + end = int(expansion.end) + ru = expansion.repeat_unit + print(expansion.repeat_id, chromosome, start, end, ru) + for key, value in expansion.read_dict.items(): + print(key, value, len(expansion.read_dict)) + for read in samfile.fetch(chromosome, start, end): + try: + print(read.query_name, expansion.read_dict[read.query_name]) + print(read.query_sequence[expansion.read_dict[read.query_name]:(expansion.read_dict[read.query_name]+50)]) + print() + + except KeyError: + print("Key " + read.query_name + " not found!") + counter += 1 + samfile.close() + print(counter) + + +def cyclicPermutation(motif): + size = len(motif) + motifs = deque(motif) + perms = [] + + for i in range(size): + motifs.rotate(1) + perms.append(''.join(list(motifs))) + + return permm +''' \ No newline at end of file diff --git a/src/utils/ReaderAnalyser.py b/src/utils/ReaderAnalyser.py index 8cb3b80..63533c6 100644 --- a/src/utils/ReaderAnalyser.py +++ b/src/utils/ReaderAnalyser.py @@ -6,19 +6,19 @@ import numpy as np from sklearn.mixture import GaussianMixture - def resultBedReader(file, loci_dict): + with open(file) as straglr: - + straglr_reader = csv.reader(straglr, delimiter='\t') expansion_list: list[Expansion] = [] sample_id = Path(file).stem - - # loci allele1 allele2 YES/NO Referenz size_difference PathRange + + #loci allele1 allele2 YES/NO Referenz size_difference PathRange for straglr in straglr_reader: - if straglr[0] != '#chrom': - # Variables are assigned based on the coordinates in the straglr reader - coords = straglr[0] + straglr[1] + straglr[2] + if straglr[0]!='#chrom': + #Variables are assigned based on the coordinates in the straglr reader + coords = straglr[0]+straglr[1]+straglr[2] chr = straglr[0] start = straglr[1] end = straglr[2] @@ -27,12 +27,8 @@ def resultBedReader(file, loci_dict): int_path_range = loci_dict[coords][0].pathogenic_range motif = straglr[3] alleles = 0 - # "-" means that there was no coverage in this region -> skip this loci - if straglr[4] == '-': - continue - - # "-" means that straglr assigned no different number for second allele -> allele 1 = allele 2; In the following 2 different allele sizes were assigned - elif straglr[8] != '-': + #"-" means that straglr assigned no different number for second allele -> allele 1 = allele 2; In the following 2 different allele sizes were assigned + if straglr[8]!='-': allele1 = round(float(straglr[4])) allele2 = round(float(straglr[7])) copy_number_1 = straglr[5] @@ -49,114 +45,113 @@ def resultBedReader(file, loci_dict): allele1_support = straglr[6] allele2_support = straglr[6] alleles = 1 - - expansion_object = Expansion(chr, start, end, loci, motif, allele1, allele2, reference_size, int_path_range, copy_number_1, copy_number_2, allele1_support, - allele2_support, sample_id) + + expansion_object = Expansion(chr, start, end, loci, motif, allele1, allele2, reference_size, int_path_range, copy_number_1, copy_number_2, allele1_support, allele2_support, sample_id) analyseGenotype(expansion_object, alleles) expansion_list.append(expansion_object) - - # List of expansions, though not pathogenic by definition, are sorted by chromosome or the normalized size difference between sample and reference length on hg38 in descending order - + + #List of expansions, though not pathogenic by definition, are sorted by chromosome or the normalized size difference between sample and reference length on hg38 in descending order + return expansion_list - def analyseGenotype(expansion_object: Expansion, alleles): - if alleles == 0: - # repeat not covered - expansion_object.in_pathogenic_range = 'NA' - expansion_object.size_difference = np.nan - elif alleles == 1: - + + if alleles == 1: + if expansion_object.pathogenic_range == 'NA': - expansion_object.in_pathogenic_range = 'NA' + expansion_object.in_pathogenic_range ='NA' if expansion_object.allele1_size > expansion_object.allele2_size: - expansion_object.size_difference = expansion_object.allele1_size - expansion_object.wt_size + expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size else: - expansion_object.size_difference = expansion_object.allele2_size - expansion_object.wt_size + expansion_object.size_difference = expansion_object.allele2_size-expansion_object.wt_size - # Check for pathogenic expansion size happens here; Size of sample expansion checked against lower boundary of pathogenic range from literature; Assignment of respective loci and sizes to corresponding lists + #Check for pathogenic expansion size happens here; Size of sample expansion checked against lower boundary of pathogenic range from literature; Assignment of respective loci and sizes to corresponding lists elif expansion_object.allele1_size > int(expansion_object.pathogenic_range) or expansion_object.allele2_size > int(expansion_object.pathogenic_range): expansion_object.in_pathogenic_range = 'Yes' if expansion_object.allele1_size > expansion_object.allele2_size: - expansion_object.size_difference = expansion_object.allele1_size - expansion_object.wt_size + expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size else: - expansion_object.size_difference = expansion_object.allele2_size - expansion_object.wt_size - - # If size of sample expansion is still below pathogenic range, they are added into general list - else: + expansion_object.size_difference = expansion_object.allele2_size-expansion_object.wt_size + + #If size of sample expansion is still below pathogenic range, they are added into general list + else: expansion_object.in_pathogenic_range = 'No' if expansion_object.allele1_size > expansion_object.allele2_size: - expansion_object.size_difference = expansion_object.allele1_size - expansion_object.wt_size + expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size else: - expansion_object.size_difference = expansion_object.allele2_size - expansion_object.wt_size - - else: + expansion_object.size_difference = expansion_object.allele2_size-expansion_object.wt_size + + else: if expansion_object.pathogenic_range == 'NA': expansion_object.in_pathogenic_range = 'NA' - # Same check and assignment as above - if expansion_object.allele1_size > expansion_object.allele2_size: - expansion_object.size_difference = expansion_object.allele1_size - expansion_object.wt_size + #Same check and assignment as above + if expansion_object.allele1_size>expansion_object.allele2_size: + expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size else: - expansion_object.size_difference = expansion_object.allele2_size - expansion_object.wt_size - - elif expansion_object.allele1_size > int(expansion_object.pathogenic_range): + expansion_object.size_difference = expansion_object.allele2_size-expansion_object.wt_size + + elif expansion_object.allele1_size> int(expansion_object.pathogenic_range): expansion_object.in_pathogenic_range = 'Yes' - expansion_object.size_difference = expansion_object.allele1_size - expansion_object.wt_size - + expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size + else: expansion_object.in_pathogenic_range = 'No' - if expansion_object.allele1_size > expansion_object.allele2_size: - expansion_object.size_difference = expansion_object.allele1_size - expansion_object.wt_size + if expansion_object.allele1_size>expansion_object.allele2_size: + expansion_object.size_difference = expansion_object.allele1_size-expansion_object.wt_size else: - expansion_object.size_difference = expansion_object.allele2_size - expansion_object.wt_size - - + expansion_object.size_difference = expansion_object.allele2_size-expansion_object.wt_size + def lociBedReader(bedFile): Loci = {} with open(bedFile) as lociCoords: - lociReader_reader = csv.reader(lociCoords, delimiter='\t') + lociReader_reader=csv.reader(lociCoords, delimiter='\t') for locus in lociReader_reader: - if locus[0] != '#chrom': + if locus[0]!='#chrom': key = locus[0] + locus[1] + locus[2] - new_object = Locus(locus[4], locus[0], locus[1], locus[2], locus[3], locus[4], locus[5], locus[6], locus[7], locus[8]) + new_object = Locus(locus[4],locus[0],locus[1],locus[2],locus[3],locus[4],locus[5],locus[6],locus[7],locus[8]) if key in Loci: Loci[key].append(new_object) else: - Loci.update({key: [new_object]}) + Loci.update({key:[new_object]}) return Loci - def newGenotyping(expansion_object: Expansion, cutoff, new: bool): + concat_reads = [] number_of_reads = 0 - + if new: + for new_readlist in expansion_object.new_read_list: + concat_reads += new_readlist number_of_reads = len(concat_reads) else: + for original_readlist in expansion_object.read_list: + concat_reads += original_readlist number_of_reads = len(concat_reads) - + rearanged_array = np.array(concat_reads).reshape(-1,1) + X = rearanged_array - - N = np.arange(1, 4) - M = np.arange(1, 3) - + + N = np.arange(1,4) + M = np.arange(1,3) + # fit models with 1-3 components - + if len(np.unique(X)) < 2: models = [GaussianMixture(1, covariance_type='full', init_params="kmeans", max_iter=500).fit(X)] - + elif len(np.unique(X)) == 2: models = [GaussianMixture(m, covariance_type='full', init_params="kmeans", max_iter=500).fit(X) - for m in M] + for m in M] else: models = [GaussianMixture(n, covariance_type='full', init_params="kmeans", max_iter=500).fit(X) - for n in N] + for n in N] BIC = [m.bic(X) for m in models] @@ -168,7 +163,7 @@ def newGenotyping(expansion_object: Expansion, cutoff, new: bool): cluster2 = [] cluster3 = [] clusters = [] - + for i in range(len(X)): if cluster_assignment[i] == 0: @@ -177,91 +172,109 @@ def newGenotyping(expansion_object: Expansion, cutoff, new: bool): cluster2.append(X[i][0]) if cluster_assignment[i] == 2: cluster3.append(X[i][0]) - - clusters.append(cluster1) + + clusters.append(cluster1) clusters.append(cluster2) clusters.append(cluster3) - - clusters.sort(key=len, reverse=True) - + + clusters.sort(key=len, reverse=True) + cluster1 = clusters[0] cluster2 = clusters[1] cluster3 = clusters[2] + if cluster3: distance13 = abs(np.mean(cluster3) - np.mean(cluster1)) distance12 = abs(np.mean(cluster3) - np.mean(cluster2)) - if distance12 < distance13: + + if distance12 < distance13: + if distance12 < max(5, 2*np.var(cluster2)): + cluster2 += cluster3 + else: + if distance13 < max(5, 2*np.var(cluster2)): + cluster1 += cluster3 + + allele1 = cluster1 allele2 = cluster2 - - if allele1 and allele2: - - expansion_object.new_read_list = [allele1, allele2] + + #print(allele1) + #print(allele2) + + if allele1 and allele2: + + expansion_object.new_read_list = [allele1,allele2] expansion_object.new_allele1 = np.median(allele1) expansion_object.new_allele2 = np.median(allele2) - + if expansion_object.new_allele1 > expansion_object.new_allele2: - expansion_object.new_size_difference = expansion_object.new_allele1 - expansion_object.wt_size + expansion_object.new_size_difference = expansion_object.new_allele1-expansion_object.wt_size if expansion_object.pathogenic_range == "NA": expansion_object.new_in_pathogenic_range = "NA" elif expansion_object.new_allele1 > int(expansion_object.pathogenic_range): expansion_object.new_in_pathogenic_range = "Yes" - else: + else: expansion_object.new_in_pathogenic_range = "No" - + else: - expansion_object.new_size_difference = expansion_object.new_allele2 - expansion_object.wt_size + expansion_object.new_size_difference = expansion_object.new_allele2-expansion_object.wt_size if expansion_object.pathogenic_range == "NA": expansion_object.new_in_pathogenic_range = "NA" elif expansion_object.new_allele2 > int(expansion_object.pathogenic_range): expansion_object.new_in_pathogenic_range = "Yes" - else: + else: expansion_object.new_in_pathogenic_range = "No" - - expansion_object.new_copy_numberA1 = expansion_object.new_allele1 / len(expansion_object.repeat_unit) - expansion_object.new_copy_numberA2 = expansion_object.new_allele2 / len(expansion_object.repeat_unit) + + expansion_object.new_copy_numberA1 = expansion_object.new_allele1/len(expansion_object.repeat_unit) + expansion_object.new_copy_numberA2 = expansion_object.new_allele2/len(expansion_object.repeat_unit) expansion_object.new_allele1_support = len(allele1) - expansion_object.new_allele2_support = len(allele2) + expansion_object.new_allele2_support = len(allele2) + #print("New: " + expansion_object.title, expansion_object.new_read_list) else: - expansion_object.new_read_list = [allele1 + allele2] + expansion_object.new_read_list = [allele1+allele2] expansion_object.new_allele1 = np.median(allele1) expansion_object.new_allele2 = np.median(allele1) - expansion_object.new_size_difference = expansion_object.new_allele1 - expansion_object.wt_size - + expansion_object.new_size_difference = expansion_object.new_allele1-expansion_object.wt_size + if expansion_object.pathogenic_range == "NA": - expansion_object.new_in_pathogenic_range = "NA" + expansion_object.new_in_pathogenic_range = "NA" elif expansion_object.new_allele1 > int(expansion_object.pathogenic_range): expansion_object.new_in_pathogenic_range = "Yes" - else: + else: expansion_object.new_in_pathogenic_range = "No" - - expansion_object.new_copy_numberA1 = expansion_object.new_allele1 / len(expansion_object.repeat_unit) - expansion_object.new_copy_numberA2 = expansion_object.new_allele2 / len(expansion_object.repeat_unit) + + expansion_object.new_copy_numberA1 = expansion_object.new_allele1/len(expansion_object.repeat_unit) + expansion_object.new_copy_numberA2 = expansion_object.new_allele2/len(expansion_object.repeat_unit) expansion_object.new_allele1_support = len(allele1) - - if number_of_reads > 3 * cutoff: + expansion_object.new_allele2_support = len(allele1) + + + if number_of_reads > 3*cutoff: for new_list in expansion_object.new_read_list: - + if len(new_list) <= cutoff: expansion_object.new_read_list.remove(new_list) + #expansion_object.read_list = temp_list newGenotyping(expansion_object, cutoff, True) - - -def expansionScorer(expansion: Expansion, new_clustering: bool): + +def expansionScorer(expansion : Expansion, new_clustering : bool): + #score = x-xmin/xmax-xmin if expansion.pathogenic_range != "NA": if new_clustering: - score = (expansion.new_size_difference) / (int(expansion.pathogenic_range) - expansion.wt_size) + score = (expansion.new_size_difference)/(int(expansion.pathogenic_range)-expansion.wt_size) expansion.new_norm_score = round(score, 4) - + else: - score = (expansion.size_difference) / (int(expansion.pathogenic_range) - expansion.wt_size) - expansion.norm_score = round(score, 4) + score = (expansion.size_difference)/(int(expansion.pathogenic_range)-expansion.wt_size) + #print(expansion.in_pathogenic_range, score, expansion.size_difference, int(expansion.pathogenic_range)) + expansion.norm_score = round(score, 4) else: - expansion.norm_score = None \ No newline at end of file + expansion.norm_score = None + \ No newline at end of file diff --git a/src/utils/Visualiser.py b/src/utils/Visualiser.py index 5feed3d..4aec811 100644 --- a/src/utils/Visualiser.py +++ b/src/utils/Visualiser.py @@ -13,7 +13,6 @@ from Bio import SeqIO import pysam - def getHistData(file, expansions: "list[Expansion]"): expansions_read_lists = {} for expansion in expansions: @@ -28,16 +27,8 @@ def getHistData(file, expansions: "list[Expansion]"): for line in read_reader: if not line[0].startswith('#'): if expansion.start+expansion.repeat_unit == line[1]+line[3]: - # skip reads with read_status != full - if line[14] != "full": - continue - # read column 'read'/'read_name' and 'read_start' - read_coords_dict.update({line[7]: int(line[11])}) - if line[11] == "NA": - read_coords_dict.update({line[7]: np.nan}) - # read column 'size' - read_list.append(int(line[10])) - + read_coords_dict.update({line[5]:int(line[8])}) + read_list.append(int(line[7])) expansion.read_dict = read_coords_dict expansion.read_list = [read_list] expansion.title = title @@ -52,29 +43,25 @@ def getHistData(file, expansions: "list[Expansion]"): for line in read_reader: if not line[0].startswith('#'): if expansion.start+expansion.repeat_unit == line[1]+line[3]: - # skip reads with read_status != full - if line[14] != "full": - continue - if expansion.copy_numberA1 == line[13]: # read column 'allele' (not 'copy_number') - read_list_A1.append(int(line[10])) # read column 'size' - if expansion.copy_numberA2 == line[13]: # read column 'allele' (not 'copy_number') - read_list_A2.append(int(line[10])) # read column 'size' - # read column 'read'/'read_name' and 'read_start' - read_coords_dict.update({line[7]:int(line[11])}) + if expansion.copy_numberA1 == line[10]: + read_list_A1.append(int(line[7])) + if expansion.copy_numberA2 == line[10]: + read_list_A2.append(int(line[7])) + read_coords_dict.update({line[5]:int(line[8])}) expansion.read_dict = read_coords_dict expansion.read_list = [read_list_A1,read_list_A2] expansion.title = title expansions_read_lists.update({expansion.repeat_id: (genotype1,genotype2,title,read_list_A1,read_list_A2)}) - + return expansions_read_lists - def plotHistogram(expansion_object: Expansion, plotfolder, bool_altclustering): save_place = plotfolder+"/"+expansion_object.title+'.png' ref_size = expansion_object.wt_size path_range = expansion_object.pathogenic_range - + + if bool_altclustering: allele1_size = expansion_object.new_allele1 allele2_size = expansion_object.new_allele2 @@ -94,7 +81,10 @@ def plotHistogram(expansion_object: Expansion, plotfolder, bool_altclustering): plt.axvline(allele1_size, color='k', linestyle='dashed', linewidth=1, label=allele1_size) plt.axvline(allele2_size, color='k', linestyle='dashed', linewidth=1, label=allele2_size) plt.axvline(ref_size, color='green', linewidth=1, label="wt size: " + str(ref_size)) - + + #if path_range != "NA": + #plt.axvline(int(path_range), color='grey', linewidth=1, label="pathogenic range: " + str(path_range)) + plt.legend() plt.savefig(save_place) plt.close() @@ -109,12 +99,14 @@ def plotHistogram(expansion_object: Expansion, plotfolder, bool_altclustering): plt.axvline(allele1_size, color='black', linestyle='dashed', linewidth=1, label=allele1_size) plt.axvline(allele2_size, color='red', linestyle='dashed', linewidth=1, label=allele2_size) plt.axvline(ref_size, color='green', linewidth=1, label="wt size: " + str(ref_size)) - + + #if path_range != "NA": + #plt.axvline(int(path_range), color='grey', linewidth=1, label="pathogenic range: " + str(path_range)) + plt.legend() plt.savefig(save_place) plt.close() - - + def alleleVisualiser(fasta_file, motif, flank_length, title, output_folder, chromosome, start, end, reference_genome): chr = chromosome @@ -214,6 +206,7 @@ def alleleVisualiser(fasta_file, motif, flank_length, title, output_folder, chro plt.close() + def rectangleMaker(motif_colors, motif_coord_list, size, flank_length, motif): patches = [] color_list = [] diff --git a/src/utils/__pycache__/ReaderAnalyser.cpython-38.pyc b/src/utils/__pycache__/ReaderAnalyser.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8365cb6d16086fc114b1d4eb40af1868c6a0f52 GIT binary patch literal 5229 zcmbtYOKc=Z8Sd)W%=FCkjK^z0Dw=Ec8y|NMfY{DY9wKMlxpc;a86;RKVN99^)3T-qW2Lukkvlbvd<3KyMs(k*tVLgoN@IXHPd=ZrY9P4 zxAn|OwOh&oyYpg3BeX?@>pPmbQov}9MP*(=Z&II`1wAVjoUB}MvP#NQjG+13MrI}E zDZI4Lvg#?koYgR^w5Vitu4OjhGGVu?`Sir4C=!RL15ndhPP4H@%H&x8ZTIlP* zr_tN!XSkW!;jPC=Igxr=3Q0RHg`~}JBeRogT1v}lCAHFe+DMyeHLaynX)B#hXHsL| z&SsNZI-A=2DpOMZu%6ByF{bQZ`JVE5iWzSmizrlzV~S_Y+SDI^qZB zC%aoB>I<^VV>-rT36+V*o<7!zQliG7#-yf#rfuhXH1NV)>&AC;Q+O_iM4F%LapLuJ zJ#_nCuK8h-+rr!Qgco+bPVB$s<;?*MEb4io-|YxD?0LB%qG8B$V>5`{M9`Xf!wubF zFZN#Y!YJ7rc)95gupV#E<@%-{czGp=x_*cI-NY%yiEw*?aO@G#DfDq{@5FjLfgdM% zC3gFR!0Y(jF}qPDcntlwL~g{1E0SD;d`Bm_VhK7pHni`MvL|Zxqr~6LOKuQ&fft_u zt0%ykQ}0HDy-qmnZ+N1!dKO*7JIRTwop?AHL|AY{SQ`_X6NRIVZLgc$SH#m$<-yA< z{V-XHlL#;Bt*s1WPsA(k)`oD0$=XWX75*TJS6(aj;hW(~EV?Vh#1GR-1K$|boDZ` z*qkm_Ag9g56?A9Z0@((cmUIgowwE%_c2x0~RN02x19Xe4snTJ<(Qj)zy7=4())(MH zI>zn?M8`A2!=DfzR~!=d*^VA_;!kk#YmU)pB>t3U;CYCz3R$Y(U zC2;`w;cOkpGnqT2MM6 z&rnneYEV<7hGMK}QPZTRj;3vj8G>i2p*SqaYl|n*)FTJ5NMR4I2Mm{t*KpMhN?s$8S6?$)U3deNrg!4Mh9J`b)tQeC53b_;MK5A@% z-GuXKs!uTS6>vuOl4}s!CC_m3TL3_tK(ts#G(ei5afY)~#b@cZ;Us4$T}X{*!AV%E zAw=uDt?&uD5t3_yYo6g=O*EW?c7zk)NH305qz@;Bex__PUgfoW2;8^en=q;+X>~~}7c@z+ zCB>4IYHDFsE3Fab$P07EQHqY}9*zU!23dw~nr4iH^L(Kx(wi8Av<~x*# zOmTVx8$A-Stbrc4Cu^q716ABk8hA^tW)@rNv8nvy8M zGZLPY@GNX=2G8t{4o&_(!scXTng_PPTi88S{E@dXb1}R`Ba$WJP;3I{Z+u#EOb$61 zL6L-{PGro$mX+-b`|JY6JF@24Gn7vNo82ePE&`g9(39bBk^ariD?Oq=s(tY$C)!We zD)iOF=415L#E;MF3vK>4{R&N=gja+&oKdg`y1;wM`+Fsu#~aM!4cHFD{(ngH!%OKr z*;4XUO4&hUA7#*iw)>%sWV7qiA09u%ukeQxS4f@_Y2h#ka&xEepJg`p%M!EmWeRc;Om6 z0B+zB`=z>z^!mOou4AD1DmBkha|6wNRZzm5Yny&@U(MC!t2yf<2WPKOLSt@Sqtvt; z^#|AA!{C^*Uz4d1-+Bsuag_b~VQ={w9IP9}*8!Z2|9`a>H?fwW@Cjf1BD$P~;%h{s zoKwii{W756;Th}o$f6Wtn>pHs-xV)l*hj7M{&*EdRlW_4 zz6?TM65d_px?U0PuF~99YSz%SFXwfNNR;gt3BDGfB&kerH5}xID}=k}Kv2Bp!U9a3 zJry11F!Xnao;)H!=`V45eLr+c1L09&fx3i77V%S$DPz~%jo8u2#BvSBlWVXp$LwP` zC{qcQ0deHX9PGeN%M4KZ* zjWi=3*{3rV3BTu~>X?vFcNAOCJMu_!ww|+BuzL||(%8{%e*JmJ+VJ9}BPmX+8w^oP zijIqK4ZX1Mg^5!h(N@RM+8BCf48=J^Uc;dWUHsK&PQh#9Gk6!#rWY!c#-wdw@6z(6 z(Qklz7f<{LG>WRD_R&>KE3=wGt);bes)lNMi!C8ZwQya_sF~Da}%$XKOTHwnH$~2J>Er7BFc}uKJbBr1iW5`7nW}`1-URf<`HrAk3Y8rU>+gQ8c znIn#6VO7kkse*KGt9fNyN#?dRnPM;DNzPPdlw;9DP9Xjv8uG))BQ){dl!-N9ETMV{ z@foeQpj=RpStAqU>OMtFWA_K3n;$!-6q=mkSxNeFZC%J)Oy0X)mX(Nb083k!7ezb) zw9qiD*p-ZB6uO+7{7`VvKZKW8alG#(VIZ6tRSu5L3mVV4HO|$82p5 z#~}hx+EX%Xc8tLuJ`&_5D*FR}qxhs@p|Boy=}T8EzfQ=sIyZ0w27BT*W;=~|CxCT_ z%l+buSm9Nu7M693On7sBBZ>lfKyt~#>dxt0z6RqT2Hth5bYl7pLDo{7J9@Qz)m|yv L%^xYBYTEw+(sA;i literal 0 HcmV?d00001 diff --git a/src/utils/__pycache__/Structures.cpython-38.pyc b/src/utils/__pycache__/Structures.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b48fc367819ed1f98408276968249251f1aec689 GIT binary patch literal 1975 zcmb7^OONC<5P`~Xqizs z?fi^Az;XWy7p}-x{>7fCax&fX=nHQ7YghSMm8$w>IP4K<{~Y`+KJF9pFACS23&IS# zIt2VkIN|gX`U_`)lEYn)T#zXDo|4E&U|4lslJX;R=tt+`A+% z=)xENvp{sX|B~_mbROz>2hI#l_xs<~VsC&Rp-3NyC0NAaEz+N2!`}GJosJp-+C%}>AnB)07OVsS9$UUpy zx0-u)eBbf`$2UB*<2Nkdw0z6*ZOeD==$rMAI02*Gxgo}z7rBTIjGLTE<0~CYZCsIa z)0LtWv1TbZ{dQR8sWyW+%S0xQnMyCjl{9UnVN#UmEU#84LVj~>ZirJeUhk~RvVeP9 z1#E5=o#(nasH@^tnF+jnr;PPku@rflFd64dVfNFU-BFJ5IgF>Lr$UN65ysc4&cp)D zh&jtrrQ0D-6K#5VAy-T#g%pcnE}paXD$_@?ed%>!Y98Xuw!8On;`aSkdE4S^%C*&t zUEF@db9Yr!Gs0DyXGde>DUqE@41Vb&0z?N9BDx4nJc&UpcM*_%L=F%`#0aqictzv^ z{Js8RvdZ;D=|aX!F`ZOONHvMiPGnr^=|m+mEw!3F)i9mOpYw^5$)wV0W_4AfD=E}b zd2T`mj!HFSnA+-F0C57k_2&d$;KB@cWb?me?Zw_YMUSqIPbePUHXLKOxr)xH$uo%a zS9Bgx6EGVO5tWZ%##-CpT@>#Duy`oUb>du%H&?aWOrW$(^JQXJ;3{Zn9s1wi_y7hq z{R0XYcm8yKcbXc}X4-4|xuGm_Ob2o)ixVN=hvkvmyygQK!{@9{)V7Cx2D1vUAp3>f>hinC@$9>Q`Hma-^?)aTYfFhOW7q;bKNYZu{{m(~sF zJ+x0@*n~7Zc4*J_s6>4N{YJxuGw2GPcf0Cx7YdAL{GW=XQVE9LRl_lEwLY7{;D1z9 ztehQNGecWlyXal{9vl+&jE9z+9@Md7Rj6VmHd$f(RiV>U<7fCBvllB>Bx$S#hl-&@ ztb`d!(N>POuwkmjSyOUs@X`3Yi^b;l@95swiVO7aQ9W}S8L#>Bz=S zV>su}P5YXDlb5*nDdS~c`PlYPgch%&RpT>g&GR~+{n+pqc!STOT;%pOqusoNRTyoS z*zU4Jc(A}}EUR_*OUB0~;dEQY}t z9OqiN_0~IkGZ8wzeW>e0^myTc=A>)V)YD%LCGGobc#?m@jDeAwioI^!W*`1JwWOhp zn{0z|wqPi0!(70=>zcJ-q&7Hq>VP_FK{=|hX^MZz0*(05&(fkSZn&yQZ|9box_u)p zLAR&e*NqSTw5-b9ZW)sjclKGrp~J%it>710M2jtB!GK1tya`FWEkjmNmRit1VEcIU zsx+x;o<}_5Y(ZLCoti538Oj+28#{GnLIbQP8^I~{?0o!t#tX(bjH|}`#+Ey^`!5f% zy8_#*hqnLtUfXZ8Wa-h?-)4e(XJL1_h1H-fRpoxGA<;M{@e@^jK;n^maGPx{P1jse z)xP=2Rin4(Jh*OU;xtySWUOJd6gzfeeM9LwT2-6Lj^r!csn{;CFx1B(ji+clUS)Aj$*kx4lf>6SA8rr;FqE6 zo7djW8(If8@NpVc*9yCFEO?TeYDa@n=vNLW>b!dKp4RHm4e=C)-o3ms9LbeL#vJd>&Yb+fPDw+I^rCz zugK?b!5jXUt6O-;>J~Nc|5zP2fHAErciGCG{p@(u*T`~Nknh>6ysD95>EhBeORN3o z_3y7=Y};9phwy}QH?!9Uqa<^pU@eS*5H|)~uJ6rU(f4{O?k!!#^-K2LYe1H#*2e9O zZDs`?BH^Gx*GC5_rU1G+G^X$2av4zERko4JO`3FYcw>oM9A9<8cMR_4Vj}~eGLB} z16T&#j)1cdv+^js(V61q@y2RarJh6B5c4#_0uch5%o;~P#@?X6EyB!RCnPjSV_;>j z)*23gvcq5}E9_49az~ebG1$3ITa#Id6j>n(VGYlUNpL-cHtb#$CtH;!W2 zizC3a-&l*|2&T$zv<>(dgHhl2Xjo^A1&6*X!k%6wMq!ot`i{)@XY^WV!#f&qHR|Q0 zO)IL>Tm)QkUj)OKz)CKG7-j>wQ?P8J%r%J?Ob__opcD*JlkdOKfo1V zg_bTM(!uXZRLG}kbeaclDJLx<`YGIGX#r&sWl^$;soZ^rD8`#uszk`MZ>42vOJ`D$ zz^I$Rw0#q@I%ZH6#5&Ei27m6#lB#WZ(aK&Z=2uDzj{SVz!L5q3RP?H3&hreuz3DT>U0fWg5G0G6N&$R2?Ii z(^)w~xTXk&Jj7SC`)1lya~R*$<9`d;6KW2!e@tglpU1O+XlGGk&YOq?Km7FvX4*Yao;rb*F6x#3UDkldby>qs;~goJSDNrW z=_%QqoY*(8%G2p%3h3T4Hs-NswSC5C;6v)zHSAt`2679){!@LE^r5xLY3HkW>obJ$ zO`x^zX{i}?LY=}JV>g`-tn_TUls+y`swJF^WqC@SRm;4AsQ8>ZH#sdIo1B4`vvf9S zw0dlER-L07-o$yiM5FK)&Zkev$2XQY&Z#HV5IY3)hvgPI#pDVr})G!NTpjb~6UE*O(GZ}IxR$xrfAQ!P)^ z2#w;WuVhXyKuDI^fXrR-BJPa%0!Wvl#e*N!DJr~%C!u&T&m-Uk7$r7V6jJ`ecJ9CM z?#odyT;stj-@}-H;4$`=b0j<#(7YVQ-5^R{dG{Ork_Tz zOWF?zidIDf1oC#55+?HqviHhSd?OS~V$ffgZ8Iyb2O}PZ32cqKnHBeX;xiaZ{)_kS zvX_9iyaz>v9_3!L{fsz+?#vnQCIQ0W%d|xl6JobQ3Nl2U30?s?|p3|h0 ztdyCD1=BRqU@a>G*t_dt;yXG&6>rkeWg2P+JA(u(T90ptw}??hLO^aoQ1sb;()aW^ z>QEG(Im1{EdRe6x1*6Rljz?dv=P`FR-j?HS*~wi_=5^OaJdBeVJ8+t`TJTjAWA{Q4 zj=EuoLi8bA8RUGV6TLUWk>5D##x8G@oIjJ#e$Y;|4;62d%8E9ImK}@4uXZu}k<)!g zPYT}Dd*!n9c70DvX@+0dMIP{L=jIp5d1E^NP(5V?e)*VS#s(RqJ)0dE&eAw&>95_8SMXNAFMgu--50j(mMSP9t96{TH!*|E(Kg zU~(PAFeyoJ;cjbYQzr}!)3fSuzP6XA50xpu$tot22FT-Y)*~pP0kQ3n@yvKl$o3{cW&7Pkg70&Bh|GcoIE~LOg}d0 zQGj%7(fA)nmH^+sOe?CQY<$r8w=(Dx2ka=RO3Ogsb7?iVNmV%l46;eussguS!5uwd z*VeC;woFxW%eHJ>SUV9m)Mm3ssEC}K@G&do zr3Pm~-YIhc-Qy|1CSdrDy_J z$+?bq^&QfSIkrLcok8ee+kJT$^X({ng`9Un5MIF+RHmCk+i|RA)}F3?&gK6B7&4II literal 0 HcmV?d00001 diff --git a/src/utils/__pycache__/expansion.cpython-38.pyc b/src/utils/__pycache__/expansion.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b91d7ff9572c448be0e70d2765c9510a403b8f2 GIT binary patch literal 2796 zcmb7F%Wfk@6s>N*+HdEb$-Eel00R><27v~RUn`~M=eJe?$cH3076%zH1 zS@R9B%rDiJ6>D}`acdrezy+<+54hf@X^0eY_y)^(+NO|1B=9gO%~a2#X*4zRD>;3iOR4gs*nR) zrYhB7D^#ZjtV2y|!B(kF9oQPJ&?;=5)@U8JK^wFQ+oUbphHcRu+JSA;UAhO`q5D|d z15&4RqYK%p=7j z9S@avB>TOwmP+@$<2TA1$Nir6RWOQmZ$HLxsD6xkTKT7tmce5xAx90wT`~7t?t64^;F&j?xWAt zwTyql2K|8j;SQb7Zk=t+iE|#l1(&RFoxk)%ar6`M3wYH-RMk5G(5Ji{(LCkRoRZy^ zQ06?Y$mJbRDS6Wp>rdQFWdD^OMh!wTcN%tR+{BpL0(1+L z=+8PXi*7;Ho;-OaqhWkHl3n~qb8rO`^lW=!|8s7)2|-G|ECfwx#GEfCx7TB4p61=w zoOi+JHngmMY2+7Z`=9g zSW`y%Y&m7#<+h@Ca-xg;GF{f3U9jI{?3*0_)eDwAZm#(A>lD`^hxeJM`*t~D&SMWR zDPF0_8%WnRHP;;sX^fZGH4WGOb?k+c5%qxS9x?D0sXk!%kb$oa^%29z4E)Ba9z&mj z^ILJYsV^DkFF{)42S@P}!lz1emt#3QPGR~h&T45->bZxTmH)A9D&stxtp01B?_Yf4 EKVSb&lmGw# literal 0 HcmV?d00001 diff --git a/src/utils/__pycache__/extract_repeats.cpython-38.pyc b/src/utils/__pycache__/extract_repeats.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..871d19a5c684bab98f1769f3b3640db954ea6ea8 GIT binary patch literal 2321 zcmaJ?&5s*36t_JenM^)*TS~iIDo9m?8lVkSLV~JVTBs@{#PU&~C9qmeV<$W8WRiFs z+TD6iDc90J0HN#!sdt338xw9E-39%j9JEpUe}`wncww!lY88MM7sfB;vuhq3wgw= zk4U$|V_pM|c%3&utDN2@?d5%}OxD}f)VSy;(@gQSr+UmvpM7=ED%yu=#UJPtQFKO2 zrkI9HGxpQZrKi1{WaCNcYoD`K!o6)KU!SaKf9~y(xzGJQS_V3By5I}}7p{^sgjXCN zS-uQ;wX8rBuaYtXt$|iS>!2}c1GJ`b?Cq+2tV2z=EAsJVMFl#ZQS{Xr)wRB-na0{W zx<)1|cgQ>BbMghbOEkk_o9l*3NpXY&df|Xp59rnX#pu$$J@Iank3`YRN2(jXZ z4D0tHN^@cS-Y6?f0QIymVWAQQT9qQJ(}ulYzL>nHEA8on-5tOs48er|7{URdjA+&LESY=AAh&| zm&ZG%I!VtPbwoDm)jY(^?Z}Ycj)AaCL2J>IG{Eq0R z1ld7vk|%WI*U}#l?P>oSa%fID)&3s)&MSj5#LbgA(}D7Ih}-mZWk&aqQ<#+_8=~)h z4eM319m+K~@^Jg$=E0uL5~s6r%zb*=NY)imK|0WZ?9h3koGbU=o=0;(qduv{8*0eWnu`BxI{|xL2Hpqt-c%$TMx^2 z(=aLI30Rw;FH~CnF_hHNbnPB-WL){h>XufK2Texxlk7ZH17$Ej{8CyZxM_OUJNv4#MJ1 zbOiW`fuED~99scBO|PhvNF=~*Vi1N3nVAFbPomc{%&LK zVTnt`KgPcu$R+HE@`C~AQRoLo75Qt@jM7|HcX9b6!Exfk5Fq{Hos|fHx76B-!#*_AxuD3lC zC34_unrYsUgXvJ@>N8v7q=^@$4*H_aOz1|^_;!5EGimo(Ln}gbwjM^*$$Iu(IQ;D0 z_-M{48xPzKZfC~IKv9%5hNq9AprN>FZ|0SuvW38(G5c!YCh|I_AjT~aB=k;^3{B%k zmJV|3^Q|-!E*)Gj5a}0Swg_&=a&MvQo_H4FeuA{#94BhaVZhzWu_Kw-8GVv$3wM%5 zz*&sktW)_BR%;`CRGzn7EN|F7ZlNd+OJ3>DTY4dc9(xe776&y{cjSO@nO7W3Y`X z9|QIcW-#+L>hHND8s%6XqdZ0?vkr}R!3EV+$zimODq{dQKxZ7~n8|XljiWrvGYhT7 z3Ty#wffZQ^?E)*a3fdy8vKrbFtFs2$GHbFH+6r4_OK7WXnXRC$u~oK)w$9es2HFO@ z$2QS6*?sl^ZHqm`*&eC#H@^*>Kzd=|F5)z4p*|h+C93=b5Sy`K8*vV3#(7{awt)G# z0JP!-U?DC77vd7I7?**ixB@K4RbVBq0jqHxSc@CLdfWsy8139VF@P z{0bw!s~*v;F#`wB_muGuj%BphRo*=nsvsjrL@Lh%rpkg3xD(kPQ`J#-7GTIa zCwVJDDEhW^Ly>O9JU41DDob+zSXj`QC=e_V6bVWMWdibg0f#q4ji64@AZQY_2*@wQ z62S)G(hv{OTMD;@*DUh^fLW?7z0SQ6PBmKll}amnVIC7CTY|3su>xxe&99$lp?^Vya8q@7bm z+!o()xmX5iss-v_*Fp(0{z(4%HWt(KWOSb{-&+2*sd^g?`;emGsihCZEWyELAuN-Jx_T0Z0q=>DQO2@&J&Hr~fA*oW$aqvPgNVf_yIQ zhcfK*Nv@&_{V?*5RRI~pP^u+IN~8`>#2Jz>$sNg6L-0{DHcId)7jpb0h11#0Q8tfD z_FKzjc6ufuZ`YH@fNp^JieQp8y`Ukvrr0LfA=o7#9nG-#RglUumefWh_t;03lt$45 zGVi-+XgiEEnO@rxpMzXbC=kyGw42TIk}t9PEvkMeN~WN350RFE L1({m^+SvOS$FsvE literal 0 HcmV?d00001 diff --git a/src/utils/__pycache__/locus.cpython-38.pyc b/src/utils/__pycache__/locus.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eba8e8cd924ac8c2f214d4d95865c69a85a7fbe6 GIT binary patch literal 736 zcmZvZ&5qMB5XYUbHf__gsu1E8df7;OLqZ6|fy=5KR$OwiOzgIilDNz`NUVHgUxruV z5q#wlxb10Y8X*ONqxpF}_J771e_F4Xg2wihH9un^e(_>D5?Z_^!!6;1&_YKKWZ&aT z==e_TlA7Xw4azbqo|9ul6=_k$I@0ljs1lu!CJouGtp}{v6CwL7eNTqB1Rr@Jg^!7m zPl&NkiHXmMsh<%uKPS$7PMrG%G51U2!mo%+Ul3PXP|0Y&D?-|<&V*%s0^3=%LBb1-1~+DL5fO(=Q`ooXeh{%FG0S-F!v zn!c&WNHvX&$zM#)Tq5Ji-8!Ub+5>O{fLjhJBV)`Mb4JcsF!;@2Mfj)2S12g_L%@cn z$9HAdyAquRbu>5SfCjKsr_Z1UcT-{w&Dmjj=V-fyk9~?vEC~7QvBVn|{3a7mPWZE3 literal 0 HcmV?d00001 diff --git a/src/utils/extract_repeats.py b/src/utils/extract_repeats.py index c520690..64a3edf 100644 --- a/src/utils/extract_repeats.py +++ b/src/utils/extract_repeats.py @@ -5,7 +5,6 @@ from collections import defaultdict import re - def parse_tsv(tsv, loci=None): support = defaultdict(dict) with open(tsv, 'r') as ff: @@ -14,23 +13,18 @@ def parse_tsv(tsv, loci=None): continue cols = line.rstrip().split('\t') locus = cols[0] + ":" + cols[1] + "-" + cols[2] - status = cols[14].strip() # was "does not exist" - # ignore skipped/failed reads - if status != "full": - continue - read_name = cols[7] # was cols[5] - size = cols[10] # was cols[7] - read_start = cols[11] # was cols[8] - strand = cols[12] # was cols[9] + status = "does not exist" + read_name = cols[5] + size = cols[7] + read_start = cols[8] + strand = cols[9] if loci is not None and locus not in loci: continue - # print(read_name) support[locus][read_name] = int(read_start), int(size), strand return support - def extract_repeats(bam, support, flank_size=10): seqs = {} for locus in support: