From 63c3f13ce308c100e791be8e762ac230547cf271 Mon Sep 17 00:00:00 2001 From: Boris Simovski Date: Fri, 13 Apr 2018 10:50:55 +0200 Subject: [PATCH 1/2] CHERRYPICK to master: Optimize raw overlap algorithm (cherry picked from commit 9b817e6) --- lib/hb/gold/statistic/RawOverlapStat.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/lib/hb/gold/statistic/RawOverlapStat.py b/lib/hb/gold/statistic/RawOverlapStat.py index c507acfedb84..35501690263a 100644 --- a/lib/hb/gold/statistic/RawOverlapStat.py +++ b/lib/hb/gold/statistic/RawOverlapStat.py @@ -67,7 +67,7 @@ def _findAllStartAndEndEvents(t1s, t1e, t2s, t2e): t2CodedEnds= t2e * 8 +2 allSortedCodedEvents = numpy.concatenate( (t1CodedStarts,t1CodedEnds,t2CodedStarts,t2CodedEnds) ) - allSortedCodedEvents.sort() + allSortedCodedEvents.sort(kind='mergesort') allEventCodes = (allSortedCodedEvents % 8) -4 @@ -83,14 +83,21 @@ def _findAllStartAndEndEvents(t1s, t1e, t2s, t2e): @classmethod def _computeRawOverlap(cls, t1s, t1e, t2s, t2e, binSize): - allSortedDecodedEvents, allEventLengths, cumulativeCoverStatus = cls._findAllStartAndEndEvents(t1s, t1e, t2s, t2e) - - tn,fp,fn,tp = [long((allEventLengths[ cumulativeCoverStatus[:-1] == status ]).sum()) for status in range(4)] - - if len(allSortedDecodedEvents)>0: - tn += allSortedDecodedEvents[0] + (binSize - allSortedDecodedEvents[-1]) - else: - tn+=binSize + # allSortedDecodedEvents, allEventLengths, cumulativeCoverStatus = cls._findAllStartAndEndEvents(t1s, t1e, t2s, t2e) + # + # tn,fp,fn,tp = [long((allEventLengths[ cumulativeCoverStatus[:-1] == status ]).sum()) for status in range(4)] + # + # if len(allSortedDecodedEvents)>0: + # tn += allSortedDecodedEvents[0] + (binSize - allSortedDecodedEvents[-1]) + # else: + # tn+=binSize + + starts = numpy.sort(numpy.concatenate((t1s, t2s)), kind="mergesort") + ends = numpy.sort(numpy.concatenate((t1e, t2e)), kind="mergesort") + tp = long(numpy.sum(numpy.maximum(ends[:-1] - starts[1:], 0))) + fp = long(numpy.sum(t1e - t1s)) - tp + fn = long(numpy.sum(t2e - t2s)) - tp + tn = binSize - tp - fp - fn return tn,fp,fn,tp From 023ad59d2237a5f5d1132a4a4408a1649b4fb1c4 Mon Sep 17 00:00:00 2001 From: Boris Simovski Date: Fri, 13 Apr 2018 10:51:35 +0200 Subject: [PATCH 2/2] CHERRYPICK to master: Optimize shuffling of numpy arrays (cherry picked from commit 33aa523) --- lib/hb/gold/track/RandomizedSegsTrack.py | 8 ++++++-- lib/hb/gold/track/ShuffledMarksTrack.py | 3 ++- lib/hb/quick/extra/StandardizeTrackFiles.py | 3 ++- .../quick/track/SegsSampledByDistanceToReferenceTrack.py | 3 ++- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/lib/hb/gold/track/RandomizedSegsTrack.py b/lib/hb/gold/track/RandomizedSegsTrack.py index 313a18874638..a1bdcd4422bc 100644 --- a/lib/hb/gold/track/RandomizedSegsTrack.py +++ b/lib/hb/gold/track/RandomizedSegsTrack.py @@ -40,7 +40,9 @@ def _permuteSegs(self, starts, ends, vals, strands, ids, edges, weights, extras) segLens = ends-starts #permuting order (of length-elements) of both pools if vals is None and strands is None: - numpy.random.shuffle(segLens) + # numpy.random.shuffle(segLens) + permutIndexes = numpy.random.permutation(len(segLens)) + segLens = segLens[permutIndexes] else: permutIndexes = numpy.random.permutation( len(segLens) ) segLens = segLens[permutIndexes] @@ -61,7 +63,9 @@ def _permuteIntersegs(self, starts, ends, binLen): #add start and end-case of bin. Double-check with statistician.. intersegLens = numpy.append(intersegLens, [starts[0], binLen-ends[-1]]) - numpy.random.shuffle(intersegLens) + # numpy.random.shuffle(intersegLens) + permutIndexes = numpy.random.permutation(len(intersegLens)) + intersegLens = intersegLens[permutIndexes] return intersegLens def _sampleIntervals(self, totalSpace, numElements): diff --git a/lib/hb/gold/track/ShuffledMarksTrack.py b/lib/hb/gold/track/ShuffledMarksTrack.py index f5164e7447ee..766e66708c49 100644 --- a/lib/hb/gold/track/ShuffledMarksTrack.py +++ b/lib/hb/gold/track/ShuffledMarksTrack.py @@ -9,6 +9,7 @@ def _checkTrackFormat(self, origTV): def _createRandomizedNumpyArrays(self, binLen, starts, ends, vals, strands, ids, edges, weights, extras, region): newVals = numpy.copy(vals) - numpy.random.shuffle(newVals) + permutIndexes = numpy.random.permutation(len(newVals)) + newVals = newVals[permutIndexes] return starts, ends, newVals, strands, ids, edges, weights, extras diff --git a/lib/hb/quick/extra/StandardizeTrackFiles.py b/lib/hb/quick/extra/StandardizeTrackFiles.py index 4cce10815a8c..98916d798a07 100644 --- a/lib/hb/quick/extra/StandardizeTrackFiles.py +++ b/lib/hb/quick/extra/StandardizeTrackFiles.py @@ -730,7 +730,8 @@ def parseFile(cls,inFn, outFn, numElements=None, **kwargs): import numpy.random as nr lineNumbers = range(len(lines)) - nr.shuffle(lineNumbers) + permutIndexes = nr.permutation(len(lineNumbers)) + lineNumbers = lineNumbers[permutIndexes] lineNumbers = lineNumbers[0:numElements] for lineNum in lineNumbers: diff --git a/lib/hb/quick/track/SegsSampledByDistanceToReferenceTrack.py b/lib/hb/quick/track/SegsSampledByDistanceToReferenceTrack.py index 357ccec74502..02e2dc392d9e 100644 --- a/lib/hb/quick/track/SegsSampledByDistanceToReferenceTrack.py +++ b/lib/hb/quick/track/SegsSampledByDistanceToReferenceTrack.py @@ -113,7 +113,8 @@ def _createRandomizedNumpyArraysFromDistanceToReference(self, binLen, starts, en assert len(sampledPositions) == numElements sampledElementLengths = elementLengths - numpy.random.shuffle(sampledElementLengths) + permutIndexes = numpy.random.permutation(len(sampledElementLengths)) + sampledElementLengths = sampledElementLengths[permutIndexes] sampledPositions = numpy.array(sampledPositions) sampledPositions.sort() sampledStarts = (sampledPositions - (sampledElementLengths/2)).astype('int')