hyperbrowser · radmilak · Mar 23, 2020 · Mar 23, 2020 · Mar 23, 2020 · Mar 31, 2020
diff --git a/lib/galaxy/dependencies/hyperbrowser-requirements.txt b/lib/galaxy/dependencies/hyperbrowser-requirements.txt
@@ -23,6 +23,10 @@ progressbar>=2.3
 python_dateutil>=1.5
 asteval==0.9.8
 python-Levenshtein==0.12.0
+bx-python>=0.8.2
+pyBigWig>=0.3.13
+plastid>=0.4.8
+#ucsc-bedtobigbed - comment out when we can use conda for HB dependencies
 
 #Might be needed
 #pycairo>=1.8.10

diff --git a/lib/hb/gold/application/DataTypes.py b/lib/hb/gold/application/DataTypes.py
@@ -27,7 +27,7 @@ def getSupportedFileSuffixes():
 
 # Defined to stop searching for GTrackGenomeElementSource subtypes online.
 def getUnsupportedFileSuffixes():
-    return ['bam', 'bai', 'tab', 'tbi', 'bigwig', 'bw', 'bigbed', 'bb', 'fastq', 'fq', \
+    return ['bam', 'bai', 'tab', 'tbi', 'fastq', 'fq', \
             'csfasta', 'csqual', 'doc', 'docx', 'xls', 'xlsx', 'gp', 'gappedPeak', 'peaks', \
             'bedcluster', 'bedlogr', 'bedrnaelement', 'bedrrbs', 'cel', 'matrix', \
             'pdf', 'peptidemapping', 'shortfrags', 'spikeins', 'pair', 'txt', \

diff --git a/lib/hb/gold/origdata/BedComposer.py b/lib/hb/gold/origdata/BedComposer.py
@@ -6,6 +6,7 @@
 
 ColumnInfo = namedtuple('ColumnInfo', ['colIdx', 'defaultVal', 'checkExtra'])
 
+
 class BedComposer(FileFormatComposer):
     FILE_SUFFIXES = ['bed']
     FILE_FORMAT_NAME = 'BED'
@@ -51,13 +52,6 @@ def matchesTrackFormat(trackFormat):
     # Compose methods
 
     def _compose(self, out):
-        trackName = self._geSource.getTrackName()
-        if trackName is not None:
-            name = ':'.join(self._geSource.getTrackName()).replace(' ','_')
-            trackLine =  'track' + ' name=' + name
-            trackLine += ''.join(" %s=%s" % (key, val) for key, val in self._extraTrackLineAttributes.iteritems())
-            print >>out, trackLine
-
         numCols = self._findNumCols()
         bedColumnsList = list(self._bedColumnsDict.iteritems())
 

diff --git a/lib/hb/gold/origdata/BedGenomeElementSource.py b/lib/hb/gold/origdata/BedGenomeElementSource.py
@@ -3,9 +3,10 @@
 from gold.util.CustomExceptions import InvalidFormatError
 import numpy
 
+
 class BedGenomeElementSource(GenomeElementSource):
     _VERSION = '1.2'
-    #FILE_SUFFIXES = ['bed']
+    FILE_SUFFIXES = ['bed']
     FILE_FORMAT_NAME = 'BED'
     _numHeaderLines = 0
 
@@ -14,7 +15,6 @@ class BedGenomeElementSource(GenomeElementSource):
 
     BED_EXTRA_COLUMNS = ['thickstart', 'thickend', 'itemrgb', 'blockcount', 'blocksizes', 'blockstarts']
 
-
     def __new__(cls, *args, **kwArgs):
         return object.__new__(cls)
 
@@ -91,8 +91,8 @@ def _parseEnd(self, ge, end):
 
 class BedValuedGenomeElementSource(BedGenomeElementSource):
     _VERSION = '1.1'
-#    FILE_SUFFIXES = ['valued.bed', 'marked.bed']
-    FILE_SUFFIXES = ['bed', 'valued.bed', 'marked.bed']
+    FILE_SUFFIXES = ['valued.bed', 'marked.bed']
+    #FILE_SUFFIXES = ['bed', 'valued.bed', 'marked.bed']
     FILE_FORMAT_NAME = 'Valued BED'
 
 #    MIN_NUM_COLS = 5

diff --git a/lib/hb/gold/origdata/BigBedComposer.py b/lib/hb/gold/origdata/BigBedComposer.py
@@ -0,0 +1,162 @@
+import subprocess
+import tempfile
+from collections import OrderedDict
+
+from gold.origdata.BedComposer import BedComposer
+from BedComposer import ColumnInfo
+from quick.util.GenomeInfo import GenomeInfo
+from gold.util.CustomExceptions import InvalidFormatError
+
+
+class BigBedComposer(BedComposer):
+    FILE_SUFFIXES = ['bb', 'bigbed']
+    FILE_FORMAT_NAME = 'BigBed'
+
+    _BED_COLUMNS_LIST = [('chr', 0, '', ()), \
+                         ('start', 1, '', ()), \
+                         ('end', 2, '', ()), \
+                         (('id', 'name'), 3, '.', ()), \
+                         ('val', 4, '0', ()), \
+                         ('strand', 5, '.', ()), \
+                         ('thickstart', 6, '0', ('thickend',)), \
+                         ('thickend', 7, '0', ('thickstart',)), \
+                         (('itemrgb', 'reserved'), 8, '0', ()), \
+                         ('blockcount', 9, '0', ('blocksizes', 'blockstarts')), \
+                         ('blocksizes', 10, '.', ('blockcount', 'blockstarts')), \
+                         (('blockstarts', 'chromstarts'), 11, '.', ('blockcount', 'blocksizes'))]
+
+    _BED_COLUMNS_AUTOSQL_STR = 'string chrom;       "Reference sequence chromosome or scaffold"\n \
+   uint   chromStart;  "Start position in chromosome"\n \
+   uint   chromEnd;    "End position in chromosome"\n \
+   string name;        "Name of item."\n \
+   uint score;          "Score (0-1000)"\n \
+   char[1] strand;     "+ or - for strand"\n \
+   uint thickStart;   "Start of where display should be thick (start codon)"\n \
+   uint thickEnd;     "End of where display should be thick (stop codon)"\n \
+   uint reserved;     "Used as itemRgb as of 2004-11-22"\n \
+   int blockCount;    "Number of blocks"\n \
+   int[blockCount] blockSizes; "Comma separated list of block sizes"\n \
+   int[blockCount] chromStarts; "Start positions relative to chromStart"\n'
+
+    _BED_COLUMNS_AUTOSQL = _BED_COLUMNS_AUTOSQL_STR.splitlines(True)
+
+    def __init__(self, geSource):
+        BedComposer.__init__(self, geSource)
+        self._prefixSet = self._geSource.getPrefixList()
+        self._extraCols = []
+        self._bedColumnsDict = self._createColumnsDict(self._prefixSet[:])
+        self._init()
+
+    def _createColumnsDict(self, geCols):
+        # handle alternative column names and case sensitivity (lowercase/camelCase)
+        # returns mapped columns and extra columns that were not mapped
+        cols, extraCols = self._mapColsToStandardCols(geCols)
+
+        lastColIndex = cols[-1][1]
+        for extraCol in extraCols:
+            lastColIndex += 1
+            cols.append((extraCol, lastColIndex, '.', ()))
+
+        self._extraCols = extraCols
+
+        columnsDict = OrderedDict([(colName, ColumnInfo(colIdx, defaultVal, checkExtra)) for
+                                            colName, colIdx, defaultVal, checkExtra in cols])
+
+        return columnsDict
+
+    def _compose(self, out):
+        tmpBedFile = self._getBedFile()
+        tmpChromSizes = self._getChromSizesFile()
+
+        cmds = [
+            'bedToBigBed',
+            tmpBedFile.name,
+            tmpChromSizes.name,
+            out.name
+        ]
+        bedtype = 'bed%s' % (self._findNumCols() - len(self._extraCols))
+        tmpAutoSql = None
+        if self._extraCols:
+            bedtype += '+%s' % len(self._extraCols)
+            autoSql = self._createAutoSql()
+            tmpAutoSql = tempfile.NamedTemporaryFile(suffix='.as')
+            tmpAutoSql.write(autoSql)
+            tmpAutoSql.flush()
+            cmds.append('-as=%s' % tmpAutoSql.name)
+        cmds.append('-type=%s' % bedtype)
+
+        try:
+            subprocess.check_call(cmds)
+        except subprocess.CalledProcessError:
+            self._closeFiles(tmpBedFile, tmpChromSizes, tmpAutoSql)
+            raise InvalidFormatError('There was an error while composing the BigBed file.')
+
+        self._closeFiles(tmpBedFile, tmpChromSizes, tmpAutoSql)
+
+    def returnComposed(self, ignoreEmpty=False, **kwArgs):
+        tmpOut = tempfile.NamedTemporaryFile(suffix='.bb')
+        self._composeCommon(tmpOut, ignoreEmpty, **kwArgs)
+
+        composedStr = tmpOut.read()
+        tmpOut.close()
+
+        return composedStr
+
+    def _findNumCols(self):
+        return len(self._bedColumnsDict)
+
+    def _mapColsToStandardCols(self, geCols):
+        geCols.append('chr')
+        cols = []
+        lowercasePrefixMap = {}
+
+        for p in geCols:
+            lowercasePrefixMap[p.lower()] = p
+
+        for colDefTuple in self._BED_COLUMNS_LIST:
+            colName = colDefTuple[0]
+            if colName in lowercasePrefixMap:
+                self._handleStandardCol(cols, geCols, lowercasePrefixMap[colName], colDefTuple)
+            elif isinstance(colName, tuple):
+                for item in colName:
+                    if item in lowercasePrefixMap:
+                        self._handleStandardCol(cols, geCols, lowercasePrefixMap[item], colDefTuple)
+
+        return cols, geCols
+
+    def _handleStandardCol(self, cols, geCols, colName, colDefTuple):
+        cols.append((colName,) + colDefTuple[1:])
+        geCols.remove(colName)
+
+    def _getChromSizesFile(self):
+        chromSizes = GenomeInfo.getStdChrLengthDict(self._geSource.getGenome())
+        tmpChromSizes = tempfile.NamedTemporaryFile(suffix='.sizes')
+        for chrom, size in chromSizes.iteritems():
+            tmpChromSizes.write(chrom + '\t' + str(size) + '\n')
+        tmpChromSizes.flush()
+
+        return tmpChromSizes
+
+    def _createAutoSql(self):
+        autoSqlStr = 'table FromBigBedComposer\n'
+        autoSqlStr += '"Automatically genearated"\n(\n'
+        autoSqlStr += ''.join(self._BED_COLUMNS_AUTOSQL[:self._findNumCols() - len(self._extraCols)])
+        for extraCol in self._extraCols:
+            autoSqlStr += 'string ' + extraCol + '; " extra field"\n'
+
+        autoSqlStr += ')'
+
+        return autoSqlStr
+
+    def _getBedFile(self):
+        tmpFile = tempfile.NamedTemporaryFile(suffix='.bed')
+        BedComposer._compose(self, tmpFile)
+        tmpFile.flush()
+
+        return tmpFile
+
+    def _closeFiles(self, tmpBedFile, tmpChromSizes, tmpAutoSql):
+        tmpBedFile.close()
+        tmpChromSizes.close()
+        if tmpAutoSql:
+            tmpAutoSql.close()
diff --git a/lib/hb/gold/origdata/BigBedGenomeElementSource.py b/lib/hb/gold/origdata/BigBedGenomeElementSource.py
@@ -0,0 +1,143 @@
+import os
+from copy import copy
+
+import numpy as np
+import pyBigWig
+from plastid.readers.autosql import AutoSqlDeclaration
+
+from gold.origdata.GenomeElementSource import GenomeElementSource
+from gold.origdata.GenomeElement import GenomeElement
+from gold.util.CommonConstants import BINARY_MISSING_VAL
+from gold.util.CustomExceptions import InvalidFormatError
+
+
+class BigBedGenomeElementSource(GenomeElementSource):
+    _VERSION = '1.0'
+    FILE_SUFFIXES = ['bb', 'bigbed']
+    FILE_FORMAT_NAME = 'BigBed'
+    BED_EXTRA_COLUMNS = ['name', 'score', 'strand', 'thickStart', 'thickEnd', 'reserved', 'blockCount', 'blockSizes',
+                         'blockStarts']
+
+    _numHeaderLines = 0
+    _isSliceSource = True
+    _isSorted = True
+    _inputIsOneIndexed = True
+    _inputIsEndInclusive = True
+    _addsStartElementToDenseIntervals = False
+
+    def __new__(cls, *args, **kwArgs):
+        return object.__new__(cls)
+
+    def __init__(self, *args, **kwArgs):
+        GenomeElementSource.__init__(self, *args, **kwArgs)
+        # using 1 as length because new line character gets added in TestGenomeElementSource
+        if os.stat(self._fn).st_size > 1:
+            self._bigBedFile = pyBigWig.open(self._fn)
+            self._chrIter = iter(sorted(self._bigBedFile.chroms().items()))
+
+            self._extraColNames = self._initColumnNames()
+            self._numOfExtraCols = 0
+            if self._extraColNames:
+                self._numOfExtraCols = len(self._extraColNames)
+
+            self._parseValVec = np.vectorize(self._parseVal)
+            self._getStrandFromStringVec = np.vectorize(self._getStrandFromString)
+
+    def __iter__(self):
+        # using 1 as length because new line character gets added in TestGenomeElementSource
+        if os.stat(self._fn).st_size > 1:
+            self._bigBedFile = pyBigWig.open(self._fn)
+            self._chrIter = iter(sorted(self._bigBedFile.chroms().items()))
+        else:
+            self._chrIter = iter([])
+        geIter = copy(self)
+
+        return geIter
+
+    def _initColumnNames(self):
+        autoSql = self._bigBedFile.SQL()
+        if autoSql:
+            autoSqlParser = AutoSqlDeclaration(self._bigBedFile.SQL())
+            colNames = autoSqlParser.field_formatters.keys()
+            return colNames[3:]
+
+    def _iter(self):
+        return self
+
+    def next(self):
+        currentChrom = next(self._chrIter, None)
+        if not currentChrom:
+            if hasattr(self, '_bigBedFile'):
+                self._bigBedFile.close()
+            raise StopIteration
+
+        chrName, chrLengths = currentChrom
+
+        entries = self._bigBedFile.entries(str(chrName), 0, chrLengths)
+        # self._extraColNames are initialized during the first iteration
+        if self._extraColNames is None:
+            self._initExtraCols(entries)
+
+        start, end = self._parseStartAndEnd(entries)
+
+        ge = GenomeElement(genome=self._genome, chr=chrName,
+                           start=start, end=end)
+
+        if self._numOfExtraCols != 0:
+            strVals = [x[2] for x in entries]
+            values = np.genfromtxt(strVals, dtype=None, names=self._extraColNames, delimiter='\t')
+            if values.size == 1:
+                values = values.reshape((1,))
+            tmpColNames = self._extraColNames[:]
+            if 'score' in self._extraColNames:
+                ge.val = np.array(self._parseValVec(values['score']), dtype=np.int32)
+                tmpColNames.remove('score')
+            if 'strand' in self._extraColNames:
+                ge.strand = np.array(self._getStrandFromStringVec(values['strand']), dtype=np.int8)
+                tmpColNames.remove('strand')
+            for colName in tmpColNames:
+                setattr(ge, colName, values[colName].astype(str))
+
+        return ge
+
+    def _initExtraCols(self, entries):
+        numOfCols = len(entries[0])
+        if numOfCols >= 2 and entries[0][2]:
+            extraCols = entries[0][2].split('\t')
+            self._extraColNames = self.BED_EXTRA_COLUMNS[:len(extraCols)]
+            self._numOfExtraCols = len(extraCols)
+        else:
+            self._extraColNames = []
+            extraCols = []
+
+        self._extraColNames = self.BED_EXTRA_COLUMNS[:len(extraCols)]
+        self._numOfExtraCols = len(extraCols)
+
+    def _parseStartAndEnd(self, entries):
+        tupleVals = [(x[0], x[1]) for x in entries]
+        intervals = np.array(tupleVals, dtype=np.dtype([('start', 'int32'), ('end', 'int32')]))
+
+        return intervals['start'], intervals['end']
+
+    def _parseVal(self, strVal):
+        if strVal in ['-', '.']:
+            val = 0
+        else:
+            val = int(strVal)
+
+        return val
+
+    @classmethod
+    def _getStrandFromString(cls, val):
+        if val == '+':
+            return 1
+        elif val == '-':
+            return 0
+        elif val == '.':
+            return BINARY_MISSING_VAL
+        else:
+            raise InvalidFormatError(
+                "Error: strand must be either '+', '-' or '.'. Value: %s" % val)
+
+    def getValDataType(self):
+        return 'int32'