Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ft master/new fileformats support #132

Open
wants to merge 6 commits into
base: hb_master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions lib/galaxy/dependencies/hyperbrowser-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ progressbar>=2.3
python_dateutil>=1.5
asteval==0.9.8
python-Levenshtein==0.12.0
bx-python>=0.8.2
pyBigWig>=0.3.13
plastid>=0.4.8
#ucsc-bedtobigbed - comment out when we can use conda for HB dependencies

#Might be needed
#pycairo>=1.8.10
Expand Down
2 changes: 1 addition & 1 deletion lib/hb/gold/application/DataTypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def getSupportedFileSuffixes():

# Defined to stop searching for GTrackGenomeElementSource subtypes online.
def getUnsupportedFileSuffixes():
return ['bam', 'bai', 'tab', 'tbi', 'bigwig', 'bw', 'bigbed', 'bb', 'fastq', 'fq', \
return ['bam', 'bai', 'tab', 'tbi', 'fastq', 'fq', \
'csfasta', 'csqual', 'doc', 'docx', 'xls', 'xlsx', 'gp', 'gappedPeak', 'peaks', \
'bedcluster', 'bedlogr', 'bedrnaelement', 'bedrrbs', 'cel', 'matrix', \
'pdf', 'peptidemapping', 'shortfrags', 'spikeins', 'pair', 'txt', \
Expand Down
8 changes: 1 addition & 7 deletions lib/hb/gold/origdata/BedComposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

ColumnInfo = namedtuple('ColumnInfo', ['colIdx', 'defaultVal', 'checkExtra'])


class BedComposer(FileFormatComposer):
FILE_SUFFIXES = ['bed']
FILE_FORMAT_NAME = 'BED'
Expand Down Expand Up @@ -51,13 +52,6 @@ def matchesTrackFormat(trackFormat):
# Compose methods

def _compose(self, out):
trackName = self._geSource.getTrackName()
if trackName is not None:
name = ':'.join(self._geSource.getTrackName()).replace(' ','_')
trackLine = 'track' + ' name=' + name
trackLine += ''.join(" %s=%s" % (key, val) for key, val in self._extraTrackLineAttributes.iteritems())
print >>out, trackLine

numCols = self._findNumCols()
bedColumnsList = list(self._bedColumnsDict.iteritems())

Expand Down
8 changes: 4 additions & 4 deletions lib/hb/gold/origdata/BedGenomeElementSource.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
from gold.util.CustomExceptions import InvalidFormatError
import numpy


class BedGenomeElementSource(GenomeElementSource):
_VERSION = '1.2'
#FILE_SUFFIXES = ['bed']
FILE_SUFFIXES = ['bed']
FILE_FORMAT_NAME = 'BED'
_numHeaderLines = 0

Expand All @@ -14,7 +15,6 @@ class BedGenomeElementSource(GenomeElementSource):

BED_EXTRA_COLUMNS = ['thickstart', 'thickend', 'itemrgb', 'blockcount', 'blocksizes', 'blockstarts']


def __new__(cls, *args, **kwArgs):
return object.__new__(cls)

Expand Down Expand Up @@ -91,8 +91,8 @@ def _parseEnd(self, ge, end):

class BedValuedGenomeElementSource(BedGenomeElementSource):
_VERSION = '1.1'
# FILE_SUFFIXES = ['valued.bed', 'marked.bed']
FILE_SUFFIXES = ['bed', 'valued.bed', 'marked.bed']
FILE_SUFFIXES = ['valued.bed', 'marked.bed']
#FILE_SUFFIXES = ['bed', 'valued.bed', 'marked.bed']
FILE_FORMAT_NAME = 'Valued BED'

# MIN_NUM_COLS = 5
Expand Down
162 changes: 162 additions & 0 deletions lib/hb/gold/origdata/BigBedComposer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import subprocess
import tempfile
from collections import OrderedDict

from gold.origdata.BedComposer import BedComposer
from BedComposer import ColumnInfo
from quick.util.GenomeInfo import GenomeInfo
from gold.util.CustomExceptions import InvalidFormatError


class BigBedComposer(BedComposer):
FILE_SUFFIXES = ['bb', 'bigbed']
FILE_FORMAT_NAME = 'BigBed'

_BED_COLUMNS_LIST = [('chr', 0, '', ()), \
('start', 1, '', ()), \
('end', 2, '', ()), \
(('id', 'name'), 3, '.', ()), \
('val', 4, '0', ()), \
('strand', 5, '.', ()), \
('thickstart', 6, '0', ('thickend',)), \
('thickend', 7, '0', ('thickstart',)), \
(('itemrgb', 'reserved'), 8, '0', ()), \
('blockcount', 9, '0', ('blocksizes', 'blockstarts')), \
('blocksizes', 10, '.', ('blockcount', 'blockstarts')), \
(('blockstarts', 'chromstarts'), 11, '.', ('blockcount', 'blocksizes'))]

_BED_COLUMNS_AUTOSQL_STR = 'string chrom; "Reference sequence chromosome or scaffold"\n \
uint chromStart; "Start position in chromosome"\n \
uint chromEnd; "End position in chromosome"\n \
string name; "Name of item."\n \
uint score; "Score (0-1000)"\n \
char[1] strand; "+ or - for strand"\n \
uint thickStart; "Start of where display should be thick (start codon)"\n \
uint thickEnd; "End of where display should be thick (stop codon)"\n \
uint reserved; "Used as itemRgb as of 2004-11-22"\n \
int blockCount; "Number of blocks"\n \
int[blockCount] blockSizes; "Comma separated list of block sizes"\n \
int[blockCount] chromStarts; "Start positions relative to chromStart"\n'

_BED_COLUMNS_AUTOSQL = _BED_COLUMNS_AUTOSQL_STR.splitlines(True)

def __init__(self, geSource):
BedComposer.__init__(self, geSource)
self._prefixSet = self._geSource.getPrefixList()
self._extraCols = []
self._bedColumnsDict = self._createColumnsDict(self._prefixSet[:])
self._init()

def _createColumnsDict(self, geCols):
# handle alternative column names and case sensitivity (lowercase/camelCase)
# returns mapped columns and extra columns that were not mapped
cols, extraCols = self._mapColsToStandardCols(geCols)

lastColIndex = cols[-1][1]
for extraCol in extraCols:
lastColIndex += 1
cols.append((extraCol, lastColIndex, '.', ()))

self._extraCols = extraCols

columnsDict = OrderedDict([(colName, ColumnInfo(colIdx, defaultVal, checkExtra)) for
colName, colIdx, defaultVal, checkExtra in cols])

return columnsDict

def _compose(self, out):
tmpBedFile = self._getBedFile()
tmpChromSizes = self._getChromSizesFile()

cmds = [
'bedToBigBed',
tmpBedFile.name,
tmpChromSizes.name,
out.name
]
bedtype = 'bed%s' % (self._findNumCols() - len(self._extraCols))
tmpAutoSql = None
if self._extraCols:
bedtype += '+%s' % len(self._extraCols)
autoSql = self._createAutoSql()
tmpAutoSql = tempfile.NamedTemporaryFile(suffix='.as')
tmpAutoSql.write(autoSql)
tmpAutoSql.flush()
cmds.append('-as=%s' % tmpAutoSql.name)
cmds.append('-type=%s' % bedtype)

try:
subprocess.check_call(cmds)
except subprocess.CalledProcessError:
self._closeFiles(tmpBedFile, tmpChromSizes, tmpAutoSql)
raise InvalidFormatError('There was an error while composing the BigBed file.')

self._closeFiles(tmpBedFile, tmpChromSizes, tmpAutoSql)

def returnComposed(self, ignoreEmpty=False, **kwArgs):
tmpOut = tempfile.NamedTemporaryFile(suffix='.bb')
self._composeCommon(tmpOut, ignoreEmpty, **kwArgs)

composedStr = tmpOut.read()
tmpOut.close()

return composedStr

def _findNumCols(self):
return len(self._bedColumnsDict)

def _mapColsToStandardCols(self, geCols):
geCols.append('chr')
cols = []
lowercasePrefixMap = {}

for p in geCols:
lowercasePrefixMap[p.lower()] = p

for colDefTuple in self._BED_COLUMNS_LIST:
colName = colDefTuple[0]
if colName in lowercasePrefixMap:
self._handleStandardCol(cols, geCols, lowercasePrefixMap[colName], colDefTuple)
elif isinstance(colName, tuple):
for item in colName:
if item in lowercasePrefixMap:
self._handleStandardCol(cols, geCols, lowercasePrefixMap[item], colDefTuple)

return cols, geCols

def _handleStandardCol(self, cols, geCols, colName, colDefTuple):
cols.append((colName,) + colDefTuple[1:])
geCols.remove(colName)

def _getChromSizesFile(self):
chromSizes = GenomeInfo.getStdChrLengthDict(self._geSource.getGenome())
tmpChromSizes = tempfile.NamedTemporaryFile(suffix='.sizes')
for chrom, size in chromSizes.iteritems():
tmpChromSizes.write(chrom + '\t' + str(size) + '\n')
tmpChromSizes.flush()

return tmpChromSizes

def _createAutoSql(self):
autoSqlStr = 'table FromBigBedComposer\n'
autoSqlStr += '"Automatically genearated"\n(\n'
autoSqlStr += ''.join(self._BED_COLUMNS_AUTOSQL[:self._findNumCols() - len(self._extraCols)])
for extraCol in self._extraCols:
autoSqlStr += 'string ' + extraCol + '; " extra field"\n'

autoSqlStr += ')'

return autoSqlStr

def _getBedFile(self):
tmpFile = tempfile.NamedTemporaryFile(suffix='.bed')
BedComposer._compose(self, tmpFile)
tmpFile.flush()

return tmpFile

def _closeFiles(self, tmpBedFile, tmpChromSizes, tmpAutoSql):
tmpBedFile.close()
tmpChromSizes.close()
if tmpAutoSql:
tmpAutoSql.close()
143 changes: 143 additions & 0 deletions lib/hb/gold/origdata/BigBedGenomeElementSource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import os
from copy import copy

import numpy as np
import pyBigWig
from plastid.readers.autosql import AutoSqlDeclaration

from gold.origdata.GenomeElementSource import GenomeElementSource
from gold.origdata.GenomeElement import GenomeElement
from gold.util.CommonConstants import BINARY_MISSING_VAL
from gold.util.CustomExceptions import InvalidFormatError


class BigBedGenomeElementSource(GenomeElementSource):
_VERSION = '1.0'
FILE_SUFFIXES = ['bb', 'bigbed']
FILE_FORMAT_NAME = 'BigBed'
BED_EXTRA_COLUMNS = ['name', 'score', 'strand', 'thickStart', 'thickEnd', 'reserved', 'blockCount', 'blockSizes',
'blockStarts']

_numHeaderLines = 0
_isSliceSource = True
_isSorted = True
_inputIsOneIndexed = True
_inputIsEndInclusive = True
_addsStartElementToDenseIntervals = False

def __new__(cls, *args, **kwArgs):
return object.__new__(cls)

def __init__(self, *args, **kwArgs):
GenomeElementSource.__init__(self, *args, **kwArgs)
# using 1 as length because new line character gets added in TestGenomeElementSource
if os.stat(self._fn).st_size > 1:
self._bigBedFile = pyBigWig.open(self._fn)
self._chrIter = iter(sorted(self._bigBedFile.chroms().items()))

self._extraColNames = self._initColumnNames()
self._numOfExtraCols = 0
if self._extraColNames:
self._numOfExtraCols = len(self._extraColNames)

self._parseValVec = np.vectorize(self._parseVal)
self._getStrandFromStringVec = np.vectorize(self._getStrandFromString)

def __iter__(self):
# using 1 as length because new line character gets added in TestGenomeElementSource
if os.stat(self._fn).st_size > 1:
self._bigBedFile = pyBigWig.open(self._fn)
self._chrIter = iter(sorted(self._bigBedFile.chroms().items()))
else:
self._chrIter = iter([])
geIter = copy(self)

return geIter

def _initColumnNames(self):
autoSql = self._bigBedFile.SQL()
if autoSql:
autoSqlParser = AutoSqlDeclaration(self._bigBedFile.SQL())
colNames = autoSqlParser.field_formatters.keys()
return colNames[3:]

def _iter(self):
return self

def next(self):
currentChrom = next(self._chrIter, None)
if not currentChrom:
if hasattr(self, '_bigBedFile'):
self._bigBedFile.close()
raise StopIteration

chrName, chrLengths = currentChrom

entries = self._bigBedFile.entries(str(chrName), 0, chrLengths)
# self._extraColNames are initialized during the first iteration
if self._extraColNames is None:
self._initExtraCols(entries)

start, end = self._parseStartAndEnd(entries)

ge = GenomeElement(genome=self._genome, chr=chrName,
start=start, end=end)

if self._numOfExtraCols != 0:
strVals = [x[2] for x in entries]
values = np.genfromtxt(strVals, dtype=None, names=self._extraColNames, delimiter='\t')
if values.size == 1:
values = values.reshape((1,))
tmpColNames = self._extraColNames[:]
if 'score' in self._extraColNames:
ge.val = np.array(self._parseValVec(values['score']), dtype=np.int32)
tmpColNames.remove('score')
if 'strand' in self._extraColNames:
ge.strand = np.array(self._getStrandFromStringVec(values['strand']), dtype=np.int8)
tmpColNames.remove('strand')
for colName in tmpColNames:
setattr(ge, colName, values[colName].astype(str))

return ge

def _initExtraCols(self, entries):
numOfCols = len(entries[0])
if numOfCols >= 2 and entries[0][2]:
extraCols = entries[0][2].split('\t')
self._extraColNames = self.BED_EXTRA_COLUMNS[:len(extraCols)]
self._numOfExtraCols = len(extraCols)
else:
self._extraColNames = []
extraCols = []

self._extraColNames = self.BED_EXTRA_COLUMNS[:len(extraCols)]
self._numOfExtraCols = len(extraCols)

def _parseStartAndEnd(self, entries):
tupleVals = [(x[0], x[1]) for x in entries]
intervals = np.array(tupleVals, dtype=np.dtype([('start', 'int32'), ('end', 'int32')]))

return intervals['start'], intervals['end']

def _parseVal(self, strVal):
if strVal in ['-', '.']:
val = 0
else:
val = int(strVal)

return val

@classmethod
def _getStrandFromString(cls, val):
if val == '+':
return 1
elif val == '-':
return 0
elif val == '.':
return BINARY_MISSING_VAL
else:
raise InvalidFormatError(
"Error: strand must be either '+', '-' or '.'. Value: %s" % val)

def getValDataType(self):
return 'int32'
Loading