Skip to content
This repository has been archived by the owner on May 19, 2020. It is now read-only.

Commit

Permalink
Add validation with single vcf and bam file
Browse files Browse the repository at this point in the history
Adds a command line program to validate mutations in a given vcf in a given bam
file.
  • Loading branch information
inodb committed May 27, 2015
0 parents commit f11d784
Show file tree
Hide file tree
Showing 6 changed files with 337 additions and 0 deletions.
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# So U Found A Mutation? (SUFAM)

Found a mutation in one or more samples? Now you want to check if they are in
another sample. Unfortunately mutect, varscan or whatever other variant caller
is not calling them. Use SUFAM. The super sensitive validation caller that calls
everything on a given position.

## Installation

```
git clone http://github.com/inodb/sufam
cd sufam
python setup.py install
```

## Run
```
sufam --help
```
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
numpy>=1.9.2
pandas>=0.16.1
30 changes: 30 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python
from setuptools import setup
import sys, os

version = '0.0.1'

setup(name='sufam',
version=version,
description="So U Found A Mutation?",
long_description="""Validate mutations from supplied vcf in supplied bam""",
classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
keywords='Python validation mutation mpileup samtools pileup vcf bam',
author='Ino de Bruijn',
author_email='[email protected]',
maintainer='Ino de Bruijn',
maintainer_email='[email protected]',
url='https://github.com/inodb/sufam',
license='FreeBSD',
packages=['sufam'],
include_package_data=True,
zip_safe=False,
install_requires=['numpy>=1.9.2',
'pandas>=0.16.1',
],
entry_points={
'console_scripts': [
'sufam = sufam.__main__:main'
]
},
)
Empty file added sufam/__init__.py
Empty file.
192 changes: 192 additions & 0 deletions sufam/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
#!/usr/bin/env python
"""
So U Found A Mutation? (SUFAM)
Found a mutation in one or more samples? Now you want to check if they are in
another sample. Unfortunately mutect, varscan or whatever other variant caller
is not calling them. Use SUFAM. The super sensitive validation caller that calls
everything on a given position.
Author: inodb
"""
import argparse
from collections import Counter
import glob
import subprocess
import sys

import numpy as np
import pandas as pd

from . import mpileup_parser


def get_pile_up_baseparser(bam, chrom, pos1, pos2, reffa):
"""Get vcf line of given chrom and pos from mpileup and baseparser"""
posmin = min(pos1, pos2)
posmax = max(pos1, pos2)
cmd = "samtools view -bh {bam} {chrom}:{pos1}-{pos2} " \
"| samtools mpileup -R -q 1 -f {reffa} -".format(bam=bam,chrom=chrom,pos1=posmin,pos2=posmax, reffa=reffa)
if pos1 == pos2:
cmd += " | grep -P '^{chrom}\\t{pos}\\t'".format(chrom=chrom,pos=pos1)
else:
cmd += " | tail -n +2"
sys.stderr.write("Running:\n{}\n".format(cmd))
rv = subprocess.Popen(cmd,
shell=True, stdout=subprocess.PIPE).communicate()[0]
return [mpileup_parser.parse(line) for line in rv.split("\n")[:-1]]


def _most_common_al(x):
if x["ref"]:
bl = ["A","C","G","T"]
bl.remove(x["ref"])
mc = Counter({k:int(v) for k, v in dict(x.ix[bl]).iteritems()}).most_common(1)[0]
return pd.Series({"most_common_al":str(mc[0]),
"most_common_al_count":str(mc[1]),
"most_common_al_maf":str(mc[1]/float(x["cov"])) if float(x["cov"]) > 0 else "0"})
else:
return pd.Series({"most_common_al":None,
"most_common_al_count":None,
"most_common_al_maf":None})


def _most_common_indel(x):
dels = Counter(x["-"].split(",")).most_common(1)[0] if x["-"] else None
ins = Counter(x["+"].split(",")).most_common(1)[0] if x["+"] else None
if ins and dels:
mc = dels if dels[1] >= ins[1] else ins
indel_type = "-" if dels[1] >= ins[1] else "+"
elif ins:
mc = ins
indel_type = "+"
elif dels:
mc = dels
indel_type = "-"
else:
return pd.Series({
"most_common_indel_type":None,
"most_common_indel":None,
"most_common_indel_count":None,
"most_common_indel_maf":None})
return pd.Series({
"most_common_indel_type":indel_type,
"most_common_indel":str(mc[0]),
"most_common_indel_count":str(mc[1]),
"most_common_indel_maf":str(mc[1]/float(x["cov"]) if float(x["cov"]) > 0 else "0")})


def get_baseparser_extended_df(bam, sample, chrom, pos1, pos2, reffa):
"""Turn baseParser results into a dataframe"""
columns = "chrom\tpos\tref\tcov\tA\tC\tG\tT\t*\t-\t+\tX".split()
bp_lines = get_pile_up_baseparser(bam, chrom, pos1, pos2, reffa)

# change baseparser output to get most common maf per indel
bpdf = pd.DataFrame([[bam.split("/")[1][:-4]] + l.rstrip('\n').split("\t") for l in bp_lines if len(l) > 0],
columns=["sample"] + columns, dtype=np.object)

if len(bpdf) == 0:
return None

bpdf = pd.concat([bpdf, bpdf.apply(_most_common_indel, axis=1)], axis=1)
bpdf = pd.concat([bpdf, bpdf.apply(_most_common_al, axis=1)], axis=1)
bpdf["most_common_count"] = bpdf.apply(lambda x: max([x.most_common_al_count, x.most_common_indel_count]), axis=1)
bpdf["most_common_maf"] = bpdf.apply(lambda x: max([x.most_common_al_maf, x.most_common_indel_maf]), axis=1)

return bpdf


def filter_out_mutations_in_normal(tumordf, normaldf, most_common_maf_min=0.2,
most_common_count_maf_threshold=20,
most_common_count_min=1):
"""Remove mutations that are in normal"""
df = tumordf.merge(normaldf, on=["chrom","pos"], suffixes=("_T","_N"))

# filters
common_al = (df.most_common_al_count_T == df.most_common_count_T) & (df.most_common_al_T == df.most_common_al_N)
common_indel = (df.most_common_indel_count_T == df.most_common_count_T) & (df.most_common_indel_T == df.most_common_indel_N)
normal_criteria = ((df.most_common_count_N >= most_common_count_maf_threshold) & (df.most_common_maf_N > most_common_maf_min)) | \
((df.most_common_count_N < most_common_count_maf_threshold) & (df.most_common_count_N > most_common_count_min))
df = df[~(common_al | common_indel) & normal_criteria]

# restore column names of tumor
for c in df.columns:
if c.endswith("_N"):
del df[c]
df.columns = [c[:-2] if c.endswith("_T") else c for c in df.columns]

return df


def select_only_revertant_mutations(bpdf, snv=None, ins=None, dlt=None):
"""
Selects only mutations that revert the given mutations in a single event.
"""
if sum([bool(snv), bool(ins), bool(dlt)]) != 1:
raise(Exception("Should be either snv, ins or del".format(snv)))

if snv:
if snv not in ["A","C","G","T"]:
raise(Exception("snv {} should be A, C, G or T".format(snv)))
return bpdf[(bpdf.most_common_al == snv) & (bpdf.most_common_al_count == bpdf.most_common_count)]
elif bool(ins):
return \
bpdf[((bpdf.most_common_indel.apply(lambda x: len(x) + len(ins) % 3 if x else None) == 0 ) & (bpdf.most_common_indel_type == "+") & (bpdf.most_common_count == bpdf.most_common_indel_count)) |
((bpdf.most_common_indel.apply(lambda x: len(ins) - len(x) % 3 if x else None) == 0 ) & (bpdf.most_common_indel_type == "-") & (bpdf.most_common_count == bpdf.most_common_indel_count))]
elif bool(dlt):
return \
bpdf[((bpdf.most_common_indel.apply(lambda x: len(x) - len(dlt) % 3 if x else None) == 0) & (bpdf.most_common_indel_type == "+") & (bpdf.most_common_count == bpdf.most_common_indel_count)) |
((bpdf.most_common_indel.apply(lambda x: -len(dlt) - len(x) % 3 if x else None) == 0 ) & (bpdf.most_common_indel_type == "-") & (bpdf.most_common_count == bpdf.most_common_indel_count))]
else:
# should never happen
raise(Exception("No mutation given?"))


def validate_mutations(vcffile, bam, reffa, sample):
"""Check if mutations in vcf are in bam"""
header = []
for line in open(vcffile):
if line.startswith("#CHROM"):
header = line[1:].split("\t")
if line.startswith("#"):
continue
record = dict(zip(header, line.split("\t")))
row = []
bpdf = get_baseparser_extended_df(bam, sample, record["CHROM"], record["POS"], record["POS"], reffa)
if len(record["REF"]) == len(record["ALT"]):
if len(bpdf[(bpdf.most_common_al == record["ALT"]) & (bpdf.most_common_al_maf > 0)]) > 0:
row += [True]
else:
row += [False]
else:
# deletion
if len(record["REF"]) > len(record["ALT"]):
if len(bpdf[(bpdf.most_common_indel == record["REF"][1:]) & (bpdf.most_common_indel_maf > 0)]) > 0:
row += [True]
else:
row += [False]
# insertion
else:
if len(bpdf[(bpdf.most_common_indel == record["ALT"][1:]) & (bpdf.most_common_indel_maf > 0)]) > 0:
row += [True]
else:
row += [False]
print "\t".join([str(int(x)) for x in row])


def main():
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("reffa", type=str, help="Reference genome (fasta)")
parser.add_argument("vcf", type=str, help="VCF with mutations to be validated")
parser.add_argument("bam", type=str, help="BAM to find mutations in")
parser.add_argument("--sample_name", type=str, default=None, help="Set name "
"of sample, used in output.")
args = parser.parse_args()
if args.sample_name is None:
args.sample_name = args.bam
validate_mutations(args.vcf, args.bam, args.reffa, args.sample_name)


if __name__ == "__main__":
main()
94 changes: 94 additions & 0 deletions sufam/mpileup_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#! /usr/bin/env python
###############################################################################
# parses pileup base string and returns the counts for all possible alleles
# for each position
# reads input (mpileup output) from sys.stdin
#
# Author: Niknafs (Noushin Niknafs)
# Edit: inodb
###############################################################################

import os
import sys

class ParseString(object):

def __init__(self, ref, string):
self.ref = ref.upper()
self.string = string.upper()
self.types = {'A':0,'G':0,'C':0,'T':0,'-':[],'*':0,'+':[],'X':[]}
self.process()

def process(self):
# remove end of read character
self.string = self.string.replace('$','')
while self.string != '':
if self.string[0] == '^':
# skip two characters when encountering '^' as it indicates
# a read start mark and the read mapping quality
self.string = self.string[2:]
elif self.string[0] == '*':
self.types['*'] += 1
# skip to next character
self.string = self.string[1:]

elif self.string[0] in ['.',',']:
if (len(self.string)== 1) or (self.string[1] not in ['+','-']):
# a reference base
self.types[self.ref] += 1
self.string = self.string[1:]
elif self.string[1] == '+':
insertionLength = int(self.string[2])
insertionSeq = self.string[3:3+ insertionLength]
self.types['+'].append(insertionSeq)
self.string = self.string[3+insertionLength:]
elif self.string[1] == '-':
deletionLength = int(self.string[2])
deletionSeq = self.string[3:3+deletionLength]
self.types['-'].append(deletionSeq)
self.string = self.string[3+deletionLength:]

elif self.types.has_key(self.string[0]) and\
((len(self.string)==1) or (self.string[1] not in ['-','+'])):
# one of the four bases
if self.string[0] in ["A","C","G","T"]:
self.types[self.string[0]] += 1
self.string = self.string[1:]
elif self.string[0] == '+':
insertionLength = int(self.string[1])
insertionSeq = self.string[2:2+ insertionLength]
self.types['+'].append(insertionSeq)
self.string = self.string[2+insertionLength:]
elif self.string[0] == '-':
deletionLength = int(self.string[1])
deletionSeq = self.string[2:2+deletionLength]
self.types['-'].append(deletionSeq)
self.string = self.string[2+deletionLength:]
else:
# unrecognized character
# or a read that reports a substitition followed by an insertion/deletion
self.types['X'].append(self.string[0])
self.string = self.string[1:]
return

def __repr__(self):
types = self.types
return '\t'.join(map(str,[types['A'], types['C'], types['G'],types['T'],\
types['*']]) +\
map(','.join, [types['-'],types['+'],types['X']]))


def parse(line):
toks = line.strip('\n').split('\t')
ref = toks[2].upper()
cov = toks[3]
return '\t'.join([toks[0], toks[1], ref, cov]) + '\t' + str(ParseString(ref, toks[4]))


def main():
print >>sys.stdout, "chrom\tpos\tref\tcov\tA\tC\tG\tT\t*\t-\t+\tX"
for line in sys.stdin:
print >>sys.stdout, parse_mpileup_line(line)

if __name__ == '__main__':
main()

0 comments on commit f11d784

Please sign in to comment.