This repository has been archived by the owner on May 19, 2020. It is now read-only.
forked from inodb/sufam
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add validation with single vcf and bam file
Adds a command line program to validate mutations in a given vcf in a given bam file.
- Loading branch information
0 parents
commit f11d784
Showing
6 changed files
with
337 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# So U Found A Mutation? (SUFAM) | ||
|
||
Found a mutation in one or more samples? Now you want to check if they are in | ||
another sample. Unfortunately mutect, varscan or whatever other variant caller | ||
is not calling them. Use SUFAM. The super sensitive validation caller that calls | ||
everything on a given position. | ||
|
||
## Installation | ||
|
||
``` | ||
git clone http://github.com/inodb/sufam | ||
cd sufam | ||
python setup.py install | ||
``` | ||
|
||
## Run | ||
``` | ||
sufam --help | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
numpy>=1.9.2 | ||
pandas>=0.16.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#!/usr/bin/env python | ||
from setuptools import setup | ||
import sys, os | ||
|
||
version = '0.0.1' | ||
|
||
setup(name='sufam', | ||
version=version, | ||
description="So U Found A Mutation?", | ||
long_description="""Validate mutations from supplied vcf in supplied bam""", | ||
classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers | ||
keywords='Python validation mutation mpileup samtools pileup vcf bam', | ||
author='Ino de Bruijn', | ||
author_email='[email protected]', | ||
maintainer='Ino de Bruijn', | ||
maintainer_email='[email protected]', | ||
url='https://github.com/inodb/sufam', | ||
license='FreeBSD', | ||
packages=['sufam'], | ||
include_package_data=True, | ||
zip_safe=False, | ||
install_requires=['numpy>=1.9.2', | ||
'pandas>=0.16.1', | ||
], | ||
entry_points={ | ||
'console_scripts': [ | ||
'sufam = sufam.__main__:main' | ||
] | ||
}, | ||
) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
#!/usr/bin/env python | ||
""" | ||
So U Found A Mutation? (SUFAM) | ||
Found a mutation in one or more samples? Now you want to check if they are in | ||
another sample. Unfortunately mutect, varscan or whatever other variant caller | ||
is not calling them. Use SUFAM. The super sensitive validation caller that calls | ||
everything on a given position. | ||
Author: inodb | ||
""" | ||
import argparse | ||
from collections import Counter | ||
import glob | ||
import subprocess | ||
import sys | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from . import mpileup_parser | ||
|
||
|
||
def get_pile_up_baseparser(bam, chrom, pos1, pos2, reffa): | ||
"""Get vcf line of given chrom and pos from mpileup and baseparser""" | ||
posmin = min(pos1, pos2) | ||
posmax = max(pos1, pos2) | ||
cmd = "samtools view -bh {bam} {chrom}:{pos1}-{pos2} " \ | ||
"| samtools mpileup -R -q 1 -f {reffa} -".format(bam=bam,chrom=chrom,pos1=posmin,pos2=posmax, reffa=reffa) | ||
if pos1 == pos2: | ||
cmd += " | grep -P '^{chrom}\\t{pos}\\t'".format(chrom=chrom,pos=pos1) | ||
else: | ||
cmd += " | tail -n +2" | ||
sys.stderr.write("Running:\n{}\n".format(cmd)) | ||
rv = subprocess.Popen(cmd, | ||
shell=True, stdout=subprocess.PIPE).communicate()[0] | ||
return [mpileup_parser.parse(line) for line in rv.split("\n")[:-1]] | ||
|
||
|
||
def _most_common_al(x): | ||
if x["ref"]: | ||
bl = ["A","C","G","T"] | ||
bl.remove(x["ref"]) | ||
mc = Counter({k:int(v) for k, v in dict(x.ix[bl]).iteritems()}).most_common(1)[0] | ||
return pd.Series({"most_common_al":str(mc[0]), | ||
"most_common_al_count":str(mc[1]), | ||
"most_common_al_maf":str(mc[1]/float(x["cov"])) if float(x["cov"]) > 0 else "0"}) | ||
else: | ||
return pd.Series({"most_common_al":None, | ||
"most_common_al_count":None, | ||
"most_common_al_maf":None}) | ||
|
||
|
||
def _most_common_indel(x): | ||
dels = Counter(x["-"].split(",")).most_common(1)[0] if x["-"] else None | ||
ins = Counter(x["+"].split(",")).most_common(1)[0] if x["+"] else None | ||
if ins and dels: | ||
mc = dels if dels[1] >= ins[1] else ins | ||
indel_type = "-" if dels[1] >= ins[1] else "+" | ||
elif ins: | ||
mc = ins | ||
indel_type = "+" | ||
elif dels: | ||
mc = dels | ||
indel_type = "-" | ||
else: | ||
return pd.Series({ | ||
"most_common_indel_type":None, | ||
"most_common_indel":None, | ||
"most_common_indel_count":None, | ||
"most_common_indel_maf":None}) | ||
return pd.Series({ | ||
"most_common_indel_type":indel_type, | ||
"most_common_indel":str(mc[0]), | ||
"most_common_indel_count":str(mc[1]), | ||
"most_common_indel_maf":str(mc[1]/float(x["cov"]) if float(x["cov"]) > 0 else "0")}) | ||
|
||
|
||
def get_baseparser_extended_df(bam, sample, chrom, pos1, pos2, reffa): | ||
"""Turn baseParser results into a dataframe""" | ||
columns = "chrom\tpos\tref\tcov\tA\tC\tG\tT\t*\t-\t+\tX".split() | ||
bp_lines = get_pile_up_baseparser(bam, chrom, pos1, pos2, reffa) | ||
|
||
# change baseparser output to get most common maf per indel | ||
bpdf = pd.DataFrame([[bam.split("/")[1][:-4]] + l.rstrip('\n').split("\t") for l in bp_lines if len(l) > 0], | ||
columns=["sample"] + columns, dtype=np.object) | ||
|
||
if len(bpdf) == 0: | ||
return None | ||
|
||
bpdf = pd.concat([bpdf, bpdf.apply(_most_common_indel, axis=1)], axis=1) | ||
bpdf = pd.concat([bpdf, bpdf.apply(_most_common_al, axis=1)], axis=1) | ||
bpdf["most_common_count"] = bpdf.apply(lambda x: max([x.most_common_al_count, x.most_common_indel_count]), axis=1) | ||
bpdf["most_common_maf"] = bpdf.apply(lambda x: max([x.most_common_al_maf, x.most_common_indel_maf]), axis=1) | ||
|
||
return bpdf | ||
|
||
|
||
def filter_out_mutations_in_normal(tumordf, normaldf, most_common_maf_min=0.2, | ||
most_common_count_maf_threshold=20, | ||
most_common_count_min=1): | ||
"""Remove mutations that are in normal""" | ||
df = tumordf.merge(normaldf, on=["chrom","pos"], suffixes=("_T","_N")) | ||
|
||
# filters | ||
common_al = (df.most_common_al_count_T == df.most_common_count_T) & (df.most_common_al_T == df.most_common_al_N) | ||
common_indel = (df.most_common_indel_count_T == df.most_common_count_T) & (df.most_common_indel_T == df.most_common_indel_N) | ||
normal_criteria = ((df.most_common_count_N >= most_common_count_maf_threshold) & (df.most_common_maf_N > most_common_maf_min)) | \ | ||
((df.most_common_count_N < most_common_count_maf_threshold) & (df.most_common_count_N > most_common_count_min)) | ||
df = df[~(common_al | common_indel) & normal_criteria] | ||
|
||
# restore column names of tumor | ||
for c in df.columns: | ||
if c.endswith("_N"): | ||
del df[c] | ||
df.columns = [c[:-2] if c.endswith("_T") else c for c in df.columns] | ||
|
||
return df | ||
|
||
|
||
def select_only_revertant_mutations(bpdf, snv=None, ins=None, dlt=None): | ||
""" | ||
Selects only mutations that revert the given mutations in a single event. | ||
""" | ||
if sum([bool(snv), bool(ins), bool(dlt)]) != 1: | ||
raise(Exception("Should be either snv, ins or del".format(snv))) | ||
|
||
if snv: | ||
if snv not in ["A","C","G","T"]: | ||
raise(Exception("snv {} should be A, C, G or T".format(snv))) | ||
return bpdf[(bpdf.most_common_al == snv) & (bpdf.most_common_al_count == bpdf.most_common_count)] | ||
elif bool(ins): | ||
return \ | ||
bpdf[((bpdf.most_common_indel.apply(lambda x: len(x) + len(ins) % 3 if x else None) == 0 ) & (bpdf.most_common_indel_type == "+") & (bpdf.most_common_count == bpdf.most_common_indel_count)) | | ||
((bpdf.most_common_indel.apply(lambda x: len(ins) - len(x) % 3 if x else None) == 0 ) & (bpdf.most_common_indel_type == "-") & (bpdf.most_common_count == bpdf.most_common_indel_count))] | ||
elif bool(dlt): | ||
return \ | ||
bpdf[((bpdf.most_common_indel.apply(lambda x: len(x) - len(dlt) % 3 if x else None) == 0) & (bpdf.most_common_indel_type == "+") & (bpdf.most_common_count == bpdf.most_common_indel_count)) | | ||
((bpdf.most_common_indel.apply(lambda x: -len(dlt) - len(x) % 3 if x else None) == 0 ) & (bpdf.most_common_indel_type == "-") & (bpdf.most_common_count == bpdf.most_common_indel_count))] | ||
else: | ||
# should never happen | ||
raise(Exception("No mutation given?")) | ||
|
||
|
||
def validate_mutations(vcffile, bam, reffa, sample): | ||
"""Check if mutations in vcf are in bam""" | ||
header = [] | ||
for line in open(vcffile): | ||
if line.startswith("#CHROM"): | ||
header = line[1:].split("\t") | ||
if line.startswith("#"): | ||
continue | ||
record = dict(zip(header, line.split("\t"))) | ||
row = [] | ||
bpdf = get_baseparser_extended_df(bam, sample, record["CHROM"], record["POS"], record["POS"], reffa) | ||
if len(record["REF"]) == len(record["ALT"]): | ||
if len(bpdf[(bpdf.most_common_al == record["ALT"]) & (bpdf.most_common_al_maf > 0)]) > 0: | ||
row += [True] | ||
else: | ||
row += [False] | ||
else: | ||
# deletion | ||
if len(record["REF"]) > len(record["ALT"]): | ||
if len(bpdf[(bpdf.most_common_indel == record["REF"][1:]) & (bpdf.most_common_indel_maf > 0)]) > 0: | ||
row += [True] | ||
else: | ||
row += [False] | ||
# insertion | ||
else: | ||
if len(bpdf[(bpdf.most_common_indel == record["ALT"][1:]) & (bpdf.most_common_indel_maf > 0)]) > 0: | ||
row += [True] | ||
else: | ||
row += [False] | ||
print "\t".join([str(int(x)) for x in row]) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description=__doc__, | ||
formatter_class=argparse.RawDescriptionHelpFormatter) | ||
parser.add_argument("reffa", type=str, help="Reference genome (fasta)") | ||
parser.add_argument("vcf", type=str, help="VCF with mutations to be validated") | ||
parser.add_argument("bam", type=str, help="BAM to find mutations in") | ||
parser.add_argument("--sample_name", type=str, default=None, help="Set name " | ||
"of sample, used in output.") | ||
args = parser.parse_args() | ||
if args.sample_name is None: | ||
args.sample_name = args.bam | ||
validate_mutations(args.vcf, args.bam, args.reffa, args.sample_name) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
#! /usr/bin/env python | ||
############################################################################### | ||
# parses pileup base string and returns the counts for all possible alleles | ||
# for each position | ||
# reads input (mpileup output) from sys.stdin | ||
# | ||
# Author: Niknafs (Noushin Niknafs) | ||
# Edit: inodb | ||
############################################################################### | ||
|
||
import os | ||
import sys | ||
|
||
class ParseString(object): | ||
|
||
def __init__(self, ref, string): | ||
self.ref = ref.upper() | ||
self.string = string.upper() | ||
self.types = {'A':0,'G':0,'C':0,'T':0,'-':[],'*':0,'+':[],'X':[]} | ||
self.process() | ||
|
||
def process(self): | ||
# remove end of read character | ||
self.string = self.string.replace('$','') | ||
while self.string != '': | ||
if self.string[0] == '^': | ||
# skip two characters when encountering '^' as it indicates | ||
# a read start mark and the read mapping quality | ||
self.string = self.string[2:] | ||
elif self.string[0] == '*': | ||
self.types['*'] += 1 | ||
# skip to next character | ||
self.string = self.string[1:] | ||
|
||
elif self.string[0] in ['.',',']: | ||
if (len(self.string)== 1) or (self.string[1] not in ['+','-']): | ||
# a reference base | ||
self.types[self.ref] += 1 | ||
self.string = self.string[1:] | ||
elif self.string[1] == '+': | ||
insertionLength = int(self.string[2]) | ||
insertionSeq = self.string[3:3+ insertionLength] | ||
self.types['+'].append(insertionSeq) | ||
self.string = self.string[3+insertionLength:] | ||
elif self.string[1] == '-': | ||
deletionLength = int(self.string[2]) | ||
deletionSeq = self.string[3:3+deletionLength] | ||
self.types['-'].append(deletionSeq) | ||
self.string = self.string[3+deletionLength:] | ||
|
||
elif self.types.has_key(self.string[0]) and\ | ||
((len(self.string)==1) or (self.string[1] not in ['-','+'])): | ||
# one of the four bases | ||
if self.string[0] in ["A","C","G","T"]: | ||
self.types[self.string[0]] += 1 | ||
self.string = self.string[1:] | ||
elif self.string[0] == '+': | ||
insertionLength = int(self.string[1]) | ||
insertionSeq = self.string[2:2+ insertionLength] | ||
self.types['+'].append(insertionSeq) | ||
self.string = self.string[2+insertionLength:] | ||
elif self.string[0] == '-': | ||
deletionLength = int(self.string[1]) | ||
deletionSeq = self.string[2:2+deletionLength] | ||
self.types['-'].append(deletionSeq) | ||
self.string = self.string[2+deletionLength:] | ||
else: | ||
# unrecognized character | ||
# or a read that reports a substitition followed by an insertion/deletion | ||
self.types['X'].append(self.string[0]) | ||
self.string = self.string[1:] | ||
return | ||
|
||
def __repr__(self): | ||
types = self.types | ||
return '\t'.join(map(str,[types['A'], types['C'], types['G'],types['T'],\ | ||
types['*']]) +\ | ||
map(','.join, [types['-'],types['+'],types['X']])) | ||
|
||
|
||
def parse(line): | ||
toks = line.strip('\n').split('\t') | ||
ref = toks[2].upper() | ||
cov = toks[3] | ||
return '\t'.join([toks[0], toks[1], ref, cov]) + '\t' + str(ParseString(ref, toks[4])) | ||
|
||
|
||
def main(): | ||
print >>sys.stdout, "chrom\tpos\tref\tcov\tA\tC\tG\tT\t*\t-\t+\tX" | ||
for line in sys.stdin: | ||
print >>sys.stdout, parse_mpileup_line(line) | ||
|
||
if __name__ == '__main__': | ||
main() |