Add validation with single vcf and bam file

Adds a command line program to validate mutations in a given vcf in a given bam file.
jrflab · May 27, 2015 · f11d784 · f11d784
commit f11d784
Show file tree

Hide file tree

Showing 6 changed files with 337 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,19 @@
+# So U Found A Mutation? (SUFAM)
+
+Found a mutation in one or more samples? Now you want to check if they are in
+another sample. Unfortunately mutect, varscan or whatever other variant caller
+is not calling them. Use SUFAM. The super sensitive validation caller that calls
+everything on a given position.
+
+## Installation
+
+```
+git clone http://github.com/inodb/sufam
+cd sufam
+python setup.py install
+```
+
+## Run
+```
+sufam --help
+```
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+numpy>=1.9.2
+pandas>=0.16.1
diff --git a/setup.py b/setup.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+from setuptools import setup
+import sys, os
+
+version = '0.0.1'
+
+setup(name='sufam',
+      version=version,
+      description="So U Found A Mutation?",
+      long_description="""Validate mutations from supplied vcf in supplied bam""",
+      classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
+      keywords='Python validation mutation mpileup samtools pileup vcf bam',
+      author='Ino de Bruijn',
+      author_email='[email protected]',
+      maintainer='Ino de Bruijn',
+      maintainer_email='[email protected]',
+      url='https://github.com/inodb/sufam',
+      license='FreeBSD',
+      packages=['sufam'],
+      include_package_data=True,
+      zip_safe=False,
+      install_requires=['numpy>=1.9.2',
+                        'pandas>=0.16.1',
+                        ],
+      entry_points={
+          'console_scripts': [
+              'sufam = sufam.__main__:main'
+          ]
+      },
+      )
diff --git a/sufam/__init__.py b/sufam/__init__.py
diff --git a/sufam/__main__.py b/sufam/__main__.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+"""
+So U Found A Mutation? (SUFAM)
+
+Found a mutation in one or more samples? Now you want to check if they are in
+another sample. Unfortunately mutect, varscan or whatever other variant caller
+is not calling them. Use SUFAM. The super sensitive validation caller that calls
+everything on a given position.
+
+Author: inodb
+"""
+import argparse
+from collections import Counter
+import glob
+import subprocess
+import sys
+
+import numpy as np
+import pandas as pd
+
+from . import mpileup_parser
+
+
+def get_pile_up_baseparser(bam, chrom, pos1, pos2, reffa):
+    """Get vcf line of given chrom and pos from mpileup and baseparser"""
+    posmin = min(pos1, pos2)
+    posmax = max(pos1, pos2)
+    cmd = "samtools view -bh {bam} {chrom}:{pos1}-{pos2} " \
+            "| samtools mpileup -R -q 1 -f {reffa} -".format(bam=bam,chrom=chrom,pos1=posmin,pos2=posmax, reffa=reffa)
+    if pos1 == pos2:
+        cmd += " | grep -P '^{chrom}\\t{pos}\\t'".format(chrom=chrom,pos=pos1)
+    else:
+        cmd += " | tail -n +2"
+    sys.stderr.write("Running:\n{}\n".format(cmd))
+    rv = subprocess.Popen(cmd,
+            shell=True, stdout=subprocess.PIPE).communicate()[0]
+    return [mpileup_parser.parse(line) for line in rv.split("\n")[:-1]]
+
+
+def _most_common_al(x):
+    if x["ref"]:
+        bl = ["A","C","G","T"]
+        bl.remove(x["ref"])
+        mc = Counter({k:int(v) for k, v in dict(x.ix[bl]).iteritems()}).most_common(1)[0]
+        return pd.Series({"most_common_al":str(mc[0]),
+                        "most_common_al_count":str(mc[1]),
+                        "most_common_al_maf":str(mc[1]/float(x["cov"])) if float(x["cov"]) > 0 else "0"})
+    else:
+        return pd.Series({"most_common_al":None,
+                        "most_common_al_count":None,
+                        "most_common_al_maf":None})
+
+
+def _most_common_indel(x):
+    dels = Counter(x["-"].split(",")).most_common(1)[0] if x["-"] else None
+    ins = Counter(x["+"].split(",")).most_common(1)[0] if x["+"] else None
+    if ins and dels:
+        mc = dels if dels[1] >= ins[1] else ins
+        indel_type = "-" if dels[1] >= ins[1] else "+"
+    elif ins:
+        mc = ins
+        indel_type = "+"
+    elif dels:
+        mc = dels
+        indel_type = "-"
+    else:
+        return pd.Series({
+                        "most_common_indel_type":None,
+                        "most_common_indel":None,
+                        "most_common_indel_count":None,
+                        "most_common_indel_maf":None})
+    return pd.Series({
+                    "most_common_indel_type":indel_type,
+                    "most_common_indel":str(mc[0]),
+                    "most_common_indel_count":str(mc[1]),
+                    "most_common_indel_maf":str(mc[1]/float(x["cov"]) if float(x["cov"]) > 0 else "0")})
+
+
+def get_baseparser_extended_df(bam, sample, chrom, pos1, pos2, reffa):
+    """Turn baseParser results into a dataframe"""
+    columns = "chrom\tpos\tref\tcov\tA\tC\tG\tT\t*\t-\t+\tX".split()
+    bp_lines = get_pile_up_baseparser(bam, chrom, pos1, pos2, reffa)
+
+    # change baseparser output to get most common maf per indel
+    bpdf = pd.DataFrame([[bam.split("/")[1][:-4]] + l.rstrip('\n').split("\t") for l in bp_lines if len(l) > 0],
+                        columns=["sample"] + columns, dtype=np.object)
+
+    if len(bpdf) == 0:
+        return None
+
+    bpdf = pd.concat([bpdf, bpdf.apply(_most_common_indel, axis=1)], axis=1)
+    bpdf = pd.concat([bpdf, bpdf.apply(_most_common_al, axis=1)], axis=1)
+    bpdf["most_common_count"] = bpdf.apply(lambda x: max([x.most_common_al_count, x.most_common_indel_count]), axis=1)
+    bpdf["most_common_maf"] = bpdf.apply(lambda x: max([x.most_common_al_maf, x.most_common_indel_maf]), axis=1)
+
+    return bpdf
+
+
+def filter_out_mutations_in_normal(tumordf, normaldf, most_common_maf_min=0.2,
+                                   most_common_count_maf_threshold=20,
+                                   most_common_count_min=1):
+    """Remove mutations that are in normal"""
+    df = tumordf.merge(normaldf, on=["chrom","pos"], suffixes=("_T","_N"))
+
+    # filters
+    common_al = (df.most_common_al_count_T == df.most_common_count_T) & (df.most_common_al_T == df.most_common_al_N)
+    common_indel = (df.most_common_indel_count_T == df.most_common_count_T) & (df.most_common_indel_T == df.most_common_indel_N)
+    normal_criteria = ((df.most_common_count_N >= most_common_count_maf_threshold) & (df.most_common_maf_N > most_common_maf_min)) | \
+                      ((df.most_common_count_N < most_common_count_maf_threshold) & (df.most_common_count_N > most_common_count_min))
+    df = df[~(common_al | common_indel) & normal_criteria]
+
+    # restore column names of tumor
+    for c in df.columns:
+        if c.endswith("_N"):
+            del df[c]
+    df.columns = [c[:-2] if c.endswith("_T") else c for c in df.columns]
+
+    return df
+
+
+def select_only_revertant_mutations(bpdf, snv=None, ins=None, dlt=None):
+    """
+    Selects only mutations that revert the given mutations in a single event.
+    """
+    if sum([bool(snv), bool(ins), bool(dlt)]) != 1:
+        raise(Exception("Should be either snv, ins or del".format(snv)))
+
+    if snv:
+        if snv not in ["A","C","G","T"]:
+            raise(Exception("snv {} should be A, C, G or T".format(snv)))
+            return bpdf[(bpdf.most_common_al == snv) & (bpdf.most_common_al_count == bpdf.most_common_count)]
+    elif bool(ins):
+        return \
+            bpdf[((bpdf.most_common_indel.apply(lambda x: len(x) + len(ins) % 3 if x else None) == 0 ) & (bpdf.most_common_indel_type == "+") & (bpdf.most_common_count == bpdf.most_common_indel_count)) |
+                    ((bpdf.most_common_indel.apply(lambda x: len(ins) - len(x) % 3 if x else None) == 0 ) & (bpdf.most_common_indel_type == "-") & (bpdf.most_common_count == bpdf.most_common_indel_count))]
+    elif bool(dlt):
+        return \
+            bpdf[((bpdf.most_common_indel.apply(lambda x: len(x) - len(dlt) % 3 if x else None) == 0) & (bpdf.most_common_indel_type == "+") & (bpdf.most_common_count == bpdf.most_common_indel_count)) |
+                    ((bpdf.most_common_indel.apply(lambda x: -len(dlt) - len(x) % 3 if x else None) == 0 ) & (bpdf.most_common_indel_type == "-") & (bpdf.most_common_count == bpdf.most_common_indel_count))]
+    else:
+        # should never happen
+        raise(Exception("No mutation given?"))
+
+
+def validate_mutations(vcffile, bam, reffa, sample):
+    """Check if mutations in vcf are in bam"""
+    header = []
+    for line in open(vcffile):
+        if line.startswith("#CHROM"):
+            header = line[1:].split("\t")
+        if line.startswith("#"):
+            continue
+        record = dict(zip(header, line.split("\t")))
+        row = []
+        bpdf = get_baseparser_extended_df(bam, sample, record["CHROM"], record["POS"], record["POS"], reffa)
+        if len(record["REF"]) == len(record["ALT"]):
+            if len(bpdf[(bpdf.most_common_al == record["ALT"]) & (bpdf.most_common_al_maf > 0)]) > 0:
+                row += [True]
+            else:
+                row += [False]
+        else:
+            # deletion
+            if len(record["REF"]) > len(record["ALT"]):
+                if len(bpdf[(bpdf.most_common_indel == record["REF"][1:]) & (bpdf.most_common_indel_maf > 0)]) > 0:
+                    row += [True]
+                else:
+                    row += [False]
+            # insertion
+            else:
+                if len(bpdf[(bpdf.most_common_indel == record["ALT"][1:]) & (bpdf.most_common_indel_maf > 0)]) > 0:
+                    row += [True]
+                else:
+                    row += [False]
+    print "\t".join([str(int(x)) for x in row])
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("reffa", type=str, help="Reference genome (fasta)")
+    parser.add_argument("vcf", type=str, help="VCF with mutations to be validated")
+    parser.add_argument("bam", type=str, help="BAM to find mutations in")
+    parser.add_argument("--sample_name", type=str, default=None, help="Set name "
+                        "of sample, used in output.")
+    args = parser.parse_args()
+    if args.sample_name is None:
+        args.sample_name = args.bam
+    validate_mutations(args.vcf, args.bam, args.reffa, args.sample_name)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sufam/mpileup_parser.py b/sufam/mpileup_parser.py
@@ -0,0 +1,94 @@
+#! /usr/bin/env python
+###############################################################################
+# parses pileup base string and returns the counts for all possible alleles
+# for each position
+# reads input (mpileup output) from sys.stdin
+#
+# Author: Niknafs (Noushin Niknafs)
+# Edit: inodb
+###############################################################################
+
+import os
+import sys
+
+class ParseString(object):
+
+    def __init__(self, ref, string):
+        self.ref = ref.upper()
+        self.string = string.upper()
+        self.types = {'A':0,'G':0,'C':0,'T':0,'-':[],'*':0,'+':[],'X':[]}
+        self.process()
+
+    def process(self):
+        # remove end of read character
+        self.string = self.string.replace('$','')
+        while self.string != '':
+            if self.string[0] == '^':
+                # skip two characters when encountering '^' as it indicates
+                # a read start mark and the read mapping quality
+                self.string = self.string[2:]
+            elif self.string[0] == '*':
+                self.types['*'] += 1
+                # skip to next character
+                self.string = self.string[1:]
+
+            elif self.string[0] in ['.',',']:
+                if (len(self.string)== 1) or (self.string[1] not in ['+','-']):
+                    # a reference base
+                    self.types[self.ref] += 1
+                    self.string = self.string[1:]
+                elif self.string[1] == '+':
+                    insertionLength = int(self.string[2])
+                    insertionSeq = self.string[3:3+ insertionLength]
+                    self.types['+'].append(insertionSeq)
+                    self.string = self.string[3+insertionLength:]
+                elif self.string[1] == '-':
+                    deletionLength = int(self.string[2])
+                    deletionSeq = self.string[3:3+deletionLength]
+                    self.types['-'].append(deletionSeq)
+                    self.string = self.string[3+deletionLength:]
+
+            elif self.types.has_key(self.string[0]) and\
+                 ((len(self.string)==1) or (self.string[1] not in ['-','+'])):
+                # one of the four bases
+                if self.string[0] in ["A","C","G","T"]:
+                    self.types[self.string[0]] += 1
+                    self.string = self.string[1:]
+                elif self.string[0] == '+':
+                    insertionLength = int(self.string[1])
+                    insertionSeq = self.string[2:2+ insertionLength]
+                    self.types['+'].append(insertionSeq)
+                    self.string = self.string[2+insertionLength:]
+                elif self.string[0] == '-':
+                    deletionLength = int(self.string[1])
+                    deletionSeq = self.string[2:2+deletionLength]
+                    self.types['-'].append(deletionSeq)
+                    self.string = self.string[2+deletionLength:]
+            else:
+                # unrecognized character
+                # or a read that reports a substitition followed by an insertion/deletion
+                self.types['X'].append(self.string[0])
+                self.string = self.string[1:]
+        return
+
+    def __repr__(self):
+        types = self.types
+        return '\t'.join(map(str,[types['A'], types['C'], types['G'],types['T'],\
+                                  types['*']]) +\
+                         map(','.join, [types['-'],types['+'],types['X']]))
+
+
+def parse(line):
+    toks = line.strip('\n').split('\t')
+    ref = toks[2].upper()
+    cov = toks[3]
+    return '\t'.join([toks[0], toks[1], ref, cov]) + '\t' + str(ParseString(ref, toks[4]))
+
+
+def main():
+    print >>sys.stdout, "chrom\tpos\tref\tcov\tA\tC\tG\tT\t*\t-\t+\tX"
+    for line in sys.stdin:
+        print >>sys.stdout, parse_mpileup_line(line)
+
+if __name__ == '__main__':
+    main()