Skip to content

Commit

Permalink
Hotfix VarScan VCFs that store Ref depth separately
Browse files Browse the repository at this point in the history
  • Loading branch information
ckandoth committed Apr 9, 2014
1 parent e498263 commit 4c83154
Showing 1 changed file with 28 additions and 19 deletions.
47 changes: 28 additions & 19 deletions vcf2maf.pl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
my ( $vep_path, $vep_data ) = ( "~/vep", "~/.vep" );
my ( $snpeff_path, $snpeff_data ) = ( "~/snpEff", "~/snpEff/data" );
my ( $tumor_id, $normal_id ) = ( "TUMOR", "NORMAL" );
my ( $ncbi_build, $maf_center ) = ( 37, "mskcc.org" );
my ( $ncbi_build, $maf_center ) = ( 37, "." );

# Check for missing or crappy arguments
unless( @ARGV and $ARGV[0]=~m/^-/ ) {
Expand Down Expand Up @@ -142,35 +142,44 @@
$vcf_tumor_idx = $i if( $rest[$i] eq $vcf_tumor_id );
$vcf_normal_idx = $i if( $rest[$i] eq $vcf_normal_id );
}
( defined $vcf_tumor_idx ) or die "No genotypes for $vcf_tumor_id in VCF!\n";
( defined $vcf_tumor_idx ) or warn "No genotype column for $vcf_tumor_id in VCF!\n";
( defined $vcf_normal_idx ) or warn "No genotype column for $vcf_normal_id in VCF!\n";
}
next;
}

# Parse out the data in the info column, and store into a hash
my %info = map {( m/=/ ? ( split( /=/, $_, 2 )) : ( $_, 1 ))} split( /\;/, $info_line );

# If there are multiple ALT alleles, choose the one with the most supporting reads
# If there are multiple ALT alleles, choose the one with the most supporting read depth
# Or if allelic depths are unavailable, choose the first one listed under ALT
my @alleles = ( $ref, split( /,/, $alt ));
my $alt_allele_idx = 1;
my ( %var_sample_info, %nrm_sample_info );
if( $format_line and scalar( @rest ) > 0 ) {
my $var_allele_idx = 1;
my ( %tum_sample_info, %nrm_sample_info, @tum_allele_depths, @nrm_allele_depths );
if( $vcf_tumor_idx ) {

# Load into a hash, the FORMATted genotype info for the sample containing the variant
my @format_keys = split( /\:/, $format_line );
my ( $t_idx, $n_idx ) = ( 0, 0 );
%var_sample_info = map {( $format_keys[$t_idx++], $_ )} split( /\:/, $rest[$vcf_tumor_idx] );
%tum_sample_info = map {( $format_keys[$t_idx++], $_ )} split( /\:/, $rest[$vcf_tumor_idx] );
%nrm_sample_info = map {( $format_keys[$n_idx++], $_ )} split( /\:/, $rest[$vcf_normal_idx] ) if( defined $vcf_normal_idx );
if( defined $var_sample_info{AD} ) {
my @allelic_depth = split( /,/, $var_sample_info{AD} );
if( defined $tum_sample_info{AD} ) {
@tum_allele_depths = split( /,/, $tum_sample_info{AD} );
@nrm_allele_depths = split( /,/, $nrm_sample_info{AD} ) if( defined $nrm_sample_info{AD} );

# Handle VCFs generated by VarScan, where reference allele depths are stored separately
if( $#tum_allele_depths ne $#alleles and $tum_sample_info{RD} ) {
unshift( @tum_allele_depths, $tum_sample_info{RD} );
unshift( @nrm_allele_depths, $nrm_sample_info{RD} ) if( $#nrm_allele_depths ne $#alleles and $nrm_sample_info{RD} );
}

# The first depth listed belongs to the reference allele. Of the rest, find the largest
for( my $i = 1; $i <= $#allelic_depth; ++$i ) {
$alt_allele_idx = $i if( $allelic_depth[$i] > $allelic_depth[$alt_allele_idx] );
for( my $i = 1; $i <= $#tum_allele_depths; ++$i ) {
$var_allele_idx = $i if( $tum_allele_depths[$i] > $tum_allele_depths[$var_allele_idx] );
}
}
}
my ( $var ) = $alleles[$alt_allele_idx];
my ( $var ) = $alleles[$var_allele_idx];

# Figure out the appropriate start/stop loci and variant type/allele to report in the MAF
my $start = my $stop = my $var_type = "";
Expand Down Expand Up @@ -211,7 +220,7 @@
my %effect = map{s/\&/,/g; ( $vepcsq_cols[$idx++], $_ )} split( /\|/, $csq_line );

# Skip effects on other ALT alleles
if( $effect{ALLELE_NUM} == $alt_allele_idx ) {
if( $effect{ALLELE_NUM} == $var_allele_idx ) {

# Fix potential warnings about undefined variables
$effect{BIOTYPE} = '' unless( $effect{BIOTYPE} );
Expand Down Expand Up @@ -272,7 +281,7 @@
my %effect = map{( $snpeff_cols[$idx++], $_ )} ( $1, split( /\|/, $2 ));

# Skip transcripts with errors/warnings, or effects on other ALT alleles
unless( $effect{ERRORS} or $effect{WARNINGS} or $effect{Genotype_Number} != $alt_allele_idx ) {
unless( $effect{ERRORS} or $effect{WARNINGS} or $effect{Genotype_Number} != $var_allele_idx ) {

# Fix potential warnings about undefined variables
$effect{Transcript_BioType} = '' unless( $effect{Transcript_BioType} );
Expand Down Expand Up @@ -326,9 +335,9 @@
$maf_line{Reference_Allele} = $ref;
# If the genotypes are unavailable, then we'll assume it's ref/var heterozygous
$maf_line{Tumor_Seq_Allele1} = $ref;
if( defined $var_sample_info{GT} and $var_sample_info{GT} ne "." ) {
if( defined $tum_sample_info{GT} and $tum_sample_info{GT} ne "." ) {
# ::NOTE:: MAF format doesn't support triallelic genotypes. So break it down as necessary
my ( $idx1, $idx2 ) = split( /\//, $var_sample_info{GT} );
my ( $idx1, $idx2 ) = split( /\//, $tum_sample_info{GT} );
$maf_line{Tumor_Seq_Allele1} = ( $alleles[$idx1] eq $var ? $alleles[$idx2] : $alleles[$idx1] );
}
$maf_line{Tumor_Seq_Allele2} = $var;
Expand All @@ -339,10 +348,10 @@
$maf_line{HGVSp} = ( $maf_effect->{HGVSp} ? $maf_effect->{HGVSp} : '' );
$maf_line{Transcript_ID} = ( $maf_effect->{RefSeq} ? $maf_effect->{RefSeq} : ( $maf_effect->{Transcript_ID} ? $maf_effect->{Transcript_ID} : '' ));
$maf_line{Exon_Number} = ( $maf_effect->{EXON} ? $maf_effect->{EXON} : ( $maf_effect->{Exon_Rank} ? $maf_effect->{Exon_Rank} : '' ));
$maf_line{t_depth} = $var_sample_info{DP} if( defined $var_sample_info{DP} );
( $maf_line{t_ref_count}, $maf_line{t_alt_count} ) = split( /,/, $var_sample_info{AD} ) if( defined $var_sample_info{AD} );
$maf_line{t_depth} = $tum_sample_info{DP} if( defined $tum_sample_info{DP} );
( $maf_line{t_ref_count}, $maf_line{t_alt_count} ) = @tum_allele_depths[0,$var_allele_idx] if( @tum_allele_depths );
$maf_line{n_depth} = $nrm_sample_info{DP} if( defined $nrm_sample_info{DP} );
( $maf_line{n_ref_count}, $maf_line{n_alt_count} ) = split( /,/, $nrm_sample_info{AD} ) if( defined $nrm_sample_info{AD} );
( $maf_line{n_ref_count}, $maf_line{n_alt_count} ) = @nrm_allele_depths[0,$var_allele_idx] if( @nrm_allele_depths );

foreach my $col ( @maf_header ) {
$maf_fh->print( "\t" ) if ( $col ne $maf_header[0] );
Expand Down

0 comments on commit 4c83154

Please sign in to comment.