diff --git a/vcf2maf.pl b/vcf2maf.pl index fd40902..5fee8f4 100644 --- a/vcf2maf.pl +++ b/vcf2maf.pl @@ -10,7 +10,7 @@ my ( $vep_path, $vep_data ) = ( "~/vep", "~/.vep" ); my ( $snpeff_path, $snpeff_data ) = ( "~/snpEff", "~/snpEff/data" ); my ( $tumor_id, $normal_id ) = ( "TUMOR", "NORMAL" ); -my ( $ncbi_build, $maf_center ) = ( 37, "mskcc.org" ); +my ( $ncbi_build, $maf_center ) = ( 37, "." ); # Check for missing or crappy arguments unless( @ARGV and $ARGV[0]=~m/^-/ ) { @@ -142,7 +142,8 @@ $vcf_tumor_idx = $i if( $rest[$i] eq $vcf_tumor_id ); $vcf_normal_idx = $i if( $rest[$i] eq $vcf_normal_id ); } - ( defined $vcf_tumor_idx ) or die "No genotypes for $vcf_tumor_id in VCF!\n"; + ( defined $vcf_tumor_idx ) or warn "No genotype column for $vcf_tumor_id in VCF!\n"; + ( defined $vcf_normal_idx ) or warn "No genotype column for $vcf_normal_id in VCF!\n"; } next; } @@ -150,27 +151,35 @@ # Parse out the data in the info column, and store into a hash my %info = map {( m/=/ ? ( split( /=/, $_, 2 )) : ( $_, 1 ))} split( /\;/, $info_line ); - # If there are multiple ALT alleles, choose the one with the most supporting reads + # If there are multiple ALT alleles, choose the one with the most supporting read depth # Or if allelic depths are unavailable, choose the first one listed under ALT my @alleles = ( $ref, split( /,/, $alt )); - my $alt_allele_idx = 1; - my ( %var_sample_info, %nrm_sample_info ); - if( $format_line and scalar( @rest ) > 0 ) { + my $var_allele_idx = 1; + my ( %tum_sample_info, %nrm_sample_info, @tum_allele_depths, @nrm_allele_depths ); + if( $vcf_tumor_idx ) { # Load into a hash, the FORMATted genotype info for the sample containing the variant my @format_keys = split( /\:/, $format_line ); my ( $t_idx, $n_idx ) = ( 0, 0 ); - %var_sample_info = map {( $format_keys[$t_idx++], $_ )} split( /\:/, $rest[$vcf_tumor_idx] ); + %tum_sample_info = map {( $format_keys[$t_idx++], $_ )} split( /\:/, $rest[$vcf_tumor_idx] ); %nrm_sample_info = map {( $format_keys[$n_idx++], $_ )} split( /\:/, $rest[$vcf_normal_idx] ) if( defined $vcf_normal_idx ); - if( defined $var_sample_info{AD} ) { - my @allelic_depth = split( /,/, $var_sample_info{AD} ); + if( defined $tum_sample_info{AD} ) { + @tum_allele_depths = split( /,/, $tum_sample_info{AD} ); + @nrm_allele_depths = split( /,/, $nrm_sample_info{AD} ) if( defined $nrm_sample_info{AD} ); + + # Handle VCFs generated by VarScan, where reference allele depths are stored separately + if( $#tum_allele_depths ne $#alleles and $tum_sample_info{RD} ) { + unshift( @tum_allele_depths, $tum_sample_info{RD} ); + unshift( @nrm_allele_depths, $nrm_sample_info{RD} ) if( $#nrm_allele_depths ne $#alleles and $nrm_sample_info{RD} ); + } + # The first depth listed belongs to the reference allele. Of the rest, find the largest - for( my $i = 1; $i <= $#allelic_depth; ++$i ) { - $alt_allele_idx = $i if( $allelic_depth[$i] > $allelic_depth[$alt_allele_idx] ); + for( my $i = 1; $i <= $#tum_allele_depths; ++$i ) { + $var_allele_idx = $i if( $tum_allele_depths[$i] > $tum_allele_depths[$var_allele_idx] ); } } } - my ( $var ) = $alleles[$alt_allele_idx]; + my ( $var ) = $alleles[$var_allele_idx]; # Figure out the appropriate start/stop loci and variant type/allele to report in the MAF my $start = my $stop = my $var_type = ""; @@ -211,7 +220,7 @@ my %effect = map{s/\&/,/g; ( $vepcsq_cols[$idx++], $_ )} split( /\|/, $csq_line ); # Skip effects on other ALT alleles - if( $effect{ALLELE_NUM} == $alt_allele_idx ) { + if( $effect{ALLELE_NUM} == $var_allele_idx ) { # Fix potential warnings about undefined variables $effect{BIOTYPE} = '' unless( $effect{BIOTYPE} ); @@ -272,7 +281,7 @@ my %effect = map{( $snpeff_cols[$idx++], $_ )} ( $1, split( /\|/, $2 )); # Skip transcripts with errors/warnings, or effects on other ALT alleles - unless( $effect{ERRORS} or $effect{WARNINGS} or $effect{Genotype_Number} != $alt_allele_idx ) { + unless( $effect{ERRORS} or $effect{WARNINGS} or $effect{Genotype_Number} != $var_allele_idx ) { # Fix potential warnings about undefined variables $effect{Transcript_BioType} = '' unless( $effect{Transcript_BioType} ); @@ -326,9 +335,9 @@ $maf_line{Reference_Allele} = $ref; # If the genotypes are unavailable, then we'll assume it's ref/var heterozygous $maf_line{Tumor_Seq_Allele1} = $ref; - if( defined $var_sample_info{GT} and $var_sample_info{GT} ne "." ) { + if( defined $tum_sample_info{GT} and $tum_sample_info{GT} ne "." ) { # ::NOTE:: MAF format doesn't support triallelic genotypes. So break it down as necessary - my ( $idx1, $idx2 ) = split( /\//, $var_sample_info{GT} ); + my ( $idx1, $idx2 ) = split( /\//, $tum_sample_info{GT} ); $maf_line{Tumor_Seq_Allele1} = ( $alleles[$idx1] eq $var ? $alleles[$idx2] : $alleles[$idx1] ); } $maf_line{Tumor_Seq_Allele2} = $var; @@ -339,10 +348,10 @@ $maf_line{HGVSp} = ( $maf_effect->{HGVSp} ? $maf_effect->{HGVSp} : '' ); $maf_line{Transcript_ID} = ( $maf_effect->{RefSeq} ? $maf_effect->{RefSeq} : ( $maf_effect->{Transcript_ID} ? $maf_effect->{Transcript_ID} : '' )); $maf_line{Exon_Number} = ( $maf_effect->{EXON} ? $maf_effect->{EXON} : ( $maf_effect->{Exon_Rank} ? $maf_effect->{Exon_Rank} : '' )); - $maf_line{t_depth} = $var_sample_info{DP} if( defined $var_sample_info{DP} ); - ( $maf_line{t_ref_count}, $maf_line{t_alt_count} ) = split( /,/, $var_sample_info{AD} ) if( defined $var_sample_info{AD} ); + $maf_line{t_depth} = $tum_sample_info{DP} if( defined $tum_sample_info{DP} ); + ( $maf_line{t_ref_count}, $maf_line{t_alt_count} ) = @tum_allele_depths[0,$var_allele_idx] if( @tum_allele_depths ); $maf_line{n_depth} = $nrm_sample_info{DP} if( defined $nrm_sample_info{DP} ); - ( $maf_line{n_ref_count}, $maf_line{n_alt_count} ) = split( /,/, $nrm_sample_info{AD} ) if( defined $nrm_sample_info{AD} ); + ( $maf_line{n_ref_count}, $maf_line{n_alt_count} ) = @nrm_allele_depths[0,$var_allele_idx] if( @nrm_allele_depths ); foreach my $col ( @maf_header ) { $maf_fh->print( "\t" ) if ( $col ne $maf_header[0] );