convert2annovar.pl

#!/usr/bin/env perl
use warnings;
use strict;
use Getopt::Long;
use Pod::Usage;
use File::Basename;

our $REVISION = '$Revision: f98de7f0a9145baca0dd81fa66f8e3db0603abf9 $';
our $DATE =	'$Date: 2019-10-24 00:05:27 -0400 (Thu, 24 Oct 2019) $';  
our $AUTHOR =	'$Author: Kai Wang <kaichop@gmail.com> $';

our ($verbose, $help, $man);
our ($variantfile);
our ($outfile, $format, $includeinfo, $snpqual, $snppvalue, $coverage, $maxcoverage, $chr, $chrmt, $altcov, $allelicfrac, $fraction, $species, 
	$filterword, $confraction, $allallele, $withzyg, $comment, $allsample, $genoqual, $varqual, $dbsnpfile, $withfreq, $withfilter, $seqdir, $inssize, $delsize, $subsize, $genefile, $splicing_threshold, $context, $avsnpfile, $keepindelref);

our %iupac = (R=>'AG', Y=>'CT', S=>'CG', W=>'AT', K=>'GT', M=>'AC', A=>'AA', C=>'CC', G=>'GG', T=>'TT', B=>'CGT', D=>'AGT', H=>'ACT', V=>'ACG', N=>'ACGT', '.'=>'-', '-'=>'-'); ### <<< FOR 5500SOLiD LifeScope ( S=>'GC' is replaced by S=>'CG')
our %iupacrev = reverse %iupac; ### <<< FOR 5500SOLiD LifeScope

GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'outfile=s'=>\$outfile, 'format=s'=>\$format, 'includeinfo'=>\$includeinfo,
	'snpqual=f'=>\$snpqual, 'snppvalue=f'=>\$snppvalue, 'coverage=i'=>\$coverage, 'maxcoverage=i'=>\$maxcoverage, 'chr=s'=>\$chr, 'chrmt=s'=>\$chrmt, 
	'fraction=f'=>\$fraction, 'altcov=i'=>\$altcov, 'allelicfrac'=>\$allelicfrac,
	'species'=>\$species, 'filter=s'=>\$filterword, 'confraction=f'=>\$confraction, 'allallele!'=>\$allallele, 'withzyg'=>\$withzyg,
	'comment'=>\$comment, 'allsample'=>\$allsample, 'genoqual=f'=>\$genoqual, 'varqual=f'=>\$varqual, 'dbsnpfile=s'=>\$dbsnpfile, 'withfreq'=>\$withfreq,
	'withfilter'=>\$withfilter, 'seqdir=s'=>\$seqdir, 'inssize=i'=>\$inssize, 'delsize=i'=>\$delsize, 'subsize=i'=>\$subsize, 'genefile=s'=>\$genefile,
	'splicing_threshold=i'=>\$splicing_threshold, 'context'=>\$context, 'avsnpfile=s'=>\$avsnpfile, 'keepindelref'=>\$keepindelref) or pod2usage ();

$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
@ARGV == 1 or pod2usage ("Syntax error");

($variantfile) = @ARGV;

$chrmt ||= 'M';

#prepare PATH environmental variable
my $path = File::Basename::dirname ($0);
$path and $ENV{PATH} = "$path:$ENV{PATH}";		#set up the system executable path to include the path where this program is located in

if (not $format) {
	$format = 'pileup';
	print STDERR "NOTICE: the default --format argument is set as 'pileup'\n";
}
$format eq 'vcf' and $format = 'vcf4';

if ($allsample) {
	defined $withfreq or defined $outfile or pod2usage ("Error in argument: please specify --outfile when --allsample is specified (unless -withfreq is set)");
	$format eq 'vcf4' or pod2usage ("Error in argument: the --allsample argument is supported only if --format is 'vcf4'");
	defined $withfreq or print STDERR "NOTICE: output files will be written to $outfile.<samplename>.avinput\n";
} else {
	if (defined $outfile) {
		open (STDOUT, ">$outfile") or die "Error: cannot write to output file $outfile: $!\n";
	}
}

defined $snpqual and $format eq 'pileup' || $format eq 'vcf4old' || pod2usage ("Error in argument: the --snpqual is supported only for the 'pileup' or 'vcf4old' format");
defined $snppvalue and $format eq 'gff3-solid' || pod2usage ("Error in argument: the --snppvalue is supported only for the 'gff3-solid' format");
if (not defined $snpqual and $format eq 'pileup') {
	$snpqual = 20;
	print STDERR "NOTICE: the default --snpqual argument for pileup format is set as 20\n";
}

if (not defined $snppvalue) {
	$snppvalue = 1;		#by default, do not use any of the P-value cutoff in filtering out GFF3-SOLID files (this is differnt from handling pileup files)
}

if (not defined $coverage) {
	$coverage = 0;
}

if (defined $fraction) {
	$format eq 'pileup' or $format eq 'vcf4' or pod2usage ("Error in argument: the '--fraction' argument is supported for the pileup or vcf4 format only");
	$format eq 'vcf4old' and print STDERR "NOTICE: the --fraction argument works ONLY on indels for vcf4old format\n";
	$fraction >= 0 and $fraction <=1 or pod2suage ("Error in argument: the --fraction argument must be between 0 and 1 inclusive");
} else {
	$fraction = 0;
}

if (defined $withfilter) {
	$format eq 'vcf4' or $format eq 'vcf4old' or pod2usage ("Error in argument: the '-withfilter' argument is supported for the vcf4 or vcf4old format only");
}

if (defined $confraction) {
	$format eq 'vcf4old' and print STDERR "NOTICE: the --confraction argument works ONLY on indels for vcf4old format\n";
	$confraction >= 0 and $fraction <=1 or pod2suage ("Error in argument: the --confraction argument must be between 0 and 1 inclusive");
} else {
	$confraction = 0;
}

if (defined $altcov) {
	$format eq 'pileup' or pod2usage ("Error in argument: the '--altcov' argument is supported for the '--format pileup' only");
	$altcov < $coverage or pod2suage ("Error in argument: the --altcov argument must be less than --coverage");
	$altcov > 0 or pod2suage ("Error in argument: the --altcov argument must be a positive integer");
}

if (defined $species) {
	$format eq 'gff3-solid' or pod2usage ("Error in argument: the '--species' argument is only necessary for the '--format gff3-solid'");
}

if ($allallele) {
	$format eq 'vcf4old' or pod2usage ("Error in argument: the '--allallele' argument is only supported for the '--format vcf4old'");
}

if ($withfreq and $withzyg) {
	pod2usage ("Error in argument: -withfreq and -withzyg are mutually exclusive");
}

if ($format eq 'pileup') {
	convertPileup ($variantfile);
} elsif ($format eq 'cg') {
	convertCG ($variantfile);
} elsif ($format eq 'cgmastervar') {
	convertCGMasterVar ($variantfile);
} elsif ($format eq 'gff3-solid') {
	convertGFF3SolidSNP ($variantfile);
} elsif ($format eq 'soap') {
	print STDERR "WARNING: the support for '--format soap' is not well developed yet and may contain bugs for indel analysis.\n";
	convertSOAP ($variantfile);
} elsif ($format eq 'maq') {
	print STDERR "WARNING: the support for '--format maq' is not well developed yet and may contain bugs.\n";
	convertMAQSNP ($variantfile);
} elsif ($format eq 'casava') {
	convertCASAVA ($variantfile, $chr);
} elsif ($format eq 'vcf4old') {
	convertVCF4Old ($variantfile);
} elsif ($format eq 'vcf4') {
	convertVCF4 ($variantfile);
} elsif ($format eq 'annovar') {
	convertANNOVAR ($variantfile);
} elsif ($format eq 'annovar2vcf') {
	convertANNOVAR2VCF ($variantfile);
} elsif ($format eq 'bed') {
	convertBED ($variantfile);
} elsif ($format eq 'rsid') {
	defined $dbsnpfile or defined $avsnpfile or pod2usage ("Error in argument: please specify --dbsnpfile or -avsnpfile when the --format is 'rsid'");
	if (defined $dbsnpfile) {
		convertRsid ($variantfile);
	} else {
		convertAvsnpid ($variantfile);
	}
} elsif ($format eq 'region') {
	defined $subsize or $subsize = 1;
	$variantfile =~ m/(chr)?(\w+):(\d+)-(\d+)$/ or pod2usage "Error in argument: for '-format region', the region should be specified in 'chr:start-end' format";
	$seqdir or pod2usage "Error in argument: please specify -seqdir for the '-format region'\n";
	convertRegion ($variantfile);
} elsif ($format eq 'transcript') {
	defined $subsize or $subsize = 1;
	defined $genefile or pod2usage ("Error in argument; please specify -genefile for the '-format transcript'");
	$seqdir or pod2usage "Error in argument: please specify -seqdir for the '-format transcript'\n";
	$splicing_threshold ||= 2;
	convertTranscript ($variantfile);
} else {
	pod2usage ("Error in argument: the --format $format is not currently supported. Please contact ANNOVAR developer for adding the support");
}

sub convertTranscript {
	my ($transcript) = @_;
	my ($foundmatch) = (0);
	my (@allregion);
	open (GENE, $genefile) or die "Error: cannot read from gene file $genefile: $!\n";
	while (<GENE>) {
		s/[\r\n]+$//;							#deleting the newline characters
		my @record = split (/\t/, $_);
		my ($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes);

		@record >= 11 or die "Error: invalid record in genefile (>=11 fields expected): <$_>\n";
		
		
		if (@record == 12) {	#knownGene
			($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend) = @record[0..9];
		} else {		#other gene definition
			($name, $chr, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exoncount, $exonstart, $exonend, $id, $name2, $cdsstartstat, $cdsendstat, $exonframes) = @record[1..15];
		}
		
		if ($transcript eq $name) {
			$foundmatch++;
			my @exonstart = split (/,/, $exonstart);
			my @exonend = split (/,/, $exonend);
			map {$_++} @exonstart;
			for my $i (0 .. @exonstart-1) {
				push @allregion, "$chr:" . ($exonstart[$i]-$splicing_threshold) . '-' . ($exonend[$i]+$splicing_threshold);
			}
		}
	}
	print STDERR "NOTICE: ${\(scalar @allregion)} regions will be analyzed for possible mutations\n";
	$verbose and print STDERR "NOTICE: These regions will be analyzed: @allregion\n";
	$foundmatch or print STDERR "WARNING: the specified transcript $transcript is not found in genefile $genefile\n";
	$foundmatch > 1 and print STDERR "WARNING: the specified transcript $transcript occurs $foundmatch times in genefile $genefile\n";
	for my $nextregion (@allregion) {
		convertRegion ($nextregion);
	}
}

sub convertRegion {
	my ($region) = @_;
	$region =~ m/(chr)?(\w+):(\d+)-(\d+)$/;
	my ($chr, $start, $end) = ($2, $3, $4);
	$end >= $start or die "Error: end position must be equal or larger than start position in region specification '$region'\n";
	my $sc = "echo $region | retrieve_seq_from_fasta.pl -format simple -tabout -seqdir $seqdir -outfile stdout stdin";
	my $result = qx/$sc/;
	$result =~ m/\S+\t(\S+)/ or die "Error: unable to retrieve user-specified region $region from seqdir $seqdir\n";
	my $seq = $1;
	
	if (not defined $subsize or $subsize) {			#default value is 1
		my @allnt = generateAllNT ($subsize);
		for my $i (0 .. length($seq)-$subsize) {
			for my $nt (@allnt) {
				substr($seq, $i, $subsize) eq $nt and next;
				substr($seq, $i, $subsize) =~ m/^N+$/ and next;		#if it is only composed of N, skip this position
				print STDOUT join ("\t", $chr, $start+$i, $start+$i+$subsize-1, substr($seq, $i, $subsize), $nt), "\n";
			}
		}
	}
	if ($delsize) {
		for my $i (0 .. length($seq)-$delsize) {
			print STDOUT join ("\t", $chr, $start+$i, $start+$i, substr($seq, $i, $delsize), '-'), "\n";
		}
	}
	if ($inssize) {
		my @allnt = generateAllNT ($inssize);
		for my $i (0 .. length($seq)-1) {
			for my $nt (@allnt) {
				print STDOUT join ("\t", $chr, $start+$i, $start+$i, '-', $nt), "\n";
			}
		}
	}
}


sub generateAllNT {
	my ($size) = @_;
	my @oldstr = ('');		#initialize as four empty string
	my @newstr;
	for my $i (1 .. $size) {
		@newstr = ();
		for my $nt (qw/A C G T/) {
			for my $str (@oldstr) {
				push @newstr, $str.$nt;
			}
		}
		@oldstr = @newstr;
	}
	return (@newstr);
}

sub convertPileup {
	my ($variantfile) = @_;
	my ($countline, $countvar, $counthom, $counthet, $countindel, $countsnp, $countti, $counttv) = qw/0 0 0 0 0 0 0 0/;
	
	if ($variantfile eq 'stdin') {
		*VAR = *STDIN;
	} elsif ($variantfile =~ m/\.gz$/) {
		open (VAR, "gunzip -c $variantfile |") or die "Error: cannot read from STDIN uncompressing variant file $variantfile: $!\n";
	} else {
		open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
	}

	print STDERR "NOTICE: Column 6-9 in output are heterozygosity status, SNP quality, total reads, reads with mutation\n";

	while (<VAR>) {
		s/[\r\n]+$//;
		$countline++;
		my $hom = 'hom';
		my @field = split (/\t/, $_);
		@field >= 10 or die "Error: invalid record found in pileupfile $variantfile (at least 10 fields expected): <$_>\n";
		my ($chr, $pos, $wt, $call, @other) = @field;
		my ($cons_qual, $snp_quality, $readcount, $readallele) = @field[4,5,7,8];
		$chr =~ s/^chr//;
		$wt = uc $wt;					#wt may or may not in upper case, it depends on the input FASTA file
		$call = uc $call;				#usually call is in upper case
		$readallele = uc $readallele;			#lower case indicate the opposite strand
		
		$includeinfo or @other = ();			#unless -includeinfo is set, the other will not be printed
		
		$snp_quality >= $snpqual or next;		#quality of the variant call
		$readcount >= $coverage or next;		#read count of the variant call
		$maxcoverage and $readcount <= $maxcoverage || next;	#maximum coverage of the variant call
		
		if ($wt eq '*') {				#indel
			#example:
			#1       970271  *       +C/+C   39      106     44      5       +C      *       1       4       0       0       0
			#1       1548977 *       */+CCG  29      29      42      3       *       +CCG    2       1       0       0       0
			#1       1674810 *       */+C    24      24      42      6       *       +C      5       1       0       0       0
			#1       968466  *       -CT/-CT 53      339     55      5       -CT     *       5       0       0       0       0
			#1       1093600 *       -GAAA/* 29      29      53      3       -GAAA   *       1       2       0       0       0
			#1       1110101 *       */-A    41      41      17      6       *       -A      5       1       0       0       0
			#1       1215395 *       */-TC   26      26      32      4       *       -TC     3       1       0       0       0
			my @obs = split (/\//, $call);		#example: '+AG/+AG' as homozygotes, '*/-TA' or '*/+T' as heterozygotes
			@obs == 2 or die "Error: pileup record contains more than two alternative alleles: <$_>\n";
			my ($end, $ref, $alt);
			my ($indelreadcount);			#number of reads supporting the indel
			
			
			if ($obs[0] eq $obs[1]) {
				#something weird may occur in SamTools output: 22      15231121        *       */*     360     0       32      158     *       +GA     156     2       0       0       0
				$obs[0] eq '*' and next;	
	
				#for deletions, SAMTOOLS represent deletion using a location before the first deleted base in the reference sequence coordinate system
				#for example, a deletion in Samtools output is "1       109266688       *       */-CTT  1429    1429    58      43      *       -CTT    24      19      0       0       0"
				#the correct ANNOVAR input (for rs35029887) should be "1       109266689       109266691       CTT     -       het     1429"
				#insertions are fine without change; for example, check rs5745466 in Genome Browser; samtools report "1       76119508        *       +AT/+AT"
				#for this insertion, ANNOVAR input file (for rs5745466) becomes "1       76119508        76119508        -       AT      hom     1601"

				if ($obs[0] =~ m/^\-/) {
					$pos++;			#add 1 to position in deletion
				}
				
				$indelreadcount = calculateIndelReadCount ($obs[0], \@field);
				$indelreadcount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
				defined $altcov and $indelreadcount >= $altcov || next;
				
				if ($chr eq $chrmt or $allelicfrac) {
					$hom = sprintf ("%.3f", $indelreadcount/$readcount);
				}
				($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[0]);
				print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n";
				$counthom++;
			} else {
				$hom = 'het';
				if ($obs[0] =~ m/^[\-\+]/) {
					$obs[0] =~ m/^\-/ and $pos++;
					($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[0]);
					$indelreadcount = calculateIndelReadCount ($obs[0], \@field);
					$indelreadcount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
					defined $altcov and $indelreadcount >= $altcov || next;
					
					if ($chr eq $chrmt or $allelicfrac) {
						$hom = sprintf ("%.3f", $indelreadcount/$readcount);
					}
					print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n";
					$counthet++;
				}
				if ($obs[1] =~ m/^[\-\+]/) {
					$obs[1] =~ m/^\-/ and $pos++;
					($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[1]);
					$indelreadcount = calculateIndelReadCount ($obs[1], \@field);
					$indelreadcount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
					defined $altcov and $indelreadcount >= $altcov || next;
					
					if ($chr eq $chrmt or $allelicfrac) {
						$hom = sprintf ("%.3f", $indelreadcount/$readcount);
					}
					print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n";
					$counthet++;
				}
			}
			$countindel++;
		} else {
			#1       798494  G       A       36      36      58      3       AAA     bbb
			#1       798677  T       K       33      33      52      26      ,$.,,G.GG,.,......,..G,,...     b`bbBaJIbFbZWaTNQbb_VZcbbb
			#1       856182  G       A       42      42      50      5       AAAAA   B\bbb
			#1       861034  A       M       48      48      49      14      ,$,.,..,cc.c.,c bbBbb`]BFbHbBB
			#1       864289  T       K       22      22      56      6       .g,,g,  BH^_BB
			
			$wt eq $call and next;			#this is not a SNP
			my $obs = $iupac{$call} or die "Error: invalid best call ($call) in <$_>\n";
			my @obs = split (//, $obs);
			@obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
			if ($obs[0] ne $obs[1]) {
				$hom = 'het';
			}
				
			
			if ($obs[0] eq $wt) {			#obs[0] is guaranteed to be an alternative allele
				@obs = @obs[1,0];
			}
			if ($wt eq 'A' and $obs[0] eq 'G' or $wt eq 'G' and $obs[0] eq 'A' or $wt eq 'C' and $obs[0] eq 'T' or $wt eq 'T' and $obs[0] eq 'C') {
				unless ($wt ne $obs[0] and $wt ne $obs[1] and $obs[0] ne $obs[1]) {
					$countti++;
				}
				
			} else {
				unless ($wt ne $obs[0] and $wt ne $obs[1] and $obs[0] ne $obs[1]) {
					$counttv++;
				}
			}
			
			my $mutallelecount;
			
			if ($obs[1] eq $wt) {			#het SNP
				if ($chr eq $chrmt or $allelicfrac) {
					$hom = calculateAllelicFraction ($obs[0], $field[8], $readcount);
				}
				$mutallelecount = calculateMutAlleleCount ($obs[0], $readallele);
				$mutallelecount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
				defined $altcov and $mutallelecount >= $altcov || next;
				
				print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
				$counthet++;
			} elsif ($obs[1] ne $obs[0]) {		#het SNP but both differ from reference allele
				if ($chr eq $chrmt or $allelicfrac) {
					$hom = calculateAllelicFraction ($obs[1], $field[8], $readcount);
				}
				$mutallelecount = calculateMutAlleleCount ($obs[1], $readallele);
				$mutallelecount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
				defined $altcov and $mutallelecount >= $altcov || next;
				
				print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[1], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
				if ($chr eq $chrmt) {
					$hom = calculateAllelicFraction ($obs[0], $field[8], $readcount);
				}
				$mutallelecount = calculateMutAlleleCount ($obs[0], $readallele);
				$mutallelecount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
				defined $altcov and $mutallelecount >= $altcov || next;
				
				print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
				$counthet++;
				$counthet++;
			} else {				#homo SNP
				if ($chr eq $chrmt or $allelicfrac) {
					$hom = calculateAllelicFraction ($obs[0], $field[8], $readcount);
				}
				$mutallelecount = calculateMutAlleleCount ($obs[0], $readallele);
				$mutallelecount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
				defined $altcov and $mutallelecount >= $altcov || next;
				
				print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
				$counthom++;
			}
			$countsnp++;
		}
		$countvar++;
	}
	my $triallelic = $countsnp-$countti-$counttv;
	print STDERR "NOTICE: Read $countline lines and wrote ${\($counthet+$counthom)} different variants at $countvar genomic positions ($countsnp SNPs and $countindel indels)\n";
	print STDERR "NOTICE: Among ${\($counthet+$counthom)} different variants at $countvar positions, $counthet are heterozygotes, $counthom are homozygotes\n";
	print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions", $triallelic?", $triallelic have more than 2 alleles\n":"\n";
}

sub calculateIndelReadCount {
	my ($obs, $field) = @_;
	#make sure to use upper case in the comparison, for example:
	#chr10   130133  *       */-ca   189     189     59      31      *       -ca     27      4       0       0       0
	if ($obs eq uc $field->[8]) {
		return $field->[10];
	} elsif ($obs eq uc $field->[9]) {
		return $field->[11];
	} else {
		die "Error: invalid record in pileup file (indel counts cannot be inferred): <$obs> vs <@$field>\n";
	}
}

sub calculateMutAlleleCount {
	my ($allele, $string) = @_;	#they should be already both in upper case
	$string =~ s/\^.//g;		#^ is followed by mapping quality
	$string =~ s/\$//g;
	$string =~ s/[+-]1[^\d]//g;	#1 followed by a non-number
	$string =~ s/[+-]2..//g;
	$string =~ s/[+-]3...//g;
	$string =~ s/[+-]4....//g;
	$string =~ s/[+-]5.....//g;
	$string =~ s/[+-]6......//g;
	$string =~ s/[+-]7.......//g;
	$string =~ s/[+-]8........//g;
	$string =~ s/[+-]9.........//g;
	$string =~ s/[+-]10..........//g;
	
	#make sure to use upper case letter
	my @string = split (//, uc $string);
	my $count = 0;
	for my $i (0 .. @string-1) {
		$allele eq $string[$i] and $count++;
	}
	return $count;
}

sub calculateAllelicFraction {
	my ($obs, $readbase, $readcount) = @_;
	my @readbase = split (//, $readbase);
	my $count=0;
	for my $i (0 .. @readbase-1) {
		uc $obs eq uc $readbase[$i] and $count++;
	}
	my $hom = $count/$readcount;
	length ($hom) > 5 and $hom > 0.001 and $hom = sprintf ("%.3f", $hom);
	return $hom;
}

sub recalculateEndRefObs {		#recalculate end position, reference allele and observed allele
	my ($end, $ref, $obs) = @_;
	if ($obs =~ m/^\-(\w+)/) {	#deletion
		$end += (length ($1)-1);
		$ref = $1;
		$obs = '-';
	} elsif ($obs =~ m/^\+(\w+)/) {	#insertion
		$ref = '-';
		$obs = $1;
	} else {
		die "Error: cannot handle $end, $ref, $obs\n";
	}
	return ($end, $ref, $obs);
}

sub convertBED {
	my ($variantfile) = @_;
	
	my ($foundheader, $countline, $countvar) = qw/0 0 0/;
	my ($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = qw/0 0 0 0 0 0 0 0/;

	if ($variantfile eq 'stdin') {
		*VAR = *STDIN;
	} elsif ($variantfile =~ m/\.gz$/) {
		open (VAR, "gunzip -c $variantfile |") or die "Error: cannot read from STDIN uncompressing variant file $variantfile: $!\n";
	} else {
		open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
	}


	print STDERR "NOTICE: Converting variants from $variantfile\n";
	while (<VAR>) {
		s/[\r\n]+$//;
		$countline++;
		m/^#/ and next;		#comment lines
		if (m/^track/) {
			$foundheader++;
			next;
		}
		if (not $foundheader) {
			$countline > 10 and die "Error: invalid BED file format for $variantfile (track record is not found within the first 10 lines)\n";
		}
		my ($chrom, $start, $end, @otherinfo) = split (/\t/, $_);
		$chrom =~ s/^chr//;
		
		print join ("\t", $chrom, $start+1, $end, 0, 0, @otherinfo), "\n";
		$countvar++;
		
		
	}
	print STDERR "NOTICE: Done with $countline lines and $countvar variants\n";
}

sub convertCG {
	my ($variantfile) = @_;
	
	my ($foundheader, $countline, @field);
	my ($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = qw/0 0 0 0 0 0 0 0/;

	if ($variantfile eq 'stdin') {
		*VAR = *STDIN;
	} elsif ($variantfile =~ m/\.gz$/) {
		open (VAR, "gunzip -c $variantfile |") or die "Error: cannot read from STDIN uncompressing variant file $variantfile: $!\n";
	} else {
		open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
	}


	print STDERR "NOTICE: Converting variants from $variantfile\n";
	while (<VAR>) {
		s/[\r\n]+$//;
		$countline++;
		if (m/^>locus/) {
			$foundheader++;
		}
		if (not $foundheader) {
			$countline > 50 and die "Error: invalid CG-var file format for $variantfile (>locus record is not found within the first 50 lines)\n";
			next;
		}
		my ($locus, $ploidy, $haplo, $chr, $start, $end, $vartype, $ref, $obs, $score, $haplolink, $xref) = split (/\t/, $_);
		$chr =~ s/^chr//;
		$vartype eq 'ins' or $start++;		#CG use zero-start, half-open coordinate. Insertion does not need to be processed (example, "16476   2       2       chr1    751820  751820  ins             T       49              dbsnp:rs59038458")
		$obs eq '' and $obs = '-';
		$ref eq '' and $ref = '-';

		if ($vartype =~ m/^snp|ins|del|delins|sub$/) {		#in new versions of the files, "sub" is used instead of "delins".
			#$chr eq 'M' and next;			#ignore chrM markers as they are not diploid
			if ($chr eq $prechr and $start eq $prestart and $end eq $preend and $obs eq $preobs) {		#homozygous mutation
				print $chr, "\t", $start, "\t", $end, "\t", $ref, "\t", $obs, "\t", $vartype, "\t", ($score+$prescore)/2, "\t", "hom\t", $xref, "\n";
				($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = qw/0 0 0 0 0 0 0 0/;
			} else {
				if ($prestart and $preend) {
					print $prechr, "\t", $prestart, "\t", $preend, "\t", $preref, "\t", $preobs, "\t", $prevartype, "\t", $prescore, "\thet\t", $prexref, "\n";
				}
				($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = ($chr, $start, $end, $vartype, $ref, $obs, $score, $xref);
			}
		}
	}
	if ($prestart and $preend) {
		print $prechr, "\t", $prestart, "\t", $preend, "\t", $preref, "\t", $preobs, "\t", $prevartype, "\t", $prescore, "\thet\t", $prexref, "\n";
	}
	print STDERR "NOTICE: Done with $countline lines\n";
}

sub convertCGMasterVar {
	#this subroutine converts CG masterVar format into ANNOVAR input format
	#example input file is below:
	
	#SEGDUP_GENERATED_AT    2010-Dec-01 13:40
	#SOFTWARE_VERSION       2.0.2.26
	#TYPE   VAR-OLPL
	#>locus  ploidy  chromosome      begin   end     zygosity        varType reference       allele1Seq      allele2Seq      allele1VarScoreVAF      allele2VarScoreVAF      allele1VarScoreEAF      allele2VarScoreEAF      allele1VarQuality       allele2VarQuality       allele1HapLink  allele2HapLink  allele1XRef     allele2XRef     evidenceIntervalId      allele1ReadCount        allele2ReadCount        referenceAlleleReadCount        totalReadCount  allele1Gene     allele2Gene     pfam    miRBaseId       repeatMasker    segDupOverlap   relativeCoverageDiploid calledPloidy    relativeCoverageNondiploid      calledLevel
	#1       2       chr1    0       10000   no-call no-ref  =       ?       ?                                                                                                                                                                                                       
	#2       2       chr1    10000   11038   no-call complex =       ?       ?                                                                                                                                                                               1.13    N       1.02    1.006
	#3       2       chr1    11038   11055   hom     ref     =       =       =                                                                                                                                                                               1.13    N       1.02    1.006
	#4       2       chr1    11055   11082   no-call complex =       ?       ?                                                                                                                                                                               1.13    N       1.02    1.006
	#5       2       chr1    11082   11109   hom     ref     =       =       =                                                                                                                                                                               1.13    N       1.02    1.006

	my ($variantfile) = @_;
	
	my ($foundheader, $countline);

	if ($variantfile eq 'stdin') {
		*VAR = *STDIN;
	} elsif ($variantfile =~ m/\.gz$/) {
		open (VAR, "gunzip -c $variantfile |") or die "Error: cannot read from STDIN uncompressing variant file $variantfile: $!\n";
	} else {
		open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
	}


	print STDERR "NOTICE: Converting variants from $variantfile\n";
	while (<VAR>) {
		s/[\r\n]+$//;
		$countline++;
		if (m/^>locus/) {
			$foundheader++;
		}
		if (not $foundheader) {
			$countline > 50 and die "Error: invalid CG-var file format for $variantfile (>locus record is not found within the first 50 lines)\n";
			next;
		}
		my ($locus, $ploidy, $chr, $start, $end, $zygosity, $vartype, $ref, $obs1, $obs2, @otherinfo) = split (/\t/, $_, -1);
		my ($a1scorevaf, $a2scorevaf, $a1scoreeaf, $a2scoreeaf, $a1varqual, $a2varqual, $a1haplink, $a2haplink, $a1xref, $a2xref, $interval, $a1read, $a2read, $refread, $totalread) = @otherinfo;

		$chr =~ s/^chr//;
		$vartype eq 'ins' or $start++;		#CG use zero-start, half-open coordinate. Insertion does not need to be processed (example, "16476   2       2       chr1    751820  751820  ins             T       49              dbsnp:rs59038458")
		$obs1 eq '' and $obs1 = '-';
		$obs2 eq '' and $obs2 = '-';
		$ref eq '' and $ref = '-';

		#zygosity explanation:
		##no-call: All alleles are partially or fully no-called.
		##hap: Haploid, fully called locus.
		##half: Diploid locus where one of the alleles is fully called and the other contains no-calls.
		##hom: Diploid, homozygous, fully called locus.
		##het-ref: Diploid, heterozygous, fully called locus where one of the alleles is identical to the reference.
		##het-alt: Diploid, heterozygous, fully called locus where both alleles differ from the reference.

		#vartype explanation:
		##snp, ins, del, or sub: Fully called or half-called locus that contains only a single isolated variation.
		##ref: Fully called or half-called locus that contains only reference calls and no calls and at least one allele is fully called.
		##complex: Locus that contains multiple variations or has no-calls in all alleles. This is also the value for all loci where the reference itself is ambiguous.
		##no-ref: Locus where the reference genome is N.
		##PAR-called-in-X: Locus on the pseudo-autosomal region of the Y chromosomes in males.
                
                $zygosity eq 'no-call' and next;		#ignore locus without calls
                

		if ($vartype =~ m/^snp|ins|del|delins|sub$/) {		#in new versions of the files, "sub" is used instead of "delins".
	                if ($coverage) {
	                	$totalread >= $coverage or next;
	                }
	                if ($maxcoverage) {
	                	$totalread <= $maxcoverage or next;
	                }
			if ($zygosity eq 'hom' or $zygosity eq 'hap') {
				print $chr, "\t", $start, "\t", $end, "\t", $ref, "\t", $obs1, "\t", 'hom', "\t", $vartype, "\t", $totalread, "\t", $includeinfo?join("\t", "\t", @otherinfo):'', "\n";
			} else {
				print $chr, "\t", $start, "\t", $end, "\t", $ref, "\t", $obs1, "\t", 'het', "\t", $vartype, "\t", $totalread, "\t", $includeinfo?join("\t", "\t", @otherinfo):'', "\n";    
			}
		}
	}
	print STDERR "NOTICE: Done with $countline lines\n";
}

sub convertGFF3SolidSNP {
	my ($variantfile) = @_;
	my ($countline, $countvar, $countallvar, @other) = (0, 0, 0);
	my ($unknown_count);		#count of variants with 'unknown' variation type
	
	if ($variantfile eq 'stdin') {
		*VAR = *STDIN;
	} elsif ($variantfile =~ m/\.gz$/) {
		open (VAR, "gunzip -c $variantfile |") or die "Error: cannot read from STDIN uncompressing variant file $variantfile: $!\n";
	} else {
		open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
	}

	$_ = <VAR>;
	s/[\r\n]+$//;
	m/^##gff-version\s+3/ or die "Error: invalid first line in GFF3 file ('##gff-version 3' expected): <$_>\n";
	$_ = <VAR>;
	s/[\r\n]+$//;
	(m/^##solid-gff-version/ || m/^##source-version/) or print STDERR "WARNING: problematic second line in GFF3-SOLiD file ('##solid-gff-version' or '##source-version' expected): <$_>\n"; ### <<< FOR 5500SOLiD LifeScope

	print STDERR "NOTICE: Column 6-9 in output are heterozygosity status, variant score (P-value), total clipped normal coverage reads, total reads with mutated allele\n";
	
	while (<VAR>) {
		s/[\r\n]+$//;
		$countline++;
		if ($comment) {
			m/^#/ and print and next;	#keep comment in output file
		} else {
			m/^##/ and next;		#header of comment lines
			m/^#/ and next;			#header of results lines
		}
		
		my @field = split (/\t/, $_);
		@field == 9 or die "Error: invalid record found in $variantfile (10 fields expected): <$_>\n";
		my ($chr, $program, $type, $pos, $end, $score, $attribute) = @field[0,1,2,3,4,5,8];		#score is the P-value for the SNP calls
		$chr eq 'chr_name' and next;	#header line
		
		if ($score ne '.') {
			$score >=0 and $score <=1 or die "Error: invalid score record found in file (0-1 range expected): <$_>\n";
			$score <= $snppvalue or next;
		}
		
		if ($species and $species eq 'human') {
			$chr eq '23' and $chr = 'X';
			$chr eq '24' and $chr = 'Y';
			$chr eq '25' and $chr = 'M';
		}

		$includeinfo and @other = ($attribute);			#unless -includeinfo is set, the other will not be printed

		my ($readcount, $mutallelecount) = ('.', '.');		#total coverage, coverage for mutated alleles
		
		if ($type eq 'unknown') {
			#SOLiD GDD3 may have unknown variation types
			#chr1    AB_SOLiD Small Indel Tool       unknown 3833062 3833153 1       .       .       ID=5483;len=no_call;allele-call-pos=3833062;allele-call=/CCAC;allele-pos=3833057;alleles=atccatccacccatc/aTCCATCCACCCACCCATC/NO_CALL;allele-counts=REF,2,2;tight_chrom_pos=none;loose_chrom_pos=3833058-3833069;no_nonred_reads=3;coverage_ratio=8.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_50_10_r,L1_1_50_15_r,L1_1_50_15_r,L1_1_50_12_r;bead_ids=1018_196_970,699_1263_465,220_513_923,2022_1532_1071;overall_qvs=4,6,2,50;no_mismatches=5,4,2,0;read_pos=27,29,31,13;from_end_pos=23,21,19,37;strands=+,+,+,+;tags=R3,F3,F3,F3;indel_sizes=-92,-112,4,4;non_indel_no_mismatches=0,0,8,0;unmatched-lengths=50,50,50,50;ave-unmatched=50.0000;anchor-match-lengths=48,49,49,49;ave-anchor-length=48.7500;read_seqs=G23223321322112233223100132013201320110011001322332,T33223321322112233223100132013201320110013021322332,T33223321322112233223100132013201320110011001322332,T31001320132013201100110013223322113030332233113032;base_qvs=;non_indel_seqs=T21322332211221121322332230321212121223322332233221,G12020202202020012001200213022002130012332310122030,G12020202202020012001000210022012110312331331122030,G22111012101031010100002002321020002202121121313021;non_indel_qvs=
			$unknown_count++;
			next;		#do not count this one!
		}
		
		if ($program eq 'SOLiD_diBayes' or $program eq 'AB_SOLiD SNP caller') {		#SNP variants
			#detailed description can be found at http://solidsoftwaretools.com/gf/download/docmanfileversion/208/866/DiBayes_Documentation_v1.2.pdf
			#chr1    SOLiD_diBayes   SNP     559817  559817  0.094413        .       .       genotype=Y;reference=T;coverage=9;refAlleleCounts=5;refAlleleStarts=4;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=14;diColor1=11;diColor2=33;het=1;flag= 
			#chr1    SOLiD_diBayes   SNP     714068  714068  0.000000        .       .       genotype=M;reference=C;coverage=13;refAlleleCounts=7;refAlleleStarts=6;refAlleleMeanQV=25;novelAlleleCounts=6;novelAlleleStarts=4;novelAlleleMeanQV=22;diColor1=00;diColor2=11;het=1;flag= 
			#chr1    SOLiD_diBayes   SNP     714835  714835  0.041579        .       .       genotype=R;reference=A;coverage=5;refAlleleCounts=3;refAlleleStarts=3;refAlleleMeanQV=18;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=20;diColor1=02;diColor2=20;het=1;flag= 

			$pos == $end or die "Error: invalid record found in GFF3-SOLiD file: start and end discordant: <$_>\n";
	
			my ($wt, $call);
			my ($hit); ### <<< FOR 5500SOLiD LifeScope

			if ($attribute =~ m/ref_base=(\w)/) {
				$wt = $1;
			} elsif ($attribute =~ m/reference=(\w)/) {
				$wt = $1;
			} else {
				die "Error: invalid record found in GFF3-SOLiD file (ref_base/reference was not found): <$_>\n";
			}
			
			if ($attribute =~ m/consen_base=(\w)/) {
				$call = $1;
			} elsif ($attribute =~ m/genotype=(\w)/) {
				$call = $1;
			} elsif ($attribute =~ m/allele-call=([\w\/]+)/) { ### <<< FOR 5500SOLiD LifeScope
			        $hit = $1;
			        if ($hit =~ m/\//) {
			            $call = $iupacrev{join("",sort(split(/\//,$hit)))}; 
			        } else {
				    $call = $hit;
			        }
			} else {
				die "Error: invalid record found in GFF3-SOLiD file (consen_base was not found): <$_>\n";
			}
						
			if ($attribute =~ m/coverage=(\d+)/) {
				$readcount = $1;
				$readcount >= $coverage or next;		#read count of the variant call
				$maxcoverage and $readcount <= $maxcoverage || next;
			}
			if ($attribute =~ m/novelAlleleCounts=(\d+)/) {
				$mutallelecount = $1;
				$mutallelecount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
				defined $altcov and $mutallelecount >= $altcov || next;
			}
			
			my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n";
			my @obs = split (//, $obs);
			@obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
			if ($obs[0] eq $wt and $obs[1] eq $wt) {
				die "Error: reference alleles are identical to observed alleles: <$_>\n";
			} elsif ($obs[0] eq $wt) {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
			} elsif ($obs[1] eq $wt) {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
			} elsif ($obs[1] ne $obs[0]) {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
				$countallvar++;
			} else {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
			}
		} elsif ($program eq 'AB_CNV_PIPELINE') {	#CNV
			if ($attribute =~ m/copynum=(\d+)/ or $attribute =~ m/copynumber=(\d+)/) {
				if ($1 < 2) {
					print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", '-', "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n";
				} elsif ($1 > 2) {
					print $chr, "\t", $end, "\t", $end, "\t", '-', "\t", 0, "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n";
				}
			} else {
				print $chr, "\t", $end, "\t", $end, "\t", '-', "\t", 0, "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n";
			}
		} elsif ($program eq 'AB_SOLiD Large Indel Tool') {	#CNV
			#http://solidsoftwaretools.com/gf/download/docmanfileversion/182/780/Large_Indel_Documentation_v1.0.0.pdf
			## [FIELDS] (1) chromosome (2) version (3) indel type (4) breakpoint start (5) breakpoint end (6) p-value (7) NA (8) NA (9) attributes
			#chr10   AB_SOLiD Large Indel Tool       insertion       151910  151969  2.77548e-11     .       .       dev=-71;avgDev=-1.63884;zygosity=HOMOZYGOUS;nRef=0;nDev=14;refDev=0;devDev=-1.60924;refVar=0;devVar=0.0159438;beadIds=1750_720_1641,649_1680_794,503_1756_608,1726_174_1362,1208_1772_353,872_594_1604,1924_990_858,1899_961_1848,901_1226_378,323_1750_1017,1185_180_1237,1519_490_1074,1291_94_324,285_758_922,1135_95_1594,1055_218_1279,
			#chr10   AB_SOLiD Large Indel Tool       insertion       154109  154729  2.1559e-11      .       .       dev=-66;avgDev=-1.51253;zygosity=HOMOZYGOUS;nRef=0;nDev=15;refDev=0;devDev=-1.02864;refVar=0;devVar=0.133236;beadIds=1728_1671_1739,1250_231_25,811_783_1090,1035_908_491,649_1680_794,503_1756_608,1726_174_1362,1208_1772_353,872_594_1604,1924_990_858,1899_961_1848,901_1226_378,323_1750_1017,1185_180_1237,1519_490_1074,1291_94_324,285_758_922,1135_95_1594,1055_218_1279,
			my ($call, @call, $zygosity);
			if ($attribute =~ m#zygosity=HEMIZYGOUS#) {
				$zygosity = 'het';
			} elsif ($attribute =~ m#zygosity=HOMOZYGOUS#) {
				$zygosity = 'hom';
			} else {
				$zygosity = 'unk';
			}
			if ($type eq 'insertion') {
				#the true boundary is unknown (start is different from end) so we cannot use "-" to represent reference allele.
				print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", 0, "\t", $zygosity, "\t", "$score\t.\t.\t", join ("\t", @other), "\n";
			} elsif ($type eq 'deletion') {
				print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", '-', "\t", $zygosity, "\t", "$score\t.\t.\t", join ("\t", @other), "\n";
			}
		} elsif ($program eq 'AB_SOLiD Small Indel Tool') {		#small indels
			#typical simple insertion and deletions
			#chr1    AB_SOLiD Small Indel Tool       deletion        1352612 1352614 1       .       .       ID=1290;del_len=3;allele-call-pos=1352612;allele-call=cct/;allele-pos=1352610;alleles=cccctccat/cCCCAT;allele-counts=REF,2;tight_chrom_pos=1352612-1352614;loose_chrom_pos=1352612-1352614;no_nonred_reads=2;coverage_ratio=11.5000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_3_r,L1_1_25_8_r;bead_ids=1470_2000_506,822_1710_1767;overall_qvs=18,19;no_mismatches=3,3;read_pos=6,13;from_end_pos=19,12;strands=-,+;tags=R3,R3;indel_sizes=-3,-3;non_indel_no_mismatches=1,-1;unmatched-lengths=25,25;ave-unmatched=25.0000;anchor-match-lengths=24,99;ave-anchor-length=61.5000;read_seqs=G0112310001100003120031200,G0300213000011000132110021;base_qvs=;non_indel_seqs=T2120033002022200220000002,;non_indel_qvs=
			#chr1    AB_SOLiD Small Indel Tool       insertion_site  1311162 1311162 1       .       .       ID=1249;ins_len=1;allele-call-pos=1311162;allele-call=/G;allele-pos=1311161;alleles=gaggggggg/GAGGGGGGGG/NO_CALL;allele-counts=REF,3,1;tight_chrom_pos=none;loose_chrom_pos=1311160-1311169;no_nonred_reads=3;coverage_ratio=4.6667;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_6_r,L1_1_50_10_r,L1_1_25_2_r,L1_1_25_3_r;bead_ids=850_837_429,1160_878_181,404_1050_1881,1084_64_1343;overall_qvs=20,56,25,25;no_mismatches=3,2,2,1;read_pos=11,22,11,11;from_end_pos=14,28,14,14;strands=+,-,-,-;tags=R3,F3,F3,F3;indel_sizes=1,1,1,1;non_indel_no_mismatches=-1,1,0,1;unmatched-lengths=25,50,25,25;ave-unmatched=31.2500;anchor-match-lengths=99,49,24,24;ave-anchor-length=49.0000;read_seqs=G1020001130221020000000020,T03223323210110021000000022122030100020221222222122,T0102210000000221223301000,T0102210000000221220301000;base_qvs=;non_indel_seqs=,G21202030032202013220021321131212021000122300013132,G1331133120001221220120120,G1331133120001221220120220;non_indel_qvs=
			
			#sometimes, allele-call is ambiguous that requires a "block substitution" representation (although they were annotated as insertion or deletion by SOLiD, they should be treated as block substitution by ANNOVAR)
			#sometimes, mutiple allele calls may be present at the same site
			#chr1    AB_SOLiD Small Indel Tool       deletion        1209357 1209360 1       .       .       ID=1101;del_len=4;allele-call-pos=1209357;allele-call=ggtggg/TT;allele-pos=1209355;alleles=ggggtgggggggtt/gGTTGGGGTT/gGTGTTTTGCCTT/NO_CALL;allele-counts=REF,3,1,1;tight_chrom_pos=none;loose_chrom_pos=1209357-1209363;no_nonred_reads=4;coverage_ratio=3.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=0.9888;run_names=L1_1_25_1_r,L1_1_25_2_r,L1_1_25_4_r,L1_1_25_3_r,L1_1_25_7_r;bead_ids=1017_1024_53,1493_1896_615,1794_647_1473,307_1116_687,773_1492_1671;overall_qvs=24,24,28,24,8;no_mismatches=2,3,2,3,2;read_pos=14,9,14,9,15;from_end_pos=11,16,11,16,10;strands=-,+,-,+,+;tags=F3,R3,F3,F3,F3;indel_sizes=-4,-4,-4,-4,3;non_indel_no_mismatches=1,0,0,0,0;unmatched-lengths=25,25,25,25,25;ave-unmatched=25.0000;anchor-match-lengths=24,24,24,24,24;ave-anchor-length=24.0000;read_seqs=T2221100101000101000221100,G0001122000100000101001020,T2221100101000101000221100,T1112200010100010100112000,T1011220000111000130200001;base_qvs=;non_indel_seqs=G0312033221312111022200300,T0111113210210112100001130,G0312133221312111022200300,G0231003132222112000012020,G3121331033101113122312020;non_indel_qvs=
			#chr1    AB_SOLiD Small Indel Tool       deletion        1209436 1209436 1       .       .       ID=1103;del_len=1;allele-call-pos=1209436;allele-call=ag/A/G;allele-pos=1209434;alleles=tgagggggtt/tGAGGGGTT/tGGGGGGTT;allele-counts=REF,1,1;tight_chrom_pos=none;loose_chrom_pos=1209436-1209441;no_nonred_reads=2;coverage_ratio=5.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_6_r,L1_1_25_2_r;bead_ids=1315_1584_2005,1706_194_437;overall_qvs=28,21;no_mismatches=0,3;read_pos=9,7;from_end_pos=16,18;strands=-,-;tags=R3,R3;indel_sizes=-1,-1;non_indel_no_mismatches=-1,0;unmatched-lengths=25,25;ave-unmatched=25.0000;anchor-match-lengths=99,24;ave-anchor-length=61.5000;read_seqs=G3001010000011001010000001,G3010100022110010111000110;base_qvs=;non_indel_seqs=,T1112003220020013202122300;non_indel_qvs=
			#chr1    AB_SOLiD Small Indel Tool       insertion_site  1424540 1424540 1       .       .       ID=1376;ins_len=3;allele-call-pos=1424540;allele-call=tt/CCCAC;allele-pos=1424537;alleles=ttttttg/TTTCCCACTG/NO_CALL;allele-counts=REF,1,1;tight_chrom_pos=none;loose_chrom_pos=1424536-1424543;no_nonred_reads=2;coverage_ratio=11.5000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_7_r,L1_1_50_16_r;bead_ids=703_437_370,1089_878_1744;overall_qvs=1,9;no_mismatches=3,4;read_pos=5,35;from_end_pos=20,15;strands=-,-;tags=R3,F3;indel_sizes=3,3;non_indel_no_mismatches=2,0;unmatched-lengths=25,50;ave-unmatched=37.5000;anchor-match-lengths=24,47;ave-anchor-length=35.5000;read_seqs=G2032002200200000000000020,T30100102220312202103112130230322210121100200002100;base_qvs=;non_indel_seqs=T2121120003012303000000000,G22213300221101011121030022002222300220322213303102;non_indel_qvs=
			my ($call, @call, $zygosity);
			my ($refcall, $gapnonred, %temphash); ### <<< FOR 5500SOLiD LifeScope
			#if ($attribute =~ m#experimental-zygosity=HEMIZYGOUS# ||$attribute =~ m#zygosity=HEMIZYGOUS#) { ### <<< FOR 5500SOLiD LifeScope
			#	$zygosity = 'het';
			#} elsif ($attribute =~ m#experimental-zygosity=HOMOZYGOUS# || $attribute =~ m#zygosity=HOMOZYGOUS#) { ### <<< FOR 5500SOLiD LifeScope
			#the above 3 lines are replaced by the following 3 lines on 20120618
			if ($attribute =~ m#zygosity=(MULTI-)?HEMIZYGOUS#) { ### <<< FOR 5500SOLiD LifeScope
			   $zygosity = 'het';
			} elsif ($attribute =~ m#zygosity=(MULTI-)?HOMOZYGOUS#) { ### <<< FOR 5500SOLiD LifeScope
				$zygosity = 'hom';
			} else {
				$zygosity = 'unk';
			}
			$score = '.';			#by default, score=1 in the output
			
			#no_nonred_reads: Number of reads with unique start positions (non-redundant reads).
			#coverage_ratio: Clipped normal coverage/number of non-redundant reads.Clipped coverage is where the parts of the read that are within 5 bp at either end are not counted as a part of coverage.
			if ($attribute =~ m/no_nonred_reads=(\d+);coverage_ratio=([\d\.]+)/) {
				$readcount = int ($1*$2);	
				$readcount >= $coverage or next;		#clipped normal read count of the variant call (this could even be lower than mut allele count)
				$maxcoverage and $readcount <= $maxcoverage || next;
			} elsif ($attribute =~ m/gap-nonred-reads=(\d+)/) { ### <<< FOR 5500SOLiD LifeScope
				$gapnonred = $1;
				$attribute =~ m/coverage_ratio=(\d+)/;
				$readcount = int($gapnonred*$1);
				$readcount >= $coverage or next;
				$maxcoverage and $readcount <= $maxcoverage || next;
			} else {
				$readcount = '.';
			}
			if ($attribute =~ m/allele-counts=REF,(\d+)/) {
				$mutallelecount = $1;
			} elsif ($attribute =~ m/context-variant-reads=(\d+)/) { ### <<< FOR 5500SOLiD LifeScope
			    	$mutallelecount = $1;
			}
			if ($attribute =~ m#reference=([\w\-]+)#) { ### <<< FOR 5500SOLiD LifeScope (using "reference" tag for the reference allele) 
			       	$refcall = $1;
				$attribute =~ m#;allele\-call=([\w\-\/]+)#;
				foreach my $item(split(/\//, $1)) { $temphash{$item}++; } # collecting unique alleles
				delete $temphash{"possibleOthers"}; # ingore the "possibleOthers" allele
				@call = keys %temphash;

				if ($1 eq '-/-') { # a simple deletion ["allele-call=-/-"] (for the end position, "$end" is already not used)
					print $chr, "\t", $pos, "\t", $pos+length($refcall)-1, "\t", $refcall, "\t", '-', "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
				} elsif ($refcall eq '-') { # a simple insertion (single or multiple allele) ["reference=-"]
					for my $i (0 .. @call-1) {					    
					    	next if ($refcall eq $call[$i]);
						print $chr, "\t", $pos, "\t", $pos, "\t", '-', "\t", $call[$i], "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
						$i > 0 and $countallvar++;
					}
				} else { # an indel that may have several alleles, or may require a block substitution representation
					for my $i (0 .. @call-1) {
					    	next if ($refcall eq $call[$i]);
					    	# for the end position, "$pos+length($call[0])-1" is already not used.
						print $chr, "\t", $pos, "\t", $pos+length($refcall)-1, "\t", $refcall, "\t", $call[$i], "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
						$i > 0 and $countallvar++;
					}
				} 
			} elsif ($attribute =~ m#allele\-call=([\w\/]+)#) {
				@call = split (/\//, $1);
				
				if (@call == 1) {		#a simple deletion
					print $chr, "\t", $pos, "\t", $end, "\t", $call[0], "\t", '-', "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
				} elsif ($call[0] eq '') {	#a simple insertion (single or multiple allele)
					for my $i (1 .. @call-1) {
						print $chr, "\t", $pos, "\t", $pos, "\t", '-', "\t", $call[$i], "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
						$i > 1 and $countallvar++;
					}
				} else {			#an indel that may have several alleles, or may require a block substitution representation
					for my $i (1 .. @call-1) {
						print $chr, "\t", $pos, "\t", $pos+length($call[0])-1, "\t", $call[0], "\t", $call[$i], "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
						$i > 1 and $countallvar++;
					}
				}
			} else {
				$call = '0';
				print $chr, "\t", $pos, "\t", $end, "\t", $call, "\t", '-', "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
			}
		} else {
			die "Error: unrecognizable genotype calling program encountered (valid types are SOLiD_diBayes, AB_CNV_PIPELINE, AB_SOLiD Large Indel Tool, AB_SOLiD Small Indel Tool): <$_>\n";
		}
			
		$countvar++;		#variation positions
		$countallvar++;		#all variants (maybe several at one variation position)
	}
	print STDERR "NOTICE: Finished processing $variantfile with $countline input lines\n";
	print STDERR "NOTICE: Wrote variants in $countvar variation positions ($countallvar variants considering multi-allelic ones)\n";
	$unknown_count and print STDERR "WARNING: $unknown_count variants with 'unknown' variation type were skipped\n";
}


sub convertSOAP {
	my ($variantfile) = @_;
	my ($countline, $countvar, @other);

	if ($variantfile eq 'stdin') {
		*VAR = *STDIN;
	} elsif ($variantfile =~ m/\.gz$/) {
		open (VAR, "gunzip -c $variantfile |") or die "Error: cannot read from STDIN uncompressing variant file $variantfile: $!\n";
	} else {
		open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
	}

	
	while (<VAR>) {
		s/[\r\n]+$//;
		$countline++;
		
		my @field = split (/\t/, $_);
		if (@field == 18) {		#snp file
			my ($chr, $pos, $wt, $call, @other) = @field;
			$chr =~ s/^chr//;
	
			$includeinfo or @other = ();			#unless -includeinfo is set, the other will not be printed
	
			my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n";
			my @obs = split (//, $obs);
			@obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
			if ($obs[0] eq $wt and $obs[1] eq $wt) {
				die "Error: reference alleles are identical to observed alleles: <$_>\n";
			} elsif ($obs[0] eq $wt) {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
			} elsif ($obs[1] eq $wt) {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
			} elsif ($obs[1] ne $obs[0]) {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
				$countvar++;
			} else {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n";
			}
		    } elsif (@field == 17) {		#snp file
			my ($chr, $pos, $wt, $call, @other) = @field;
			$chr =~ s/^chr//;
	
			$includeinfo or @other = ();			#unless -includeinfo is set, the other will not be printed
	
			my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n";
			my @obs = split (//, $obs);
			@obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
			if ($obs[0] eq $wt and $obs[1] eq $wt) {
				die "Error: reference alleles are identical to observed alleles: <$_>\n";
			} elsif ($obs[0] eq $wt) {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
			} elsif ($obs[1] eq $wt) {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
			} elsif ($obs[1] ne $obs[0]) {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
				$countvar++;
			} else {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n";
			}
		} elsif (@field == 6) {		#indel file
			my ($chr, $pos, $strand, $indellen, $call, $homo) = @field;
			$homo eq 'Homo' and $homo = 'hom';
			$homo eq 'Hete' and $homo = 'het';
			$chr =~ s/^chr//;
	
			$includeinfo or @other = ();			#unless -includeinfo is set, the other will not be printed
	
			if ($indellen =~ m/^\+(\d+)$/) {		#insertion
				length ($call) == $1 or die "Error: invalid record found in SOAPindel file: <$_>\n";
				print join("\t", $chr, $pos, $pos, '-', $call, $homo), "\n";
			} elsif ($indellen =~ m/^\-(\d+)$/) {		#deletion
				length ($call) == $1 or die "Error: invalid record found in SOAPindel file: <$_>\n";
				print join("\t", $chr, $pos, $pos+$1-1, $call, '-', $homo), "\n";
			} else {
				die "Error: invalid record found in SOAPindel file: <$_>\n";
			}
		} else {
			die "Error: invalid record found in $variantfile (18, 17 or 6 fields expected, observed ${\(scalar @field)} fields): <$_>\n";
		}
		$countvar++;
	}
	print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
}

sub convertAvsnpid {
	my ($variantfile) = @_;
	my (%info, @rsid);
	open(VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
	while (<VAR>) {
		s/[\r\n]+$//;
		m/^(rs\d+)\s*(.*)/ or next;
		$info{$1} = $2;
		push @rsid, $1;
	}
	close (VAR);
	open(AVSNP, $avsnpfile) or die "Error: cannot read from avsnpfile $avsnpfile: $!\n";
	while (<AVSNP>) {
		s/[\r\n]+$//;
		if (m/(rs\d+)$/) {
			if (exists $info{$1}) {
				print $_, "\n";
			}
		}
	}
}

sub convertRsid {
	my ($variantfile) = @_;
	my (%info, @rsid);
	open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
	while (<VAR>) {
		s/[\r\n]+$//;
		m/^(rs\d+)\s*(.*)/ or next;
		$info{$1} = $2;
		push @rsid, $1;
	}
	close (VAR);
		
		
	my (%found, %founddup);		#whether a SNP is found, whether a SNP is found multiple times (multiple mapping SNPs)
	open (DBSNP, $dbsnpfile) or die "Error: cannot read from dbSNP file\n";
	print STDERR "NOTICE: Scanning dbSNP file $dbsnpfile...\n";
	while (<DBSNP>) {
		my @record = split (/\t/, $_);
		my ($chr, $start, $end, $id, $strand, $ref, $twoallele, $class) = @record[1,2,3,4,6,8,9,11];
		unless ($class eq 'single' or $class eq 'deletion' or $class eq 'in-del' or $class eq 'insertion') {	#enum('unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion')
			next;
		}
		exists $info{$id} or next;	#not relevant so should skip this
		
		$start++;
		my @allele = split (/\//, $twoallele);
		#@allele >= 2 or next;
		if ($strand eq '-') {					#handle reverse strand annotation (the vast majority of records in dbSNP should be already in + strand)
			for my $i (0 .. @allele-1) {
				$allele[$i] = revcom ($allele[$i]);	#for example, rs2363296 needs to be reversed due to being negative strand
			}
		}
		
		for my $i (0 .. @allele-1) {
			$ref eq $allele[$i] and next;		#this is the reference allele
			
			$ref eq '-' and $start--;		#rs35561142      0       +       -       -       -/AT    genomic insertion
			if ($class eq 'in-del') {
				$allele[$i] eq '-' and next;	#rs10552169      0       +       A       A       -/ATAAA genomic in-del
			}
			
			print join ("\t", $chr, $start, $end, $ref, $allele[$i], $id);
			$info{$id} and print "\t$info{$id}";
			print "\n";
		}
		
		if ($found{$id}) {
			$founddup{$id}++;
		}
		$found{$id}++;
	}
	print STDERR "NOTICE: input file contains ", scalar (keys %info), " rs identifiers, output file contains information for ", scalar (keys %found), " rs identifiers\n";
	%founddup and print STDERR "WARNING: ", scalar (keys %founddup), " rs identifiers have multiple records (due to multiple mapping) and they are all written to output\n";
}

sub revcom {
	my ($seq) = @_;
	$seq = reverse $seq;
	$seq =~ tr/acgtACGT/tgcaTGCA/;
	return ($seq);
}

sub convertANNOVAR {
	my ($variantfile) = @_;
	my ($countline, $countvar, $countsnp, $countindel, $invalid) = (0, 0, 0, 0, 0);
	my ($countti, $counttv) = (0, 0);
	
	if ($variantfile eq 'stdin') {
		*VAR = *STDIN;
	} elsif ($variantfile =~ m/\.gz$/) {
		open (VAR, "gunzip -c $variantfile |") or die "Error: cannot read from STDIN uncompressing variant file $variantfile: $!\n";
	} else {
		open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
	}

	while (<VAR>) {
		$countline++;
		s/[\r\n]+$//; 
		my @field = split (/\s+/, $_);
		@field >= 5 or die "Error: invalid record found in annovar input file (at least 5 tab or space delimited fields expected): <$_>\n";
		my ($chr, $start, $end, $ref, $obs) = @field;

		
		($ref, $obs) = (uc $ref, uc $obs);
		$chr =~ s/^chr//;
		if ($chr =~ m/[^\w\.]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) {		#chr name could contain . (example: GL000212.1)
			$invalid++;
			next;
		} elsif ($ref eq '-' and $obs eq '-' 		#both are empty allele
			or $ref =~ m/[^ACTG0\-]/ 		#non-standard nucleotide code
			or $obs =~ m/[^ACGT0\-]/ 		#non-standard nucleotide code
			or $start =~ m/[^\d]/ 			#start is not a number
			or $end =~ m/[^\d]/ 			#end is not a number
			or $start > $end			#start is more than end
			or $ref ne '0' and $end-$start+1 != length ($ref) 	#length mismatch with ref
			or $ref eq '-' and $start != $end	#length mismatch for insertion
			) {
			$invalid++;
			next;
		}
		print "$_\n";
		$countvar++;


		if ($ref =~ m/^[ACGT]$/ and $obs =~ m/^[ACGT]$/) {
			if ($ref eq 'A' and $obs eq 'G' or $ref eq 'G' and $obs eq 'A' or $ref eq 'C' and $obs eq 'T' or $ref eq 'T' and $obs eq 'C') {
				$countti++;
			} else {
				$counttv++;
			}
			$countsnp++;
		} else {
			$countindel++;
		}

	}
	#print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
	$invalid and print STDERR "WARNING: $invalid input lines have invalid formats\n";
	#print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions\n";
	print STDERR "NOTICE: Finished writing $countsnp SNPs ($countti transitions and $counttv transversions) and $countindel indels/substitutions for 1 sample\n";

}

sub convertANNOVAR2VCF {
	my ($variantfile) = @_;
	my ($countline, $countvar, $countsnp, $countindel, $invalid) = (0, 0, 0, 0, 0);
	my ($countti, $counttv) = (0, 0);
	
	if ($variantfile eq 'stdin') {
		*VAR = *STDIN;
	} elsif ($variantfile =~ m/\.gz$/) {
		open (VAR, "gunzip -c $variantfile |") or die "Error: cannot read from STDIN uncompressing variant file $variantfile: $!\n";
	} else {
		open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
	}

	print "##fileformat=VCFv4.0\n";
	print qq/##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">\n/;
	print qq/##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth (only filtered reads used for calling)">\n/;
	print qq/##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality">\n/;
	print qq/##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n/;
	print qq/##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">\n/;
	print qq/##INFO=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">\n/;
	print qq/##INFO=<ID=DP,Number=1,Type=Integer,Description="Read Depth (only filtered reads used for calling)">\n/;
	print qq/##INFO=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality">\n/;
	print qq/##INFO=<ID=GT,Number=1,Type=String,Description="Genotype">\n/;
	print qq/##INFO=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">\n/;
	print qq/##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">\n/;
	print qq/##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">\n/;
	print qq/##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">\n/;
	print qq/##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">\n/;
	print qq/##INFO=<ID=Dels,Number=1,Type=Float,Description="Fraction of Reads Containing Spanning Deletions">\n/;
	print qq/##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">\n/;
	print qq/##INFO=<ID=HRun,Number=1,Type=Integer,Description="Largest Contiguous Homopolymer Run of Variant Allele In Either Direction">\n/;
	print qq/##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">\n/;
	print qq/##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">\n/;
	print qq/##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">\n/;
	print qq/##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">\n/;
	print qq/##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">\n/;
	print qq{##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">\n};
	print qq/##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">\n/;
	print qq/##INFO=<ID=VQSLOD,Number=1,Type=Float,Description="Log odds ratio of being a true variant versus being false under the trained gaussian mixture model">\n/;
	print qq/##INFO=<ID=culprit,Number=1,Type=String,Description="The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out">\n/;
	
	print qq/##INFO=<ID=DP4,Number=4,Type=Integer,Description="Number of 1) forward ref alleles; 2) reverse ref; 3) forward non-ref; 4) reverse non-ref alleles, used in variant calling. Sum can be smaller than DP because low-quality bases are not counted">\n/;
	
	print join("\t", "#CHROM", qw/POS     ID        REF ALT    QUAL FILTER INFO FORMAT SAMPLE/), "\n";
	while (<VAR>) {
		$countline++;
		s/[\r\n]+$//; 
		my @field = split (/\s+/, $_);
		@field >= 5 or die "Error: invalid record found in annovar input file (at least 5 tab or space delimited fields expected): <$_>\n";
		my ($chr, $start, $end, $ref, $obs) = @field;

		
		($ref, $obs) = (uc $ref, uc $obs);
		$chr =~ s/^chr//;
		if ($chr =~ m/[^\w\.]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) {		#chr name could contain . (example: GL000212.1)
			$invalid++;
			next;
		} elsif ($ref eq '-' and $obs eq '-' 		#both are empty allele
			or $ref =~ m/[^ACTG0\-]/ 		#non-standard nucleotide code
			or $obs =~ m/[^ACGT0\-]/ 		#non-standard nucleotide code
			or $start =~ m/[^\d]/ 			#start is not a number
			or $end =~ m/[^\d]/ 			#end is not a number
			or $start > $end			#start is more than end
			or $ref ne '0' and $end-$start+1 != length ($ref) 	#length mismatch with ref
			or $ref eq '-' and $start != $end	#length mismatch for insertion
			) {
			$invalid++;
			next;
		}
				
		print "$chr\t$start\t.\t$ref\t$obs", (defined $field[6])?"\t$field[6]":".", "\t.", (defined $field[7])?"\t$field[7]":".";
		if (defined $field[5]) {
			if ($field[5] eq 'het') {
				print "\tGT\t0/1";
			} elsif ($field[5] eq 'hom') {
				print "\tGT\t1/1";
			} else {
				print "\tGT\t0/0";
			}
		}
		print "\n";
		
		$countvar++;


		if ($ref =~ m/^[ACGT]$/ and $obs =~ m/^[ACGT]$/) {
			if ($ref eq 'A' and $obs eq 'G' or $ref eq 'G' and $obs eq 'A' or $ref eq 'C' and $obs eq 'T' or $ref eq 'T' and $obs eq 'C') {
				$countti++;
			} else {
				$counttv++;
			}
			$countsnp++;
		} else {
			$countindel++;
		}

	}
	$invalid and print STDERR "WARNING: $invalid input lines have invalid formats\n";
	print STDERR "NOTICE: Finished writing $countsnp SNPs ($countti transitions and $counttv transversions) and $countindel indels/substitutions for 1 sample\n";

}

sub convertMAQSNP {
	my ($variantfile) = @_;
	my ($countline, $countvar, @other);
	
	if ($variantfile eq 'stdin') {
		*VAR = *STDIN;
	} elsif ($variantfile =~ m/\.gz$/) {
		open (VAR, "gunzip -c $variantfile |") or die "Error: cannot read from STDIN uncompressing variant file $variantfile: $!\n";
	} else {
		open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
	}

	while (<VAR>) {
		s/[\r\n]+$//;
		$countline++;
		
		my @field = split (/\t/, $_);
		my @other = ();
		if (@field == 12) {					#SNPs
			my ($chr, $pos, $wt, $call, @other) = @field;
			$chr =~ s/^chr//;
	
			$includeinfo and @other = @field;			#unless -includeinfo is set, the other will not be printed
	
			my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n";
			my @obs = split (//, $obs);
			@obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
			if ($obs[0] eq $wt and $obs[1] eq $wt) {
				die "Error: reference alleles are identical to observed alleles: <$_>\n";
			} elsif ($obs[0] eq $wt) {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
			} elsif ($obs[1] eq $wt) {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
			} elsif ($obs[1] ne $obs[0]) {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
				$countvar++;
			} else {
				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n";
			}
			$countvar++;
		} elsif (@field == 13) {				#indels; the deletion start site do not need changes; the duplication start site need additional investigation by ANNOVAR developers
			my ($chr, $pos, $type, $numread, $call, @other) = @field;
			$chr =~ s/^chr//;
	
			$includeinfo and @other = @field;			#unless -includeinfo is set, the other will not be printed
	
			my @obs = split (/:/, $call);
			@obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
			if ($obs[0] =~ m/^\-(\d+)/) {		#deletion
				my $len = $1;
				print $chr, "\t", $pos, "\t", $pos+$len-1, "\t", $obs[1], "\t", '-', "\t", "het\t", join ("\t", @other), "\n";
			} elsif ($obs[0] =~ m/^(\d+)/) {	#insertion
				my $len = $1;
				print $chr, "\t", $pos-1, "\t", $pos-1, "\t", '-', "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";	#2011jul12: changed pos to pos-1 for insertions
			}
			$countvar++;
		} else {
			die "Error: invalid record found in $variantfile (12 or 13 fields expected, observed ${\(scalar @field)} fields): <$_>\n";
		}
	}
	print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
}

sub convertCASAVA {
	my ($variantfile, $chr) = @_;
	my ($countline, $countvar, @other);
	
	my ($intype);
	my ($seq_name_index, $pos_index, $call_index, $reference_index, $type_index, $score_index, $total_index, $used_index);
	my ($up_index, $ref_indel_index, $quality_index, $maxgtype_index, $bp1_reads_index, $ref_reads_index, $indel_reads_index, $other_reads_index, $gq_index, $a_index, $c_index, $g_index, $t_index, $depth_index);

	if ($variantfile eq 'stdin') {
		*VAR = *STDIN;
	} elsif ($variantfile =~ m/\.gz$/) {
		open (VAR, "gunzip -c $variantfile |") or die "Error: cannot read from STDIN uncompressing variant file $variantfile: $!\n";
	} else {
		open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
	}

	##$ COLUMNS seq_name pos type ref_upstream ref/indel ref_downstream Q(indel) max_gtype Q(max_gtype) depth alt_reads indel_reads other_reads repeat_unit ref_repeat_count indel_repeat_count
	#chr22	16264718	1D	AGAAAAAAGT	A/-	GTGATAACCT	69	hom	5	2	0	2	0	A	1	0
	##$ COLUMNS seq_name pos bcalls_used bcalls_filt ref Q(snp) max_gt Q(max_gt) max_gt|poly_site Q(max_gt|poly_site) A_used C_used G_used T_used
	#chr22	16060453	1	2	G	4	GG	2	AG	3	1	0	0	0

	while (<VAR>) {
		s/[\r\n]+$//;
		$countline++;
		my @field;

		if (m/^#/) {
			s/^#//;
			if (s/^\$\sCOLUMNS\s//) {
				@field = split (/\s+/, $_);
			} else {
				@field = split (/\t/, $_);
			}
			if (m/\bposition\b/ or m/\bpos\b/) {
				for my $i (0 .. @field-1) {
					if ($field[$i] eq 'seq_name') {
						$seq_name_index = $i;
					} elsif ($field[$i] eq 'position' or $field[$i] eq 'pos') {
						$pos_index = $i;
					} elsif ($field[$i] eq 'Q(snp)') {	#variant type is SNP
						$intype = 'snp';
						$quality_index = $i;
						print STDERR "NOTICE: Automatically detected input type as $intype\n";
					} elsif ($field[$i] eq 'Q(indel)') {	#variant type is indel
						$intype = 'indel';
						$quality_index = $i;
						print STDERR "NOTICE: Automatically detected input type as $intype\n";
					} elsif ($field[$i] eq 'modified_call' or $field[$i] eq 'max_gt') {
						defined $call_index or $call_index = $i;
					} elsif ($field[$i] eq 'max_gt|poly_site') {		#this has priority over max_gt per se	
						$call_index = $i;
					} elsif ($field[$i] eq 'Q(max_gt)') {
						defined $gq_index or $gq_index = $i;
					} elsif ($field[$i] eq 'Q(max_gt|poly_site)') {		#this has priority over Q(max)
						$gq_index = $i;
					} elsif ($field[$i] eq 'reference' or $field[$i] eq 'ref') {
						$reference_index = $i;
					} elsif ($field[$i] eq 'type') {
						$type_index = $i;
					} elsif ($field[$i] eq 'score') {
						$score_index = $i;
					} elsif ($field[$i] eq 'total') {
						$total_index = $i;
					} elsif ($field[$i] eq 'used') {
						$used_index = $i;
					} elsif ($field[$i] eq 'ref_upstream') {
						$up_index = $i;
					} elsif ($field[$i] eq 'ref/indel') {
						$ref_indel_index = $i;
					} elsif ($field[$i] eq 'max_gtype') {
						$maxgtype_index = $i;
					} elsif ($field[$i] eq 'Q(max_gtype)') {
						$gq_index = $i;
					} elsif ($field[$i] eq 'bp1_reads') {
						$bp1_reads_index = $i;
					} elsif ($field[$i] eq 'ref_reads') {
						$ref_reads_index = $i;
					} elsif ($field[$i] eq 'indel_reads') {
						$indel_reads_index = $i;
					} elsif ($field[$i] eq 'other_reads') {
						$other_reads_index = $i;
					} elsif ($field[$i] eq 'A_used') {
						$a_index = $i;
					} elsif ($field[$i] eq 'C_used') {
						$c_index = $i;
					} elsif ($field[$i] eq 'G_used') {
						$g_index = $i;
					} elsif ($field[$i] eq 'T_used') {
						$t_index = $i;
					} elsif ($field[$i] eq 'depth') {
						$depth_index = $i;
					}
				}
			}
			next;
		}
		
		##$ COLUMNS seq_name pos bcalls_used bcalls_filt ref Q(snp) max_gt Q(max_gt) max_gt|poly_site Q(max_gt|poly_site) A_used C_used G_used T_used
		#chr21.fa	9411785	1	0	G	11	GT	3	GT	3	0	0	0	1
		#chr21.fa	9414658	1	0	T	10	CT	3	CT	3	0	1	0	0
		#chr21.fa	9415181	2	0	C	52	TT	5	TT	5	0	0	0	2
		#chr21.fa	9415317	2	0	C	6	CT	6	CT	34	0	1	0	1

		#$ COLUMNS seq_name pos type ref_upstream ref/indel ref_downstream Q(indel) max_gtype Q(max_gtype) depth alt_reads indel_reads other_reads repeat_unit ref_repeat_count indel_repeat_count
		#chr1    15904   1I      AGCAGAGTGG      -/C     CCAGCCACCG      20      het     3       1       0       1       0       C       2       3
		#chr1    723799  2D      GATGAGAGAC      AG/--   AGAGAAGGAG      454     hom     25      9       0       9       0       AG      3       2
		#chr1    761958  1I      CAGAAAACCA      -/T     CTAAGGAATT      461     hom     21      8       0       8       0       T       0       1
		#chr1    821034  1D      AAGTTAGAAC      G/-     TGCCCCTTTA      218     het     48      6       1       5       0       G       1       0
		#chr1    823813  4I      AAAGAGAGAA      ----/ATAG       AGAAGGAAAT      305     het     246     14      5       6       3       ATAG    0       1
		
		$intype or die "Error: unable to recognize the correct type of the input file (make sure that header line is present in $variantfile)\n";
		@field = split (/\t/, $_);
		
		if ($intype eq 'snp') {					#SNPs
			defined $pos_index and defined $reference_index and defined $call_index or die "Error: unalbe to find the position, reference and modified_call column header in $variantfile\n";
			my ($seq_name, $pos, $wt, $obs) = @field[$seq_name_index, $pos_index, $reference_index, $call_index];
			my (@other);
			defined $pos and defined $wt and defined $obs or die;
			
			if (defined $chr) {
				$seq_name = $chr; 	#sometimes seq_name can be things like chr1.fa, etc.
			}
			
			if ($includeinfo) {
				@other = @field;
			} else {
				if ($quality_index) {
					push @other, $field[$quality_index];
					
					if (defined $varqual) {			#skip variant (alternative allele call) that have low quality score
						$field[$quality_index] ne '.' and $field[$quality_index] < $varqual and next;
					}
				}
				if ($a_index) {
					
					if (defined $coverage) {
						$field[$a_index]+$field[$c_index]+$field[$g_index]+$field[$t_index] < $coverage and next;
					}
					
					my $dp = 0;
					if ($obs =~ m/A/) {
						$dp += $field[$a_index];
					}
					if ($obs =~ m/C/) {
						$dp += $field[$c_index];
					}
					if ($obs =~ m/G/) {
						$dp += $field[$g_index];
					}
					if ($obs =~ m/T/) {
						$dp += $field[$t_index];
					}
					
					$coverage and $dp >= $coverage || next;				#does not meet coverage threshold
					if ($gq_index) {
						push @other, "DP=$dp;GQ=$field[$gq_index]";
					} else {
						push @other, "DP=$dp";
					}
				}
			}
			
			length ($obs) == 1 and $obs .= $obs;
			my @obs = split (//, $obs);
			@obs == 2 or die "Error: observed allele $obs should correspond to two nucleotide alleles: <$_>\n";
			if ($obs[0] eq $wt and $obs[1] eq $wt) {
				die "Error: reference alleles are identical to observed alleles: <$_>\n";
			} elsif ($obs[0] eq $wt) {
				print $seq_name, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
			} elsif ($obs[1] eq $wt) {
				print $seq_name, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
			} elsif ($obs[1] ne $obs[0]) {
				print $seq_name, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
				print $seq_name, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
				$countvar++;
			} else {
				print $seq_name, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n";
			}
			$countvar++;
		} elsif ($intype eq 'indel') {				#indels
			defined $pos_index and defined $ref_indel_index and defined $maxgtype_index or die "Error: unable to find the pos, ref_indel and max_gtype column header in $variantfile\n";
			my ($seq_name, $pos, $upcall, $call, $hom, @other) = @field[$seq_name_index, $pos_index, $up_index, $ref_indel_index, $maxgtype_index];
			if (defined $chr) {
				$seq_name = $chr; 	#sometimes seq_name can be things like chr1.fa, etc.
			}
			
			if ($includeinfo) {
				@other = @field;
			} else {
				if ($quality_index) {
					push @other, $field[$quality_index];
				}
				if ($depth_index) {
					$coverage and $field[$depth_index] >= $coverage || next;		#does not meet coverage threshold
					if ($gq_index) {
						push @other, "DP=$field[$depth_index];GQ=$field[$gq_index]";
					} else {
						push @other, "DP=$field[$depth_index]";
					}
				}
			}

			#hg19 coordinate below; insertion needs position adjustment!!! deletion is fine
			#948847  1I      CCTCAGGCTT      -/A     ATAATAGGGC      969     hom     47      het     22      0       16      6       A       1       2
			#978604  2D      CACTGAGCCC      CT/--   GTGTCCTTCC      251     hom     20      het     8       0       4       4       CT      1       0
			#1276974 4I      CCTCATGCAG      ----/ACAC       ACACATGCAC      838     hom     39      het     18      0       14      4       AC      2       4
			#1289368 2D      AGCCCGGGAC      TG/--   GGAGCCGCGC      1376    hom     83      het     33      0       25      9       TG      1       0
			#185137455     11I10M2I        TATGTGTCCT      -----------TTTTTTATTT--/AAATGATAGACTTTTTTTTTTAA ATTTCAGAAA      1126    het     988     hom    45       20      24      7       N/A     0       0
			#1276931 2D41M4I CACACACATG      CACACACACGCACACACGTGCAATGTGAAAACACCTCATGCAG----/--CACACACGCACACACGTGCAATGTGAAAACACCTCATGCAGACAC ACACATGCAC      548     hom     16      het     8       0       11      11      N/A     0       0
			
			my @obs = split (/\//, $call);
			@obs == 2 or die "Error: observed indel allele $call should correspond to two alleles: <$_>\n";
			if ($context) {			#print context for indels (this is useful to convert to VCF files)
				if ($obs[0] =~ m/^\-+$/) {		#insertion
					my $len = length ($obs[0]);
					print $seq_name, "\t", $pos-1, "\t", $pos-1, "\t", substr($upcall, -1, 1), "\t", substr($upcall, -1, 1).$obs[1], "\t", $hom, "\t", join ("\t", @other), "\n";
				} elsif ($obs[1] =~ m/^\-+$/) {		#deletion
					my $len = length ($obs[0]);
					print $seq_name, "\t", $pos-1, "\t", $pos+$len-1, "\t", substr($upcall, -1, 1).$obs[0], "\t", substr($upcall, -1, 1), "\t", $hom, "\t", join ("\t", @other), "\n";
				} elsif (length ($obs[0]) eq length ($obs[1])) {	#block substitution
					$obs[0] =~ s/\-//g;
					$obs[1] =~ s/\-//g;
					print $seq_name, "\t", $pos, "\t", $pos+length($obs[0])-1, "\t", $obs[0], "\t", $obs[1], "\t", $hom, "\t", join ("\t", @other), "\n";
				} else {
					die "Error: invalid record found in indel line: <$_>\n";
				}
			} else {
				if ($obs[0] =~ m/^\-+$/) {		#insertion
					my $len = length ($obs[0]);
					print $seq_name, "\t", $pos-1, "\t", $pos-1, "\t", '-', "\t", $obs[1], "\t", $hom, "\t", join ("\t", @other), "\n";
				} elsif ($obs[1] =~ m/^\-+$/) {		#deletion
					my $len = length ($obs[0]);
					print $seq_name, "\t", $pos, "\t", $pos+$len-1, "\t", $obs[0], "\t", '-', "\t", $hom, "\t", join ("\t", @other), "\n";
				} elsif (length ($obs[0]) eq length ($obs[1])) {	#block substitution
					$obs[0] =~ s/\-//g;
					$obs[1] =~ s/\-//g;
					print $seq_name, "\t", $pos, "\t", $pos+length($obs[0])-1, "\t", $obs[0], "\t", $obs[1], "\t", $hom, "\t", join ("\t", @other), "\n";
				} else {
					die "Error: invalid record found in indel line: <$_>\n";
				}
			}
			$countvar++;
		} else {
			die "Error: invalid record found in $variantfile (11 or 15 fields expected, observed ${\(scalar @field)} fields): <$_>\n";
		}
	}
	print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
}

sub convertVCF4Old {
	my ($variantfile) = @_;
	
	my ($countline, $countvar, $counthom, $counthet, $countunknown, $countindel, $countsnp, $countti, $counttv) = qw/0 0 0 0 0 0 0 0 0/;
	
	my ($source_program, $gtpos);		#the program that generated the VCF4 file; the GT position within FORMAT record

	if ($variantfile eq 'stdin') {
		*VAR = *STDIN;
	} elsif ($variantfile =~ m/\.gz$/) {
		open (VAR, "gunzip -c $variantfile |") or die "Error: cannot read from STDIN uncompressing variant file $variantfile: $!\n";
	} else {
		open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
	}

	
	while (<VAR>) {
		$countline++;
		
		if (m/^##fileformat=VCFv(\d+\.)/) {
			$1<4 and print STDERR "ERROR: Input file is not in VCF version 4 format but is $_" and exit;
		}
		if (m/^##UnifiedGenotyper/) {
			$source_program = 'gatksnp';
			print STDERR "NOTICE: Detected that the VCF4 file is generated by GATK UnifiedGenotyper\n";
			$includeinfo or print STDERR "NOTICE: column 6-10 represent heterozygosity status, quality score, read depth, RMS mapping quality, quality by depth\n";
			$fraction and print STDERR "WARNING: the --fraction argument will be ignored for GATK SNP calls!!!\n";
			$confraction and print STDERR "WARNING: the --confraction argument will be ignored for GATK SNP calls!!!\n";
		}
		if (m/^##IndelGenotyper/) {
			$source_program = 'gatkindel';
			print STDERR "NOTICE: Detected that the VCF4 file is generated by GATK IndelGenotyper\n";
			$includeinfo or print STDERR "NOTICE: column 6-10 represent heterozygosity status, quality score, read depth, read count supporting indel call, RMS mapping quality\n";
		}
		
		if (not m/^#/ and not $source_program) {	#finished reading header line but did not detect the source program
			$includeinfo or print STDERR "NOTICE: for SNPs, column 6 and beyond MAY BE heterozygosity status, quality score, read depth, RMS mapping quality, quality by depth, if these information can be recognized automatically\n";
			$includeinfo or print STDERR "NOTICE: for indels, column 6 and beyond MAY BE heterozygosity status, quality score, read depth, read count supporting indel call, RMS mapping quality, if these information can be recognized automatically\n";
			$source_program = 'unknown';
		}
		
		if ($comment) {
			m/^#/ and print and next;
		} else {
			m/^#/ and next;		#skip comment lines
		}
		s/[\r\n]+$//;		#delete trailing new lines
		my $otherinfo = $_;	#this is the complete line (when -includeinfo is set, the entire line will be included in output file)
	
		#format description: http://www.1000genomes.org/wiki/Analysis/vcf4.0
		#standard VCF4 should have 8 columns, but some software may produce more columns (for example, for genotype calls). The first 8 columns should follow the specification
		
		#example of VCF4 generated by GATK SNP caller
		#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  SAMPLE
		#1       55      .       T       G       34.82   .       DP=2;Dels=0.00;HRun=0;HaplotypeScore=0.00;MQ=14.16;MQ0=0;QD=17.41;SB=-10.00     GT:DP:GL:GQ     0/1:1:-6.66,-0.30,-0.00:1.76
		#1       2646    .       G       A       40.91   .       DP=4;Dels=0.00;HRun=0;HaplotypeScore=0.00;MQ=7.50;MQ0=3;QD=10.23;SB=-10.00      GT:DP:GL:GQ     0/1:1:-7.27,-0.30,-0.00:1.76
		
		#example of VCF4 generated by GATK indel caller
		#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  SAMPLE
		#1       2525324 .       G       GC      .       PASS    AC=5,5;DP=12;MM=4.8,3.7142856;MQ=29.0,42.285713;NQSBQ=33.0,46.463768;NQSMM=0.24,0.20289855;SC=0,5,1,6  GT       0/1
		#1       3553372 .       GC      G       .       PASS    AC=6,6;DP=6;MM=0.8333333,0.0;MQ=60.0,0.0;NQSBQ=63.533333,0.0;NQSMM=0.0,0.0;SC=0,6,0,0   GT      1/0
		#1       6093011 .       CG      C       .       PASS    AC=31,31;DP=32;MM=0.7096774,2.0;MQ=59.64516,60.0;NQSBQ=64.192184,39.666668;NQSMM=0.0,0.11111111;SC=23,8,0,1     GT      1/0
		
		#example of VCF4 generated by 1000G
		#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
		#1       533     .       G       C       .       PASS    AA=.;AC=6;AN=120;DP=423
		#1       41342   .       T       A       .       PASS    AA=.;AC=29;AN=120;DP=188
		#1       41791   .       G       A       .       PASS    AA=.;AC=5;AN=120;DP=192
		#1       44449   .       T       C       .       PASS    AA=C;AC=2;AN=120;DP=166
		#1       44539   rs2462492       C       T       .       PASS    AA=T;AC=2;AN=120;DP=131    
		
		#example of VCF4 generated by 1000G
		#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
		#1       1000153 .       TCACA   T       100     PASS    AF=0.115095;HP=1;NF=16;NR=13;NS=52;CA=0;DP=615
		#1       1000906 .       CA      C       48      PASS    AF=0.0772696;HP=1;NF=2;NR=9;NS=51;CA=0;DP=281
		#1       1000950 rs60561655;-/G  CG      C       100     PASS    AF=0.447771;HP=5;DB;NF=10;NR=20;NS=50;CA=M;DP=291
		#1       1010786 rs36095298;-/G,mills,venter     A       AG      100     PASS    AF=0.774334;HP=1;DB;NF=21;NR=27;NS=51;CA=0;DP=306
		#1       1026158 .       T       TGGGGG  100     PASS    AF=0.115637;HP=1;NF=5;NR=2;NS=52;CA=0;DP=591
                
                #example of VCF4 generated by SamTools mpileup (Note that GT was not the first field in the FORMAT string)
                ##fileformat=VCFv4.0
		##samtoolsVersion=0.1.16 (r963:234)
		##fileformat=VCFv4.0
		#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  1247MFL0003.NOVO.srt.bam
		#chr1    14574   .       A       G       3.54    .       "DP=3;AF1=0.4999;CI95=0.5,0.5;DP4=1,0,2,0;MQ=21;PV4=1,1,1,1"    PL:GT:GQ        "31,0,34:0/1:32"
		#chr1    14930   .       A       G       37      .       "DP=19;AF1=0.5;CI95=0.5,0.5;DP4=7,5,5,1;MQ=25;PV4=0.6,6.3e-05,1,0.23"   PL:GT:GQ        "67,0,103:0/1:70"
		#chr1    16495   .       G       C       28      .       "DP=4;AF1=0.5;CI95=0.5,0.5;DP4=0,0,4,0;MQ=32"   PL:GT:GQ        "70,12,70:0/1:58"
		#chr1    59040   .       T       C       4.77    .       "DP=4;AF1=0.4999;CI95=0.5,0.5;DP4=0,2,2,0;MQ=22;PV4=0.33,0.21,1,1"      PL:GT:GQ        "33,0,39:0/1:35"
		#chr1    69270   .       A       G       46      .       "DP=20;AF1=0.5;CI95=0.5,0.5;DP4=2,0,18,0;MQ=24;PV4=1,1,1,0.28"  PL:GT:GQ        "94,18,100:0/1:78"
		#chr1    69511   .       A       G       24      .       "DP=5;AF1=0.5;CI95=0.5,0.5;DP4=1,0,2,1;MQ=25;PV4=1,0.46,1,0.039"        PL:GT:GQ        "54,0,57:0/1:55"
                
		#reserved VCF4 sub-fields in the INFO field
		#    * AA ancestral allele
		#    * AC allele count in genotypes, for each ALT allele, in the same order as listed
		#    * AF allele frequency for each ALT allele in the same order as listed: use this when estimated from primary data, not called genotypes
		#    * AN total number of alleles in called genotypes
		#    * BQ RMS base quality at this position
		#    * CIGAR cigar string describing how to align an alternate allele to the reference allele
		#    * DB dbSNP membership
		#    * DP combined depth across samples, e.g. DP=154
		#    * END end position of the variant described in this record (esp. for CNVs)
		#    * H2 membership in hapmap2
		#    * MQ RMS mapping quality, e.g. MQ=52
		#    * MQ0 Number of MAPQ == 0 reads covering this record
		#    * NS Number of samples with data
		#    * SB strand bias at this position
		#    * SOMATIC indicates that the record is a somatic mutation, for cancer genomics
		#    * VALIDATED validated by follow-up experiment


		#SAMtools/BCFtools specific information
		#SAMtools/BCFtools may write the following tags in the INFO field in VCF/BCF.
		#Tag	Description
		#I16	16 integers:
		#1	#reference Q13 bases on the forward strand 	2	#reference Q13 bases on the reverse strand
		#3	#non-ref Q13 bases on the forward strand 	4	#non-ref Q13 bases on the reverse strand
		#5	sum of reference base qualities 	6	sum of squares of reference base qualities
		#7	sum of non-ref base qualities 	8	sum of squares of non-ref base qualities
		#9	sum of ref mapping qualities 	10	sum of squares of ref mapping qualities
		#11	sum of non-ref mapping qualities 	12	sum of squares of non-ref mapping qualities
		#13	sum of tail distance for ref bases 	14	sum of squares of tail distance for ref bases
		#15	sum of tail distance for non-ref bases 	16	sum of squares of tail distance for non-ref
		#INDEL	Indicating the variant is an INDEL.
		#DP	The number of reads covering or bridging POS.
		#DP4	Number of 1) forward ref alleles; 2) reverse ref; 3) forward non-ref; 4) reverse non-ref alleles, used in variant calling. Sum can be smaller than DP because low-quality bases are not counted.
		#PV4	P-values for 1) strand bias (exact test); 2) baseQ bias (t-test); 3) mapQ bias (t); 4) tail distance bias (t)
		#FQ	Consensus quality. If positive, FQ equals the phred-scaled probability of there being two or more different alleles. If negative, FQ equals the minus phred-scaled probability of all chromosomes being identical. Notably, given one sample, FQ is positive at hets and negative at homs.
		#AF1	EM estimate of the site allele frequency of the strongest non-reference allele.
		#CI95	Equal-tail (Bayesian) credible interval of the site allele frequency at the 95% level.
		#PC2	Phred-scaled probability of the alternate allele frequency of group1 samples being larger (,smaller) than of group2 samples.
		#PCHI2	Posterior weighted chi^2 P-value between group1 and group2 samples. This P-value is conservative.
		#QCHI2	Phred-scaled PCHI2
		#RP	Number of permutations yeilding a smaller PCHI2

		#example of triallelic variants generated by mpileup/bcftools
		#1       156706559       .       A       C,G     114     .	DP=20;AF1=1;CI95=1,1;DP4=0,0,1,19;MQ=60;FQ=-63  GT:PL:GQ	1/2:237,126,90,162,0,138:99
		#6       31129642        .       A       G,C     76      .	DP=31;AF1=1;CI95=1,1;DP4=0,0,28,3;MQ=60;FQ=-75  GT:PL:GQ	1/2:255,194,146,164,0,119:99
		#1       11297762        .       T       C,A     98      .	DP=19;AF1=1;CI95=1,1;DP4=0,0,17,1;MQ=60;FQ=-78  GT:PL:GQ	1/1:131,51,0,120,28,117:99
		
		my @field=split(/\t/,$_);
		@field >=8 or die "Error: invalid record found in VCF4 file (at least 8 tab-delimited fields expected): <$_>\n";
		my ($chr, $start, $ID, $ref_allele, $mut_allele, $quality_score, $filter, $info, $format, $sample) = @field;
		my ($end);
		my ($mut_allele2, $zygosity);
		
		if ($filterword) {		#ensure that the filter field contains the filterword
			$filter =~ m/\b$filterword\b/i or next;
		}
		
		$info =~ s/^"//; $info =~ s/"$//;
		
		#sometimes the alleles are not in the same case
		#chr1    1869771 1869774 actc    aCTctc          43.5    13      INDEL;DP=13;AF1=0.5;CI95=0.5,0.5;DP4=0,4,4,0;MQ=37;PV4=0.029,0.45,1,0.46
		$ref_allele = uc $ref_allele;
		$mut_allele = uc $mut_allele;
		
		#if ($ID eq '.' || $ID =~ /^rs/) {		#per MISHIMA, Hiroyuki suggestion (vcf4's third column (ID column) are not always ".")
		#	$end = $start;				#this block is commented out on 2011feb19
		#}
		
		if ($mut_allele eq '.') {			#no variant call was made at this position
			next;
		}
		
		if ($mut_allele =~ m/([^,]+),([\w,]+)/) {	#there could be more than two alternative alleles
			$mut_allele = $1;
			$mut_allele2 = $2;
		}
		
		if(length($ref_allele)==1 && length($mut_allele)==1) {  	### output snv
			if ($ref_allele =~ m/[^ACGTacgt]/ ) {
				print STDERR "WARNING: invalid allele record found in VCF4 file (ACGT expected): <$ref_allele> and <$mut_allele> in line <$_>\n";
				$ref_allele = 0;
			}
			if ( $mut_allele =~ m/[^ACGTacgt]/) {
				print STDERR "WARNING: invalid allele record found in VCF4 file (ACGT expected): <$ref_allele> and <$mut_allele> in line <$_>\n";
				$mut_allele = 0;
			}
				
			my ($unfiltered_read_depth) = $info =~ /DP=(\d+)/;
			my ($MappingQuality) = $info =~ /MQ=([^;]+)/; 
			my ($QualityByDepth) = $info =~ /QD=([^;]+)/;		
			

			if ($coverage) {
				defined $unfiltered_read_depth and $unfiltered_read_depth >= $coverage || next;
				if ($maxcoverage) {
					defined $unfiltered_read_depth and $unfiltered_read_depth <= $maxcoverage || next;
				}
			}
			
			if ($snpqual) {
				defined $QualityByDepth and $QualityByDepth >= $snpqual || next;		#the QD was used here as quality score
			}			
			
			if (defined $format) {
				my @format = split (/:/, $format);
				undef $gtpos;
				for my $i (0 .. @format-1) {
					if ($format[$i] eq 'GT') {
						$gtpos = $i;
						last;
					}
				}
				if (defined $sample and defined $gtpos) {
					my @sample = split (/:/, $sample);
					#if ($sample[$gtpos] =~ m#^0/1# or $sample[$gtpos] =~ m#^1/0#) {
					#	$zygosity = 'het';
					#	$counthet++;
					#} elsif ($sample[$gtpos] =~ m#^1/1#) {
					#	$zygosity = 'hom';
					#	$counthom++;
					#change the above lines to the following on 20120618
					if ($sample[$gtpos] =~ m#^(\d)/(\d)#) {
						if ($1 == $2) {
							$zygosity = 'hom';
							$counthom++;
						} else {
							$zygosity = 'het';
							$counthet++;
						}
					
					} else {
						$zygosity = 'unknown';
						$countunknown++;
					}
				} else {		#sometimes the input VCF file does not contain the GT field!!!
					$zygosity = 'unknown';
					$countunknown++;
				}
			} else {
				$zygosity = 'unknown';
				$countunknown++;
			}

			#the subject is called as homozygous for the first alternative allele (genotype 1/1. i.e. C/C), but since there was one read containing A, samtools still keep both alleles in the VCF file (but gives a very low probabilities for it).
			#1       11297762        .       T       C,A     98      . DP=19;AF1=1;CI95=1,1;DP4=0,0,17,1;MQ=60;FQ=-78  GT:PL:GQ 1/1:131,51,0,120,28,117:99			
			
			#following was commented out per Merck 20131009
			#if ($mut_allele2 and $zygosity eq 'hom') {
			#	$mut_allele2 = '';
			#}

			if (not $mut_allele2 or $mut_allele2 and $zygosity eq 'hom') {
				if ($ref_allele eq 'A' and $mut_allele eq 'G' or $ref_allele eq 'G' and $mut_allele eq 'A' or $ref_allele eq 'C' and $mut_allele eq 'T' or $ref_allele eq 'T' and $mut_allele eq 'C') {
					$countti++;
					
				} else {
					$counttv++;
				}
			}
			
			#print $chr, "\t", $start, "\t", $start, "\t", $ref_allele, "\t", $mut_allele, "\t$zygosity",  "\t", $quality_score, (defined $unfiltered_read_depth)? "\t$unfiltered_read_depth" : '', (defined $MappingQuality) ? "\t$MappingQuality" : '', (defined $QualityByDepth) ? "\t$QualityByDepth" : '', $includeinfo ? "\t$otherinfo" : '', "\n";	#commented Sep 2011
			if ($includeinfo) {
				print $chr, "\t", $start, "\t", $start, "\t", $ref_allele, "\t", $mut_allele, $withzyg?"\t$zygosity":"", "\t", $otherinfo, "\n";
			} else {
				print $chr, "\t", $start, "\t", $start, "\t", $ref_allele, "\t", $mut_allele, "\t$zygosity",  $withfilter?"\t$filter":"", "\t", $quality_score, (defined $unfiltered_read_depth)? "\t$unfiltered_read_depth" : '', (defined $MappingQuality) ? "\t$MappingQuality" : '', (defined $QualityByDepth) ? "\t$QualityByDepth" : '', "\n";
			}
			
			if ($allallele) {
				if ($mut_allele2) {
					my @mut_allele2 = split (/,/, $mut_allele2);
					for my $i (0 .. @mut_allele2-1) {
						if ($includeinfo) {
							print $chr, "\t", $start, "\t", $start, "\t", $ref_allele, "\t", $mut_allele2[$i], $withzyg?"\t$zygosity":"", "\t", $otherinfo, "\n";
						} else {
							print $chr, "\t", $start, "\t", $start, "\t", $ref_allele, "\t", $mut_allele2[$i], "\t$zygosity",  $withfilter?"\t$filter":"", "\t", $quality_score, (defined $unfiltered_read_depth)? "\t$unfiltered_read_depth" : '', (defined $MappingQuality) ? "\t$MappingQuality" : '', (defined $QualityByDepth) ? "\t$QualityByDepth" : '', "\n";
						}
					}
				}
			}
			
			$countsnp++;
		} elsif (length($ref_allele) > 1 || length($mut_allele) > 1) {  ### output indel
			my ($indel_read_depth1, $indel_read_depth2) = $info =~ /\bAC=([^,;]+),([^,;]+)/;		#number of reads supporting consensus indel, any indel
			my ($unfiltered_read_depth) = $info =~ /\bDP=(\d+)/;
			my ($MappingQuality) = $info =~ /\bMQ=([^;]+)/;
			my ($QualityByDepth) = $info =~ /\bQD=([^;]+)/;	
					
			if ($coverage) {
				defined $unfiltered_read_depth and $unfiltered_read_depth >= $coverage || next;
				if ($maxcoverage) {
					defined $unfiltered_read_depth and $unfiltered_read_depth <= $maxcoverage || next;
				}
			}
			
			if ($snpqual) {
				defined $QualityByDepth and $QualityByDepth >= $snpqual || next;		#the QD was used here as quality score
			}
			
			if (defined $indel_read_depth1 and $unfiltered_read_depth) {		#deleted "defined" before $unfiltered_read_depth on 2012may25
				$indel_read_depth1/$unfiltered_read_depth >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
				if ($indel_read_depth2) {
					$indel_read_depth1/$indel_read_depth2 >= $confraction or next;
				}
			}
			

			#example VCF4 records below:
			#20      2       .       TCG     T       .       PASS    DP=100
			#Chr1    5473    .       AT      ATT     23.5    .       INDEL;DP=16;AF1=0.5;CI95=0.5,0.5;DP4=4,2,3,1;MQ=42;PV4=1,0.41,0.042,0.24
			#Chr1    6498    .       ATTTT   ATTTTT  53.5    .       INDEL;DP=9;AF1=1;CI95=1,1;DP4=0,0,5,3;MQ=28
			
			
			if(length($ref_allele) > length ($mut_allele)) { 		# deletion or block substitution
				my $head = substr($ref_allele, 0, length ($mut_allele));
				if ($head eq $mut_allele) {
					print $chr,"\t";
					print $start+length($head),"\t";
					print $start+length($ref_allele)-1,"\t";
					
					my $ref_allele1 = substr ($ref_allele, length ($mut_allele));
					print $ref_allele1,"\t";
					print "-";
				} else {
					print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele;
				}
			} elsif(length($mut_allele) >= length ($ref_allele)) { 		# insertion or block substitution
				my $head = substr ($mut_allele, 0, length ($ref_allele));
				if ($head eq $ref_allele) {
					print $chr,"\t";	
					print $start+length($ref_allele)-1,"\t";
					print $start+length($ref_allele)-1,"\t";
					
					$mut_allele = substr ($mut_allele, length ($ref_allele));
					print "-\t";
					print $mut_allele;
				} else {
					print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele;
				}
			}
			

			if (defined $format) {
				my @format = split (/:/, $format);
				undef $gtpos;
				for my $i (0 .. @format-1) {
					if ($format[$i] eq 'GT') {
						$gtpos = $i;
						last;
					}
				}
				if (defined $sample and defined $gtpos) {
					my @sample = split (/:/, $sample);
					if ($sample[$gtpos] =~ m#^0/1# or $sample[$gtpos] =~ m#^1/0#) {
						$zygosity = 'het';
						$counthet++;
					} elsif ($sample[$gtpos] =~ m#^1/1#) {
						$zygosity = 'hom';
						$counthom++;
					} else {
						$zygosity = 'unknown';
						$countunknown++;
					}
				} else {
					$zygosity = 'unknown';
					$countunknown++;
				}
			} else {
				$zygosity = 'unknown';
				$countunknown++;
			}
			
			if ($includeinfo) {
				 print $withzyg?"\t$zygosity":"", "\t", $otherinfo;
			} else {
				print "\t$zygosity";
				$withfilter and print "\t$filter";
				defined $quality_score and print "\t", $quality_score;
				defined $unfiltered_read_depth and print "\t", $unfiltered_read_depth;
				
				#defined $indel_read_depth1 and print "\t", $indel_read_depth1;		#commented out Nov 2011
				defined $MappingQuality and print "\t", $MappingQuality;
				defined $QualityByDepth and print "\t", $QualityByDepth;		#added in Nov 2011 to be consistent with SNP output
				#$includeinfo and print "\t", $otherinfo;	#commented Sep 2011
			}
			print "\n";
			$countindel++;


			#do the same thing again, exactly like above, except that we work on second mutation;
			#in the future, consider rewrite this paragraph to make the code more elegant	
			if ($allallele and $mut_allele2) {
				my @mut_allele2 = split (/,/, $mut_allele2);
				for my $mut_allele2 (@mut_allele2) {
					if(length($ref_allele) > length ($mut_allele2)) { 		# deletion or block substitution
						my $head = substr($ref_allele, 0, length ($mut_allele2));
						if ($head eq $mut_allele2) {
							print $chr,"\t";
							print $start+length($head),"\t";
							print $start+length($ref_allele)-1,"\t";
							
							my $ref_allele1 = substr ($ref_allele, length ($mut_allele2));
							print $ref_allele1,"\t";
							print "-";
						} else {
							print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele2;
						}
					} elsif(length($mut_allele2) > length ($ref_allele)) { 		# insertion or block substitution
						my $head = substr ($mut_allele2, 0, length ($ref_allele));
						if ($head eq $ref_allele) {
							print $chr,"\t";	
							print $start+length($ref_allele)-1,"\t";
							print $start+length($ref_allele)-1,"\t";
							
							$mut_allele2 = substr ($mut_allele2, length ($ref_allele));
							print "-\t";
							print $mut_allele2;
						} else {
							print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele2;
						}
					} else {		#identical length of alleles
						print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele2;
					}

					if (defined $sample) {
						if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) {
							$zygosity = "het";
							$counthet++;
						} elsif ($sample =~ m#^1/1#) {
							$zygosity =  "hom";
							$counthom++;
						} # BEGIN ARQ
						elsif ($sample =~ m#^./.#) {
							$zygosity = "unknown";
							$countunknown++;
						} # END ARQ
					}
										
					if ($includeinfo) {
						print $withzyg?"\t$zygosity":"", "\t", $otherinfo;
					} else {
						print "\t", $zygosity;
						$withfilter and print "\t", $filter;		#if -withfilter is set, print out the filter word
						print "\t", $quality_score;
						defined $unfiltered_read_depth and print "\t", $unfiltered_read_depth;
						
						defined $indel_read_depth1 and print "\t", $indel_read_depth1;
						defined $MappingQuality and print "\t", $MappingQuality;
						#$includeinfo and print "\t", $otherinfo;
					}
					print "\n";

				}
			}
		}
		$countvar++;
	}
	my $triallelic = $countsnp-$countti-$counttv;
	print STDERR "NOTICE: Read $countline lines and wrote ${\($counthet+$counthom)} different variants at $countvar genomic positions ($countsnp SNPs and $countindel indels)\n";
	print STDERR "NOTICE: Among ${\($counthet+$counthom+$countunknown)} different variants at $countvar positions, $counthet are heterozygotes, $counthom are homozygotes\n";
	print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions (ratio=", $counttv?sprintf("%.2f", $countti/$counttv):"NA", ")" , $triallelic?", $triallelic have more than 2 alleles\n":"\n";
}

sub adjustStartEndRefAlt {
	my ($newstart, $newend, $newref, $newalt) = @_;

	until (substr($newref,-1) ne substr($newalt,-1)) {
		chop $newref;
		chop $newalt;
		$newend--;
		if (not $newref) {
			$newref = '-';
			$newstart--;		#now it is an insertion so the start should decrease by 1 (20150925)
			last;
		}
		if (not $newalt) {
			$newalt = '-';
			last;
		}
	}
	until (substr($newref,0,1) ne substr ($newalt,0,1)) {
		substr ($newref,0,1) = '';
		substr ($newalt,0,1) = '';
		$newstart++;
		if (not $newref) {
			$newref = '-';
			$newstart--;		#now it is an insertion so the start should decrease by 1 (20150925)
			last;
		}
		if (not $newalt) {
			$newalt = '-';
			last;
		}
	}
	return ($newstart, $newend, $newref, $newalt);
}

sub convertVCF4 {
	#on 2013Jul28, I rewrote the VCF4 conversion subroutine to address many user comments that I have never had time to address, and to make this subroutine more robust
	my ($variantfile) = @_;
	
	my ($countsample, $countline, $countlocus, $countvar, $counthom, $counthet, $countunknown, $countindel, $countsnp, $countti, $counttv) = qw/0 0 0 0 0 0 0 0 0 0 0/;
	my ($countsnp1, $countindel1, $countti1, $counttv1) = qw/0 0 0 0/;
	my ($countinvalidalt, $countinvalidgt, $countinvalidref) = (0, 0, 0);
	#countline: total lines in VCF file
	#countvar: total variants in VCF file
	#counthom/counthet/countunknown: counts of zygosity of variants
	#countindel/countsnp: counts of different types of variants
	#countti/counttv: transition/transversion ratio
	#countinvalid: invalid alternative alleles
	
	my (@fhout, @comment);
	my ($found_headerline);

	if ($variantfile eq 'stdin') {
		*VAR = *STDIN;
	} elsif ($variantfile =~ m/\.gz$/) {
		open (VAR, "gunzip -c $variantfile |") or die "Error: cannot read from STDIN uncompressing variant file $variantfile: $!\n";
	} else {
		open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
	}
	
	while (<VAR>) {
		$countline++;			#count number of lines in VCF file
		
		if (m/^##fileformat=VCFv([\d+\.]+)/) {
			$1<4 and die "ERROR: Input file has a lower version ($1) than VCF version 4\n";
		}

		s/[\r\n]+$//;		#delete trailing new lines
		
		if (m/^##/) {
			$comment and push @comment, $_;
			next;
		}
		
		$verbose and print STDERR "NOTICE: Processing line $_\n";
		
		my $otherinfo = $_;			#this is the complete line (when -includeinfo is set, the entire line will be included in output file)
		my @field=split(/\t/,$_);
		@field >=8 or die "Error: invalid record found in VCF4 file (at least 8 tab-delimited fields expected): <$_>\n";
		my ($chr, $start, $ID, $ref, $alt, $quality_score, $filter, $info, $format, @sample) = @field;
		my ($end) = ($start);
		my ($alt2, $zygosity);
		my (@allalt);
		my ($dp);				#sequencing depth
		my ($gtpos, $dppos, $gqpos);		#the program that generated the VCF4 file; the GT/DP/GQ position within FORMAT record
		my ($read_depth, $genotype_quality);

		if ($chr =~ m/^#CHR/i) {		#format specification line
			$found_headerline and next;	#sometimes people concatenate multiple VCF files together without removing header lines in the combined file; we need to address this
			$found_headerline++;
			if ($allsample and @sample and not $withfreq) {
				
				if (not @sample and $withfreq) {
					die "Error: -withfreq argument is set, but the VCF file does not contain any sample\n";
				}
				
				for my $i (0 .. @sample-1) {
					open ($fhout[$i], ">$outfile.$sample[$i].avinput") or die "cannot write to outfile\n";
					my $fhout = $fhout[$i];		#must use a temp variable for this to work
					$comment and @comment and print $fhout join ("\n", @comment), "\n", join ("\t", $chr, $start, $ID, $ref, $alt, $quality_score, $filter, $info, $format, $sample[$i]), "\n";
				}
			} else {
				my $fhout;
				if ($outfile) {
					open ($fhout, ">$outfile") or die "Error: cannot write to output file $outfile: $!\n";
					$fhout[0] = $fhout;
				} else {
					$fhout[0] = *STDOUT;
					$fhout = $fhout[0];
				}
				$comment and @comment and print $fhout join ("\n", @comment), "\n", join ("\t", $chr, $start, $ID, $ref, $alt, $quality_score, $filter, $info, $format, $sample[0]), "\n";
				if (@sample > 1) {
					if (not $allsample or not $withfreq) {
						print STDERR "WARNING to old ANNOVAR users: this program no longer does line-to-line conversion for multi-sample VCF files. If you want to include all variants in output, use '-format vcf4old' or use '-format vcf4 -allsample -withfreq' instead.\n";
					}
				}
			}
			$countsample = scalar (@sample);
			next;
		} elsif (m/^#/) {
			if ($allsample and @sample) {
				for my $i (0 .. @sample-1) {
					my $fhout = $fhout[$i];		#must use a temp variable for this to work
					$comment and print $fhout $_, "\n";
				}
			} else {
				$comment and push @comment, $otherinfo;
				next;
			}
		} elsif (not @fhout) {		#the VCF file does not contain a header line
			if ($outfile) {
				my $fhout;
				open ($fhout, ">$outfile") or die "Error: cannot write to output file $outfile: $!\n";
				$fhout[0] = $fhout;
			}
		}
		
		if (not @sample) {
			@sample = ('NULL');		#later on we process sample array one by one, so this is necessary to create a NULL sample
		}
		
		if ($filterword) {			#ensure that the filter field contains the filterword (for example, "PASS")
			$filter =~ m/\b$filterword\b/i or next;
		}
		if (defined $varqual) {			#skip variant (alternative allele call) that have low quality score
			$quality_score ne '.' and $quality_score < $varqual and next;
		}
		$countlocus++;
		
		$info =~ s/^"//; $info =~ s/"$//;	#some genotype calling software tools put quotes to INFO field
		
		#some genotype calling software tools put the alleles in different cases
		#chr1    1869771 1869774 actc    aCTctc          43.5    13      INDEL;DP=13;AF1=0.5;CI95=0.5,0.5;DP4=0,4,4,0;MQ=37;PV4=0.029,0.45,1,0.46
		($ref, $alt) = (uc $ref, uc $alt);
		
		#$alt eq '.' and next;			#no real variant call was made at this position (some genotype calling software tools do this)	#commented out 20140712 since users may still want to annotate this location regardless of what

		@allalt = split (/,/, $alt);		#there may be multiple alternative alleles
		#@allalt > 1 and print STDERR "NOTICE: Alternative alleles found <$_>\n";
		
		##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
		##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
		##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
		##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
		
		if (defined $format) {		#for example, "GT:GQ:DP:HQ" means genotype, genotype quality, read depth, haplotype quality
			my @format = split (/:/, $format);
			for my $j (0 .. @format-1) {
				if ($format[$j] eq 'GT') {
					defined $gtpos and die "Error: invalid record found in VCF file (multiple GT format specifiers existin FORMAT field $format:<$otherinfo>\n";
					$gtpos = $j;
				}
				if ($format[$j] eq 'DP') {
					#defined $dppos and die "Error: invalid record found in VCF file (multiple DP format specifiers exist in FORMAT field:<$otherinfo>\n";
					$dppos = $j;
				}
				if ($format[$j] eq 'GQ') {
					defined $gqpos and die "Error: invalid record found in VCF file (multiple GQ format specifiers exist in FORMAT field:<$otherinfo>\n";
					$gqpos = $j;
				}
				if ($format[$j] eq 'NR') {
					if (not defined $dppos) {	#check DP first then NR (Number of reads covering variant location in this sample)
						$dppos = $j;
					}
				}
				if ($format[$j] eq 'AD') {		#DP, NR, AD and so on can all be used to indicate coverage
					if (not defined $dppos) {
						$dppos = $j;
					}
				}
			}
			defined $gtpos or die "Error: invalid record in VCF file: the GT specifier is not present in the FORMAT string: <$_>\n";
		}
		
		for my $i (0 .. @allalt-1) {		#process each alternative allele
			my ($head, $newstart, $newend, $newref, $newalt);

			$alt = $allalt[$i];
			my ($count_alt, $count_all) = (0, 0);	#count of alternative allele and all allele (should be sample*2 generally)
			
			
			if ($ref =~ m/[^ACGTacgt]/) {
				if (length ($ref) < 100) {	#sometimes extremely long ref contains N nucleotides
					$verbose and print STDERR "WARNING: invalid reference ref allele record found in VCF4 file (ACGT expected): <$ref> in line <$_>\n";
				}
				$countinvalidref++;
				$ref = 0;
			} elsif ($ref eq '.') {	#sometimes people just write . as ref allele
				$ref = 0;
			}
			if ( $alt =~ m/[^ACGTacgt]/ and not $alt =~ m/^<.+>$/) {	#sometimes <DEL> and <DUP> are alternative allele
				$verbose and print STDERR "WARNING: invalid alt allele record (ACGT expected): <$alt> in line <$_>\n";
				$countinvalidalt++;
				if ($alt eq '.') {	#same as ref
					$alt = $ref;
				} else {
					$alt = 0;						#commented out "next;" on 20140712 since users may still be interested in this locus
				}
			}
			
			if (length ($ref) == length ($alt) and $ref=~m/^[ACGT]$/ and $alt=~m/^[ACGT]$/) {
				$countsnp++;
				if ($ref eq 'A' and $alt eq 'G' or $ref eq 'G' and $alt eq 'A' or $ref eq 'C' and $alt eq 'T' or $ref eq 'T' and $alt eq 'C') {
					$countti++;
				} else {
					if ($ref ne $alt) {
						$counttv++;
					}
				}
			} else {
				if ($ref ne '0' and $alt ne '0') {
					$countindel++;
				}
			}
			
			for my $j (0 .. @sample-1) {
				$allsample or $j and last;		#if --allsample is not specified, only process j=0 (the first sample in VCF file)
				my @sampleinfo = split (/:/, $sample[$j]);	#for example, "0|0:48:1:51,51" corresponds to "GT:GQ:DP:HQ"
				
				#print STDERR "processing line $countline allele $i sample $j ($sample[$j])\n";
				if (defined $gtpos) {
					#$sampleinfo[$gtpos] eq '0' and next;		#commented out 20150623 note that MuTect may generate "0" as genotype in the VCF file
					my ($a1, $a2);
					defined $sampleinfo[$gtpos] or warn "WARNING: genotype not found for gtpos=$gtpos sampleinfo=$sample[$j] in the line <$chr, $start, $ID, $ref, $alt>" and next;
					if ($sampleinfo[$gtpos] =~ m/([\d\.]+)[\/\|]([\d\.]+)/) {
						($a1, $a2) = ($1, $2);
					} elsif ($sampleinfo[$gtpos] =~ m/^([\d\.]+)$/) {	#chrX or chrY variants
						($a1, $a2) = ($1, $1);
					} else {
						$verbose and print STDERR "WARNING: invalid GT field found in VCF file: <$otherinfo>\n";
						$countinvalidgt++;
						next;
					}
					
				
					$a1 eq '.' or $count_all += 1;
					$a2 eq '.' or $count_all += 1;		#consider . and 0 separately, so that . is not used in the count_all calculation
					
					$a1 eq '.' and $a1 = 0;
					$a2 eq '.' and $a2 = 0;			#CG VCF files have many records as "." probably denoting unknown alleles
					
					if ($a1 eq '0' and $a2 eq '0') {	#ref/ref call or unknown call
						#$count_all += 2;	#commented out 20170328
						if (not $allsample or not $withfreq) {
							next;					#no mutation found in this sample so go to next one (but when -allsample and -withfreq is set, this mutation must be printed out)
						}
					} elsif ($a1 eq $a2) {
						if ($a1 != $i+1) {
							if (not $allsample or not $withfreq) {
								next;		#this genotype does not have this alternative allele
							}
						}
						#$count_all += 2;	#commented out 20170328
						$count_alt += 2;
						$zygosity = 'hom';
						$counthom++;
					} else {
						if ($a1 != $i+1 and $a2 != $i+1) {
							if (not $allsample or not $withfreq) {
								next;	#this genotype does not have this alternative allele
							}
						}
						$count_alt += 1;
						#$count_all += 2;	#commented out 20170328
						$zygosity = 'het';
						$counthet++;
					}
				} else {
					$zygosity = '.';
				}
				
				
				if (defined $dppos and defined $sampleinfo[$dppos]) {	#handle situations where format specification is wrong: <GT:AD:DP:GQ:PL  ./.:0,0 0/1:32,5:.:99:177,0,222./.:7,0:7>
					if ($sampleinfo[$dppos] =~ m/,/) {		#some NR records contains multiple rather than one number
						my $tempsum = 0;
						my @temp = split (/,/, $sampleinfo[$dppos]);
						for my $k (0 .. @temp-1) {
							$tempsum += $temp[$k];
						}
						$read_depth = $tempsum;
					} else {
						$read_depth = $sampleinfo[$dppos];
					}
				} elsif ($info =~ m/\bDP=(\d+)/) {
					$read_depth = $1;
				} elsif ($info =~ m/\bDP4=(\d+),(\d+),(\d+),(\d+)/) {
					$read_depth = $1+$2+$3+$4;
				} else {
					$read_depth = '.';
				}
				
				if (defined $gqpos) {
					$genotype_quality = $sampleinfo[$gqpos];
				} else {
					$genotype_quality = '.';
				}
				
				if ($coverage) {		#if -coverage argument is set (set as 0 by default)
					if ($read_depth ne '.') {
						$read_depth >= $coverage or next;
					}
				}
				
				
				if (defined $genoqual) {
					if ($genotype_quality ne '.') {
						$genotype_quality >= $genoqual or next;
					}
				}
				
				if ($allsample or not $allsample and $j==0) {	#either test all samples, or test the first sample
					if (length ($ref) == length ($alt) and $ref=~m/^[ACGT]$/ and $alt=~m/^[ACGT]$/) {
						$countsnp1++;
						if ($ref eq 'A' and $alt eq 'G' or $ref eq 'G' and $alt eq 'A' or $ref eq 'C' and $alt eq 'T' or $ref eq 'T' and $alt eq 'C') {
							$countti1++;
						} else {
							if ($ref ne $alt) {
								$counttv1++;
							}
						}
					} else {
						if ($ref ne '0' and $alt ne '0') {
							$countindel1++;		#sometimes ref is unknown
						}
					}
				}			
				
				
				if (uc $alt eq '<DEL>') {		#old 1000G VCF files and CNV call files have this
					#6	1422801	CNVnator_del_4	C	<DEL>	.	PASS	END=1424100;SVTYPE=DEL;SVLEN=-1300;IMPRECISE;natorRD=0.0696997;
					#if END tag is present, use it, otherwise have to rely on the ref allele itself
					if ($info =~ m/\bEND=(\d+)\b/) {
						($newstart, $newend) = ($start, $1);
						($newref, $newalt) = (0, '-');
					} else {
						($newstart, $newend) = ($start, $start+length($ref)-1);
						($newref, $newalt) = ($ref, '-');
					}
				} elsif (uc $alt eq '<DUP>' or uc $alt eq '<INV>' or uc $alt eq '<INS>') {
					#16	8611701	CNVnator_dup_7	A	<DUP>	.	PASS	END=8614800;SVTYPE=DUP;SVLEN=3100;IMPRECISE;natorRD=1.56346;natorP1=0.000439407;
					if ($info =~ m/\bEND=(\d+)\b/) {
						($newstart, $newend) = ($start, $1);
						($newref, $newalt) = (0, 0);
					} else {
						($newstart, $newend) = ($start, $start+length($ref)-1);
						($newref, $newalt) = ($ref, '-');
					}
					
				} elsif (length ($ref) == 1 and length ($alt) == 1) {	#SNV
					($newstart, $newend) = ($start, $start+length($ref)-1);
					($newref, $newalt) = ($ref, $alt);
					
				} elsif (length ($ref) > length ($alt)) {		#deletion or block substitution
					if ($keepindelref) {
						($newstart, $newend, $newref, $newalt) = ($start, $start+length($ref)-1, $ref, $alt);
					} else {
						$head = substr ($ref, 0, length ($alt));
						if ($head eq $alt) {
							($newstart, $newend) = ($start+length ($head), $start + length ($ref)-1);
							($newref, $newalt) = (substr($ref, length($alt)), '-');
						} else {
							($newstart, $newend) = ($start, $start+length($ref)-1);		#changed to length(ref) on 20130820
							($newref, $newalt) = ($ref, $alt);
						}
						
						($newstart, $newend, $newref, $newalt) = adjustStartEndRefAlt ($newstart, $newend, $newref, $newalt);	#20150324: further adjust when only part of alt and ref matches
					}
				} elsif (length ($ref) < length ($alt)) {		#insertion or block substitution
					if ($keepindelref) {
						($newstart, $newend, $newref, $newalt) = ($start, $start+length($ref)-1, $ref, $alt);
					} else {
						$head = substr ($alt, 0, length ($ref));
						if ($head eq $ref) {
							($newstart, $newend) = ($start+length($ref)-1, $start+length($ref)-1);
							($newref, $newalt) = ('-', substr($alt, length($ref)));
						} else {
							($newstart, $newend) = ($start, $start+length($ref)-1);
							($newref, $newalt) = ($ref, $alt);
						}
						
						($newstart, $newend, $newref, $newalt) = adjustStartEndRefAlt ($newstart, $newend, $newref, $newalt);	#20150324: further adjust when only part of alt and ref matches
					}
				} else {                #block substitution (only differing in the last base)
					if ($keepindelref) {
						($newstart, $newend, $newref, $newalt) = ($start, $start+length($ref)-1, $ref, $alt);
					} else {
	                	$head = substr ($ref, 0, length ($ref) - 1);
						if ($alt =~ /^$head/) {
							($newstart, $newend) = ($start+length($ref)-1, $start+length($ref)-1);
							my $chopped_ref = $ref;
							($newref, $newalt) = (chop $chopped_ref, chop $alt);
						} else {
							($newstart, $newend) = ($start, $start+length($ref)-1);
							($newref, $newalt) = ($ref, $alt);
						}
						
						($newstart, $newend, $newref, $newalt) = adjustStartEndRefAlt ($newstart, $newend, $newref, $newalt);	#20150324: further adjust when only part of alt and ref matches
					}
				}
				
				if ($includeinfo) {
					$fhout[0] ||= *STDOUT;		#sometimes VCF file does not have the sample name header, so that the file is never opened (m/^#CHR/ not found)
					my $fhout = $fhout[$j];
					if ($withzyg) {
						print $fhout join ("\t", $chr, $newstart, $newend, $newref, $newalt, $zygosity, $quality_score, $read_depth, $chr, $start, $ID, $ref, $alt, $quality_score, $filter, $info, $format, $sample[$j]), "\n";
					} elsif ($withfreq) {
						1;		#print this line out later on
					} elsif ($sample[$j] eq 'NULL') {
						print $fhout join ("\t", $chr, $newstart, $newend, $newref, $newalt, $_), "\n";
					} else {
						defined $sample[$j] or die;
						defined $format or die;
						defined $info or die;
						defined $filter or die;
						defined $quality_score or die;
						defined $newstart or die "newstart <$_>";
						defined $newend or die "newend <$_>";
						defined $fhout or die;
						print $fhout join ("\t", $chr, $newstart, $newend, $newref, $newalt, $chr, $start, $ID, $ref, $alt, $quality_score, $filter, $info, $format, $sample[$j]), "\n";
					}
				} else {
					$fhout[0] ||= *STDOUT;		#sometimes VCF file does not have the sample name header, so that the file is never opened (m/^#CHR/ not found)
					my $fhout = $fhout[$j];
					defined $fhout or next;		#if the VCF file does not have the sample name header, only first sample is printed out to STDOUT, and other samples are skipped
					if ($withfreq) {
						1;		#print this line out later on
					} else {
						if ($withfilter) {
							print $fhout join ("\t", $chr, $newstart, $newend, $newref, $newalt, $zygosity, $filter, $quality_score, $read_depth), "\n";
						} else {
							print $fhout join ("\t", $chr, $newstart, $newend, $newref, $newalt, $zygosity, $quality_score, $read_depth), "\n";
							#print STDERR "NOTICE: printing chr=$chr\n";
						}
					}
				} #if ($includeinfo)	
			} #for my $j (0 .. @sample-1)
			
			#we have to decide what to do with VCF file that does not even contain any sample, and is used for listing sites
			#traditionally, no output will be generated, so that users must use -vcf4old. this is not a great solution because (1) users get confused and complain a lot (2) must maintain two separate subroutines which is a huge burden
			#in principle, this problem can be solved by still printing some essential information about these sites, and warn users that no zygosity will be available
			#the following was added 20140711
			
			if ($withfreq) {
				my $fhout = $fhout[0];
				my $freq = $count_all?($count_alt/$count_all):'.';
				$newalt ||= 0;		#this is to handle GATK haplotype caller that sometimes have ALT allele as "."
				length ($freq) > 6 and $freq = sprintf("%.4g", $freq);
				if ($includeinfo) {
					print $fhout join ("\t", $chr, $newstart, $newend, $newref, $newalt, $freq, $quality_score, $read_depth, $_), "\n";
				} else {
					print $fhout join ("\t", $chr, $newstart, $newend, $newref, $newalt, $freq, $quality_score, $read_depth), "\n";
				}
			}
			
		} #for my $i (0 .. @allalt-1)
	} #while (<VAR>)
	print STDERR "NOTICE: Finished reading $countline lines from VCF file\n";
	print STDERR "NOTICE: A total of $countlocus locus in VCF file passed QC threshold, representing $countsnp SNPs ($countti transitions and $counttv transversions) and $countindel indels/substitutions\n";
	if ($allsample) {
		if ($withfreq) {
			print STDERR "NOTICE: Finished writing allele frequencies based on $countsnp1 SNP genotypes ($countti1 transitions and $counttv1 transversions) and $countindel1 indels/substitutions for $countsample samples\n";
		} else {
			print STDERR "NOTICE: Finished writing $countsnp1 SNP genotypes ($countti1 transitions and $counttv1 transversions) and $countindel1 indels/substitutions for $countsample samples\n";
		}
	} else {
		if ($countsample > 1) {
			print STDERR "NOTICE: Finished writing $countsnp1 SNP genotypes ($countti1 transitions and $counttv1 transversions) and $countindel1 indels/substitutions for 1 sample (but input contains $countsample samples)\n";
		} else {
			print STDERR "NOTICE: Finished writing $countsnp1 SNP genotypes ($countti1 transitions and $counttv1 transversions) and $countindel1 indels/substitutions for 1 sample\n";
		}
	}
	$countinvalidalt and print STDERR "WARNING: $countinvalidalt invalid alternative alleles found in input file\n";
	$countinvalidref and print STDERR "WARNING: $countinvalidref invalid reference alleles found in input file\n";
	$countinvalidgt and print STDERR "WARNING: $countinvalidgt invalid genotype records in input file\n";
}
				

=head1 SYNOPSIS

 convert2annovar.pl [arguments] <variantfile>

 Optional arguments:
        -h, --help                      print help message
        -m, --man                       print complete documentation
        -v, --verbose                   use verbose output
            --format <string>		input format (default: pileup)
            --includeinfo		include supporting information in output
            --outfile <file>		output file name (default: STDOUT)
            --snpqual <float>		quality score threshold in pileup file (default: 20)
            --snppvalue <float>		SNP P-value threshold in GFF3-SOLiD file (default: 1)
            --coverage <int>		read coverage threshold in pileup file (default: 0)
            --maxcoverage <int>		maximum coverage threshold (default: none)
            --chr <string>		specify the chromosome (for CASAVA format)
            --chrmt <string>		chr identifier for mitochondria (default: M)
            --fraction <float>		minimum allelic fraction to claim a mutation (for pileup format)
            --altcov <int>		alternative allele coverage threshold (for pileup format)
            --allelicfrac		print out allelic fraction rather than het/hom status (for pileup format)
            --species <string>		if human, convert chr23/24/25 to X/Y/M (for gff3-solid format)
            --filter <string>		output variants with this filter (case insensitive, for vcf4 format)
            --confraction <float>	minimal fraction for two indel calls as a 0-1 value (for vcf4old format)
            --allallele			print all alleles rather than first one (for vcf4old format)
            --withzyg			print zygosity/coverage/quality when -includeinfo is used (for vcf4 format)
            --comment			keep comment line in output (for vcf4 format)
            --allsample			process all samples in file with separate output files (for vcf4 format)
            --genoqual <float>		genotype quality score threshold (for vcf4 format)
            --varqual <float>		variant quality score threshold (for vcf4 format)
            --dbsnpfile <file>		dbSNP file in UCSC format (for rsid format)
            --withfreq			for --allsample, print frequency information instead (for vcf4 format)
            --withfilter		print filter information in output (for vcf4 format)
            --seqdir <string>		directory with FASTA sequences (for region format)
            --inssize <int>		insertion size (for region format)
            --delsize <int>		deletion size (for region format)
            --subsize <int>		substitution size (default: 1, for region format)
            --genefile <file>		specify the gene file from UCSC (for transcript format)
            --splicing_threshold <int>	the splicing threshold (for transcript format)
            --context <int>		print context nucleotide for indels (for casava format)
            --avsnpfile <file>		specify the avSNP file (for rsid format)
            --keepindelref		keep Ref/Alt alleles for indels (for vcf4 format)

 Function: convert variant call file generated from various software programs 
 into ANNOVAR input format
 
 Example: convert2annovar.pl -format pileup -outfile variant.query variant.pileup
          convert2annovar.pl -format cg -outfile variant.query variant.cg
          convert2annovar.pl -format cgmastervar variant.masterVar.txt
          convert2annovar.pl -format gff3-solid -outfile variant.query variant.snp.gff
          convert2annovar.pl -format soap variant.snp > variant.avinput
          convert2annovar.pl -format maq variant.snp > variant.avinput
          convert2annovar.pl -format casava -chr 1 variant.snp > variant.avinput
          convert2annovar.pl -format vcf4 variantfile > variant.avinput
          convert2annovar.pl -format vcf4 -filter pass variantfile -allsample -outfile variant
          convert2annovar.pl -format vcf4old input.vcf > output.avinput
          convert2annovar.pl -format rsid snplist.txt -dbsnpfile snp138.txt > output.avinput
          convert2annovar.pl -format region -seqdir humandb/hg19_seq/ chr1:2000001-2000003 -inssize 1 -delsize 2
          convert2annovar.pl -format transcript NM_022162 -gene humandb/hg19_refGene.txt -seqdir humandb/hg19_seq/

 Version: $Date: 2019-10-24 00:05:27 -0400 (Thu, 24 Oct 2019) $

=head1 OPTIONS

=over 8

=item B<--help>

print a brief usage message and detailed explanation of options.

=item B<--man>

print the complete manual of the program.

=item B<--verbose>

use verbose output.

=item B<--format>

the format of the input files. Currently supported formats include pileup, cg, 
cgmastervar, gff3-solid, soap, maq, casava, vcf4, vcf4old, rsid. In August 2013, the VCF 
file processing subroutine is changed (multiple samples in VCF file can be 
processed in genotype-aware manner), but users can use vcf4old to have identical 
results as the old behavior.

=item B<--outfile>

specify the output file name. By default, output is written to STDOUT.

=item B<--snpqual>

quality score threshold in the pileup file, such that variant calls with lower 
quality scores will not be printed out in the output file.

=item B<--snppvalue>

SNP p-value threshold in the pileup file, such that variant calls with higher
values will not be printed out in the output file.

=item B<--coverage>

read coverage threshold in the pileup file, such that variants calls generated 
with lower coverage will not be printed in the output file.

=item B<--maxcoverage>

maximum read coverage threshold in the pileup file, such that variants calls generated 
with higher coverage will not be printed in the output file.

=item B<--includeinfo>

specify that the output should contain additional information in the input line. 
By default, only the chr, start, end, reference allele, observed allele and 
homozygosity status are included in output files.

=item B<--chr>

specify the chromosome for CASAVA format

=item B<--chrmt>

specify the name of mitochondria chromosome (default is MT)

=item B<--altcov>

the minimum coverage of the alternative (mutated) allele to be printed out in 
output

=item B<--allelicfrac>

print out allelic fraction rather than het/hom status (for pileup format). This 
is useful when processing mitochondria variants.

=item B<--fraction>

specify the minimum fraction of alternative allele, to print out the mutation. 
For example, a site has 10 reads, 3 supports alternative allele. A -fraction of 
0.4 will not allow the mutation to be printed out.

=item B<--species>

specify the species from which the sequencing data is obtained. For the GFF3-
SOLiD format, when species is human, the chromosome 23, 24 and 25 will be 
converted to X, Y and M, respectively.

=item B<--filter>

for VCF4 file, only print out variant calls with this filter annotated. For 
example, if using GATK VariantFiltration walker, you will see PASS, 
GATKStandard, HARD_TO_VALIDATE, etc in the filter field. Using 'pass' as a 
filter is recommended in this case.

=item B<--allsample>

for multi-sample VCF4 file, the --allsample argument will process all samples in 
the file and generate separate output files for each sample. By default, only 
the first sample in VCF4 file will be processed.

=item B<--withzyg>

for VCF4 format, print out zygosity information, coverage information and 
genotype quality information when -includeinfo is used. By default, these 
information are printed out if -includeinfo is not used.

=item B<--genoqual>

minimum genotype quality for the variant in this sample, to be printed out. The 
genotype quality is typically denoted as GQ in the SAMPLE column

=item B<--varqual>

minimum variant quality (the QUAL column in the VCF file) to handle the variant 
in VCF file.

=item B<--comment>

include VCF4 header comment lines in the output file

=item B<--genoqual>

specify the genotype quality score to be included in the output file

=item B<--varqual>

specify the variant quality score to be included in the output file

=item B<--dbsnpfile>

specify the dbSNP file to query (for rsid format)

=item B<--withfreq>

include frequency information in the output (for VCF format with multiple
samples)

=item B<--withfilter>

include filter information in the output file (for VCF format)

=item B<--seqdir>

specify the directory for sequence file (for region format)

=item B<--inssize>

specify the insertion size when generating all mutations (for region format)

=item B<--delsize>

specify the deletion size when generating all mutations (for region format)

=item B<--subsize>

specify the substitution size when generating all mutations (for region format)

=item B<--genefile>

specify the gene file from UCSC, which can be refGene, knownGene or ensGene (for
transcript format)

=item B<--splicing_threshold>

specify the splicing threshold (for transcript format)

=item B<--context>

print context for indels which is useful to convert to VCF files (for CASAVA format)

=item B<--avsnpfile>

specify the avsnpfile that will be queried when using rsid as the input file format

=item B<--keepindelref>

do not alter the Ref and Alt alleles for indels in the VCF file (by default the
program automatically changes and shortens the Ref and Alt allele)

=back

=head1 DESCRIPTION

This program is used to convert variant call file generated from various 
software programs into ANNOVAR input format. Currently, the program can handle 
Samtools genotype-calling pileup format, Solid GFF format, Complete Genomics 
variant format, SOAP format, MAQ format, CASAVA format, VCF format. These 
formats are described below.

=over 8

=item * B<pileup format>

The pileup format can be produced by the Samtools genotyping calling subroutine. 
Note that the phrase pileup format can be used in several instances, and here I 
am only referring to the pileup files that contains the actual genotype calls. 

Using SamTools, given an alignment file in BAM format, a pileup file with 
genotype calls can be produced by the command below:

	samtools pileup -vcf ref.fa aln.bam> raw.pileup
	samtools.pl varFilter raw.pileup > final.pileup

ANNOVAR will automatically filter the pileup file so that only SNPs reaching a 
quality threshold are printed out (default is 20, use --snpqual argument to 
change this). Most likely, users may want to also apply a coverage threshold, 
such that SNPs calls from only a few reads are not considered. This can be 
achieved using the -coverage argument (default value is 0).

An example of pileup files for SNPs is shown below:

	chr1 556674 G G 54 0 60 16 a,.....,...,.... (B%A+%7B;0;%=B<:
	chr1 556675 C C 55 0 60 16 ,,..A..,...,.... CB%%5%,A/+,%....
	chr1 556676 C C 59 0 60 16 g,.....,...,.... .B%%.%.?.=/%...1
	chr1 556677 G G 75 0 60 16 ,$,.....,...,.... .B%%9%5A6?)%;?:<
	chr1 556678 G K 60 60 60 24 ,$.....,...,....^~t^~t^~t^~t^~t^~t^~t^~t^~t B%%B%<A;AA%??<=??;BA%B89
	chr1 556679 C C 61 0 60 23 .....a...a....,,,,,,,,, %%1%&?*:2%*&)(89/1A@B@@
	chr1 556680 G K 88 93 60 23 ..A..,..A,....ttttttttt %%)%7B:B0%55:7=>>A@B?B;
	chr1 556681 C C 102 0 60 25 .$....,...,....,,,,,,,,,^~,^~. %%3%.B*4.%.34.6./B=?@@>5.
	chr1 556682 A A 70 0 60 24 ...C,...,....,,,,,,,,,,. %:%(B:A4%7A?;A><<999=<<
	chr1 556683 G G 99 0 60 24 ....,...,....,,,,,,,,,,. %A%3B@%?%C?AB@BB/./-1A7?

The columns are chromosome, 1-based coordinate, reference base, consensus base, 
consensus quality, SNP quality, maximum mapping quality of the reads covering 
the sites, the number of reads covering the site, read bases and base qualities.

An example of pileup files for indels is shown below:

	seq2  156 *  +AG/+AG  71  252  99  11  +AG  *  3  8  0

ANNOVAR automatically recognizes both SNPs and indels in pileup file, and process them correctly.

=item * B<GFF3-SOLiD format>

The SOLiD provides a GFF3-compatible format for SNPs, indels and structural 
variants. A typical example file is given below:

	##gff-version 3
	##solid-gff-version 0.3
	##source-version 2
	##type DNA
	##date 2009-03-13
	##time 0:0:0
	##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
	##reference-file 
	##input-files Yoruban_snp_10x.txt
	##run-path 
	chr_name        AB_SOLiD SNP caller     SNP     coord   coord   1       .       .       coverage=# cov;ref_base=ref;ref_score=score;ref_confi=confi;ref_single=Single;ref_paired=Paired;consen_base=consen;consen_score=score;consen_confi=conf;consen_single=Single;consen_paired=Paired;rs_id=rs_id,dbSNP129
	1       AB_SOLiD SNP caller     SNP     997     997     1       .       .       coverage=3;ref_base=A;ref_score=0.3284;ref_confi=0.9142;ref_single=0/0;ref_paired=1/1;consen_base=G;consen_score=0.6716;consen_confi=0.9349;consen_single=0/0;consen_paired=2/2
	1       AB_SOLiD SNP caller     SNP     2061    2061    1       .       .       coverage=2;ref_base=G;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=C;consen_score=1.0000;consen_confi=0.8985;consen_single=0/0;consen_paired=2/2
	1       AB_SOLiD SNP caller     SNP     4770    4770    1       .       .       coverage=2;ref_base=A;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=G;consen_score=1.0000;consen_confi=0.8854;consen_single=0/0;consen_paired=2/2
	1       AB_SOLiD SNP caller     SNP     4793    4793    1       .       .       coverage=14;ref_base=A;ref_score=0.0723;ref_confi=0.8746;ref_single=0/0;ref_paired=1/1;consen_base=G;consen_score=0.6549;consen_confi=0.8798;consen_single=0/0;consen_paired=9/9
	1       AB_SOLiD SNP caller     SNP     6241    6241    1       .       .       coverage=2;ref_base=T;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=C;consen_score=1.0000;consen_confi=0.7839;consen_single=0/0;consen_paired=2/2
	
Newer version of ABI BioScope now use diBayes caller, and the output file is given below:

	##gff-version 3
	##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
	##List of SNPs. Date Sat Dec 18 10:30:45 2010    Stringency: medium Mate Pair: 1 Read Length: 50 Polymorphism Rate: 0.003000 Bayes Coverage: 60 Bayes_Single_SNP: 1 Filter_Single_SNP: 1 Quick_P_Threshold: 0.997000 Bayes_P_Threshold: 0.040000 Minimum_Allele_Ratio: 0.150000 Minimum_Allele_Ratio_Multiple_of_Dicolor_Error: 100
	##1     chr1
	##2     chr2
	##3     chr3
	##4     chr4
	##5     chr5
	##6     chr6
	##7     chr7
	##8     chr8
	##9     chr9
	##10    chr10
	##11    chr11
	##12    chr12
	##13    chr13
	##14    chr14
	##15    chr15
	##16    chr16
	##17    chr17
	##18    chr18
	##19    chr19
	##20    chr20
	##21    chr21
	##22    chr22
	##23    chrX
	##24    chrY
	##25    chrM
	# source-version SOLiD BioScope diBayes(SNP caller)
	#Chr    Source  Type    Pos_Start       Pos_End Score   Strand  Phase   Attributes
	chr1    SOLiD_diBayes   SNP     221367  221367  0.091151        .       .       genotype=R;reference=G;coverage=3;refAlleleCounts=1;refAlleleStarts=1;refAlleleMeanQV=29;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=27;diColor1=11;diColor2=33;het=1;flag= 
	chr1    SOLiD_diBayes   SNP     555317  555317  0.095188        .       .       genotype=Y;reference=T;coverage=13;refAlleleCounts=11;refAlleleStarts=10;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=29;diColor1=00;diColor2=22;het=1;flag= 
	chr1    SOLiD_diBayes   SNP     555327  555327  0.037582        .       .       genotype=Y;reference=T;coverage=12;refAlleleCounts=6;refAlleleStarts=6;refAlleleMeanQV=19;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=29;diColor1=12;diColor2=30;het=1;flag= 
	chr1    SOLiD_diBayes   SNP     559817  559817  0.094413        .       .       genotype=Y;reference=T;coverage=9;refAlleleCounts=5;refAlleleStarts=4;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=14;diColor1=11;diColor2=33;het=1;flag= 
	chr1    SOLiD_diBayes   SNP     714068  714068  0.000000        .       .       genotype=M;reference=C;coverage=13;refAlleleCounts=7;refAlleleStarts=6;refAlleleMeanQV=25;novelAlleleCounts=6;novelAlleleStarts=4;novelAlleleMeanQV=22;diColor1=00;diColor2=11;het=1;flag= 
	The file conforms to standard GFF3 specifications, but the last column is solid-
	specific and it gives certain parameters for the SNP calls.

An example of the short indel format by GFF3-SOLiD is given below:

	##gff-version 3
	##solid-gff-version 0.3
	##source-version SOLiD Corona Lite v.4.0r2.0, find-small-indels.pl v 1.0.1, process-small-indels v 0.2.2, 2009-01-12 12:28:49
	##type DNA
	##date 2009-01-26
	##time 18:33:20
	##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
	##reference-file 
	##input-files ../../mp-results/JOAN_20080104_1.pas,../../mp-results/BARB_20071114_1.pas,../../mp-results/BARB_20080227_2.pas
	##run-path /data/results2/Yoruban-frag-indel/try.01.06/mp-w2x25-2x-4x-8x-10x/2x
	##Filter-settings: max-ave-read-pos=none,min-ave-from-end-pos=9.1,max-nonreds-4filt=2,min-insertion-size=none,min-deletion-size=none,max-insertion-size=none,max-deletion-size=none,require-called-indel-size?=T
	chr1    AB_SOLiD Small Indel Tool       deletion        824501  824501  1       .       .       del_len=1;tight_chrom_pos=824501-824502;loose_chrom_pos=824501-824502;no_nonred_reads=2;no_mismatches=1,0;read_pos=4,6;from_end_pos=21,19;strands=+,-;tags=R3,F3;indel_sizes=-1,-1;read_seqs=G3021212231123203300032223,T3321132212120222323222101;dbSNP=rs34941678,chr1:824502-824502(-),EXACT,1,/GG
	chr1    AB_SOLiD Small Indel Tool       insertion_site  1118641 1118641 1       .       .       ins_len=3;tight_chrom_pos=1118641-1118642;loose_chrom_pos=1118641-1118642;no_nonred_reads=2;no_mismatches=0,1;read_pos=17,6;from_end_pos=8,19;strands=+,+;tags=F3,R3;indel_sizes=3,3;read_seqs=T0033001100022331122033112,G3233112203311220000001002

The keyword deletion or insertion_site is used in the fourth column to indicate 
that file format.

An example of the medium CNV format by GFF3-SOLiD is given below:

	##gff-version 3
	##solid-gff-version 0.3
	##source-version SOLiD Corona Lite v.4.0r2.0, find-small-indels.pl v 1.0.1, process-small-indels v 0.2.2, 2009-01-12 12:28:49
	##type DNA
	##date 2009-01-27
	##time 15:54:36
	##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
	##reference-file 
	##input-files big_d20e5-del12n_up-ConsGrp-2nonred.pas.sum
	##run-path /data/results2/Yoruban-frag-indel/try.01.06/mp-results-lmp-e5/big_d20e5-indel_950_2050
	chr1    AB_SOLiD Small Indel Tool       deletion        3087770 3087831 1       .       .       del_len=62;tight_chrom_pos=none;loose_chrom_pos=3087768-3087773;no_nonred_reads=2;no_mismatches=2,2;read_pos=27,24;from_end_pos=23,26;strands=-,+;tags=F3,F3;indel_sizes=-62,-62;read_seqs=T11113022103331111130221213201111302212132011113022,T02203111102312122031111023121220311111333012203111
	chr1    AB_SOLiD Small Indel Tool       deletion        4104535 4104584 1       .       .       del_len=50;tight_chrom_pos=4104534-4104537;loose_chrom_pos=4104528-4104545;no_nonred_reads=3;no_mismatches=0,4,4;read_pos=19,19,27;from_end_pos=31,31,23;strands=+,+,-;tags=F3,R3,R3;indel_sizes=-50,-50,-50;read_seqs=T31011011013211110130332130332132110110132020312332,G21031011013211112130332130332132110132132020312332,G20321302023001101123123303103303101113231011011011
	chr1    AB_SOLiD Small Indel Tool       insertion_site  2044888 2044888 1       .       .       ins_len=18;tight_chrom_pos=2044887-2044888;loose_chrom_pos=2044887-2044889;no_nonred_reads=2;bead_ids=1217_1811_209,1316_908_1346;no_mismatches=0,2;read_pos=13,15;from_end_pos=37,35;strands=-,-;tags=F3,F3;indel_sizes=18,18;read_seqs=T31002301231011013121000101233323031121002301231011,T11121002301231011013121000101233323031121000101231;non_indel_no_mismatches=3,1;non_indel_seqs=NIL,NIL
	chr1    AB_SOLiD Small Indel Tool       insertion_site  74832565        74832565        1       .       .       ins_len=16;tight_chrom_pos=74832545-74832565;loose_chrom_pos=74832545-74832565;no_nonred_reads=2;bead_ids=1795_181_514,1651_740_519;no_mismatches=0,2;read_pos=13,13;from_end_pos=37,37;strands=-,-;tags=F3,R3;indel_sizes=16,16;read_seqs=T33311111111111111111111111111111111111111111111111,G23311111111111111111111111111111111111111311011111;non_indel_no_mismatches=1,0;non_indel_seqs=NIL,NIL

An example of the large indel format by GFF3-SOLiD is given below:

	##gff-version 3
	##solid-gff-version 0.3
	##source-version ???
	##type DNA
	##date 2009-03-13
	##time 0:0:0
	##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
	##reference-file 
	##input-files /data/results5/yoruban_strikes_back_large_indels/LMP/five_mm_unique_hits_no_rescue/5_point_6x_del_lib_1/results/NA18507_inter_read_indels_5_point_6x.dat
	##run-path 
	chr1    AB_SOLiD Large Indel Tool       insertion_site  1307279 1307791 1       .       .       deviation=-742;stddev=7.18;ref_clones=-;dev_clones=4
	chr1    AB_SOLiD Large Indel Tool       insertion_site  2042742 2042861 1       .       .       deviation=-933;stddev=8.14;ref_clones=-;dev_clones=3
	chr1    AB_SOLiD Large Indel Tool       insertion_site  2443482 2444342 1       .       .       deviation=-547;stddev=11.36;ref_clones=-;dev_clones=17
	chr1    AB_SOLiD Large Indel Tool       insertion_site  2932046 2932984 1       .       .       deviation=-329;stddev=6.07;ref_clones=-;dev_clones=14
	chr1    AB_SOLiD Large Indel Tool       insertion_site  3166925 3167584 1       .       .       deviation=-752;stddev=13.81;ref_clones=-;dev_clones=14

An example of the CNV format by GFF3-SOLiD if given below:

	##gff-version 3
	##solid-gff-version 0.3
	##source-version ???
	##type DNA
	##date 2009-03-13
	##time 0:0:0
	##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
	##reference-file 
	##input-files Yoruban_cnv.coords
	##run-path 
	chr1    AB_CNV_PIPELINE repeat_region   1062939 1066829 .       .       .       fraction_mappable=51.400002;logratio=-1.039300;copynum=1;numwindows=1
	chr1    AB_CNV_PIPELINE repeat_region   1073630 1078667 .       .       .       fraction_mappable=81.000000;logratio=-1.409500;copynum=1;numwindows=2
	chr1    AB_CNV_PIPELINE repeat_region   2148325 2150352 .       .       .       fraction_mappable=98.699997;logratio=-1.055000;copynum=1;numwindows=1
	chr1    AB_CNV_PIPELINE repeat_region   2245558 2248109 .       .       .       fraction_mappable=78.400002;logratio=-1.042900;copynum=1;numwindows=1
	chr1    AB_CNV_PIPELINE repeat_region   3489252 3492632 .       .       .       fraction_mappable=59.200001;logratio=-1.119900;copynum=1;numwindows=1
	chr1    AB_CNV_PIPELINE repeat_region   5654415 5657276 .       .       .       fraction_mappable=69.900002;logratio=1.114500;copynum=4;numwindows=1
	chr1    AB_CNV_PIPELINE repeat_region   9516165 9522726 .       .       .       fraction_mappable=65.850006;logratio=-1.316700;numwindows=2
	chr1    AB_CNV_PIPELINE repeat_region   16795117        16841025        .       .       .       fraction_mappable=44.600002;logratio=1.880778;copynum=7;numwindows=9

The keyword repeat_region is used here, although it actually refers to CNVs.

An example of the inversion format by GFF3-SOLiD is given below:

	##gff-version 3
	##solid-gff-version 0.2
	##generated by SOLiD inversion tool
	chr10   AB_SOLiD        inversion       46443107        46479585        268.9   .       .       left=chr10:46443107-46443146;right=chr10:46479583-46479585;leftscore=295.0;rightscore=247.0;count_AAA_further_left=117;count_AAA_left=3;count_AAA_right=3;count_AAA_further_right=97;left_min_count_AAA=chr10:46443107-46443112;count_AAA_min_left=0;count_AAA_max_left=3;right_min_count_AAA=chr10:46479585-46479585;count_AAA_min_right=1;count_AAA_max_right=3;homozygous=UNKNOWN
	chr4    AB_SOLiD        inversion       190822813       190850112       214.7   .       .       left=chr4:190822813-190822922;right=chr4:190850110-190850112;leftscore=140.0;rightscore=460.0;count_AAA_further_left=110;count_AAA_left=78;count_AAA_right=74;count_AAA_further_right=77;left_min_count_AAA=chr4:190822813-190822814;count_AAA_min_left=69;count_AAA_max_left=77;right_min_count_AAA=chr4:190850110-190850112;count_AAA_min_right=74;count_AAA_max_right=74;homozygous=NO
	chr6    AB_SOLiD        inversion       168834969       168837154       175.3   .       .       left=chr6:168834969-168835496;right=chr6:168836643-168837154;leftscore=185.4;rightscore=166.2;count_AAA_further_left=67;count_AAA_left=43;count_AAA_right=40;count_AAA_further_right=59;left_min_count_AAA=chr6:168835058-168835124,chr6:168835143-168835161,chr6:168835176-168835181,chr6:168835231-168835262;count_AAA_min_left=23;count_AAA_max_left=29;right_min_count_AAA=chr6:168836643-168836652;count_AAA_min_right=23;count_AAA_max_right=31;homozygous=NO

The program should be able to recognize all the above GFF3-SOLiD format 
automatically, and handle them accordingly.

=item * B<Complete Genomics format>

This format is provided by the Complete Genomics company to their customers. The 
file var-[ASM-ID].tsv.bz2 includes a description of all loci where the assembled 
genome differs from the reference genome.

An example of the Complete Genomics format is shown below:

	#BUILD  1.5.0.5
	#GENERATED_AT   2009-Nov-03 19:52:21.722927
	#GENERATED_BY   dbsnptool
	#TYPE   VAR-ANNOTATION
	#VAR_ANN_SET    /Proj/Pipeline/Production_Data/REF/HUMAN-F_06-REF/dbSNP.csv
	#VAR_ANN_TYPE   dbSNP
	#VERSION        0.3
	
	>locus  ploidy  haplotype       chromosome      begin   end     varType reference       alleleSeq       totalScore      hapLink xRef
	1       2       all     chr1    0       959     no-call =       ?                       
	2       2       all     chr1    959     972     =       =       =                       
	3       2       all     chr1    972     1001    no-call =       ?                       
	4       2       all     chr1    1001    1008    =       =       =                       
	5       2       all     chr1    1008    1114    no-call =       ?                       
	6       2       all     chr1    1114    1125    =       =       =                       
	7       2       all     chr1    1125    1191    no-call =       ?                       
	8       2       all     chr1    1191    1225    =       =       =                       
	9       2       all     chr1    1225    1258    no-call =       ?                       
	10      2       all     chr1    1258    1267    =       =       =                       
	12      2       all     chr1    1267    1275    no-call =       ?                       
	13      2       all     chr1    1275    1316    =       =       =                       
	14      2       all     chr1    1316    1346    no-call =       ?                       
	15      2       all     chr1    1346    1367    =       =       =                       
	16      2       all     chr1    1367    1374    no-call =       ?                       
	17      2       all     chr1    1374    1388    =       =       =                       
	18      2       all     chr1    1388    1431    no-call =       ?                       
	19      2       all     chr1    1431    1447    =       =       =                       
	20      2       all     chr1    1447    1454    no-call =       ?                       

The following information is provided in documentation from Complete Genomics, that describes the var-ASM format.

	1. locus. Identifier of a particular genomic locus
	2. ploidy. The ploidy of the reference genome at the locus (= 2 for autosomes, 2 for pseudoautosomal regions on the sex chromosomes, 1 for males on the non-pseudoautosomal parts of the sex chromosomes, 1 for mitochondrion, '?' if varType is 'no-ref' or 'PAR-called-in-X'). The reported ploidy is fully determined by gender, chromosome and location, and is not inferred from the sequence data.
	3. haplotype. Identifier for each haplotype at the variation locus. For diploid genomes, 1 or 2. Shorthand of 'all' is allowed where the varType field is one of 'ref', 'no-call', 'no-ref', or 'PAR-called-in-X'. Haplotype numbering does not imply phasing; haplotype 1 in locus 1 is not necessarily in phase with haplotype 1 in locus 2. See hapLink, below, for phasing information.
	4. chromosome. Chromosome name in text: 'chr1','chr2', ... ,'chr22','chrX','chrY'. The mitochondrion is represented as 'chrM'. The pseudoautosomal regions within the sex chromosomes X and Y are reported at their coordinates on chromosome X.
	5. begin. Reference coordinate specifying the start of the variation (not the locus) using the half-open zero-based coordinate system. See section 'Sequence Coordinate System' for more information.
	6. end. Reference coordinate specifying the end of the variation (not the locus) using the half-open zero-based coordinate system. See section 'Sequence Coordinate System' for more information.
	7. varType. Type of variation, currently one of:
		snp: single-nucleotide polymorphism
		ins: insertion
		del: deletion
		sub: Substitution of one or more reference bases with the bases in the allele column
		'ref' : no variation; the sequence is identical to the reference sequence on the indicated haplotype
		no-call-rc: 'no-call reference consistent 'one or more bases are ambiguous, but the allele is potentially consistent with the reference
		no-call-ri: 'no-call reference inconsistent' one or more bases are ambiguous, but the allele is definitely inconsistent with the reference
		no-call: an allele is completely indeterminate in length and composition, i.e. alleleSeq = '?'
		no-ref: the reference sequence is unspecified at this locus.
		PAR-called-in-X: this locus overlaps one of the pseudoautosomal regions on the sex chromosomes. The called sequence is reported as diploid sequence on Chromosome X; on chromosome Y the sequence is reported as varType = 'PAR-called-in-X'.
	8. reference. The reference sequence for the locus of variation. Empty when varType is ins. A value of '=' indicates that the user must consult the reference for the sequence; this shorthand is only used in regions where no haplotype deviates from the reference sequence.
	9. alleleSeq. The observed sequence at the locus of variation. Empty when varType is del. '?' isused to indicate 0 or more unknown bases within the sequence; 'N' is used to indicate exactly one unknown base within the sequence.'=' is used as shorthand to indicate identity to the reference sequence for non-variant sequence, i.e. when varType is 'ref'.
	10. totalScore. A score corresponding to a single variation and haplotype, representing the confidence in the call.
	11. hapLink. Identifier that links a haplotype at one locus to haplotypes at other loci. Currently only populated for very proximate variations that were assembled together. Two calls that share a hapLink identifier are expected to be on the same haplotype,
	12. xRef. Field containing external variation identifiers, currently only populated for variations corroborated directly by dbSNP. Format: dbsnp:[rsID], with multiple entries separated by the semicolon (;).

In older versions of the format specification, the sub keyword used to be insdel 
keyword. ANNOVAR takes care of this.

=item * B<SOAPsnp format>

An example of the SOAP SNP caller format is shown below:

	chr8  35782  A  R  1  A  27  1  2  G  26  1  2  5   0.500000  2.00000  1  5   
	chr8  35787  G  R  0  G  25  4  6  A  17  2  4  10  0.266667  1.60000  0  5   

The following information is provided in documentation from BGI who developed 
SOAP suite. It differs slightly from the description at the SOAPsnp website, and 
presumably the website is outdated.

	Format description:(left to right)
	1. Chromosome name
	2. Position of locus
	3. Nucleotide at corresponding locus of reference sequence
	4. Genotype of sequencing sample
	5. Quality value
	6. nucleotide with the highest probability(first nucleotide)
	7. Quality value of the nucleotide with the highest probability
	8. Number of supported reads that can only be aligned to this locus 
	9. Number of all supported reads that can be aligned to this locus
	10. Nucleotide with higher probability 
	11. Quality value of nucleotide with higher probability 
	12. Number of supported reads that can only be aligned to this locus 
	13. Number of all supported reads that can be aligned to this locus 
	14. Total number of reads that can be aligned to this locus 
	15. Order and quality value
	16. Estimated copy number for this locus 
	17. Presence of this locus in the dbSNP database. 1 refers to presence and 0 refers to inexistence
	18. The distance between this locus and another closest SNP
Later SOAPsnp changed its output format to 17 columns. An example of the format is shown below:

1	12837840	G	C	12	C	37	5	5	G	0	0	0	5	1.00000	1.00000	0
1	12853805	T	K	0	T	39	1	1	G	35	1	1	2	1.00000	1.00000	0


The following information is provided on SOAPsnp website as of 16Apr2013,
and it is slightly different from the documentation with SOAPsnp, which only
has 14 columns.

	The result of SOAPsnp has 17 columns:
	1)  Chromosome ID
	2)  Coordinate on chromosome, start from 1
	3)  Reference genotype
	4)  Consensus genotype
	5)  Quality score of consensus genotype
	6)  Best base
	7)  Average quality score of best base
	8)  Count of uniquely mapped best base
	9)  Count of all mapped best base
	10) Second best bases
	11) Average quality score of second best base
	12) Count of uniquely mapped second best base
	13) Count of all mapped second best base
	14) Sequencing depth of the site
	15) Rank sum test p_value
	16) Average copy number of nearby region
	17) Whether the site is a dbSNP.
=item * B<SOAPindel format>

The current version of ANNOVAR handles SoapSNP and SoapIndel automatically via a 
single argument '--format soap'. An example of SOAP indel caller format is shown 
below:

	chr11   44061282        -       +2      CT      Hete
	chr11   45901572        +       +1      C       Hete
	chr11   48242562        *       -3      TTC     Homo
	chr11   57228723        *       +4      CTTT    Homo
	chr11   57228734        *       +4      CTTT    Homo
	chr11   57555685        *       -1      C       Hete
	chr11   61482191        -       +3      TCC     Hete
	chr11   64608031        *       -1      T       Homo
	chr11   64654936        *       +1      C       Homo
	chr11   71188303        +       -1      T       Hete
	chr11   75741034        +       +1      T       Hete
	chr11   76632438        *       +1      A       Hete
	chr11   89578266        *       -2      AG      Homo
	chr11   104383261       *       +1      T       Hete
	chr11   124125940       +       +4      CCCC    Hete
	chr12   7760052 *       +1      T       Homo
	chr12   8266049 *       +3      ACG     Homo

I do not see a documentation describing this format yet as of September 2010.

=item B<--SOAPsv format>

An example is given below:

	Chr2 Deletion 42894 43832 43167 43555 388 0-0-0 FR 41

An explanation of the structural variation format is given below:

	Format description (from left to right)
	1. Chromosome name
	2. Type of structure variation
	3. Minimal value of start position in cluster
	4. Maximal value of end position in cluster
	5. Estimated start position of this structure variation
	6. Estimated end position of this structure variation
	7. Length of SV
	8. Breakpoint of SV (only for insertion)
	9. Unusual matching mode (F refers to align with forward sequence, R refers
	to align with reverse
	sequence)
	10. number of paired-end read which support this structure variation

=item * B<MAQ format>

MAQ can perform alignment and generate genotype calls, including SNP calls and 
indel calls. The format is described below:

For indel header: The output is TAB delimited with each line consisting of chromosome, start 
position, type of the indel, number of reads across the indel, size of the indel 
and inserted/deleted nucleotides (separated by colon), number of indels on the 
reverse strand, number of indels on the forward strand, 5' sequence ahead of the 
indel, 3' sequence following the indel, number of reads aligned without indels 
and three additional columns for filters.

An example is below:

	chr10   110583  -       2       -2:AG   0       1       GCGAGACTCAGTATCAAAAAAAAAAAAAAAAA        AGAAAGAAAGAAAAAGAAAAAAATAGAAAGAA        1       @2,     @72,   @0,
	chr10   120134  -       8       -2:CA   0       1       CTCTTGCCCGCTCACACATGTACACACACGCG        CACACACACACACACACATCAGCTACCTACCT        7       @65,62,61,61,45,22,7,   @9,12,13,13,29,52,67,   @0,0,0,0,0,0,0,
	chr10   129630  -       1       -1:T    1       0       ATGTTGTGACTCTTAATGGATAAGTTCAGTCA        TTTTTTTTTAGCTTTTAACCGGACAAAAAAAG        0       @       @      @
	chr10   150209  -       1       4:TTCC  1       0       GCATATAGGGATGGGCACTTTACCTTTCTTTT        TTCCTTCCTTCCTTCCTTCCCTTTCCTTTCCT        0       @       @      @
	chr10   150244  -       2       -4:TTCT 0       1       CTTCCTTCCTTCCTTCCCTTTCCTTTCCTTTC        TTCTTTCTTTCTTTCTTTCTTTTTTTTTTTTT        0       @       @      @
	chr10   159622  -       1       3:AGG   0       1       GAAGGAGGAAGGACGGAAGGAGGAAGGAAGGA        AGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGA        0       @       @      @
	chr10   206372  -       2       2:GT    1       0       ATAATAGTAACTGTGTATTTGATTATGTGTGC        GTGTGTGTGTGTGTGTGTGTGTGTGCGTGCTT        1       @37,    @37,   @8,
	chr10   245751  -       11      -1:C    0       1       CTCATAAATACAAGTCATAATGAAAGAAATTA        CCACCATTTTCTTATTTTCATTCATTTTTAGT        10      @69,64,53,41,30,25,22,14,5,4,   @5,10,21,33,44,49,52,60,69,70,  @0,0,0,0,0,0,0,0,0,0,
	chr10   253066  -       1       2:TT    0       1       TATTGATGAGGGTGGATTATACTTTAGAACAC        TATTCAAACAGTTCTTCCACATATCTCCCTTT        0       @       @      @
	chr10   253455  -       2       -3:AAA  1       0       GTTGCACTCCAGCCTGGCGAGATTCTGTCTCC        AAAAAAAAAAAAAAAAATTGTTGTGAAATACA        1       @55,    @19,   @4,

For snp output file: Each line consists of chromosome, position, reference base, 
consensus base, Phred-like consensus quality, read depth, the average number of 
hits of reads covering this position, the highest mapping quality of the reads 
covering the position, the minimum consensus quality in the 3bp flanking regions 
at each side of the site (6bp in total), the second best call, log likelihood 
ratio of the second best and the third best call, and the third best call.

An example is below:

	chr10   83603   C       T       28      12      2.81    63      34      Y       26      C
	chr10   83945   G       R       59      61      4.75    63      62      A       47      G
	chr10   83978   G       R       47      40      3.31    63      62      A       21      G
	chr10   84026   G       R       89      22      2.44    63      62      G       49      A
	chr10   84545   C       T       54      9       1.69    63      30      N       135     N
	chr10   85074   G       A       42      5       1.19    63      38      N       108     N
	chr10   85226   A       T       42      5       1.00    63      42      N       107     N
	chr10   85229   C       T       42      5       1.00    63      42      N       112     N
	chr10   87518   A       G       39      4       3.25    63      38      N       9       N
	chr10   116402  T       C       39      4       1.00    63      38      N       76      N


=item * B<CASAVA format>

An example of Illumina CASAVA format is given below:

	#position       A       C       G       T       modified_call   total   used    score           reference       type
	14930   3       0       8       0       GA      11      11      29.10:11.10             A       SNP_het2
	14933   4       0       7       0       GA      11      11      23.50:13.70             G       SNP_het1
	14976   3       0       8       0       GA      11      11      24.09:9.10              G       SNP_het1
	15118   2       1       4       0       GA      8       7       10.84:6.30              A       SNP_het2

An example of the indels is given below:

	# ** CASAVA depth-filtered indel calls **
	#$ CMDLINE /illumina/pipeline/install/CASAVA_v1.7.0/libexec/CASAVA-1.7.0/filterIndelCalls.pl--meanReadDepth=2.60395068970547 --indelsCovCutoff=-1 --chrom=chr1.fa /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0000.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0001.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0002.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0003.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0004.txt
	#$ CHROMOSOME chr1.fa
	#$ MAX_DEPTH undefined
	#
	#$ COLUMNS pos CIGAR ref_upstream ref/indel ref_downstream Q(indel) max_gtype Q(max_gtype) max2_gtype bp1_reads ref_reads indel_reads other_reads repeat_unit ref_repeat_count indel_repeat_count
	948847  1I      CCTCAGGCTT      -/A     ATAATAGGGC      969     hom     47      het     22      0       16      6       A       1       2
	978604  2D      CACTGAGCCC      CT/--   GTGTCCTTCC      251     hom     20      het     8       0       4       4       CT      1       0
	1276974 4I      CCTCATGCAG      ----/ACAC       ACACATGCAC      838     hom     39      het     18      0       14      4       AC      2       4
	1289368 2D      AGCCCGGGAC      TG/--   GGAGCCGCGC      1376    hom     83      het     33      0       25      9       TG      1       0

=item * B<VCF4 format>

VCF4 can be used to describe both population-level variation information, or for 
reads derived from a single individual.

One example of the indel format for one individual is given below:

	##fileformat=VCFv4.0
	##IGv2_bam_file_used=MIAPACA2.alnReAln.bam
	##INFO=<ID=AC,Number=2,Type=Integer,Description="# of reads supporting consensus indel/any indel at the site">
	##INFO=<ID=DP,Number=1,Type=Integer,Description="total coverage at the site">
	##INFO=<ID=MM,Number=2,Type=Float,Description="average # of mismatches per consensus indel-supporting read/per reference-supporting read">
	##INFO=<ID=MQ,Number=2,Type=Float,Description="average mapping quality of consensus indel-supporting reads/reference-supporting reads">
	##INFO=<ID=NQSBQ,Number=2,Type=Float,Description="Within NQS window: average quality of bases from consensus indel-supporting reads/from reference-supporting reads">
	##INFO=<ID=NQSMM,Number=2,Type=Float,Description="Within NQS window: fraction of mismatching bases in consensus indel-supporting reads/in reference-supporting reads">
	##INFO=<ID=SC,Number=4,Type=Integer,Description="strandness: counts of forward-/reverse-aligned indel-supporting reads / forward-/reverse-aligned reference supporting reads">
	##IndelGenotyperV2=""
	##reference=hg18.fa
	##source=IndelGenotyperV2
	#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  Miapaca_trimmed_sorted.bam      
	chr1    439     .       AC      A       .       PASS    AC=5,5;DP=7;MM=7.0,3.0;MQ=23.4,1.0;NQSBQ=23.98,25.5;NQSMM=0.04,0.0;SC=2,3,0,2   GT      1/0
	chr1    714048  .       T       TCAAC   .       PASS    AC=3,3;DP=9;MM=3.0,7.1666665;MQ=1.0,10.833333;NQSBQ=23.266666,21.932203;NQSMM=0.0,0.15254237;SC=3,0,3,3 GT      0/1
	chr1    714049  .       G       GC      .       PASS    AC=3,3;DP=9;MM=3.0,7.1666665;MQ=1.0,10.833333;NQSBQ=23.233334,21.83051;NQSMM=0.0,0.15254237;SC=3,0,3,3  GT      0/1
	chr1    813675  .       A       AATAG   .       PASS    AC=5,5;DP=8;MM=0.4,1.0;MQ=5.0,67.0;NQSBQ=25.74,25.166666;NQSMM=0.0,0.033333335;SC=4,1,1,2       GT      0/1
	chr1    813687  .       AGAGAGAGAGAAG   A       .       PASS    AC=5,5;DP=8;MM=0.4,1.0;MQ=5.0,67.0;NQSBQ=24.54,25.2;NQSMM=0.02,0.06666667;SC=4,1,1,2    GT      1/0

=item * B<annovar2vcf format>

This is useful for converting certain ANNOVAR files to VCF format. These ANNOVAR 
input files MUST include zygosity, quality and filter information as the 3 extra 
columns after Chr, Start, End, Ref and Alt alleles.


=back

The code was written by Dr. Kai Wang and modified by Dr. Germán Gastón Leparc. 
Various users have provided sample input files for many SNP callin software, for 
the development of conversion subroutines. We thank these users for their 
continued support to improve the functionality of the script.

For questions or comments, please contact kai@openbioinformatics.org.

=cut