Merge pull request #44 from Illumina/release/1.2.0

Release/1.2.0
Illumina · Jun 6, 2019 · 2e4346d · 2e4346d
2 parents 3030b10 + 8e2ee98
commit 2e4346d
Show file tree

Hide file tree

Showing 25 changed files with 2,530 additions and 249 deletions.
diff --git a/BAlleleFreqFormat.py b/BAlleleFreqFormat.py
@@ -0,0 +1,130 @@
+from vcf.parser import _Format
+from IlluminaBeadArrayFiles import RefStrand
+import numpy as np
+
+REVERSE_COMPLEMENT = {"A": "T", "T": "A", "G": "C", "C": "G", "I": "I", "D": "D"}
+
+def extract_alleles_from_snp_string(snp_string):
+    """
+    Splits SNP string into tuple of individual alleles
+
+    Args: snp_string (string): allele string of the format
+        [T/C] or [G/C]
+
+    Returns:
+         allele1 (string), allele2 (string)
+    """
+    (allele1, allele2) = snp_string[1:4].split("/")
+    assert allele1 in REVERSE_COMPLEMENT, "allele %r is invalid (expected A,T,G,C,I,D)" % allele1
+    assert allele2 in REVERSE_COMPLEMENT, "allele %r is invalid (expected A,T,G,C,I,D)" % allele2
+    return allele1, allele2
+
+
+def extract_reverse_complement_alleles_from_snp_string(snp_string):
+    """
+    Splits SNP string into tuple of individual alleles and
+    gets takes the reverse compliment to match strand 1
+
+    Args: snp_string (string): allele string of the format
+        [T/C] or [G/C]
+
+    Returns:
+         allele1 (string), allele2 (string)
+    """
+    # get alleles as tuple
+    allele1, allele2 = extract_alleles_from_snp_string(snp_string)
+    # get reverse compliment of bases and return
+    return REVERSE_COMPLEMENT[allele1], REVERSE_COMPLEMENT[allele2]
+
+
+def convert_indel_alleles(snp_string, vcf_record):
+    """
+    Splits SNP string into tuple of individual alleles and
+    converts to actual insertion/deletion alleles from the reference
+
+    Args: snp_string (string): allele string of the format
+        [I/D] or [D/I]
+
+    Returns:
+         allele1 (string), allele2 (string)
+    """
+    # get alleles as tuple, we only need allele1 because if its "I" we can assume allele2 will be "D" and vice-versa
+    allele1, _ = extract_alleles_from_snp_string(snp_string)
+
+    assert len(vcf_record.ALT) == 1, "Cannot convert indel for BAF with multiple ALT alleles"
+    alt_allele = str(vcf_record.ALT[0])
+    ref_allele = vcf_record.REF
+    # Assuming whichever sequence is smaller is the deletion and the larger is the insertion
+    assert len(alt_allele) > len(ref_allele) or len(ref_allele) > len(alt_allele), "REF allele %r and ALT allele %r same length, cannot determine insertion or deletion" % (ref_allele, alt_allele)
+    deletion_allele, insertion_allele = (alt_allele, ref_allele) if len(alt_allele) < len(ref_allele) else (ref_allele, alt_allele)
+    if allele1 == "I":
+        return insertion_allele, deletion_allele
+    return deletion_allele, insertion_allele
+
+class BAlleleFreqFormat(object):
+    """
+    Generate b allele frequency format information for VCF
+    """
+    def __init__(self, logger, b_allele_freq):
+        self._b_allele_freq = b_allele_freq
+        self._logger = logger
+
+    @staticmethod
+    def get_id():
+        return "BAF"
+
+    @staticmethod
+    def get_description():
+        return "B Allele Frequency"
+
+    @staticmethod
+    def get_format_obj():
+        # arguments should be: ['id', 'num', 'type', 'desc']
+        return _Format(BAlleleFreqFormat.get_id(), 1, "Float", BAlleleFreqFormat.get_description())
+
+    def generate_sample_format_info(self, bpm_records, vcf_record, sample_name):
+        """
+        Get the sample B allele frequency
+
+        Args:
+
+        Returns:
+            float: B allele frequency
+        """
+        # if we have more than 1 BPMRecord, need to merge
+        b_allele_freq_list = []
+        # if we have more than 1 alt allele, return missing
+        if len(vcf_record.ALT) > 1:
+            return "."
+        for i in range(len(bpm_records)):
+            bpm_record = bpm_records[i]
+            snp = bpm_record.snp
+            strand = bpm_record.ref_strand
+            idx = bpm_record.index_num
+            b_allele_freq = self._b_allele_freq[idx]
+
+            # if indel, convert to actual ref and alt sequences
+            if bpm_record.is_indel():
+                allele1, allele2 = convert_indel_alleles(snp, vcf_record)
+            # if  2/minus strand, get rev comp
+            elif strand == RefStrand.Minus:
+                allele1, allele2 = extract_reverse_complement_alleles_from_snp_string(snp)
+            else:
+                allele1, allele2 = extract_alleles_from_snp_string(snp)
+
+            # normalize to reference allele
+            ref_allele = vcf_record.REF
+            assert allele1 == ref_allele or allele2 == ref_allele, "allele1: %r or allele2 %r do not match ref: %r" % (allele1, allele2, ref_allele)
+            # if allele1 is reference, then B allele is correct
+            if allele1 == ref_allele:
+                b_allele_freq_list.append(b_allele_freq)
+            # if allele2 is reference, then we need to do 1-freq
+            else:
+                b_allele_freq_list.append(1. - b_allele_freq)
+
+        # nanmedian ignores NaN values
+        final_b_allele_freq = np.nanmedian(b_allele_freq_list)
+        if np.isnan(final_b_allele_freq):
+            return "."
+        else:
+            return final_b_allele_freq
diff --git a/BPMReader.py b/BPMReader.py
@@ -35,7 +35,7 @@ def get_bpm_records(self):
             BPMRecord: Next BPMRecord in the file
         """
         bpm = self._bpm
-        for idx in xrange(len(bpm.addresses)):
+        for idx in range(len(bpm.addresses)):
             yield BPMRecord(bpm.names[idx], bpm.addresses[idx], None, bpm.chroms[idx], bpm.map_infos[idx], bpm.snps[idx], bpm.ref_strands[idx], bpm.assay_types[idx], None, None, None, None, idx, self._logger)
 
 class ManifestFilter(object):

diff --git a/BPMRecord.py b/BPMRecord.py
@@ -87,7 +87,7 @@ def __init__(self, name, address_a, probe_a, chromosome, pos, snp, ref_strand, a
 
         Args:
             name (string) : Name field from manifest
-            name (string) : AddressA_ID field from manifest
+            address_a (string) : AddressA_ID field from manifest
             probe_a (string) : AlleleA_ProbeSeq field from manifest
             chromosome (string) : Chr field from manifest
             pos (string, int) : MapInfo field from manifest

diff --git a/FormatFactory.py b/FormatFactory.py
@@ -1,27 +1,48 @@
 from GenotypeFormat import GenotypeFormat
 from GencallFormat import GencallFormat
+from BAlleleFreqFormat import BAlleleFreqFormat
+from LogRRatioFormat import LogRRatioFormat
+
 
 class FormatFactory(object):
     """FormatFactory is responsible for determining the fields
     that will be present in the FORMAT string and creating
     the appropriate corresponding format objects
     """
 
-    def __init__(self, no_samples, logger):
+    def __init__(self, no_samples, formats_to_include, logger):
         """
         Create a new FormatFactory
 
         Args:
             no_samples (bool): True iff no sample data is present (format string should be empty)
+            formats_to_include list(String): list of FORMAT IDs to include (default: GT,GC)
 
         Returns:
             FormatFactory
         """
         self._logger = logger
         self._format_classes = []
+        for fmt in formats_to_include:
+            assert fmt in FormatFactory.get_possible_formats(), "%r is not a valid FORMAT" % fmt
+        self._formats_to_include = formats_to_include
         if not no_samples:
-            self._format_classes.append(GenotypeFormat)
-            self._format_classes.append(GencallFormat)
+            if GenotypeFormat.get_id() in formats_to_include:
+                self._format_classes.append(GenotypeFormat)
+            if GencallFormat.get_id() in formats_to_include:
+                self._format_classes.append(GencallFormat)
+            if BAlleleFreqFormat.get_id() in formats_to_include:
+                self._format_classes.append(BAlleleFreqFormat)
+            if LogRRatioFormat.get_id() in formats_to_include:
+                self._format_classes.append(LogRRatioFormat)
+
+    @staticmethod
+    def get_possible_formats():
+        valid_classes = [GenotypeFormat.get_id(),
+                         GencallFormat.get_id(),
+                         BAlleleFreqFormat.get_id(),
+                         LogRRatioFormat.get_id()]
+        return valid_classes
 
     def get_format_id_string(self):
         """
@@ -70,8 +91,18 @@ def create_formats(self, gtc):
             None
         """
         result = []
-        result.append(GenotypeFormat(
-            self._logger, gtc.get_gender(), gtc.get_genotypes()))
-        result.append(GencallFormat(
-            self._logger, gtc.get_genotype_scores()))
+        # only append if ID is in command line args
+        if GenotypeFormat.get_id() in self._formats_to_include:
+            result.append(GenotypeFormat(
+                self._logger, gtc.get_gender(), gtc.get_genotypes()))
+        if GencallFormat.get_id() in self._formats_to_include:
+            result.append(GencallFormat(
+                self._logger, gtc.get_genotype_scores()))
+        if BAlleleFreqFormat.get_id() in self._formats_to_include:
+            result.append(BAlleleFreqFormat(
+                self._logger, gtc.get_ballele_freqs()))
+        if LogRRatioFormat.get_id() in self._formats_to_include:
+            result.append(LogRRatioFormat(
+                self._logger, gtc.get_logr_ratios()))
+
         return result
diff --git a/GenotypeFormat.py b/GenotypeFormat.py
@@ -13,9 +13,8 @@ def get_expected_ploidy(gender, chrom):
         Returns
             int: value of expected ploidy, currently set at 1 or 2
     """
-    if (gender == "M" and chrom == "X") or chrom == "Y" or chrom == "MT":
+    if (gender == b"M" and chrom == "X") or chrom == "Y" or chrom == "MT":
         return 1
-
     return 2
 
 
@@ -221,7 +220,7 @@ def _filter_inconsistent_genotypes(self, possible_genotypes):
                 length 2 where each string is a nucleotide on the plus strand
         """
         idx2inconsistent = [False] * len(possible_genotypes)
-        for idx in xrange(len(possible_genotypes)):
+        for idx in range(len(possible_genotypes)):
             for record in self._bpm_records:
                 if self._record_inconsistent_with_genotype(record, possible_genotypes[idx]):
                     idx2inconsistent[idx] = True

diff --git a/IlluminaBeadArrayFiles.py b/IlluminaBeadArrayFiles.py
@@ -139,7 +139,7 @@ def __init__(self, filename, ignore_version = False, check_write_complete = True
         self.filename = filename
         with open(self.filename, "rb") as gtc_handle:
             identifier = gtc_handle.read(3)
-            if identifier != "gtc":
+            if identifier != b'gtc':
                 raise Exception("GTC format error: bad format identifier")
             self.version = read_byte(gtc_handle)
             if self.version not in GenotypeCalls.supported_version and not ignore_version:
@@ -151,7 +151,7 @@ def __init__(self, filename, ignore_version = False, check_write_complete = True
             # to the lookup
             #
             self.toc_table = {}
-            for toc_idx in xrange(number_toc_entries):
+            for toc_idx in range(number_toc_entries):
                 (id, offset) = struct.unpack("<hI",gtc_handle.read(6))
                 self.toc_table[id] = offset
         if check_write_complete and not self.is_write_complete():
@@ -188,7 +188,7 @@ def __get_generic_array(self, toc_entry, parse_function):
             gtc_handle.seek(self.toc_table[toc_entry])
             num_entries = read_int(gtc_handle)
             result = []
-            for idx in xrange(num_entries):
+            for idx in range(num_entries):
                 result.append( parse_function( gtc_handle ) )
             return result
 
@@ -376,7 +376,7 @@ def get_base_calls(self):
             gtc_handle.seek(self.toc_table[GenotypeCalls.__ID_BASE_CALLS])
             num_entries = read_int(gtc_handle)
             result = []
-            for idx in xrange(num_entries):
+            for idx in range(num_entries):
                 if ploidy_type == 1:
                     result.append( gtc_handle.read(2) )
                 else:
@@ -507,7 +507,7 @@ def get_percentiles_x(self):
         with open(self.filename, "rb") as gtc_handle:
             gtc_handle.seek(self.toc_table[GenotypeCalls.__ID_PERCENTILES_X])
             result = []
-            for idx in xrange(3):
+            for idx in range(3):
                 result.append(read_ushort(gtc_handle))
             return result
 
@@ -522,7 +522,7 @@ def get_percentiles_y(self):
         with open(self.filename, "rb") as gtc_handle:
             gtc_handle.seek(self.toc_table[GenotypeCalls.__ID_PERCENTILES_Y])
             result = []
-            for idx in xrange(3):
+            for idx in range(3):
                 result.append(read_ushort(gtc_handle))
             return result
 
@@ -597,13 +597,14 @@ def read_normalization_transform(handle):
         return NormalizationTransform(handle.read(52))
 
     @staticmethod
-    def rect_to_polar((x,y)):
+    def rect_to_polar(x_y):
         """Converts normalized x,y intensities to (pseudo) polar co-ordinates (R, theta)
         Args:
-            x,y (float, float): Normalized x,y intensities for probe
+            x_y (float, float): Normalized x,y intensities for probe
         Returns:
             (R,theta) polar values as tuple of floats
         """
+        x,y = x_y
         if x == 0 and y == 0:
             return (nan, nan)
         return (x + y, arctan2(y,x) * 2.0 / pi)
@@ -704,7 +705,7 @@ def __parse_file(self, manifest_file):
         """
         with open(manifest_file, "rb") as manifest_handle:
             header = manifest_handle.read(3)
-            if len(header) != 3 or header != "BPM":
+            if len(header) != 3 or header != b"BPM":
                 raise Exception("Invalid BPM format")
             version = read_byte(manifest_handle)
             if version != 1:
@@ -724,11 +725,11 @@ def __parse_file(self, manifest_file):
             self.num_loci = read_int(manifest_handle)
             manifest_handle.seek(4 * self.num_loci, 1)
             name_lookup = {}
-            for idx in xrange(self.num_loci):
+            for idx in range(self.num_loci):
                 self.names.append(read_string(manifest_handle))
                 name_lookup[self.names[-1]] = idx
 
-            for idx in xrange(self.num_loci):
+            for idx in range(self.num_loci):
                 normalization_id = read_byte(manifest_handle)
                 if normalization_id >= 100:
                     raise Exception("Manifest format error: read invalid normalization ID")
@@ -741,7 +742,7 @@ def __parse_file(self, manifest_file):
             self.map_infos = [0] * self.num_loci
             self.ref_strands = [RefStrand.Unknown] * self.num_loci
             self.source_strands = [SourceStrand.Unknown] * self.num_loci
-            for idx in xrange(self.num_loci):
+            for idx in range(self.num_loci):
                 locus_entry = LocusEntry(manifest_handle)
                 self.assay_types[name_lookup[locus_entry.name]] = locus_entry.assay_type
                 self.addresses[name_lookup[locus_entry.name]] = locus_entry.address_a
@@ -755,12 +756,12 @@ def __parse_file(self, manifest_file):
                 raise Exception("Manifest format error: read invalid number of assay entries")
 
             all_norm_ids = set()
-            for locus_idx in xrange(self.num_loci):
+            for locus_idx in range(self.num_loci):
                 self.normalization_ids[locus_idx] += 100 * self.assay_types[locus_idx]
                 all_norm_ids.add(self.normalization_ids[locus_idx])
             sorted_norm_ids = sorted(all_norm_ids)
             lookup_dictionary = {}
-            for idx in xrange(len(sorted_norm_ids)):
+            for idx in range(len(sorted_norm_ids)):
                 lookup_dictionary[sorted_norm_ids[idx]] = idx
             self.normalization_lookups =  [lookup_dictionary[normalization_id] for normalization_id in self.normalization_ids]
 
@@ -913,21 +914,21 @@ def __parse_locus_version_6(self,handle):
         self.ilmn_id = read_string(handle)
         self.source_strand = SourceStrand.from_string(self.ilmn_id.split("_")[-2])
         self.name = read_string(handle)
-        for idx in xrange(3):
+        for idx in range(3):
             read_string(handle)
         handle.read(4)
-        for idx in xrange(2):
+        for idx in range(2):
             read_string(handle)
         self.snp = read_string(handle)
         self.chrom = read_string(handle)
-        for idx in xrange(2):
+        for idx in range(2):
             read_string(handle)
         self.map_info = int(read_string(handle))
-        for idx in xrange(2):
+        for idx in range(2):
             read_string(handle)
         self.address_a = read_int(handle)
         self.address_b = read_int(handle)
-        for idx in xrange(7):
+        for idx in range(7):
             read_string(handle)
         handle.read(3)
         self.assay_type = read_byte(handle)
@@ -1038,7 +1039,7 @@ def read_string(handle):
     if len(result) < total_length:
         raise Exception("Failed to read complete string")
     else:
-        return result
+        return result.decode()
 
 def read_scanner_data(handle):
     """Helper function to parse ScannerData object from file handle.