Skip to content

Commit

Permalink
Merge pull request #44 from Illumina/release/1.2.0
Browse files Browse the repository at this point in the history
Release/1.2.0
  • Loading branch information
jjzieve authored Jun 6, 2019
2 parents 3030b10 + 8e2ee98 commit 2e4346d
Show file tree
Hide file tree
Showing 25 changed files with 2,530 additions and 249 deletions.
130 changes: 130 additions & 0 deletions BAlleleFreqFormat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
from vcf.parser import _Format
from IlluminaBeadArrayFiles import RefStrand
import numpy as np

REVERSE_COMPLEMENT = {"A": "T", "T": "A", "G": "C", "C": "G", "I": "I", "D": "D"}

def extract_alleles_from_snp_string(snp_string):
"""
Splits SNP string into tuple of individual alleles
Args: snp_string (string): allele string of the format
[T/C] or [G/C]
Returns:
allele1 (string), allele2 (string)
"""
(allele1, allele2) = snp_string[1:4].split("/")
assert allele1 in REVERSE_COMPLEMENT, "allele %r is invalid (expected A,T,G,C,I,D)" % allele1
assert allele2 in REVERSE_COMPLEMENT, "allele %r is invalid (expected A,T,G,C,I,D)" % allele2
return allele1, allele2


def extract_reverse_complement_alleles_from_snp_string(snp_string):
"""
Splits SNP string into tuple of individual alleles and
gets takes the reverse compliment to match strand 1
Args: snp_string (string): allele string of the format
[T/C] or [G/C]
Returns:
allele1 (string), allele2 (string)
"""
# get alleles as tuple
allele1, allele2 = extract_alleles_from_snp_string(snp_string)
# get reverse compliment of bases and return
return REVERSE_COMPLEMENT[allele1], REVERSE_COMPLEMENT[allele2]


def convert_indel_alleles(snp_string, vcf_record):
"""
Splits SNP string into tuple of individual alleles and
converts to actual insertion/deletion alleles from the reference
Args: snp_string (string): allele string of the format
[I/D] or [D/I]
Returns:
allele1 (string), allele2 (string)
"""
# get alleles as tuple, we only need allele1 because if its "I" we can assume allele2 will be "D" and vice-versa
allele1, _ = extract_alleles_from_snp_string(snp_string)

assert len(vcf_record.ALT) == 1, "Cannot convert indel for BAF with multiple ALT alleles"
alt_allele = str(vcf_record.ALT[0])
ref_allele = vcf_record.REF
# Assuming whichever sequence is smaller is the deletion and the larger is the insertion
assert len(alt_allele) > len(ref_allele) or len(ref_allele) > len(alt_allele), "REF allele %r and ALT allele %r same length, cannot determine insertion or deletion" % (ref_allele, alt_allele)
deletion_allele, insertion_allele = (alt_allele, ref_allele) if len(alt_allele) < len(ref_allele) else (ref_allele, alt_allele)
if allele1 == "I":
return insertion_allele, deletion_allele
return deletion_allele, insertion_allele

class BAlleleFreqFormat(object):
"""
Generate b allele frequency format information for VCF
"""
def __init__(self, logger, b_allele_freq):
self._b_allele_freq = b_allele_freq
self._logger = logger

@staticmethod
def get_id():
return "BAF"

@staticmethod
def get_description():
return "B Allele Frequency"

@staticmethod
def get_format_obj():
# arguments should be: ['id', 'num', 'type', 'desc']
return _Format(BAlleleFreqFormat.get_id(), 1, "Float", BAlleleFreqFormat.get_description())

def generate_sample_format_info(self, bpm_records, vcf_record, sample_name):
"""
Get the sample B allele frequency
Args:
Returns:
float: B allele frequency
"""
# if we have more than 1 BPMRecord, need to merge
b_allele_freq_list = []
# if we have more than 1 alt allele, return missing
if len(vcf_record.ALT) > 1:
return "."
for i in range(len(bpm_records)):
bpm_record = bpm_records[i]
snp = bpm_record.snp
strand = bpm_record.ref_strand
idx = bpm_record.index_num
b_allele_freq = self._b_allele_freq[idx]

# if indel, convert to actual ref and alt sequences
if bpm_record.is_indel():
allele1, allele2 = convert_indel_alleles(snp, vcf_record)
# if 2/minus strand, get rev comp
elif strand == RefStrand.Minus:
allele1, allele2 = extract_reverse_complement_alleles_from_snp_string(snp)
else:
allele1, allele2 = extract_alleles_from_snp_string(snp)

# normalize to reference allele
ref_allele = vcf_record.REF
assert allele1 == ref_allele or allele2 == ref_allele, "allele1: %r or allele2 %r do not match ref: %r" % (allele1, allele2, ref_allele)
# if allele1 is reference, then B allele is correct
if allele1 == ref_allele:
b_allele_freq_list.append(b_allele_freq)
# if allele2 is reference, then we need to do 1-freq
else:
b_allele_freq_list.append(1. - b_allele_freq)

# nanmedian ignores NaN values
final_b_allele_freq = np.nanmedian(b_allele_freq_list)
if np.isnan(final_b_allele_freq):
return "."
else:
return final_b_allele_freq
2 changes: 1 addition & 1 deletion BPMReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def get_bpm_records(self):
BPMRecord: Next BPMRecord in the file
"""
bpm = self._bpm
for idx in xrange(len(bpm.addresses)):
for idx in range(len(bpm.addresses)):
yield BPMRecord(bpm.names[idx], bpm.addresses[idx], None, bpm.chroms[idx], bpm.map_infos[idx], bpm.snps[idx], bpm.ref_strands[idx], bpm.assay_types[idx], None, None, None, None, idx, self._logger)

class ManifestFilter(object):
Expand Down
2 changes: 1 addition & 1 deletion BPMRecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def __init__(self, name, address_a, probe_a, chromosome, pos, snp, ref_strand, a
Args:
name (string) : Name field from manifest
name (string) : AddressA_ID field from manifest
address_a (string) : AddressA_ID field from manifest
probe_a (string) : AlleleA_ProbeSeq field from manifest
chromosome (string) : Chr field from manifest
pos (string, int) : MapInfo field from manifest
Expand Down
45 changes: 38 additions & 7 deletions FormatFactory.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,48 @@
from GenotypeFormat import GenotypeFormat
from GencallFormat import GencallFormat
from BAlleleFreqFormat import BAlleleFreqFormat
from LogRRatioFormat import LogRRatioFormat


class FormatFactory(object):
"""FormatFactory is responsible for determining the fields
that will be present in the FORMAT string and creating
the appropriate corresponding format objects
"""

def __init__(self, no_samples, logger):
def __init__(self, no_samples, formats_to_include, logger):
"""
Create a new FormatFactory
Args:
no_samples (bool): True iff no sample data is present (format string should be empty)
formats_to_include list(String): list of FORMAT IDs to include (default: GT,GC)
Returns:
FormatFactory
"""
self._logger = logger
self._format_classes = []
for fmt in formats_to_include:
assert fmt in FormatFactory.get_possible_formats(), "%r is not a valid FORMAT" % fmt
self._formats_to_include = formats_to_include
if not no_samples:
self._format_classes.append(GenotypeFormat)
self._format_classes.append(GencallFormat)
if GenotypeFormat.get_id() in formats_to_include:
self._format_classes.append(GenotypeFormat)
if GencallFormat.get_id() in formats_to_include:
self._format_classes.append(GencallFormat)
if BAlleleFreqFormat.get_id() in formats_to_include:
self._format_classes.append(BAlleleFreqFormat)
if LogRRatioFormat.get_id() in formats_to_include:
self._format_classes.append(LogRRatioFormat)

@staticmethod
def get_possible_formats():
valid_classes = [GenotypeFormat.get_id(),
GencallFormat.get_id(),
BAlleleFreqFormat.get_id(),
LogRRatioFormat.get_id()]
return valid_classes

def get_format_id_string(self):
"""
Expand Down Expand Up @@ -70,8 +91,18 @@ def create_formats(self, gtc):
None
"""
result = []
result.append(GenotypeFormat(
self._logger, gtc.get_gender(), gtc.get_genotypes()))
result.append(GencallFormat(
self._logger, gtc.get_genotype_scores()))
# only append if ID is in command line args
if GenotypeFormat.get_id() in self._formats_to_include:
result.append(GenotypeFormat(
self._logger, gtc.get_gender(), gtc.get_genotypes()))
if GencallFormat.get_id() in self._formats_to_include:
result.append(GencallFormat(
self._logger, gtc.get_genotype_scores()))
if BAlleleFreqFormat.get_id() in self._formats_to_include:
result.append(BAlleleFreqFormat(
self._logger, gtc.get_ballele_freqs()))
if LogRRatioFormat.get_id() in self._formats_to_include:
result.append(LogRRatioFormat(
self._logger, gtc.get_logr_ratios()))

return result
5 changes: 2 additions & 3 deletions GenotypeFormat.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@ def get_expected_ploidy(gender, chrom):
Returns
int: value of expected ploidy, currently set at 1 or 2
"""
if (gender == "M" and chrom == "X") or chrom == "Y" or chrom == "MT":
if (gender == b"M" and chrom == "X") or chrom == "Y" or chrom == "MT":
return 1

return 2


Expand Down Expand Up @@ -221,7 +220,7 @@ def _filter_inconsistent_genotypes(self, possible_genotypes):
length 2 where each string is a nucleotide on the plus strand
"""
idx2inconsistent = [False] * len(possible_genotypes)
for idx in xrange(len(possible_genotypes)):
for idx in range(len(possible_genotypes)):
for record in self._bpm_records:
if self._record_inconsistent_with_genotype(record, possible_genotypes[idx]):
idx2inconsistent[idx] = True
Expand Down
41 changes: 21 additions & 20 deletions IlluminaBeadArrayFiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def __init__(self, filename, ignore_version = False, check_write_complete = True
self.filename = filename
with open(self.filename, "rb") as gtc_handle:
identifier = gtc_handle.read(3)
if identifier != "gtc":
if identifier != b'gtc':
raise Exception("GTC format error: bad format identifier")
self.version = read_byte(gtc_handle)
if self.version not in GenotypeCalls.supported_version and not ignore_version:
Expand All @@ -151,7 +151,7 @@ def __init__(self, filename, ignore_version = False, check_write_complete = True
# to the lookup
#
self.toc_table = {}
for toc_idx in xrange(number_toc_entries):
for toc_idx in range(number_toc_entries):
(id, offset) = struct.unpack("<hI",gtc_handle.read(6))
self.toc_table[id] = offset
if check_write_complete and not self.is_write_complete():
Expand Down Expand Up @@ -188,7 +188,7 @@ def __get_generic_array(self, toc_entry, parse_function):
gtc_handle.seek(self.toc_table[toc_entry])
num_entries = read_int(gtc_handle)
result = []
for idx in xrange(num_entries):
for idx in range(num_entries):
result.append( parse_function( gtc_handle ) )
return result

Expand Down Expand Up @@ -376,7 +376,7 @@ def get_base_calls(self):
gtc_handle.seek(self.toc_table[GenotypeCalls.__ID_BASE_CALLS])
num_entries = read_int(gtc_handle)
result = []
for idx in xrange(num_entries):
for idx in range(num_entries):
if ploidy_type == 1:
result.append( gtc_handle.read(2) )
else:
Expand Down Expand Up @@ -507,7 +507,7 @@ def get_percentiles_x(self):
with open(self.filename, "rb") as gtc_handle:
gtc_handle.seek(self.toc_table[GenotypeCalls.__ID_PERCENTILES_X])
result = []
for idx in xrange(3):
for idx in range(3):
result.append(read_ushort(gtc_handle))
return result

Expand All @@ -522,7 +522,7 @@ def get_percentiles_y(self):
with open(self.filename, "rb") as gtc_handle:
gtc_handle.seek(self.toc_table[GenotypeCalls.__ID_PERCENTILES_Y])
result = []
for idx in xrange(3):
for idx in range(3):
result.append(read_ushort(gtc_handle))
return result

Expand Down Expand Up @@ -597,13 +597,14 @@ def read_normalization_transform(handle):
return NormalizationTransform(handle.read(52))

@staticmethod
def rect_to_polar((x,y)):
def rect_to_polar(x_y):
"""Converts normalized x,y intensities to (pseudo) polar co-ordinates (R, theta)
Args:
x,y (float, float): Normalized x,y intensities for probe
x_y (float, float): Normalized x,y intensities for probe
Returns:
(R,theta) polar values as tuple of floats
"""
x,y = x_y
if x == 0 and y == 0:
return (nan, nan)
return (x + y, arctan2(y,x) * 2.0 / pi)
Expand Down Expand Up @@ -704,7 +705,7 @@ def __parse_file(self, manifest_file):
"""
with open(manifest_file, "rb") as manifest_handle:
header = manifest_handle.read(3)
if len(header) != 3 or header != "BPM":
if len(header) != 3 or header != b"BPM":
raise Exception("Invalid BPM format")
version = read_byte(manifest_handle)
if version != 1:
Expand All @@ -724,11 +725,11 @@ def __parse_file(self, manifest_file):
self.num_loci = read_int(manifest_handle)
manifest_handle.seek(4 * self.num_loci, 1)
name_lookup = {}
for idx in xrange(self.num_loci):
for idx in range(self.num_loci):
self.names.append(read_string(manifest_handle))
name_lookup[self.names[-1]] = idx

for idx in xrange(self.num_loci):
for idx in range(self.num_loci):
normalization_id = read_byte(manifest_handle)
if normalization_id >= 100:
raise Exception("Manifest format error: read invalid normalization ID")
Expand All @@ -741,7 +742,7 @@ def __parse_file(self, manifest_file):
self.map_infos = [0] * self.num_loci
self.ref_strands = [RefStrand.Unknown] * self.num_loci
self.source_strands = [SourceStrand.Unknown] * self.num_loci
for idx in xrange(self.num_loci):
for idx in range(self.num_loci):
locus_entry = LocusEntry(manifest_handle)
self.assay_types[name_lookup[locus_entry.name]] = locus_entry.assay_type
self.addresses[name_lookup[locus_entry.name]] = locus_entry.address_a
Expand All @@ -755,12 +756,12 @@ def __parse_file(self, manifest_file):
raise Exception("Manifest format error: read invalid number of assay entries")

all_norm_ids = set()
for locus_idx in xrange(self.num_loci):
for locus_idx in range(self.num_loci):
self.normalization_ids[locus_idx] += 100 * self.assay_types[locus_idx]
all_norm_ids.add(self.normalization_ids[locus_idx])
sorted_norm_ids = sorted(all_norm_ids)
lookup_dictionary = {}
for idx in xrange(len(sorted_norm_ids)):
for idx in range(len(sorted_norm_ids)):
lookup_dictionary[sorted_norm_ids[idx]] = idx
self.normalization_lookups = [lookup_dictionary[normalization_id] for normalization_id in self.normalization_ids]

Expand Down Expand Up @@ -913,21 +914,21 @@ def __parse_locus_version_6(self,handle):
self.ilmn_id = read_string(handle)
self.source_strand = SourceStrand.from_string(self.ilmn_id.split("_")[-2])
self.name = read_string(handle)
for idx in xrange(3):
for idx in range(3):
read_string(handle)
handle.read(4)
for idx in xrange(2):
for idx in range(2):
read_string(handle)
self.snp = read_string(handle)
self.chrom = read_string(handle)
for idx in xrange(2):
for idx in range(2):
read_string(handle)
self.map_info = int(read_string(handle))
for idx in xrange(2):
for idx in range(2):
read_string(handle)
self.address_a = read_int(handle)
self.address_b = read_int(handle)
for idx in xrange(7):
for idx in range(7):
read_string(handle)
handle.read(3)
self.assay_type = read_byte(handle)
Expand Down Expand Up @@ -1038,7 +1039,7 @@ def read_string(handle):
if len(result) < total_length:
raise Exception("Failed to read complete string")
else:
return result
return result.decode()

def read_scanner_data(handle):
"""Helper function to parse ScannerData object from file handle.
Expand Down
Loading

0 comments on commit 2e4346d

Please sign in to comment.