Skip to content

Commit

Permalink
swapped checking for multiple genes into info.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeremy McRae committed Feb 23, 2015
1 parent b1c0fb7 commit 6b9b4c7
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 37 deletions.
16 changes: 0 additions & 16 deletions src/main/python/clinicalfilter/variant/cnv.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,22 +121,6 @@ def set_gene_from_info(self):
else:
self.gene = self.info["HGNC"]

def get_genes(self):
""" split a gene string into list of gene names
Returns:
list of gene IDs
"""

if self.gene == None:
genes = []
elif "," in self.gene:
genes = self.gene.split(",")
else:
genes = [self.gene]

return genes

def passes_filters(self):
"""Checks whether a VCF variant passes user defined criteria.
Expand Down
42 changes: 29 additions & 13 deletions src/main/python/clinicalfilter/variant/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ class VariantInfo(object):
"""

# Here are the VEP consequences, ranked in severity from the most severe to
# the least severe as defined at:
# the least severe as defined at:
# http://www.ensembl.org/info/genome/variation/predicted_data.html
severity = {"transcript_ablation": 0, "splice_donor_variant": 1, \
"splice_acceptor_variant": 2, "stop_gained": 3, "frameshift_variant": 4, \
Expand Down Expand Up @@ -98,8 +98,8 @@ def set_gene_from_info(self):
if "HGNC" in self.info:
self.gene = self.info["HGNC"]
# If we are not using a set of known genes, we still want to check
# variants that haven't been annotated with a HGNC, since some of these
# have a functional VEP annotation, presumably due to difficulties in
# variants that haven't been annotated with a HGNC, since some of these
# have a functional VEP annotation, presumably due to difficulties in
# identifying an HGNC symbol. We don't need to worry about this when
# using a set of known genes, since we check whether variants lie within
# the known genes chromosomal ranges.
Expand Down Expand Up @@ -150,6 +150,22 @@ def get_overlapping_known_genes(self):

return overlapping

def get_genes(self):
""" split a gene string into list of gene names
Returns:
list of gene IDs
"""

if self.gene == None:
genes = []
elif "," in self.gene:
genes = self.gene.split(",")
else:
genes = [self.gene]

return genes

def set_consequence(self):
""" makes sure a consequence field is available in the info dict
"""
Expand All @@ -162,7 +178,7 @@ def set_consequence(self):
if "," in self.alt_allele:
(cq, hgnc, enst) = self.correct_multiple_alt(cq)

if "HGNC" in self.info:
if "HGNC" in self.info:
self.info["HGNC"] = hgnc
self.info["ENST"] = enst

Expand All @@ -171,9 +187,9 @@ def set_consequence(self):
def correct_multiple_alt(self, cq):
""" gets correct consequence, HGNC and ensembl IDs for multiple alt vars
Some variants have multiple alts, so we need to select the alt with
the most severe consequence. However, in at least one version of the
VCFs, one of the alts could have zero depth, which I believe resulted
Some variants have multiple alts, so we need to select the alt with
the most severe consequence. However, in at least one version of the
VCFs, one of the alts could have zero depth, which I believe resulted
from the population based multi-sample calling. We need to drop the
consequences recorded for zero-depth alternate alleles before finding
the most severe.
Expand Down Expand Up @@ -212,7 +228,7 @@ def correct_multiple_alt(self, cq):

cq = self.get_most_severe_consequence(cq)

if "HGNC" in self.info:
if "HGNC" in self.info:
hgnc = ",".join(sorted(set(hgnc)))
enst = ",".join(sorted(set(enst)))

Expand Down Expand Up @@ -253,15 +269,15 @@ def get_allele_frequency(self, values):
""" extracts the allele frequency float from a VCF string
The allele frequency for a population can be encoded in several ways,
either as a single float (eg "0.01"), or as a missing value (eg "."),
either as a single float (eg "0.01"), or as a missing value (eg "."),
or there can be a list of allele frequencies for the different alternate
alleles for the variant (eg "0.01,0.05,0.06"), or list containing floats
and missing values. We need to return the allele frequency as a float,
but if there are multiple allele frequencies, we return the largest
float.
Args:
values: string for allele frequency eg "0.01" or ".", or
values: string for allele frequency eg "0.01" or ".", or
"0.01,.,0.06". Sometimes we might even get values passed in as
a float, or a None type.
Expand All @@ -286,9 +302,9 @@ def get_allele_frequency(self, values):
def is_number(self, value):
""" determines whether a value represents a number.
Sometimes the MAF reported for a variant is ".", or even ".,.", which
Sometimes the MAF reported for a variant is ".", or even ".,.", which
are not numbers and are in fact NA values, but would cause the variant
not to pass the MAF filter. instead check if the value can be
not to pass the MAF filter. instead check if the value can be
converted to a float.
Args:
Expand Down Expand Up @@ -324,7 +340,7 @@ def find_max_allele_frequency(self):
"""

max_freq = None
# check all the populations with MAF values recorded for the variant
# check all the populations with MAF values recorded for the variant
# (typically the 1000 Genomes populations (AFR_AF, EUR_AF etc), any
# internal population (e.g. DDD_AF), and a MAX_AF field)
for key in set(self.populations) & set(self.info):
Expand Down
14 changes: 6 additions & 8 deletions src/main/python/clinicalfilter/variant/variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def set_gender(self, gender):
if gender in self.male_codes:
self.gender = "male"
elif gender in self.female_codes:
self.gender = "female"
self.gender = "female"
else:
raise ValueError("unknown gender code")

Expand Down Expand Up @@ -126,11 +126,11 @@ def get_vcf_line(self):
def set_inheritance_type(self):
""" sets the chromosome type (eg autosomal, or X chromosome type).
provides the chromosome type for a chromosome (eg Autosomal, or
X-chrom male etc). This only does simple string matching. The
chromosome string is either the chromosome number, or in the case of
the sex-chromosomes, the chromosome character. This doesn't allow for
chromosomes to be specified as "chr1", and sex chromosomes have to be
provides the chromosome type for a chromosome (eg Autosomal, or
X-chrom male etc). This only does simple string matching. The
chromosome string is either the chromosome number, or in the case of
the sex-chromosomes, the chromosome character. This doesn't allow for
chromosomes to be specified as "chr1", and sex chromosomes have to be
specified as "X" or "Y", not "23" or "24".
"""

Expand Down Expand Up @@ -181,5 +181,3 @@ def get_genotype(self):
"""

return self.genotype


0 comments on commit 6b9b4c7

Please sign in to comment.