Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Stromberg committed Apr 3, 2018
2 parents 9ad7596 + 74f0b00 commit 0738f88
Show file tree
Hide file tree
Showing 9 changed files with 190 additions and 112 deletions.
5 changes: 3 additions & 2 deletions Jasix/OnTheFlyIndexCreator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,15 @@ public void SetHeader(string header)

public void Add(IPosition position, long fileLocation)
{
var chromName = position.VcfFields[VcfCommon.ChromIndex];//we want to preserve the chrom name from input
var chromName = position.VcfFields[VcfCommon.ChromIndex];
var start = position.Start;
var end = position.InfoData.End;

if (chromName == _lastChromName && start < _lastPosition)
{
throw new UserErrorException($"the Json file is not sorted at {position.Chromosome.UcscName}: {start}");
throw new UserErrorException($"The Json file is not sorted at {position.Chromosome.UcscName}: {start}");
}

_lastPosition = start;
_lastChromName = chromName;

Expand Down
39 changes: 17 additions & 22 deletions Nirvana/Properties/launchSettings.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,45 +25,40 @@
"commandLineArgs": "-c Cache\\26\\GRCh37\\Ensembl -r References\\5\\Homo_sapiens.GRCh37.Nirvana.dat -i Data\\Mother\\Mother.vcf.gz -o mother",
"workingDirectory": "E:\\Data\\Nirvana"
},
"MS_ClinVar": {
"MS Test": {
"commandName": "Project",
"commandLineArgs": " -c Cache\\25\\GRCh37\\Ensembl90 -r References\\5\\Homo_sapiens.GRCh37.Nirvana.dat -i Data\\ClinVar\\ClinVar20150901_ShankarBugNIR1202-ClinVar_dbSNP-unknown-WG-hg19.vcf.gz -o clinvar",
"commandLineArgs": "-c Cache\\26\\GRCh37\\Both -r References\\5\\Homo_sapiens.GRCh37.Nirvana.dat -i test.vcf -o test",
"workingDirectory": "E:\\Data\\Nirvana"
},
"MS_Mother": {
"RR clinvar": {
"commandName": "Project",
"commandLineArgs": "-c Cache\\25\\GRCh37\\Ensembl90 -r References\\5\\Homo_sapiens.GRCh37.Nirvana.dat -i Data\\Mother\\Mother.vcf.gz -o mother --sd SupplementaryDatabase\\41\\GRCh37",
"workingDirectory": "E:\\Data\\Nirvana"
"commandLineArgs": " --cache C:\\Development\\Cache\\26\\GRCh37\\Ensembl --sd C:\\Development\\SupplementaryDatabase\\43\\GRCh37 --ref C:\\Development\\References\\5\\Homo_sapiens.GRCh37.Nirvana.dat --in ClinVar20150901_ShankarBugNIR1202-ClinVar_dbSNP-unknown-WG-hg19.vcf.gz --out clinvar --disable-recomposition",
"workingDirectory": "c:\\Development\\TestDatasets"
},
"SK Nirvana": {
"RR dq": {
"commandName": "Project",
"commandLineArgs": "-i E:\\Nirvana_resources\\test_runs\\test_Phantom\\MS_data\\NA12878_AH72T3CCXX-l2_S1.genome.vcf.gz -c E:\\Nirvana_resources\\Nirvana\\Development\\Cache\\25\\GRCh38\\Both90 -r E:\\Nirvana_resources\\Nirvana\\Development\\References\\5\\Homo_sapiens.GRCh38.Nirvana.dat -o NIR_NA12878 ",
"workingDirectory": "E:\\Nirvana_resources\\test_runs\\test_Phantom\\"
"commandLineArgs": " --cache C:\\Development\\Cache\\26\\GRCh38\\Both --ref C:\\Development\\References\\5\\Homo_sapiens.GRCh38.Nirvana.dat --disable-recomposition --in DQ-Strelka-Germline-chr22-hg38.vcf.gz --out dq22",
"workingDirectory": "c:\\Development\\TestDatasets"
},
"SK Pedigree": {
"RR platypus": {
"commandName": "Project",
"commandLineArgs": "-i E:\\Nirvana_resources\\test_runs\\test_Phantom\\test_data\\Pedigree.vcf.gz -c E:\\Nirvana_resources\\Nirvana\\Development\\Cache\\26\\GRCh38\\Ensembl -r E:\\Nirvana_resources\\Nirvana\\Development\\References\\5\\Homo_sapiens.GRCh38.Nirvana.dat --vcf -o Pedigree_NIR_Phan_out",
"workingDirectory": "E:\\Nirvana_resources\\test_runs\\test_Phantom\\"
"commandLineArgs": " --cache C:\\Development\\Cache\\26\\GRCh37\\Ensembl --sd C:\\Development\\SupplementaryDatabase\\43\\GRCh37 --ref C:\\Development\\References\\5\\Homo_sapiens.GRCh37.Nirvana.dat --in Platypus-Platypus-unknown-short-hg19.vcf.gz --out platypus --disable-recomposition",
"workingDirectory": "c:\\Development\\TestDatasets"
},
"SK bugfix": {
"commandName": "Project",
"commandLineArgs": "-i E:\\Nirvana_resources\\test_runs\\test_Phantom\\test_data\\Unsorted_Chromosomes.vcf -c E:\\Nirvana_resources\\Nirvana\\Development\\Cache\\26\\GRCh38\\Ensembl -r E:\\Nirvana_resources\\Nirvana\\Development\\References\\5\\Homo_sapiens.GRCh38.Nirvana.dat --vcf -o Pedigree_bugfix",
"workingDirectory": "E:\\Nirvana_resources\\test_runs\\test_Phantom"
},
"RR_clinvar": {
"commandName": "Project",
"commandLineArgs": " --cache C:\\Development\\Cache\\26\\GRCh37\\Ensembl --sd C:\\Development\\SupplementaryDatabase\\43\\GRCh37 --ref C:\\Development\\References\\5\\Homo_sapiens.GRCh37.Nirvana.dat --in ClinVar20150901_ShankarBugNIR1202-ClinVar_dbSNP-unknown-WG-hg19.vcf.gz --out clinvar --disable-recomposition",
"workingDirectory": "c:\\Development\\TestDatasets"
},
"RR_dq": {
"SK Nirvana": {
"commandName": "Project",
"commandLineArgs": " --cache C:\\Development\\Cache\\26\\GRCh38\\Both --ref C:\\Development\\References\\5\\Homo_sapiens.GRCh38.Nirvana.dat --disable-recomposition --in DQ-Strelka-Germline-chr22-hg38.vcf.gz --out dq22",
"workingDirectory": "c:\\Development\\TestDatasets"
"commandLineArgs": "-i E:\\Nirvana_resources\\test_runs\\test_Phantom\\MS_data\\NA12878_AH72T3CCXX-l2_S1.genome.vcf.gz -c E:\\Nirvana_resources\\Nirvana\\Development\\Cache\\25\\GRCh38\\Both90 -r E:\\Nirvana_resources\\Nirvana\\Development\\References\\5\\Homo_sapiens.GRCh38.Nirvana.dat -o NIR_NA12878 ",
"workingDirectory": "E:\\Nirvana_resources\\test_runs\\test_Phantom\\"
},
"RR_platypus": {
"SK Pedigree": {
"commandName": "Project",
"commandLineArgs": " --cache C:\\Development\\Cache\\26\\GRCh37\\Ensembl --sd C:\\Development\\SupplementaryDatabase\\43\\GRCh37 --ref C:\\Development\\References\\5\\Homo_sapiens.GRCh37.Nirvana.dat --in Platypus-Platypus-unknown-short-hg19.vcf.gz --out platypus --disable-recomposition",
"workingDirectory": "c:\\Development\\TestDatasets"
"commandLineArgs": "-i E:\\Nirvana_resources\\test_runs\\test_Phantom\\test_data\\Pedigree.vcf.gz -c E:\\Nirvana_resources\\Nirvana\\Development\\Cache\\26\\GRCh38\\Ensembl -r E:\\Nirvana_resources\\Nirvana\\Development\\References\\5\\Homo_sapiens.GRCh38.Nirvana.dat --vcf -o Pedigree_NIR_Phan_out",
"workingDirectory": "E:\\Nirvana_resources\\test_runs\\test_Phantom\\"
}
}
}
47 changes: 20 additions & 27 deletions SAUtils/InputFileParsers/TOPMed/TopMedItem.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,24 @@

namespace SAUtils.InputFileParsers.TOPMed
{
public sealed class TopMedItem: SupplementaryDataItem
public sealed class TopMedItem : SupplementaryDataItem
{
private readonly int? _numSamples;
private readonly int? _alleleNum;
private readonly int? _alleleCount;
private readonly int? _homCount;
private readonly bool _hasFailedFilters;
private readonly bool _failedFilter;

public TopMedItem(IChromosome chrom, int position, string refAllele, string altAllele, int? numSamples, int? alleleNum, int? alleleCount, int? homCount, bool hasFailedFilters)
public TopMedItem(IChromosome chrom, int position, string refAllele, string altAllele, int? alleleNum,
int? alleleCount, int? homCount, bool failedFilter)
{
Chromosome = chrom;
Start = position;
ReferenceAllele = refAllele;
AlternateAllele = altAllele;
_numSamples = numSamples;
_alleleNum = alleleNum;
_alleleCount = alleleCount;
_homCount = homCount;
_hasFailedFilters = hasFailedFilters;
Chromosome = chrom;
Start = position;
ReferenceAllele = refAllele;
AlternateAllele = altAllele;
_alleleNum = alleleNum;
_alleleCount = alleleCount;
_homCount = homCount;
_failedFilter = failedFilter;
}

public override bool Equals(object other)
Expand All @@ -50,24 +49,18 @@ public override int GetHashCode()

public string GetJsonString()
{
var sb = new StringBuilder();
var sb = new StringBuilder();
var jsonObject = new JsonObject(sb);

if (_hasFailedFilters) jsonObject.AddBoolValue("hasFailedFilters", true);
jsonObject.AddIntValue("numSamples", _numSamples);
jsonObject.AddStringValue("alleleFreq", ComputingUtilities.ComputeFrequency(_alleleNum, _alleleCount), false);
jsonObject.AddIntValue("alleleNumber", _alleleNum);
jsonObject.AddIntValue("alleleCount", _alleleCount);
jsonObject.AddIntValue("homCount", _homCount);

return sb.ToString();
}
jsonObject.AddStringValue("allAf", ComputingUtilities.ComputeFrequency(_alleleNum, _alleleCount), false);
jsonObject.AddIntValue("allAn", _alleleNum);
jsonObject.AddIntValue("allAc", _alleleCount);
jsonObject.AddIntValue("allHc", _homCount);
if (_failedFilter) jsonObject.AddBoolValue("failedFilter", true);

public override SupplementaryIntervalItem GetSupplementaryInterval()
{
return null;
return sb.ToString();
}


public override SupplementaryIntervalItem GetSupplementaryInterval() => null;
}
}
73 changes: 20 additions & 53 deletions SAUtils/InputFileParsers/TOPMed/TopMedReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,45 +13,32 @@ public sealed class TopMedReader : IDisposable

private int? _alleleNum;
private int? _alleleCount;
private bool _hasFailedFilters;
private int? _numSamples;
private bool _failedFilter;
private int? _homCount;
private int? _hetCount;
private double? _alleleFreq;

public TopMedReader(StreamReader streamReader, IDictionary<string, IChromosome> refChromDict)
{
_reader = streamReader;
_reader = streamReader;
_refChromDict = refChromDict;
}

private void Clear()
{
_alleleNum = null;
_alleleCount = null;
_numSamples = null;
_homCount = null;
_hetCount = null;
_alleleFreq = null;
_hasFailedFilters = false;
_alleleNum = null;
_alleleCount = null;
_homCount = null;
_failedFilter = false;
}

/// <summary>
/// Parses a source file and return an enumeration object containing
/// all the data objects that have been extracted.
/// </summary>
/// <returns></returns>
public IEnumerable<TopMedItem> GetGnomadItems()
{
using (_reader)
{
string line;
while ((line = _reader.ReadLine()) != null)
{
// Skip empty lines.
if (string.IsNullOrWhiteSpace(line)) continue;
// Skip comments.
if (line.StartsWith("#")) continue;
if (string.IsNullOrWhiteSpace(line) || line.StartsWith("#")) continue;

var topMedItem = ExtractItems(line);
if (topMedItem == null) continue;
yield return topMedItem;
Expand All @@ -62,7 +49,7 @@ public IEnumerable<TopMedItem> GetGnomadItems()
private TopMedItem ExtractItems(string vcfLine)
{
if (vcfLine == null) return null;
var splitLine = vcfLine.Split('\t');// we don't care about the many fields after info field
var splitLine = vcfLine.Split('\t');

if (splitLine.Length < 8) return null;

Expand All @@ -71,11 +58,11 @@ private TopMedItem ExtractItems(string vcfLine)
var chromosome = splitLine[VcfCommon.ChromIndex];
if (!_refChromDict.ContainsKey(chromosome)) return null;

//chr1 10169 TOPMed_freeze_5?chr1:10,169 T C 255 SVM VRT=1;NS=62784;AN=125568;AC=20;AF=0.000159276;Het=20;Hom=0 NA:FRQ 125568:0.000159276
// chr1 10169 TOPMed_freeze_5?chr1:10,169 T C 255 SVM VRT=1;NS=62784;AN=125568;AC=20;AF=0.000159276;Het=20;Hom=0 NA:FRQ 125568:0.000159276
var chrom = _refChromDict[chromosome];
var position = int.Parse(splitLine[VcfCommon.PosIndex]);//we have to get it from RSPOS in info
var refAllele = splitLine[VcfCommon.RefIndex];
var altAllele = splitLine[VcfCommon.AltIndex];
var altAllele = splitLine[VcfCommon.AltIndex];
var filters = splitLine[VcfCommon.FilterIndex];
var infoFields = splitLine[VcfCommon.InfoIndex];

Expand All @@ -85,14 +72,14 @@ private TopMedItem ExtractItems(string vcfLine)
throw new InvalidDataException("het site found!!");
}

_hasFailedFilters = !(filters.Equals("PASS") || filters.Equals("."));
_failedFilter = !(filters.Equals("PASS") || filters.Equals("."));

ParseInfoField(infoFields);

if (_alleleNum == 0) return null;

return new TopMedItem(chrom, position, refAllele, altAllele, _numSamples,
_alleleNum, _alleleCount, _homCount, _hasFailedFilters);
return new TopMedItem(chrom, position, refAllele, altAllele, _alleleNum, _alleleCount, _homCount,
_failedFilter);
}

private void ParseInfoField(string infoFields)
Expand All @@ -103,30 +90,22 @@ private void ParseInfoField(string infoFields)
foreach (var infoItem in infoItems)
{
var infoKeyValue = infoItem.Split('=');
if (infoKeyValue.Length == 2)//sanity check

if (infoKeyValue.Length == 2)
{
var key = infoKeyValue[0];
var key = infoKeyValue[0];
var value = infoKeyValue[1];

SetInfoField(key, value);
}
}
}

/// <summary>
/// Get a key value pair and using the key, set appropriate values
/// </summary>
/// <param name="vcfId"></param>
/// <param name="value"></param>
private void SetInfoField(string vcfId, string value)
private void SetInfoField(string vcfId, string value)
{
//VRT=1;NS=62784;AN=125568;AC=20;AF=0.000159276;Het=20;Hom=0

// VRT=1;NS=62784;AN=125568;AC=20;AF=0.000159276;Het=20;Hom=0
switch (vcfId)
{
case "NS":
_numSamples = Convert.ToInt32(value);
break;
case "AN":
_alleleNum = Convert.ToInt32(value);
break;
Expand All @@ -136,21 +115,9 @@ private void SetInfoField(string vcfId, string value)
case "Hom":
_homCount = Convert.ToInt32(value);
break;
case "Het":
_hetCount = Convert.ToInt32(value);
break;
case "AF":
_alleleFreq = Convert.ToDouble(value);
break;
}

}


public void Dispose()
{
_reader?.Dispose();
}
public void Dispose() => _reader?.Dispose();
}

}
Loading

0 comments on commit 0738f88

Please sign in to comment.