Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Stromberg committed Apr 25, 2018
2 parents 122cf57 + 33ad999 commit 5944c5d
Show file tree
Hide file tree
Showing 210 changed files with 2,714 additions and 2,123 deletions.
Empty file removed CacheUtils/.exclude
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
using CommandLine.NDesk.Options;
using Compression.Algorithms;
using Compression.FileHandling;
using Compression.Utilities;
using ErrorHandling;
using VariantAnnotation.Caches;
using VariantAnnotation.Caches.DataStructures;
Expand All @@ -20,7 +21,6 @@
using VariantAnnotation.IO.Caches;
using VariantAnnotation.Logger;
using VariantAnnotation.Providers;
using VariantAnnotation.Utilities;

namespace CacheUtils.Commands.CombineCacheDirectories
{
Expand All @@ -47,8 +47,8 @@ private static ExitCodes ProgramExecution()
var siftPredictionsPerRef = new Prediction[numRefSeqs][];
var polyphenPredictionsPerRef = new Prediction[numRefSeqs][];

PredictionCacheReader.PredictionHeader siftHeader;
PredictionCacheReader.PredictionHeader polyphenHeader;
PredictionHeader siftHeader;
PredictionHeader polyphenHeader;

using (var siftReader = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.SiftPath(_inputPrefix)), PredictionCacheReader.SiftDescriptions))
using (var siftReader2 = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.SiftPath(_inputPrefix2)), PredictionCacheReader.SiftDescriptions))
Expand Down Expand Up @@ -83,7 +83,7 @@ private static ExitCodes ProgramExecution()
logger.WriteLine();
WritePredictions(logger, "SIFT", CacheConstants.SiftPath(_outputPrefix), siftHeader, siftPredictionsPerRef);
WritePredictions(logger, "PolyPhen", CacheConstants.PolyPhenPath(_outputPrefix), polyphenHeader, polyphenPredictionsPerRef);
WriteTranscripts(logger, GetHeader(caches.Cache.Header), combinedIntervalArrays,
WriteTranscripts(logger, CloneHeader(caches.Cache.Header), combinedIntervalArrays,
caches.Cache.RegulatoryRegionIntervalArrays);

return ExitCodes.Success;
Expand All @@ -101,14 +101,14 @@ private static void WriteTranscripts(ILogger logger, CacheHeader header,
}

private static void WritePredictions(ILogger logger, string description, string filePath,
PredictionCacheReader.PredictionHeader header, Prediction[][] predictionsPerRef)
PredictionHeader header, Prediction[][] predictionsPerRef)
{
logger.Write($"- writing {description} predictions... ");

using (var stream = new BlockStream(new Zstandard(), FileUtilities.GetCreateStream(filePath), CompressionMode.Compress))
using (var writer = new PredictionCacheWriter(stream, GetHeader(header.Header)))
using (var writer = new PredictionCacheWriter(stream, CloneHeader(header)))
{
writer.Write(header.Lut, predictionsPerRef);
writer.Write(header.LookupTable, predictionsPerRef);
}

logger.WriteLine("finished.");
Expand Down Expand Up @@ -161,9 +161,15 @@ private static Interval<ITranscript> GetUpdatedTranscript(Interval<ITranscript>
return new Interval<ITranscript>(transcript.Start, transcript.End, updatedTranscript);
}

private static CacheHeader GetHeader(CacheHeader header) => new CacheHeader(CacheConstants.Identifier,
header.SchemaVersion, header.DataVersion, Source.BothRefSeqAndEnsembl, DateTime.Now.Ticks,
header.GenomeAssembly, header.CustomHeader);
private static VariantAnnotation.IO.Caches.Header CloneBaseHeader(VariantAnnotation.IO.Caches.Header header) =>
new VariantAnnotation.IO.Caches.Header(CacheConstants.Identifier, header.SchemaVersion, header.DataVersion,
Source.BothRefSeqAndEnsembl, DateTime.Now.Ticks, header.GenomeAssembly);

private static PredictionHeader CloneHeader(PredictionHeader header) =>
new PredictionHeader(CloneBaseHeader(header), header.Custom, header.LookupTable);

private static CacheHeader CloneHeader(CacheHeader header) =>
new CacheHeader(CloneBaseHeader(header), header.Custom);

private static (Prediction[] Predictions, int Offset) CombinePredictions(ILogger logger, IChromosome chromosome,
string description, PredictionCacheReader reader, PredictionCacheReader reader2)
Expand Down
3 changes: 1 addition & 2 deletions CacheUtils/Commands/CreateCache/CreateNirvanaDatabaseMain.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
using VariantAnnotation.IO.Caches;
using VariantAnnotation.Logger;
using VariantAnnotation.Providers;
using VariantAnnotation.Utilities;

namespace CacheUtils.Commands.CreateCache
{
Expand All @@ -39,7 +38,7 @@ private static ExitCodes ProgramExecution()
string polyphenPath = _inputPrefix + ".polyphen.gz";
string regulatoryPath = _inputPrefix + ".regulatory.gz";

var (refIndexToChromosome, refNameToChromosome, numRefSeqs) = SequenceHelper.GetDictionaries(_inputReferencePath);
(var refIndexToChromosome, var refNameToChromosome, int numRefSeqs) = SequenceHelper.GetDictionaries(_inputReferencePath);

using (var transcriptReader = new MutableTranscriptReader(GZipUtilities.GetAppropriateReadStream(transcriptPath), refIndexToChromosome))
using (var regulatoryReader = new RegulatoryRegionReader(GZipUtilities.GetAppropriateReadStream(regulatoryPath), refIndexToChromosome))
Expand Down
4 changes: 2 additions & 2 deletions CacheUtils/Commands/Download/ExternalFiles.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
using CacheUtils.IntermediateIO;
using CacheUtils.Utilities;
using Compression.Utilities;
using OptimizedCore;
using VariantAnnotation.Interface;
using VariantAnnotation.Interface.AnnotatedPositions;
using VariantAnnotation.Interface.Sequence;
using VariantAnnotation.Utilities;

namespace CacheUtils.Commands.Download
{
Expand Down Expand Up @@ -100,7 +100,7 @@ private static int GetNumGenbankFiles(ILogger logger)
string line = reader.ReadLine();
if (line == null) break;

string filename = line.Split('\t')[1];
string filename = line.OptimizedSplit('\t')[1];
if (!filename.EndsWith(".rna.gbff.gz")) continue;

int num = int.Parse(filename.Substring(6, filename.Length - 18));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using CommandLine.Builders;
using CommandLine.NDesk.Options;
using CommonUtilities;
using Compression.Utilities;
using ErrorHandling;
using VariantAnnotation.Caches.DataStructures;
using VariantAnnotation.Interface;
Expand All @@ -19,7 +20,6 @@
using VariantAnnotation.Logger;
using VariantAnnotation.Providers;
using VariantAnnotation.Sequence;
using VariantAnnotation.Utilities;

namespace CacheUtils.Commands.ExtractTranscripts
{
Expand Down Expand Up @@ -114,9 +114,8 @@ private static (PredictionCacheStaging Staging, Prediction[] Predictions) GetPre
logger.Write($"- retrieving {description} predictions... ");

var indexSet = GetUniqueIndices(transcripts, indexFunc);
var lut = reader.Header.Lut;
var predictionsPerRef = GetPredictions(indexSet, chromosome, numRefSeqs, oldPredictions);
var staging = new PredictionCacheStaging(reader.Header.Header, lut, predictionsPerRef);
var staging = new PredictionCacheStaging(reader.Header, predictionsPerRef);

logger.WriteLine($"found {indexSet.Count} predictions.");
return (staging, predictionsPerRef[chromosome.Index]);
Expand Down
21 changes: 7 additions & 14 deletions CacheUtils/Commands/Header/HeaderMain.cs
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
using System;
using System.IO.Compression;
using CommandLine.Builders;
using CommandLine.NDesk.Options;
using Compression.Algorithms;
using Compression.FileHandling;
using Compression.Utilities;
using ErrorHandling;
using ErrorHandling.Exceptions;
using VariantAnnotation.IO.Caches;
using VariantAnnotation.Providers;
using VariantAnnotation.Utilities;

namespace CacheUtils.Commands.Header
{
Expand All @@ -18,8 +15,8 @@ public static class HeaderMain

private static ExitCodes ProgramExecution()
{
var cachePath = CacheConstants.TranscriptPath(_inputPrefix);
var header = GetHeaderInformation(cachePath);
string cachePath = CacheConstants.TranscriptPath(_inputPrefix);
var header = GetHeaderInformation(cachePath);

Console.WriteLine($"Versions: Schema: {header.Schema}, Data: {header.Data}, VEP: {header.Vep}");
return ExitCodes.Success;
Expand All @@ -28,18 +25,14 @@ private static ExitCodes ProgramExecution()
private static (ushort Schema, ushort Data, ushort Vep) GetHeaderInformation(string cachePath)
{
CacheHeader header;
TranscriptCacheCustomHeader customHeader = null;

using (var stream = FileUtilities.GetReadStream(cachePath))
using (var blockStream = new BlockStream(new Zstandard(), stream, CompressionMode.Decompress))
using (var stream = FileUtilities.GetReadStream(cachePath))
{
header = blockStream.ReadHeader(CacheHeader.Read, TranscriptCacheCustomHeader.Read) as CacheHeader;
if (header != null) customHeader = header.CustomHeader as TranscriptCacheCustomHeader;
header = CacheHeader.Read(stream);
}

if (header == null || customHeader == null) throw new InvalidFileFormatException($"Could not parse the header information correctly for {cachePath}");
if (header == null) throw new InvalidFileFormatException($"Could not parse the header information correctly for {cachePath}");

return (header.SchemaVersion, header.DataVersion, customHeader.VepVersion);
return (header.SchemaVersion, header.DataVersion, header.Custom.VepVersion);
}

public static ExitCodes Run(string command, string[] args)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
using VariantAnnotation.Logger;
using VariantAnnotation.Providers;
using VariantAnnotation.Sequence;
using VariantAnnotation.Utilities;

namespace CacheUtils.Commands.ParseVepCacheDirectory
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public static IEnumerable<IRegulatoryRegion> Merge(IEnumerable<IRegulatoryRegion
{
if (currentRegion.Id.IsEmpty()) throw new InvalidOperationException("Found a regulatory region without an ID.");

var regulatoryKey = $"{currentRegion.Id}.{currentRegion.Start}.{currentRegion.End}";
string regulatoryKey = $"{currentRegion.Id}.{currentRegion.Start}.{currentRegion.End}";

if (regulatoryDict.TryGetValue(regulatoryKey, out var previousRegion))
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public static List<MutableTranscript> Merge(ILogger logger, IEnumerable<MutableT
private static MutableTranscript Merge(ILogger logger, IReadOnlyList<MutableTranscript> transcripts,
Dictionary<string, GenbankEntry> idToGenbankEntry)
{
var transcriptId = transcripts[0].Id;
string transcriptId = transcripts[0].Id;

if (transcripts.Count == 1)
{
Expand Down
4 changes: 2 additions & 2 deletions CacheUtils/Commands/RegulatoryGFF/CreateRegulatoryGffMain.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ private static ExitCodes ProgramExecution()
{
using (var writer = GZipUtilities.GetStreamWriter(_outputFileName))
{
var cachePath = CacheConstants.TranscriptPath(_inputPrefix);
string cachePath = CacheConstants.TranscriptPath(_inputPrefix);
var sequenceData = SequenceHelper.GetDictionaries(_referencePath);

// load the cache
Expand Down Expand Up @@ -62,7 +62,7 @@ public static ExitCodes Run(string command, string[] args)
}
};

var commandLineExample = $"{command} --in <cache prefix> --out <GFF path>";
string commandLineExample = $"{command} --in <cache prefix> --out <GFF path>";

return new ConsoleAppBuilder(args, ops)
.UseVersionProvider(new VersionProvider())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
using VariantAnnotation.Interface.Sequence;
using VariantAnnotation.Logger;
using VariantAnnotation.Providers;
using VariantAnnotation.Utilities;

namespace CacheUtils.Commands.UniversalGeneArchive
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public MutableGene(IChromosome chromosome, int start, int end, bool onReverseStr

public override string ToString()
{
var strand = OnReverseStrand ? "R" : "F";
string strand = OnReverseStrand ? "R" : "F";
return $"{GeneId}: {Chromosome.UcscName} {Start}-{End} {strand} symbol: {Symbol} ({SymbolSource}), HGNC ID: {HgncId}";
}

Expand All @@ -53,7 +53,7 @@ public override int GetHashCode()
unchecked
{
// ReSharper disable NonReadonlyMemberInGetHashCode
var hashCode = Chromosome.Index.GetHashCode();
int hashCode = Chromosome.Index.GetHashCode();
hashCode = (hashCode * 397) ^ Start;
hashCode = (hashCode * 397) ^ End;
hashCode = (hashCode * 397) ^ OnReverseStrand.GetHashCode();
Expand All @@ -69,10 +69,10 @@ public override int GetHashCode()

public UgaGene ToUgaGene(bool isGrch37)
{
var (ensemblGeneId, entrezGeneId) = GeneId.StartsWith("ENSG") ? (GeneId, null as string) : (null as string, GeneId);
(string ensemblGeneId, string entrezGeneId) = GeneId.StartsWith("ENSG") ? (GeneId, null as string) : (null as string, GeneId);

IInterval interval = new Interval(Start, End);
var (grch37, grch38) = isGrch37 ? (interval, null as IInterval) : (null as IInterval, interval);
(IInterval grch37, IInterval grch38) = isGrch37 ? (interval, null as IInterval) : (null as IInterval, interval);

return new UgaGene(Chromosome, grch37, grch38, OnReverseStrand, entrezGeneId, ensemblGeneId, Symbol,
HgncId);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ public override int GetHashCode()
unchecked
{
// ReSharper disable NonReadonlyMemberInGetHashCode
var hashCode = Chromosome.Index.GetHashCode();
int hashCode = Chromosome.Index.GetHashCode();
hashCode = (hashCode * 397) ^ Start;
hashCode = (hashCode * 397) ^ End;
hashCode = (hashCode * 397) ^ Id.GetHashCode();
Expand Down
3 changes: 2 additions & 1 deletion CacheUtils/DataDumperImport/IO/DataDumperReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System.Text;
using CacheUtils.DataDumperImport.DataStructures.Import;
using CacheUtils.DataDumperImport.FauxRegex;
using OptimizedCore;

namespace CacheUtils.DataDumperImport.IO
{
Expand Down Expand Up @@ -65,7 +66,7 @@ private StringKeyValueNode GetMultiLineKeyValue(string key, string value)
while (true)
{
string line = GetNextLine().Trim();
if (line.StartsWith("\'")) break;
if (line.OptimizedStartsWith('\'')) break;
_sb.Append(' ');
_sb.Append(line);
}
Expand Down
5 changes: 3 additions & 2 deletions CacheUtils/DataDumperImport/Import/Attribute.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
using System.Text.RegularExpressions;
using CacheUtils.DataDumperImport.DataStructures.Import;
using CacheUtils.DataDumperImport.Utilities;
using OptimizedCore;
using VariantAnnotation.Caches.DataStructures;
using VariantAnnotation.Interface.AnnotatedPositions;
using VariantAnnotation.Interface.Intervals;
Expand Down Expand Up @@ -46,7 +47,7 @@ public static (IInterval[] MicroRnas, IRnaEdit[] RnaEdits, bool CdsStartNotFound
if (!(node is ObjectValueNode objectValue))
throw new InvalidDataException($"Could not transform the AbstractData object into an ObjectValue: [{node.GetType()}]");

(var key, var value) = ParseKeyValue(objectValue);
(string key, string value) = ParseKeyValue(objectValue);
if (key == null) continue;

// ReSharper disable once SwitchStatementMissingSomeCases
Expand Down Expand Up @@ -85,7 +86,7 @@ private static IInterval GetInterval(string s)

private static RnaEdit GetRnaEdit(string s)
{
var cols = s.Split(' ');
var cols = s.OptimizedSplit(' ');
if (cols.Length != 3) throw new InvalidDataException($"Expected 3 columns but found {cols.Length} when parsing RNA edit");

int start = int.Parse(cols[0]);
Expand Down
5 changes: 3 additions & 2 deletions CacheUtils/Genbank/GenbankReader.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using System;
using System.Collections.Generic;
using System.IO;
using OptimizedCore;
using VariantAnnotation.Interface.Intervals;
using VariantAnnotation.Utilities;

Expand Down Expand Up @@ -34,7 +35,7 @@ public GenbankEntry GetGenbankEntry()
// assert that the record starts with LOCUS
if (!HasLocus()) return null;

var (transcriptId, transcriptVersion) = ParseHeader();
(string transcriptId, byte transcriptVersion) = ParseHeader();
var featureData = ParseFeatures();
ParseOrigin();

Expand Down Expand Up @@ -150,7 +151,7 @@ private static IInterval GetInterval(string info)

private static IInterval GetJoinInterval(string info)
{
var cols = info.Substring(5, info.Length - 6).Split(',');
var cols = info.Substring(5, info.Length - 6).OptimizedSplit(',');
int start = int.Parse(cols[0].Split("..")[0]);
int end = int.Parse(cols[1].Split("..")[1]);
return new Interval(start, end);
Expand Down
6 changes: 3 additions & 3 deletions CacheUtils/Genes/Combiners/CombinerUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ public static class CombinerUtils
{
public static UgaGene Merge(UgaGene gene37, UgaGene gene38)
{
var ensemblId = CombineField(gene37.EnsemblId, gene38.EnsemblId);
var entrezGeneId = CombineField(gene37.EntrezGeneId, gene38.EntrezGeneId);
var hgncId = CombineField(gene37.HgncId, gene38.HgncId);
string ensemblId = CombineField(gene37.EnsemblId, gene38.EnsemblId);
string entrezGeneId = CombineField(gene37.EntrezGeneId, gene38.EntrezGeneId);
int hgncId = CombineField(gene37.HgncId, gene38.HgncId);
return new UgaGene(gene37.Chromosome, gene37.GRCh37, gene38.GRCh38, gene37.OnReverseStrand, entrezGeneId,
ensemblId, gene37.Symbol, hgncId);
}
Expand Down
2 changes: 1 addition & 1 deletion CacheUtils/Genes/Combiners/HgncIdCombiner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ public void Combine(List<UgaGene> combinedGenes, HashSet<UgaGene> remainingGenes
var genesByHgnc37 = remainingGenes37.GetMultiValueDict(x => x.HgncId);
var genesByHgnc38 = remainingGenes38.GetMultiValueDict(x => x.HgncId);

foreach (var hgncId in hgncIds)
foreach (int hgncId in hgncIds)
{
var genes37 = GetGenesByHgncId(genesByHgnc37, hgncId);
var genes38 = GetGenesByHgncId(genesByHgnc38, hgncId);
Expand Down
6 changes: 3 additions & 3 deletions CacheUtils/Genes/Combiners/PartitionCombiner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ private static void CombineSet(ICollection<UgaGene> combinedGenes, IEnumerable<U
var keyToGene38 = uga38.GetMultiValueDict(GetKey);
var keys = GetAllKeys(keyToGene37.Keys, keyToGene38.Keys);

foreach (var key in keys)
foreach (string key in keys)
{
var genes37 = GetGenesByKey(keyToGene37, key);
var genes38 = GetGenesByKey(keyToGene38, key);
Expand Down Expand Up @@ -57,8 +57,8 @@ private static List<UgaGene> GetGenesByKey(IReadOnlyDictionary<string, List<UgaG
private static IEnumerable<string> GetAllKeys(IEnumerable<string> keys37, IEnumerable<string> keys38)
{
var keys = new HashSet<string>();
foreach (var key in keys37) keys.Add(key);
foreach (var key in keys38) keys.Add(key);
foreach (string key in keys37) keys.Add(key);
foreach (string key in keys38) keys.Add(key);
return keys;
}

Expand Down
Loading

0 comments on commit 5944c5d

Please sign in to comment.