Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update SAUtils for Cosmic #63

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 73 additions & 38 deletions SAUtils/DataStructures/CosmicItem.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public sealed class CosmicItem : ISupplementaryDataItem
private string Id { get; }
private string Gene { get; }
private int? SampleCount { get; }
public HashSet<CosmicStudy> Studies { get; }
public HashSet<CosmicTumor> Tumors { get; }

public CosmicItem(
Chromosome chromosome,
Expand All @@ -28,48 +28,69 @@ public CosmicItem(
string refAllele,
string altAllele,
string gene,
HashSet<CosmicStudy> studies, int? sampleCount)
HashSet<CosmicTumor> tumors,
int? sampleCount)
{
Chromosome = chromosome;
Position = position;
Id = id;
RefAllele = refAllele;
AltAllele = altAllele;
RefAllele = refAllele;
AltAllele = altAllele;
Gene = gene;
Studies = studies;
Tumors = tumors;
SampleCount = sampleCount;
}

public override int GetHashCode()
{
var hashCode = Id?.GetHashCode() ?? 0;
return hashCode;
}

public sealed class CosmicStudy : IEquatable<CosmicStudy>
public sealed class CosmicTumor : IEquatable<CosmicTumor>
{
#region members

public string Id { get; }
public IEnumerable<string> Histologies { get; }
public IEnumerable<string> Sites { get; }
public string Histology { get; }
public string Site { get; }
public string Tier { get; }

#endregion

public CosmicStudy(string studyId, IEnumerable<string> histologies, IEnumerable<string> sites)
public CosmicTumor(string tumorId,
string histology,
string site,
string tier)
{
Id = studyId;
Sites = sites;
Histologies = histologies;
Id = tumorId;
Site = site;
Histology = histology;
Tier = tier;
}

public bool Equals(CosmicStudy other)
public bool Equals(CosmicTumor other)
{
if (other == null) return false;

return Id.Equals(other.Id)
&& Histologies.SequenceEqual(other.Histologies)
&& Sites.SequenceEqual(other.Sites);
&& StringsEqual(Histology, other.Histology)
&& StringsEqual(Site, other.Site)
&& StringsEqual(Tier, other.Tier);
}

private static bool StringsEqual(string s1, string s2)
{
if (s1 == null && s2 != null) return false;
if (s1 != null && s2 == null) return false;
if (s1 == null && s2 == null) return true;
return s1.Equals(s2);
}

public override int GetHashCode()
{
var hashCode = Id?.GetHashCode() ?? 0;
//hashCode ^= Histologies.GetHashCode() ^ Sites.GetHashCode();
//hashCode ^= Histology.GetHashCode() ^ Site.GetHashCode();
return hashCode;
}
}
Expand All @@ -86,51 +107,65 @@ public string GetJsonString()
jsonObject.AddStringValue("gene", Gene);
jsonObject.AddIntValue("sampleCount", SampleCount);

jsonObject.AddStringValue("cancerTypesAndCounts", GetJsonStringFromDict("cancerType",GetCancerTypeCounts()), false);
jsonObject.AddStringValue("cancerSitesAndCounts", GetJsonStringFromDict("cancerSite",GetTissueCounts()), false);
jsonObject.AddStringValue("cancerTypesAndCounts", GetJsonStringFromDict("cancerType", GetCancerTypeCounts()), false);
jsonObject.AddStringValue("cancerSitesAndCounts", GetJsonStringFromDict("cancerSite", GetTissueCounts()), false);
jsonObject.AddStringValue("tiersAndCounts", GetJsonStringFromDict("tier", GetTierCounts()), false);

return StringBuilderPool.GetStringAndReturn(sb);
}

internal Dictionary<string,int> GetTissueCounts()
{
if (Studies == null) return null;
if (Tumors == null) return null;
var tissueCounts = new Dictionary<string, int>();
foreach (var study in Studies)
foreach (var tumor in Tumors)
{
if (study.Sites == null) return null;
if (string.IsNullOrEmpty(tumor.Site)) continue;

foreach (var site in study.Sites)
if (tissueCounts.TryGetValue(tumor.Site, out _))
{
if (tissueCounts.TryGetValue(site, out _))
{
tissueCounts[site]++;
}
else tissueCounts[site] = 1;
tissueCounts[tumor.Site]++;
}
else tissueCounts[tumor.Site] = 1;
}

return tissueCounts;
}

internal Dictionary<string,int> GetCancerTypeCounts()
{
if (Studies == null) return null;
var cancerTypeCounts = new Dictionary<string, int>();
foreach (var study in Studies)
if (Tumors == null) return null;
var histologyCounts = new Dictionary<string, int>();
foreach (var tumor in Tumors)
{
if (string.IsNullOrEmpty(tumor.Histology)) continue;

if (histologyCounts.TryGetValue(tumor.Histology, out _))
{
histologyCounts[tumor.Histology]++;
}
else histologyCounts[tumor.Histology] = 1;
}

return histologyCounts;
}

internal Dictionary<string,int> GetTierCounts()
{
if (Tumors == null) return null;
var tierCounts = new Dictionary<string, int>();
foreach (var tumor in Tumors)
{
if (study.Histologies == null) return null;
foreach (var histology in study.Histologies)
if (string.IsNullOrEmpty(tumor.Tier)) continue;

if (tierCounts.TryGetValue(tumor.Tier, out _))
{
if (cancerTypeCounts.TryGetValue(histology, out _))
{
cancerTypeCounts[histology]++;
}
else cancerTypeCounts[histology] = 1;
tierCounts[tumor.Tier]++;
}
else tierCounts[tumor.Tier] = 1;
}

return cancerTypeCounts;
return tierCounts;
}

private static string GetJsonStringFromDict(string dataType, Dictionary<string, int> dictionary)
Expand Down
95 changes: 43 additions & 52 deletions SAUtils/InputFileParsers/Cosmic/MergedCosmicReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,37 +19,38 @@ public sealed class MergedCosmicReader
private string _geneName;
private int? _sampleCount;

private int _mutationIdIndex = -1;
private int _cosmicIdIndex = -1;
private int _primarySiteIndex = -1;
private int _primaryHistologyIndex = -1;
private int _studyIdIndex = -1;
private int _tumorIdIndex = -1;
private int _tierIndex = -1;

private const string StudyIdTag = "ID_STUDY";
private const string TumorIdTag = "ID_tumour";

private readonly Dictionary<string, Chromosome> _refChromDict;
private readonly ISequenceProvider _sequenceProvider;
private readonly Dictionary<string, HashSet<CosmicItem.CosmicStudy>> _studies;
private readonly Dictionary<string, HashSet<CosmicItem.CosmicTumor>> _tumors;

public MergedCosmicReader(string vcfFile, string tsvFile, ISequenceProvider sequenceProvider)
{
_vcfFileReader = GZipUtilities.GetAppropriateStreamReader(vcfFile);
_tsvFileReader = GZipUtilities.GetAppropriateStreamReader(tsvFile);
_sequenceProvider = sequenceProvider;
_refChromDict = _sequenceProvider.RefNameToChromosome;
_studies = new Dictionary<string, HashSet<CosmicItem.CosmicStudy>>();
_tumors = new Dictionary<string, HashSet<CosmicItem.CosmicTumor>>();
}

public IEnumerable<CosmicItem> GetItems()
{
// taking up all studies in to the dictionary
// taking up all tumors in to the dictionary
using (_tsvFileReader)
{
string line;
while ((line = _tsvFileReader.ReadLine()) != null)
{
if (IsHeaderLine(line))
GetColumnIndexes(line); // the first line is supposed to be a the header line
else AddCosmicStudy(line);
else AddCosmicTumor(line);
}
}

Expand All @@ -74,87 +75,77 @@ public IEnumerable<CosmicItem> GetItems()
}
}

private void AddCosmicStudy(string line)
private void AddCosmicTumor(string line)
{
var columns = line.OptimizedSplit('\t');

string mutationId = columns[_mutationIdIndex];
string studyId = columns[_studyIdIndex];
var sites = GetSites(columns);
var histologies = GetHistologies(columns);

if (string.IsNullOrEmpty(mutationId)) return;

var study = new CosmicItem.CosmicStudy(studyId, histologies, sites);
if (_studies.TryGetValue(mutationId, out var studySet))
studySet.Add(study);
else _studies[mutationId] = new HashSet<CosmicItem.CosmicStudy> { study };
}

private List<string> GetHistologies(string[] columns)
{
var histologies = new HashSet<string>();
var primaryHistology = columns[_primaryHistologyIndex].Replace('_', ' ');
TryAddValue(primaryHistology, histologies);

return histologies.ToList();
}

private List<string> GetSites(string[] columns)
{
var sites = new HashSet<string>();
string cosmicId = columns[_cosmicIdIndex];
string tumorId = columns[_tumorIdIndex];
string site = GetString(columns[_primarySiteIndex]);
string histology = GetString(columns[_primaryHistologyIndex]);
string tier = GetString(columns[_tierIndex]);

var primarySite = columns[_primarySiteIndex].Replace('_', ' ');
TryAddValue(primarySite, sites);
if (string.IsNullOrEmpty(cosmicId)) return;

return sites.ToList();
var tumor = new CosmicItem.CosmicTumor(tumorId, histology, site, tier);
if (_tumors.TryGetValue(cosmicId, out var tumorSet))
tumorSet.Add(tumor);
else _tumors[cosmicId] = new HashSet<CosmicItem.CosmicTumor> { tumor };
}

private static void TryAddValue(string value, ISet<string> sites)
private string GetString(string value)
{
if (!string.IsNullOrEmpty(value) && value != "NS")
sites.Add(value);
if (string.IsNullOrEmpty(value) || value == "NS")
return null;
value = value.Replace('_', ' ');
return value;
}

private static bool IsHeaderLine(string line) => line.Contains(StudyIdTag);
private static bool IsHeaderLine(string line) => line.Contains(TumorIdTag);

private void GetColumnIndexes(string headerLine)
{
//Gene name Accession Number Gene CDS length HGNC ID Sample name ID_sample ID_tumour Primary site Site subtype 1 Site subtype 2 Site subtype 3 Primary histology Histology subtype 1 Histology subtype 2 Histology subtype 3 Genome-wide screen Mutation ID Mutation CDS Mutation AA Mutation Description Mutation zygosity LOH GRCh Mutation genome position Mutation strand SNP FATHMM prediction FATHMM score Mutation somatic status Pubmed_PMID ID_STUDY Sample source Tumour origin Age
//Gene name Accession Number Gene CDS length HGNC ID Sample name ID_sample ID_tumour Primary site Site subtype 1 Site subtype 2 Site subtype 3 Primary histology Histology subtype 1 Histology subtype 2 Histology subtype 3 Genome-wide screen GENOMIC_MUTATION_ID LEGACY_MUTATION_ID MUTATION_ID Mutation CDS Mutation AA Mutation Description Mutation zygosity LOH GRCh Mutation genome position Mutation strand SNP Resistance Mutation FATHMM prediction FATHMM score Mutation somatic status Pubmed_PMID ID_STUDY Sample Type Tumour origin Age Tier HGVSP HGVSC HGVSG

_mutationIdIndex = -1;
_studyIdIndex = -1;
_cosmicIdIndex = -1;
_tumorIdIndex = -1;
_primarySiteIndex = -1;
_primaryHistologyIndex = -1;
_tierIndex = -1;

var columns = headerLine.OptimizedSplit('\t');
for (int i = 0; i < columns.Length; i++)
{
switch (columns[i])
{
case "Mutation ID":
_mutationIdIndex = i;
case "GENOMIC_MUTATION_ID":
_cosmicIdIndex = i;
break;
case StudyIdTag:
_studyIdIndex = i;
case TumorIdTag:
_tumorIdIndex = i;
break;
case "Primary site":
_primarySiteIndex = i;
break;
case "Primary histology":
_primaryHistologyIndex = i;
break;
case "Tier":
_tierIndex = i;
break;
}
}

if (_mutationIdIndex == -1)
throw new InvalidDataException("Column for mutation Id could not be detected");
if (_studyIdIndex == -1)
throw new InvalidDataException("Column for study Id could not be detected");
if (_cosmicIdIndex == -1)
throw new InvalidDataException("Column for Cosmic Id could not be detected");
if (_tumorIdIndex == -1)
throw new InvalidDataException("Column for tumor Id could not be detected");
if (_primarySiteIndex == -1)
throw new InvalidDataException("Column for primary site could not be detected");
if (_primaryHistologyIndex == -1)
throw new InvalidDataException("Column for primary histology could not be detected");
if (_tierIndex == -1)
throw new InvalidDataException("Column for tier could not be decteded");
}

private const int MaxVariantLength= 1000;
Expand Down Expand Up @@ -185,7 +176,7 @@ internal List<CosmicItem> ExtractCosmicItems(string vcfLine)
var (shiftedPos, shiftedRef, shiftedAlt) = VariantUtils.TrimAndLeftAlign(position, refAllele,
altAllele, _sequenceProvider.Sequence);

cosmicItems.Add(_studies.TryGetValue(cosmicId, out var studies)
cosmicItems.Add(_tumors.TryGetValue(cosmicId, out var studies)
? new CosmicItem(chromosome, shiftedPos, cosmicId, shiftedRef, shiftedAlt, _geneName, studies,
_sampleCount)
: new CosmicItem(chromosome, shiftedPos, cosmicId, shiftedRef, shiftedAlt, _geneName, null,
Expand Down
Loading