Skip to content

Commit

Permalink
Optimize bins filtering by merging bins (#11248)
Browse files Browse the repository at this point in the history
* Merge genomic data bins working
* Workaround for clickhouse bug in numerical data parsing

---------

Co-authored-by: alisman <[email protected]>
  • Loading branch information
fuzhaoyuan and alisman authored Dec 7, 2024
1 parent a8d1e95 commit b31fe9e
Show file tree
Hide file tree
Showing 5 changed files with 193 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,15 @@
import org.cbioportal.web.parameter.ClinicalDataFilter;
import org.cbioportal.web.parameter.CategorizedGenericAssayDataCountFilter;
import org.cbioportal.web.parameter.CustomSampleIdentifier;
import org.cbioportal.web.parameter.DataFilter;
import org.cbioportal.web.parameter.DataFilterValue;
import org.cbioportal.web.parameter.GenericAssayDataFilter;
import org.cbioportal.web.parameter.GenomicDataFilter;
import org.cbioportal.web.parameter.StudyViewFilter;
import org.springframework.lang.NonNull;
import org.springframework.lang.Nullable;

import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List;
Expand All @@ -28,6 +33,18 @@ public static StudyViewFilterHelper build(@Nullable StudyViewFilter studyViewFil
if (Objects.isNull(customDataSamples)) {
customDataSamples = new ArrayList<>();
}
if (studyViewFilter.getGenomicDataFilters() != null && !studyViewFilter.getGenomicDataFilters().isEmpty()) {
List<GenomicDataFilter> mergedGenomicDataFilters = mergeDataFilters(studyViewFilter.getGenomicDataFilters());
studyViewFilter.setGenomicDataFilters(mergedGenomicDataFilters);
}
if (studyViewFilter.getClinicalDataFilters() != null && !studyViewFilter.getClinicalDataFilters().isEmpty()) {
List<ClinicalDataFilter> mergedClinicalDataFilters = mergeDataFilters(studyViewFilter.getClinicalDataFilters());
studyViewFilter.setClinicalDataFilters(mergedClinicalDataFilters);
}
if (studyViewFilter.getGenericAssayDataFilters() != null && !studyViewFilter.getGenericAssayDataFilters().isEmpty()) {
List<GenericAssayDataFilter> mergedGenericAssayDataFilters = mergeDataFilters(studyViewFilter.getGenericAssayDataFilters());
studyViewFilter.setGenericAssayDataFilters(mergedGenericAssayDataFilters);
}
return new StudyViewFilterHelper(studyViewFilter, genericAssayProfilesMap, customDataSamples);
}

Expand Down Expand Up @@ -93,4 +110,52 @@ public boolean isCategoricalClinicalDataFilter(ClinicalDataFilter clinicalDataFi
return filterValue.getValue() != null;
}

/**
* Merge the range of numerical bins in DataFilters to reduce the number of scans that runs on the database when filtering.
*/
public static <T extends DataFilter> List<T> mergeDataFilters(List<T> filters) {
boolean isNonNumericalOnly = true;
List<T> mergedDataFilters = new ArrayList<>();

for (T filter : filters) {
List<DataFilterValue> mergedValues = new ArrayList<>();
List<DataFilterValue> nonNumericalValues = new ArrayList<>();

BigDecimal mergedStart = null;
BigDecimal mergedEnd = null;
for (DataFilterValue dataFilterValue : filter.getValues()) {
// leave non-numerical values as they are
if (dataFilterValue.getValue() != null) {
nonNumericalValues.add(dataFilterValue);
}
// merge adjacent numerical bins
else {
isNonNumericalOnly = false;
BigDecimal start = dataFilterValue.getStart();
BigDecimal end = dataFilterValue.getEnd();

if (mergedStart == null && mergedEnd == null) {
mergedStart = start;
mergedEnd = end;
}
else if (mergedEnd.equals(start)) {
mergedEnd = end;
} else {
mergedValues.add(new DataFilterValue(mergedStart, mergedEnd, null));
mergedStart = start;
mergedEnd = end;
}
}
}

if (!isNonNumericalOnly) {
mergedValues.add(new DataFilterValue(mergedStart, mergedEnd, null));
}
mergedValues.addAll(nonNumericalValues);
filter.setValues(mergedValues);
mergedDataFilters.add(filter);
}

return mergedDataFilters;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@ public class DataFilterValue implements Serializable {
private BigDecimal end;
private String value;

public DataFilterValue() {}

public DataFilterValue(BigDecimal start, BigDecimal end, String value) {
this.start = start;
this.end = end;
this.value = value;
}

public BigDecimal getStart() {
return start;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.cbioportal.web.parameter;

import java.io.Serializable;
import java.util.List;

public class GenomicDataFilter extends DataFilter implements Serializable {
private String hugoGeneSymbol;
Expand All @@ -13,6 +14,12 @@ public GenomicDataFilter(String hugoGeneSymbol, String profileType) {
this.profileType = profileType;
}

public GenomicDataFilter(String hugoGeneSymbol, String profileType, List<DataFilterValue> values) {
this.hugoGeneSymbol = hugoGeneSymbol;
this.profileType = profileType;
this.setValues(values);
}

public String getHugoGeneSymbol() {
return hugoGeneSymbol;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -880,6 +880,9 @@
-->
<sql id="castStringValueToFloat">
multiIf(
-- This condition is to prevent casting non numerical values to float
NOT match(${attribute_value}, '^[>&lt;]?=?[-+]?[0-9]*[.,]?[0-9]+$'),
NULL,
(startsWith(${attribute_value}, '&lt;=') OR startsWith(${attribute_value}, '>=')),
cast(substr(${attribute_value}, 3) as float),
startsWith(${attribute_value}, '&lt;'),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package org.cbioportal.persistence.helper;

import org.cbioportal.web.parameter.DataFilterValue;
import org.cbioportal.web.parameter.GenomicDataFilter;
import org.junit.Test;

import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.List;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;

public class StudyViewFilterHelperTest {

// (-5, -1], (-1, 3], (3, 7] -> (-5, 7]
@Test
public void testMergeDataFilterNumericalContinuousValues() {
List<GenomicDataFilter> genomicDataFilters = new ArrayList<>();
List<DataFilterValue> values = new ArrayList<>();
values.add(new DataFilterValue(BigDecimal.valueOf(-5), BigDecimal.valueOf(-1), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-1), BigDecimal.valueOf(3), null));
values.add(new DataFilterValue(BigDecimal.valueOf(3), BigDecimal.valueOf(7), null));
genomicDataFilters.add(new GenomicDataFilter(null, null, values));

List<GenomicDataFilter> mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters);
List<DataFilterValue> mergedDataFilterValues = mergedGenomicDataFilters.getFirst().getValues();
BigDecimal start = mergedDataFilterValues.getFirst().getStart();
BigDecimal end = mergedDataFilterValues.getFirst().getEnd();
assertEquals(0, BigDecimal.valueOf(-5).compareTo(start));
assertEquals(0, BigDecimal.valueOf(7).compareTo(end));
}

// (-2.5, -2.25], (-2.25, -2], (-1.75, -1.5], (-1.5, -1.25] -> (-2.5, -2], (-1.75, -1.25]
@Test
public void testMergeDataFilterNumericalDiscontinuousValues() {
List<GenomicDataFilter> genomicDataFilters = new ArrayList<>();
List<DataFilterValue> values = new ArrayList<>();
values.add(new DataFilterValue(BigDecimal.valueOf(-2.5), BigDecimal.valueOf(-2.25), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-2.25), BigDecimal.valueOf(-2), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-1.75), BigDecimal.valueOf(-1.5), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-1.5), BigDecimal.valueOf(-1.25), null));
genomicDataFilters.add(new GenomicDataFilter(null, null, values));

List<GenomicDataFilter> mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters);
List<DataFilterValue> mergedDataFilterValues = mergedGenomicDataFilters.getFirst().getValues();
BigDecimal firstStart = mergedDataFilterValues.getFirst().getStart();
BigDecimal firstEnd = mergedDataFilterValues.getFirst().getEnd();
assertEquals(0, BigDecimal.valueOf(-2.5).compareTo(firstStart));
assertEquals(0, BigDecimal.valueOf(-2).compareTo(firstEnd));

BigDecimal secondStart = mergedDataFilterValues.get(1).getStart();
BigDecimal secondEnd = mergedDataFilterValues.get(1).getEnd();
assertEquals(0, BigDecimal.valueOf(-1.75).compareTo(secondStart));
assertEquals(0, BigDecimal.valueOf(-1.25).compareTo(secondEnd));
}

// (null, -2.25], (-2.25, -2], (-2, null] -> (null, null]
@Test
public void testMergeDataFilterNumericalInfiniteValues() {
List<GenomicDataFilter> genomicDataFilters = new ArrayList<>();
List<DataFilterValue> values = new ArrayList<>();
values.add(new DataFilterValue(null, BigDecimal.valueOf(-2.25), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-2.25), BigDecimal.valueOf(-2), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-2), null, null));
genomicDataFilters.add(new GenomicDataFilter(null, null, values));

List<GenomicDataFilter> mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters);
List<DataFilterValue> mergedDataFilterValues = mergedGenomicDataFilters.getFirst().getValues();
BigDecimal start = mergedDataFilterValues.getFirst().getStart();
BigDecimal end = mergedDataFilterValues.getFirst().getEnd();
assertNull(start);
assertNull(end);
}

// (-2.5, -2.25], (-2.25, -2], "NA" -> (-2.5, -1.75], "NA"
// This test also ensures the non-numerical values gets moved to the end
@Test
public void testMergeDataFilterNumericalNonNumericalValues() {
List<GenomicDataFilter> genomicDataFilters = new ArrayList<>();
List<DataFilterValue> values = new ArrayList<>();
values.add(new DataFilterValue(BigDecimal.valueOf(-2.5), BigDecimal.valueOf(-2.25), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-2.25), BigDecimal.valueOf(-2), null));
values.add(new DataFilterValue(null, null, "NA"));
genomicDataFilters.add(new GenomicDataFilter(null, null, values));

List<GenomicDataFilter> mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters);
List<DataFilterValue> mergedDataFilterValues = mergedGenomicDataFilters.getFirst().getValues();
BigDecimal start = mergedDataFilterValues.getFirst().getStart();
BigDecimal end = mergedDataFilterValues.getFirst().getEnd();
String value = mergedDataFilterValues.get(1).getValue();
assertEquals(0, BigDecimal.valueOf(-2.5).compareTo(start));
assertEquals(0, BigDecimal.valueOf(-2).compareTo(end));
assertEquals("NA", value);
}

// "NA" -> "NA"
@Test
public void testMergeDataFilterNonNumericalOnlyValues() {
List<GenomicDataFilter> genomicDataFilters = new ArrayList<>();
List<DataFilterValue> values = new ArrayList<>();
values.add(new DataFilterValue(null, null, "NA"));
genomicDataFilters.add(new GenomicDataFilter(null, null, values));

List<GenomicDataFilter> mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters);
List<DataFilterValue> mergedDataFilterValues = mergedGenomicDataFilters.getFirst().getValues();
String value = mergedDataFilterValues.getFirst().getValue();
assertEquals("NA", value);
}
}

0 comments on commit b31fe9e

Please sign in to comment.