From b31fe9ec7aad5e2e1be40845fbd5bfb371345353 Mon Sep 17 00:00:00 2001 From: "Zhaoyuan (Ryan) Fu" Date: Fri, 6 Dec 2024 19:53:23 -0500 Subject: [PATCH] Optimize bins filtering by merging bins (#11248) * Merge genomic data bins working * Workaround for clickhouse bug in numerical data parsing --------- Co-authored-by: alisman --- .../helper/StudyViewFilterHelper.java | 65 +++++++++++ .../web/parameter/DataFilterValue.java | 8 ++ .../web/parameter/GenomicDataFilter.java | 7 ++ .../mybatisclickhouse/StudyViewMapper.xml | 3 + .../helper/StudyViewFilterHelperTest.java | 110 ++++++++++++++++++ 5 files changed, 193 insertions(+) create mode 100644 src/test/java/org/cbioportal/persistence/helper/StudyViewFilterHelperTest.java diff --git a/src/main/java/org/cbioportal/persistence/helper/StudyViewFilterHelper.java b/src/main/java/org/cbioportal/persistence/helper/StudyViewFilterHelper.java index 506c16118eb..b495f2c5976 100644 --- a/src/main/java/org/cbioportal/persistence/helper/StudyViewFilterHelper.java +++ b/src/main/java/org/cbioportal/persistence/helper/StudyViewFilterHelper.java @@ -5,10 +5,15 @@ import org.cbioportal.web.parameter.ClinicalDataFilter; import org.cbioportal.web.parameter.CategorizedGenericAssayDataCountFilter; import org.cbioportal.web.parameter.CustomSampleIdentifier; +import org.cbioportal.web.parameter.DataFilter; +import org.cbioportal.web.parameter.DataFilterValue; +import org.cbioportal.web.parameter.GenericAssayDataFilter; +import org.cbioportal.web.parameter.GenomicDataFilter; import org.cbioportal.web.parameter.StudyViewFilter; import org.springframework.lang.NonNull; import org.springframework.lang.Nullable; +import java.math.BigDecimal; import java.util.ArrayList; import java.util.EnumMap; import java.util.List; @@ -28,6 +33,18 @@ public static StudyViewFilterHelper build(@Nullable StudyViewFilter studyViewFil if (Objects.isNull(customDataSamples)) { customDataSamples = new ArrayList<>(); } + if (studyViewFilter.getGenomicDataFilters() != null && !studyViewFilter.getGenomicDataFilters().isEmpty()) { + List mergedGenomicDataFilters = mergeDataFilters(studyViewFilter.getGenomicDataFilters()); + studyViewFilter.setGenomicDataFilters(mergedGenomicDataFilters); + } + if (studyViewFilter.getClinicalDataFilters() != null && !studyViewFilter.getClinicalDataFilters().isEmpty()) { + List mergedClinicalDataFilters = mergeDataFilters(studyViewFilter.getClinicalDataFilters()); + studyViewFilter.setClinicalDataFilters(mergedClinicalDataFilters); + } + if (studyViewFilter.getGenericAssayDataFilters() != null && !studyViewFilter.getGenericAssayDataFilters().isEmpty()) { + List mergedGenericAssayDataFilters = mergeDataFilters(studyViewFilter.getGenericAssayDataFilters()); + studyViewFilter.setGenericAssayDataFilters(mergedGenericAssayDataFilters); + } return new StudyViewFilterHelper(studyViewFilter, genericAssayProfilesMap, customDataSamples); } @@ -93,4 +110,52 @@ public boolean isCategoricalClinicalDataFilter(ClinicalDataFilter clinicalDataFi return filterValue.getValue() != null; } + /** + * Merge the range of numerical bins in DataFilters to reduce the number of scans that runs on the database when filtering. + */ + public static List mergeDataFilters(List filters) { + boolean isNonNumericalOnly = true; + List mergedDataFilters = new ArrayList<>(); + + for (T filter : filters) { + List mergedValues = new ArrayList<>(); + List nonNumericalValues = new ArrayList<>(); + + BigDecimal mergedStart = null; + BigDecimal mergedEnd = null; + for (DataFilterValue dataFilterValue : filter.getValues()) { + // leave non-numerical values as they are + if (dataFilterValue.getValue() != null) { + nonNumericalValues.add(dataFilterValue); + } + // merge adjacent numerical bins + else { + isNonNumericalOnly = false; + BigDecimal start = dataFilterValue.getStart(); + BigDecimal end = dataFilterValue.getEnd(); + + if (mergedStart == null && mergedEnd == null) { + mergedStart = start; + mergedEnd = end; + } + else if (mergedEnd.equals(start)) { + mergedEnd = end; + } else { + mergedValues.add(new DataFilterValue(mergedStart, mergedEnd, null)); + mergedStart = start; + mergedEnd = end; + } + } + } + + if (!isNonNumericalOnly) { + mergedValues.add(new DataFilterValue(mergedStart, mergedEnd, null)); + } + mergedValues.addAll(nonNumericalValues); + filter.setValues(mergedValues); + mergedDataFilters.add(filter); + } + + return mergedDataFilters; + } } diff --git a/src/main/java/org/cbioportal/web/parameter/DataFilterValue.java b/src/main/java/org/cbioportal/web/parameter/DataFilterValue.java index 184aee40f29..af434cfa55d 100644 --- a/src/main/java/org/cbioportal/web/parameter/DataFilterValue.java +++ b/src/main/java/org/cbioportal/web/parameter/DataFilterValue.java @@ -9,6 +9,14 @@ public class DataFilterValue implements Serializable { private BigDecimal end; private String value; + public DataFilterValue() {} + + public DataFilterValue(BigDecimal start, BigDecimal end, String value) { + this.start = start; + this.end = end; + this.value = value; + } + public BigDecimal getStart() { return start; } diff --git a/src/main/java/org/cbioportal/web/parameter/GenomicDataFilter.java b/src/main/java/org/cbioportal/web/parameter/GenomicDataFilter.java index edf95943436..34aca9c0894 100644 --- a/src/main/java/org/cbioportal/web/parameter/GenomicDataFilter.java +++ b/src/main/java/org/cbioportal/web/parameter/GenomicDataFilter.java @@ -1,6 +1,7 @@ package org.cbioportal.web.parameter; import java.io.Serializable; +import java.util.List; public class GenomicDataFilter extends DataFilter implements Serializable { private String hugoGeneSymbol; @@ -13,6 +14,12 @@ public GenomicDataFilter(String hugoGeneSymbol, String profileType) { this.profileType = profileType; } + public GenomicDataFilter(String hugoGeneSymbol, String profileType, List values) { + this.hugoGeneSymbol = hugoGeneSymbol; + this.profileType = profileType; + this.setValues(values); + } + public String getHugoGeneSymbol() { return hugoGeneSymbol; } diff --git a/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapper.xml b/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapper.xml index cf1b0258792..d91ff582ffe 100644 --- a/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapper.xml +++ b/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapper.xml @@ -880,6 +880,9 @@ --> multiIf( + -- This condition is to prevent casting non numerical values to float + NOT match(${attribute_value}, '^[><]?=?[-+]?[0-9]*[.,]?[0-9]+$'), + NULL, (startsWith(${attribute_value}, '<=') OR startsWith(${attribute_value}, '>=')), cast(substr(${attribute_value}, 3) as float), startsWith(${attribute_value}, '<'), diff --git a/src/test/java/org/cbioportal/persistence/helper/StudyViewFilterHelperTest.java b/src/test/java/org/cbioportal/persistence/helper/StudyViewFilterHelperTest.java new file mode 100644 index 00000000000..f9044f09d37 --- /dev/null +++ b/src/test/java/org/cbioportal/persistence/helper/StudyViewFilterHelperTest.java @@ -0,0 +1,110 @@ +package org.cbioportal.persistence.helper; + +import org.cbioportal.web.parameter.DataFilterValue; +import org.cbioportal.web.parameter.GenomicDataFilter; +import org.junit.Test; + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +public class StudyViewFilterHelperTest { + + // (-5, -1], (-1, 3], (3, 7] -> (-5, 7] + @Test + public void testMergeDataFilterNumericalContinuousValues() { + List genomicDataFilters = new ArrayList<>(); + List values = new ArrayList<>(); + values.add(new DataFilterValue(BigDecimal.valueOf(-5), BigDecimal.valueOf(-1), null)); + values.add(new DataFilterValue(BigDecimal.valueOf(-1), BigDecimal.valueOf(3), null)); + values.add(new DataFilterValue(BigDecimal.valueOf(3), BigDecimal.valueOf(7), null)); + genomicDataFilters.add(new GenomicDataFilter(null, null, values)); + + List mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters); + List mergedDataFilterValues = mergedGenomicDataFilters.getFirst().getValues(); + BigDecimal start = mergedDataFilterValues.getFirst().getStart(); + BigDecimal end = mergedDataFilterValues.getFirst().getEnd(); + assertEquals(0, BigDecimal.valueOf(-5).compareTo(start)); + assertEquals(0, BigDecimal.valueOf(7).compareTo(end)); + } + + // (-2.5, -2.25], (-2.25, -2], (-1.75, -1.5], (-1.5, -1.25] -> (-2.5, -2], (-1.75, -1.25] + @Test + public void testMergeDataFilterNumericalDiscontinuousValues() { + List genomicDataFilters = new ArrayList<>(); + List values = new ArrayList<>(); + values.add(new DataFilterValue(BigDecimal.valueOf(-2.5), BigDecimal.valueOf(-2.25), null)); + values.add(new DataFilterValue(BigDecimal.valueOf(-2.25), BigDecimal.valueOf(-2), null)); + values.add(new DataFilterValue(BigDecimal.valueOf(-1.75), BigDecimal.valueOf(-1.5), null)); + values.add(new DataFilterValue(BigDecimal.valueOf(-1.5), BigDecimal.valueOf(-1.25), null)); + genomicDataFilters.add(new GenomicDataFilter(null, null, values)); + + List mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters); + List mergedDataFilterValues = mergedGenomicDataFilters.getFirst().getValues(); + BigDecimal firstStart = mergedDataFilterValues.getFirst().getStart(); + BigDecimal firstEnd = mergedDataFilterValues.getFirst().getEnd(); + assertEquals(0, BigDecimal.valueOf(-2.5).compareTo(firstStart)); + assertEquals(0, BigDecimal.valueOf(-2).compareTo(firstEnd)); + + BigDecimal secondStart = mergedDataFilterValues.get(1).getStart(); + BigDecimal secondEnd = mergedDataFilterValues.get(1).getEnd(); + assertEquals(0, BigDecimal.valueOf(-1.75).compareTo(secondStart)); + assertEquals(0, BigDecimal.valueOf(-1.25).compareTo(secondEnd)); + } + + // (null, -2.25], (-2.25, -2], (-2, null] -> (null, null] + @Test + public void testMergeDataFilterNumericalInfiniteValues() { + List genomicDataFilters = new ArrayList<>(); + List values = new ArrayList<>(); + values.add(new DataFilterValue(null, BigDecimal.valueOf(-2.25), null)); + values.add(new DataFilterValue(BigDecimal.valueOf(-2.25), BigDecimal.valueOf(-2), null)); + values.add(new DataFilterValue(BigDecimal.valueOf(-2), null, null)); + genomicDataFilters.add(new GenomicDataFilter(null, null, values)); + + List mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters); + List mergedDataFilterValues = mergedGenomicDataFilters.getFirst().getValues(); + BigDecimal start = mergedDataFilterValues.getFirst().getStart(); + BigDecimal end = mergedDataFilterValues.getFirst().getEnd(); + assertNull(start); + assertNull(end); + } + + // (-2.5, -2.25], (-2.25, -2], "NA" -> (-2.5, -1.75], "NA" + // This test also ensures the non-numerical values gets moved to the end + @Test + public void testMergeDataFilterNumericalNonNumericalValues() { + List genomicDataFilters = new ArrayList<>(); + List values = new ArrayList<>(); + values.add(new DataFilterValue(BigDecimal.valueOf(-2.5), BigDecimal.valueOf(-2.25), null)); + values.add(new DataFilterValue(BigDecimal.valueOf(-2.25), BigDecimal.valueOf(-2), null)); + values.add(new DataFilterValue(null, null, "NA")); + genomicDataFilters.add(new GenomicDataFilter(null, null, values)); + + List mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters); + List mergedDataFilterValues = mergedGenomicDataFilters.getFirst().getValues(); + BigDecimal start = mergedDataFilterValues.getFirst().getStart(); + BigDecimal end = mergedDataFilterValues.getFirst().getEnd(); + String value = mergedDataFilterValues.get(1).getValue(); + assertEquals(0, BigDecimal.valueOf(-2.5).compareTo(start)); + assertEquals(0, BigDecimal.valueOf(-2).compareTo(end)); + assertEquals("NA", value); + } + + // "NA" -> "NA" + @Test + public void testMergeDataFilterNonNumericalOnlyValues() { + List genomicDataFilters = new ArrayList<>(); + List values = new ArrayList<>(); + values.add(new DataFilterValue(null, null, "NA")); + genomicDataFilters.add(new GenomicDataFilter(null, null, values)); + + List mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters); + List mergedDataFilterValues = mergedGenomicDataFilters.getFirst().getValues(); + String value = mergedDataFilterValues.getFirst().getValue(); + assertEquals("NA", value); + } +}