Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add interval logic for l2g features #812

Open
wants to merge 43 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
9c31f43
feat: add interval logic for l2g features
xyg123 Oct 3, 2024
330b79e
chore: fix docstrings
xyg123 Oct 3, 2024
183c827
chore: fix attribute errors
xyg123 Oct 3, 2024
500bae8
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 3, 2024
7cb4b5f
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 7, 2024
2035a52
fix: multiple input lines from merge
xyg123 Oct 7, 2024
985a901
fix: change to mean comparison, add additional interval features
xyg123 Oct 7, 2024
b01b4e8
fix: change to mean comparison, add additional interval features
xyg123 Oct 7, 2024
688c73a
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 7, 2024
6837df3
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 15, 2024
f194098
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 16, 2024
a9c0f6b
fix: change interval schema, reorganise interval processing, begin ad…
xyg123 Oct 17, 2024
63d6db6
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 17, 2024
374a7c3
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 18, 2024
29ad08b
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Oct 21, 2024
42e4ce9
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Nov 21, 2024
55f947f
fix: schema fixes
xyg123 Nov 22, 2024
1de5fcf
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Dec 10, 2024
c332d93
Added working tests for interval + nbh features
xyg123 Dec 11, 2024
ee8c4f2
chore: pre-commit auto fixes [...]
pre-commit-ci[bot] Dec 11, 2024
737a827
fix: l2g_feature_matrix tests
xyg123 Dec 11, 2024
921c820
Merge branch 'xg1_l2g_intervals' of https://github.com/opentargets/ge…
xyg123 Dec 11, 2024
0e23427
chore: pre-commit auto fixes [...]
pre-commit-ci[bot] Dec 11, 2024
6ac2d12
fix l2g_feature_matrix tests
xyg123 Dec 11, 2024
b1b2aa5
Merge branch 'xg1_l2g_intervals' of https://github.com/opentargets/ge…
xyg123 Dec 11, 2024
4f893fb
fix l2g_feature_matrix tests
xyg123 Dec 11, 2024
2bbf69c
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Dec 16, 2024
aed12ec
fix l2g step for intervals
xyg123 Dec 17, 2024
37109e3
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Dec 17, 2024
054eaa3
chore: pre-commit auto fixes [...]
pre-commit-ci[bot] Dec 17, 2024
ad934c4
generate features by overlapping studyLocus variants
xyg123 Dec 17, 2024
0eea3aa
Merge branch 'xg1_l2g_intervals' of https://github.com/opentargets/ge…
xyg123 Dec 17, 2024
8140d5a
fix on l2g step mypy
xyg123 Dec 17, 2024
24dc8c3
type hint issue
xyg123 Dec 17, 2024
9aeb302
add datasource step to process intervals
xyg123 Dec 17, 2024
155fcdb
chore: pre-commit auto fixes [...]
pre-commit-ci[bot] Dec 17, 2024
53a6ff3
add interval doc .md
xyg123 Dec 19, 2024
b8914a7
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
xyg123 Dec 19, 2024
78f661b
changes to config
xyg123 Dec 19, 2024
cf8b260
Merge branch 'xg1_l2g_intervals' of https://github.com/opentargets/ge…
xyg123 Dec 19, 2024
880cacf
chore: pre-commit auto fixes [...]
pre-commit-ci[bot] Dec 19, 2024
c076e17
address feature name comments and tests
xyg123 Dec 19, 2024
b074bc4
Merge branch 'xg1_l2g_intervals' of https://github.com/opentargets/ge…
xyg123 Dec 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/gentropy/config.py
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,13 @@ class LocusToGeneConfig(StepConfig):
"vepMaximumNeighbourhood",
"vepMean",
"vepMeanNeighbourhood",
# intervals
"pchicMean",
"pchicMeanNeighbourhood",
"enhTssMean",
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
"enhTssMeanNeighbourhood",
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
"dhsPmtrMean",
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
"dhsPmtrMeanNeighbourhood",
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
]
)
hyperparameters: dict[str, Any] = field(
Expand All @@ -282,6 +289,11 @@ class LocusToGeneConfig(StepConfig):
wandb_run_name: str | None = None
hf_hub_repo_id: str | None = "opentargets/locus_to_gene"
download_from_hub: bool = True
# interval_sources: dict[str, str] | None = {
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
# "javierre": "gs://genetics_etl_python_playground/static_assets/javierre_2016_preprocessed",
# "thurman": "gs://genetics_etl_python_playground/static_assets/thurman_2012/genomewideCorrs_above0.7_promoterPlusMinus500kb_withGeneNames_32celltypeCategories.bed8.gz",
# "andersson": "gs://genetics_etl_python_playground/static_assets/andersson2014/enhancer_tss_associations.bed",
# }
write_feature_matrix: bool = True
_target_: str = "gentropy.l2g.LocusToGeneStep"

Expand Down
348 changes: 348 additions & 0 deletions src/gentropy/dataset/l2g_features/intervals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,348 @@
"""Collection of methods that extract features from the interval datasets."""

from __future__ import annotations

from typing import TYPE_CHECKING, Any

import pyspark.sql.functions as f
from pyspark.sql import Window

from gentropy.common.spark_helpers import convert_from_wide_to_long

# from gentropy.dataset.colocalisation import Colocalisation
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
from gentropy.dataset.intervals import Intervals
from gentropy.dataset.l2g_features.l2g_feature import L2GFeature
from gentropy.dataset.l2g_gold_standard import L2GGoldStandard

# from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.study_locus import StudyLocus

if TYPE_CHECKING:
from pyspark.sql import DataFrame


def common_interval_feature_logic(
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
*,
intervals: Intervals,
feature_name: str,
interval_source: str,
) -> DataFrame:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci
that will be used for annotation
intervals (Intervals): The dataset containing interval information
feature_name (str): The name of the feature
interval_source (str): The datasource of the interval input

Returns:
DataFrame: Feature dataset
"""
# Only implementing mean average interval features.
agg_expr = f.mean(f.col("weightedIntervalScore"))
return (
study_loci_to_annotate.df.withColumn("variantInLocus", f.explode_outer("locus"))
.select(
"studyLocusId",
f.col("variantInLocus.variantId").alias("variantInLocusId"),
f.col("variantInLocus.posteriorProbability").alias(
"variantInLocusPosteriorProbability"
),
)
.join(
intervals.df.filter(f.col("datasourceId") == interval_source)
.withColumnRenamed("variantId", "variantInLocusId")
.withColumnRenamed("targetId", "geneId"),
on=["variantInLocusId", "geneId"],
how="inner",
)
.withColumn(
"weightedIntervalScore",
f.col("resourceScore") * f.col("variantInLocusPosteriorProbability"),
)
.groupBy("studyLocusId", "geneId")
.agg(agg_expr.alias(feature_name))
)


def common_neighbourhood_interval_feature_logic(
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
*,
intervals: Intervals,
feature_name: str,
interval_source: str,
) -> DataFrame:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
intervals (Intervals): The dataset containing interval information
feature_name (str): The name of the feature
interval_source (str): The datasource of the interval input

Returns:
DataFrame: Feature dataset
"""
local_feature_name = feature_name.replace("Neighbourhood", "")
# First compute mean interval scores to a gene
local_mean = common_interval_feature_logic(
study_loci_to_annotate,
feature_name=local_feature_name,
intervals=intervals,
interval_source=interval_source,
)
return (
# Then compute the mean score in the vicinity (
# feature will be the same for any gene associated with a studyLocus)
local_mean.withColumn(
"regional_mean",
f.mean(local_feature_name).over(Window.partitionBy("studyLocusId")),
)
.withColumn(feature_name, f.col(local_feature_name) - f.col("regional_mean"))
.drop("regional_mean")
)


class PchicMeanFeature(L2GFeature):
"""Average weighted CHiCAGO scores from studylocus to gene TSS."""
xyg123 marked this conversation as resolved.
Show resolved Hide resolved

fill_na_value = 0 # would be 0 if implemented
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not an attribute of L2GFeature, wouldnt it fail?

feature_dependency_type = Intervals
feature_name = "pchicMean"

@classmethod
def compute(
cls: type[PchicMeanFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> PchicMeanFeature:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset that contains the distance information

Returns:
PchicMeanFeature: Feature dataset
"""
interval_source = "javierre2016"
return cls(
_df=convert_from_wide_to_long(
common_interval_feature_logic(
study_loci_to_annotate,
feature_name=cls.feature_name,
interval_source=interval_source,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)


class PchicMeanNeighbourhoodFeature(L2GFeature):
"""Average weighted CHiCAGO scores from studylocus to gene TSS.
xyg123 marked this conversation as resolved.
Show resolved Hide resolved

In comparison to the Mean weighted CHiCAGO scores for all genes in the vicinity.
"""

fill_na_value = 0 # would be 0 if implemented
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

feature_dependency_type = Intervals
feature_name = "pchicMeanNeighbourhood"

@classmethod
def compute(
cls: type[PchicMeanNeighbourhoodFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> PchicMeanNeighbourhoodFeature:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset that contains the distance information

Returns:
PchicMeanNeighbourhoodFeature: Feature dataset
"""
interval_source = "javierre2016"
return cls(
_df=convert_from_wide_to_long(
common_neighbourhood_interval_feature_logic(
study_loci_to_annotate,
feature_name=cls.feature_name,
interval_source=interval_source,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)


class EnhTssMeanFeature(L2GFeature):
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
"""Average weighted Enhancer-TSS correlation between studylocus and gene TSS."""
xyg123 marked this conversation as resolved.
Show resolved Hide resolved

fill_na_value = 0 # would be 0 if implemented
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

feature_dependency_type = Intervals
feature_name = "enhTssMean"
xyg123 marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def compute(
cls: type[EnhTssMeanFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> EnhTssMeanFeature:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset that contains the distance information

Returns:
EnhTssMeanFeature: Feature dataset
"""
interval_source = "andersson2014"
return cls(
_df=convert_from_wide_to_long(
common_interval_feature_logic(
study_loci_to_annotate,
feature_name=cls.feature_name,
interval_source=interval_source,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)


class EnhTssMeanNeighbourhoodFeature(L2GFeature):
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
"""Average weighted Enhancer-TSS correlation from studylocus to gene TSS.
xyg123 marked this conversation as resolved.
Show resolved Hide resolved

Compared to the Mean weighted Enhancer-TSS correlation for all genes in the vicinity.
"""

fill_na_value = 0 # would be 0 if implemented
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
feature_dependency_type = Intervals
feature_name = "enhTssMeanNeighbourhoodFeature"

@classmethod
def compute(
cls: type[EnhTssMeanNeighbourhoodFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> EnhTssMeanNeighbourhoodFeature:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset that contains the distance information

Returns:
EnhTssMeanNeighbourhoodFeature: Feature dataset
"""
interval_source = "andersson2014"
return cls(
_df=convert_from_wide_to_long(
common_neighbourhood_interval_feature_logic(
study_loci_to_annotate,
feature_name=cls.feature_name,
interval_source=interval_source,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)


class DhsPmtrMeanFeature(L2GFeature):
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
"""Average weighted DHS-promoter correlation between studylocus and gene TSS."""
xyg123 marked this conversation as resolved.
Show resolved Hide resolved

fill_na_value = 0 # would be 0 if implemented
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

feature_dependency_type = Intervals
feature_name = "dhsPmtrMean"

@classmethod
def compute(
cls: type[DhsPmtrMeanFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> DhsPmtrMeanFeature:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset that contains the distance information

Returns:
DhsPmtrMeanFeature: Feature dataset
"""
interval_source = "thurman2012"
return cls(
_df=convert_from_wide_to_long(
common_interval_feature_logic(
study_loci_to_annotate,
feature_name=cls.feature_name,
interval_source=interval_source,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)


class DhsPmtrMeanNeighbourhoodFeature(L2GFeature):
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
"""Average weighted DHS-promoter correlation from studylocus to gene TSS.
xyg123 marked this conversation as resolved.
Show resolved Hide resolved

Compared to the Mean weighted DHS-promoter correlation for all genes in the vicinity.
"""

fill_na_value = 0 # would be 0 if implemented
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

feature_dependency_type = Intervals
feature_name = "dhsPmtrMeanNeighbourhood"

@classmethod
def compute(
cls: type[DhsPmtrMeanNeighbourhoodFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> DhsPmtrMeanNeighbourhoodFeature:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset that contains the distance information

Returns:
DhsPmtrMeanNeighbourhoodFeature: Feature dataset
"""
interval_source = "thurman2012"
return cls(
_df=convert_from_wide_to_long(
common_neighbourhood_interval_feature_logic(
study_loci_to_annotate,
feature_name=cls.feature_name,
interval_source=interval_source,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)
Loading
Loading