Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v0.5.2 #81

Merged
merged 5 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pgscatalog_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.4.2'
__version__ = "0.5.2"
19 changes: 8 additions & 11 deletions pgscatalog_utils/ancestry/ancestry_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,15 @@ def ancestry_analysis():
scorecols = list(pgs.columns)

## There should be perfect target sample overlap
assert all(
[
x in pgs.loc["reference"].index
for x in reference_df.index.get_level_values(1)
]
), "Error: PGS data missing for reference samples with PCA data."
reference_df = pd.merge(reference_df, pgs, left_index=True, right_index=True)
assert set(reference_df.index.get_level_values(1)).issubset(pgs.loc["reference"].index),\
"Error: PGS data missing for reference samples with PCA data."
reference_df = reference_df.sort_index()
reference_df = pd.concat([reference_df, pgs.loc[reference_df.index]], axis=1)

assert all(
[x in pgs.loc[args.d_target].index for x in target_df.index.get_level_values(1)]
), "Error: PGS data missing for reference samples with PCA data."
target_df = pd.merge(target_df, pgs, left_index=True, right_index=True)
assert set(target_df.index.get_level_values(1)).issubset(pgs.loc[args.d_target].index), \
"Error: PGS data missing for target samples with PCA data."
target_df = target_df.sort_index()
target_df = pd.concat([target_df, pgs.loc[target_df.index]], axis=1)
del pgs # clear raw PGS from memory

# Compare target sample ancestry/PCs to reference panel
Expand Down
175 changes: 96 additions & 79 deletions pgscatalog_utils/ancestry/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def compare_ancestry(ref_df: pd.DataFrame, ref_pop_col: str, target_df: pd.DataF
:param p_threshold: used to define LowConfidence population assignments
:return: dataframes for reference (predictions on training set) and target (predicted labels) datasets
"""
logger.debug("Starting ancestry comparison")
# Check that datasets have the correct columns
assert method in comparison_method_threshold.keys(), 'comparison method parameter must be Mahalanobis or RF'
if method == 'Mahalanobis':
Expand Down Expand Up @@ -238,13 +239,19 @@ def pgs_adjust(ref_df, target_df, scorecols: list, ref_pop_col, target_pop_col,
results_ref = {}
results_target = {}
results_models = {} # used to store regression information
scorecols_drop = set()
for c_pgs in scorecols:
# Makes melting easier later
sum_col = 'SUM|{}'.format(c_pgs)
results_ref[sum_col] = ref_df[c_pgs]
results_target[sum_col] = target_df[c_pgs]
results_models = {}

# Check that PGS has variance (e.g. not all 0)
if 0 in [np.var(results_ref[sum_col]), np.var(results_target[sum_col])]:
scorecols_drop.add(c_pgs)
logger.warning("Skipping adjustment: {} has 0 variance in PGS SUM".format(c_pgs))

# Report PGS values with respect to distribution of PGS in the most similar reference population
if 'empirical' in use_method:
logger.debug("Adjusting PGS using most similar reference population distribution.")
Expand All @@ -258,35 +265,36 @@ def pgs_adjust(ref_df, target_df, scorecols: list, ref_pop_col, target_pop_col,
z_col = 'Z_MostSimilarPop|{}'.format(c_pgs)
results_ref[z_col] = pd.Series(index=ref_df.index, dtype='float64')
results_target[z_col] = pd.Series(index=target_df.index, dtype='float64')

r_model = {}

# Adjust for each population
for pop in ref_populations:
r_pop = {}
i_ref_pop = (ref_df[ref_pop_col] == pop)
i_target_pop = (target_df[target_pop_col] == pop)
if c_pgs not in scorecols_drop:
r_model = {}

# Adjust for each population
for pop in ref_populations:
r_pop = {}
i_ref_pop = (ref_df[ref_pop_col] == pop)
i_target_pop = (target_df[target_pop_col] == pop)

# Reference Score Distribution
c_pgs_pop_dist = ref_train_df.loc[ref_train_df[ref_pop_col] == pop, c_pgs]
# Reference Score Distribution
c_pgs_pop_dist = ref_train_df.loc[ref_train_df[ref_pop_col] == pop, c_pgs]

# Calculate Percentile
results_ref[percentile_col].loc[i_ref_pop] = percentileofscore(c_pgs_pop_dist, ref_df.loc[i_ref_pop, c_pgs])
results_target[percentile_col].loc[i_target_pop] = percentileofscore(c_pgs_pop_dist, target_df.loc[i_target_pop, c_pgs])
r_pop['percentiles'] = np.percentile(c_pgs_pop_dist, range(0,101,1))
# Calculate Percentile
results_ref[percentile_col].loc[i_ref_pop] = percentileofscore(c_pgs_pop_dist, ref_df.loc[i_ref_pop, c_pgs])
results_target[percentile_col].loc[i_target_pop] = percentileofscore(c_pgs_pop_dist, target_df.loc[i_target_pop, c_pgs])
r_pop['percentiles'] = np.percentile(c_pgs_pop_dist, range(0,101,1))

# Calculate Z
r_pop['mean'] = c_pgs_pop_dist.mean()
r_pop['std'] = c_pgs_pop_dist.std(ddof=0)
# Calculate Z
r_pop['mean'] = c_pgs_pop_dist.mean()
r_pop['std'] = c_pgs_pop_dist.std(ddof=0)

results_ref[z_col].loc[i_ref_pop] = (ref_df.loc[i_ref_pop, c_pgs] - r_pop['mean'])/r_pop['std']
results_target[z_col].loc[i_target_pop] = (target_df.loc[i_target_pop, c_pgs] - r_pop['mean'])/r_pop['std']
results_ref[z_col].loc[i_ref_pop] = (ref_df.loc[i_ref_pop, c_pgs] - r_pop['mean'])/r_pop['std']
results_target[z_col].loc[i_target_pop] = (target_df.loc[i_target_pop, c_pgs] - r_pop['mean'])/r_pop['std']

r_model[pop] = r_pop
r_model[pop] = r_pop

results_models['dist_empirical'][c_pgs] = r_model
# ToDo: explore handling of individuals who have low-confidence population labels
# -> Possible Soln: weighted average based on probabilities? Small Mahalanobis P-values will complicate this
results_models['dist_empirical'][c_pgs] = r_model
# ToDo: explore handling of individuals who have low-confidence population labels
# -> Possible Soln: weighted average based on probabilities? Small Mahalanobis P-values will complicate this
# PCA-based adjustment
if any([x in use_method for x in ['mean', 'mean+var']]):
logger.debug("Adjusting PGS using PCA projections")
Expand All @@ -312,63 +320,72 @@ def pgs_adjust(ref_df, target_df, scorecols: list, ref_pop_col, target_pop_col,
target_norm[pc_col] = (target_norm[pc_col] - pc_mean) / pc_std

for c_pgs in scorecols:
results_models['adjust_pcs']['PGS'][c_pgs] = {}
if norm_centerpgs:
pgs_mean = ref_train_df[c_pgs].mean()
ref_train_df[c_pgs] = (ref_train_df[c_pgs] - pgs_mean)
ref_norm[c_pgs] = (ref_norm[c_pgs] - pgs_mean)
target_norm[c_pgs] = (target_norm[c_pgs] - pgs_mean)
results_models['adjust_pcs']['PGS'][c_pgs]['pgs_offset'] = pgs_mean

# Method 1 (Khera et al. Circulation (2019): normalize mean (doi:10.1161/CIRCULATIONAHA.118.035658)
adj_col = 'Z_norm1|{}'.format(c_pgs)
# Fit to Reference Data
pcs2pgs_fit = LinearRegression().fit(ref_train_df[cols_pcs], ref_train_df[c_pgs])
ref_train_pgs_pred = pcs2pgs_fit.predict(ref_train_df[cols_pcs])
ref_train_pgs_resid = ref_train_df[c_pgs] - ref_train_pgs_pred
ref_train_pgs_resid_mean = ref_train_pgs_resid.mean()
ref_train_pgs_resid_std = ref_train_pgs_resid.std(ddof=0)

ref_pgs_resid = ref_norm[c_pgs] - pcs2pgs_fit.predict(ref_norm[cols_pcs])
results_ref[adj_col] = ref_pgs_resid / ref_train_pgs_resid_std
# Apply to Target Data
target_pgs_pred = pcs2pgs_fit.predict(target_norm[cols_pcs])
target_pgs_resid = target_norm[c_pgs] - target_pgs_pred
results_target[adj_col] = target_pgs_resid / ref_train_pgs_resid_std
results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm1'] = package_skl_regression(pcs2pgs_fit)

if 'mean+var' in use_method:
# Method 2 (Khan et al. Nature Medicine (2022)): normalize variance (doi:10.1038/s41591-022-01869-1)
# Normalize based on residual deviation from mean of the distribution [equalize population sds]
# (e.g. reduce the correlation between genetic ancestry and how far away you are from the mean)
# USE gamma distribution for predicted variance to constrain it to be positive (b/c using linear
# regression we can get negative predictions for the sd)
adj_col = 'Z_norm2|{}'.format(c_pgs)
pcs2var_fit_gamma = GammaRegressor(max_iter=1000).fit(ref_train_df[cols_pcs], (
ref_train_pgs_resid - ref_train_pgs_resid_mean) ** 2)
if norm2_2step:
# Return 2-step adjustment
results_ref[adj_col] = ref_pgs_resid / np.sqrt(pcs2var_fit_gamma.predict(ref_norm[cols_pcs]))
results_target[adj_col] = target_pgs_resid / np.sqrt(
pcs2var_fit_gamma.predict(target_norm[cols_pcs]))
results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = package_skl_regression(pcs2var_fit_gamma)
else:
# Return full-likelihood adjustment model
# This jointly re-fits the regression parameters from the mean and variance prediction to better
# fit the observed PGS distribution. It seems to mostly change the intercepts. This implementation is
# adapted from https://github.com/broadinstitute/palantir-workflows/blob/v0.14/ImputationPipeline/ScoringTasks.wdl,
# which is distributed under a BDS-3 license.
params_initial = np.concatenate([[pcs2pgs_fit.intercept_], pcs2pgs_fit.coef_,
[pcs2var_fit_gamma.intercept_], pcs2var_fit_gamma.coef_])
pcs2full_fit = fullLL_fit(df_score=ref_train_df, scorecol=c_pgs,
predictors=cols_pcs, initial_params=params_initial)

results_ref[adj_col] = fullLL_adjust(pcs2full_fit, ref_norm, c_pgs)
results_target[adj_col] = fullLL_adjust(pcs2full_fit, target_norm, c_pgs)

if pcs2full_fit['params']['success'] is False:
logger.warning("{} full-likelihood: {} {}".format(c_pgs, pcs2full_fit['params']['status'], pcs2full_fit['params']['message']))
results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = pcs2full_fit
if c_pgs in scorecols_drop:
# fill the output with NAs
adj_cols = ['Z_norm1|{}'.format(c_pgs)]
if 'mean+var' in use_method:
adj_cols.append('Z_norm2|{}'.format(c_pgs))
for adj_col in adj_cols:
results_ref[adj_col] = pd.Series(index=ref_df.index, dtype='float64') # fill na
results_target[adj_col] = pd.Series(index=target_df.index, dtype='float64') # fill na
else:
results_models['adjust_pcs']['PGS'][c_pgs] = {}
if norm_centerpgs:
pgs_mean = ref_train_df[c_pgs].mean()
ref_train_df[c_pgs] = (ref_train_df[c_pgs] - pgs_mean)
ref_norm[c_pgs] = (ref_norm[c_pgs] - pgs_mean)
target_norm[c_pgs] = (target_norm[c_pgs] - pgs_mean)
results_models['adjust_pcs']['PGS'][c_pgs]['pgs_offset'] = pgs_mean

# Method 1 (Khera et al. Circulation (2019): normalize mean (doi:10.1161/CIRCULATIONAHA.118.035658)
adj_col = 'Z_norm1|{}'.format(c_pgs)
# Fit to Reference Data
pcs2pgs_fit = LinearRegression().fit(ref_train_df[cols_pcs], ref_train_df[c_pgs])
ref_train_pgs_pred = pcs2pgs_fit.predict(ref_train_df[cols_pcs])
ref_train_pgs_resid = ref_train_df[c_pgs] - ref_train_pgs_pred
ref_train_pgs_resid_mean = ref_train_pgs_resid.mean()
ref_train_pgs_resid_std = ref_train_pgs_resid.std(ddof=0)

ref_pgs_resid = ref_norm[c_pgs] - pcs2pgs_fit.predict(ref_norm[cols_pcs])
results_ref[adj_col] = ref_pgs_resid / ref_train_pgs_resid_std
# Apply to Target Data
target_pgs_pred = pcs2pgs_fit.predict(target_norm[cols_pcs])
target_pgs_resid = target_norm[c_pgs] - target_pgs_pred
results_target[adj_col] = target_pgs_resid / ref_train_pgs_resid_std
results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm1'] = package_skl_regression(pcs2pgs_fit)

if 'mean+var' in use_method:
# Method 2 (Khan et al. Nature Medicine (2022)): normalize variance (doi:10.1038/s41591-022-01869-1)
# Normalize based on residual deviation from mean of the distribution [equalize population sds]
# (e.g. reduce the correlation between genetic ancestry and how far away you are from the mean)
# USE gamma distribution for predicted variance to constrain it to be positive (b/c using linear
# regression we can get negative predictions for the sd)
adj_col = 'Z_norm2|{}'.format(c_pgs)
pcs2var_fit_gamma = GammaRegressor(max_iter=1000).fit(ref_train_df[cols_pcs], (
ref_train_pgs_resid - ref_train_pgs_resid_mean) ** 2)
if norm2_2step:
# Return 2-step adjustment
results_ref[adj_col] = ref_pgs_resid / np.sqrt(pcs2var_fit_gamma.predict(ref_norm[cols_pcs]))
results_target[adj_col] = target_pgs_resid / np.sqrt(
pcs2var_fit_gamma.predict(target_norm[cols_pcs]))
results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = package_skl_regression(pcs2var_fit_gamma)
else:
# Return full-likelihood adjustment model
# This jointly re-fits the regression parameters from the mean and variance prediction to better
# fit the observed PGS distribution. It seems to mostly change the intercepts. This implementation is
# adapted from https://github.com/broadinstitute/palantir-workflows/blob/v0.14/ImputationPipeline/ScoringTasks.wdl,
# which is distributed under a BDS-3 license.
params_initial = np.concatenate([[pcs2pgs_fit.intercept_], pcs2pgs_fit.coef_,
[pcs2var_fit_gamma.intercept_], pcs2var_fit_gamma.coef_])
pcs2full_fit = fullLL_fit(df_score=ref_train_df, scorecol=c_pgs,
predictors=cols_pcs, initial_params=params_initial)

results_ref[adj_col] = fullLL_adjust(pcs2full_fit, ref_norm, c_pgs)
results_target[adj_col] = fullLL_adjust(pcs2full_fit, target_norm, c_pgs)

if pcs2full_fit['params']['success'] is False:
logger.warning("{} full-likelihood: {} {}".format(c_pgs, pcs2full_fit['params']['status'], pcs2full_fit['params']['message']))
results_models['adjust_pcs']['PGS'][c_pgs]['Z_norm2'] = pcs2full_fit
# Only return results
logger.debug("Outputting adjusted PGS & models")
results_ref = pd.DataFrame(results_ref)
Expand Down
6 changes: 3 additions & 3 deletions pgscatalog_utils/scorefile/qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,11 @@ def assign_effect_type(
) -> typing.Generator[ScoreVariant, None, None]:
for variant in variants:
match (variant.is_recessive, variant.is_dominant):
case (None, None) | ("FALSE", "FALSE"):
case (None, None) | (False, False):
pass # default value is additive, pass to break match and yield
case ("FALSE", "TRUE"):
case (False, True):
variant.effect_type = EffectType.DOMINANT
case ("TRUE", "FALSE"):
case (True, False):
variant.effect_type = EffectType.RECESSIVE
case _:
logger.critical(f"Bad effect type setting: {variant}")
Expand Down
10 changes: 8 additions & 2 deletions pgscatalog_utils/scorefile/scorevariant.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,14 @@ def __init__(

self.hm_inferOtherAllele: Optional[str] = hm_inferOtherAllele
self.hm_source: Optional[str] = hm_source
self.is_dominant: Optional[bool] = is_dominant
self.is_recessive: Optional[bool] = is_recessive

self.is_dominant: Optional[bool] = (
is_dominant == "True" if is_dominant is not None else None
)
self.is_recessive: Optional[bool] = (
is_recessive == "True" if is_recessive is not None else None
)

self.hm_rsID: Optional[str] = hm_rsID
self.hm_match_chr: Optional[str] = hm_match_chr
self.hm_match_pos: Optional[str] = hm_match_pos
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pgscatalog_utils"
version = "0.5.1"
version = "0.5.2"
description = "Utilities for working with PGS Catalog API and scoring files"
homepage = "https://github.com/PGScatalog/pgscatalog_utils"
authors = ["Benjamin Wingfield <[email protected]>", "Samuel Lambert <[email protected]>", "Laurent Gil <[email protected]>"]
Expand Down
Loading
Loading