From 1d1b9bffe5f4113810fb40d97b3a59b03b8d348a Mon Sep 17 00:00:00 2001
From: Anna Cuomo <annacuomo92@gmail.com>
Date: Wed, 25 Sep 2024 09:59:40 +1000
Subject: [PATCH] Address Katie's get_anndata.py issue (#73) (#145)

* note on missing sex value

* lowly expressed genes comment in readme

* make notes unique

* proper headings

* move fill sex age to sample cov script

* comment out from get anndata

* remove list type

* delete commented out lines
---
 README.md                | 19 ++++++++++++++-----
 get_anndata.py           | 11 +----------
 get_sample_covariates.py | 24 ++++++++++++++++++------
 3 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 2a1b1929..58ba76f6 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,9 @@ Outputs:
 * VCF files containing all retained rare variants (one per chromosome) + corresponding index files (`.csi`)
 * plink object for only 2,000 variants (minor allele count > 20), after LD pruning - this is for the estimation of the variance ratio (VRE plink files)
 
-Notes: SAIGE-QTL allows numeric chromosomes only, so both the .bim and the VCF files are modified in this script to remove the 'chr' notation (so that e.g., 'chr1' becomes '1').
+### Notes (get_genotype_vcf.py)
+
+SAIGE-QTL allows numeric chromosomes only, so both the .bim and the VCF files are modified in this script to remove the 'chr' notation (so that e.g., 'chr1' becomes '1').
 
 ## Get sample covariates
 
@@ -57,7 +59,9 @@ Outputs:
 
 * TSV sample covariate file (one per cohort)
 
-Notes: option to fill in missing values for sex (0, where 1 is male, 2 is female) and age (average age across the cohort).
+### Notes (get_sample_covariates.py)
+
+There is the option to fill in missing values for sex (0 for unknown, where 1 is male, 2 is female) and age (average age across the cohort).
 Additionally, add a user-specified (default: 10) number of permuted IDs, where the individual ID is permuted at random, to assess calibration (by shuffling the individual IDs we disrupt any real association between genotype and phenotype, so we expect no significant associations left when testing).
 
 ## Gene expression preprocessing
@@ -75,9 +79,12 @@ Outputs:
 * TSV phenotype covariate files (one per gene, cell type)
 * TSV gene cis window file (one per gene)
 
-Notes: as before, we remove 'chr' from the chromosome name in the gene cis window file.
+### Notes (get_anndata.py)
+
+As in `get_genotype_vcf.py`, we remove 'chr' from the chromosome name in the gene cis window file.
 Additionally, we turn hyphens ('-') into underscores ('_') in the gene names.
 Both the AnnData objects and cell covariate files are generated on Garvan's HPC and copied over to GCP.
+A note that the `filter_lowly_expressed_genes` method will remove lowly-expressed genes that will not even get tested, which should be kept in mind when interpreting the results.
 
 ## Make group file
 
@@ -92,7 +99,9 @@ Outputs
 
 * group files (one per gene)
 
-Notes: option to include no weights or to compute weights that reflect the distance of each variant from the gene's transcription start site (`dTSS`).
+### Notes (make_group_file.py)
+
+Option to include no weights or to compute weights that reflect the distance of each variant from the gene's transcription start site (`dTSS`).
 Using one of the flags below it is possible to additionally test using equal weights.
 We use no annotations for now (set to `null`).
 
@@ -134,7 +143,7 @@ Outputs:
 * if set to true, single-variant test raw p-values for all variants in the group also (one per gene, cell type)
 * set-based association summary statistics (gene-level p-values summarised, one per cell type)
 
-### SAIGE-QTL parameters explanation
+## SAIGE-QTL parameters explanation
 
 Clarifying the reasoning behind the parameters / flags used to run SAIGE-QTL.
 Most of these are (or will be) included in the official [documentation](https://weizhou0.github.io/SAIGE-QTL-doc/).
diff --git a/get_anndata.py b/get_anndata.py
index e96dfd71..e355772f 100644
--- a/get_anndata.py
+++ b/get_anndata.py
@@ -109,8 +109,6 @@ def make_pheno_cov(
     sample_covs_file: str,
     celltype_covs_file: str,
     out_path: str,
-    fill_in_sex: bool = False,
-    fill_in_age: bool = False,
 ):
     """
     Combine expression and covariates into a single file
@@ -138,9 +136,6 @@ def make_pheno_cov(
     celltype_covs_df = pd.read_csv(celltype_covs_file, index_col=0)
     logging.info('cell covariate file opened')
 
-    # determine average age to fill in later
-    if fill_in_age:
-        mean_age = sample_covs_df['age'].mean()
     cell_ind_df = expression_adata.obs.loc[
         :, ['cell', 'individual', 'total_counts', 'sequencing_library', 'cohort']
     ]
@@ -154,11 +149,6 @@ def make_pheno_cov(
         sample_covs_df, on='individual', how='inner'
     )
     sample_covs_cells_df.index = sample_covs_cells_df['cell']
-    # fill missing values for sex and age
-    if fill_in_sex:
-        sample_covs_cells_df['sex'] = sample_covs_cells_df['sex'].fillna(0)
-    if fill_in_age:
-        sample_covs_cells_df['age'] = sample_covs_cells_df['age'].fillna(mean_age)
     # drop rows with missing values (SAIGE throws an error otherwise:  https://batch.hail.populationgenomics.org.au/batches/435978/jobs/91)
     sample_covs_cells_df = sample_covs_cells_df.dropna()
     gene_adata = expression_adata[:, expression_adata.var.index == gene]
@@ -167,6 +157,7 @@ def make_pheno_cov(
         data=gene_adata.X.todense(), index=gene_adata.obs.index, columns=[gene_name]
     )
     # move index (barcode) into a 'cell' column and reset the index - required prior to merging
+    # TO DO adjust when final data comes (see issue #97)
     expr_df['cell'] = expr_df.index
     expr_df = expr_df.reset_index(drop=True)
     sample_covs_cells_df = sample_covs_cells_df.reset_index(drop=True)
diff --git a/get_sample_covariates.py b/get_sample_covariates.py
index 6d244a21..3497774a 100644
--- a/get_sample_covariates.py
+++ b/get_sample_covariates.py
@@ -79,13 +79,17 @@
 @click.option('--vds-version', help=' e.g. 1-0 ')
 @click.option('--project-names', default='tob-wgs,bioheart')
 @click.option('--number-of-sample-perms', default=10)
+@click.option('--fill-in-sex', default=False)
+@click.option('--fill-in-age', default=False)
 @click.command()
 def main(
-    tob_sex_file_path,
-    bioheart_sex_file_path,
-    vds_version,
-    project_names,
-    number_of_sample_perms,
+    tob_sex_file_path: str,
+    bioheart_sex_file_path: str,
+    vds_version: str,
+    project_names: str,
+    number_of_sample_perms: int,
+    fill_in_sex: bool,
+    fill_in_age: bool,
 ):
     """
     Get sex, age and genotype PCs for TOB and BioHEART individuals
@@ -112,14 +116,18 @@ def main(
     # rename s as sample id to match bioheart file
     tob_meta['sample_id'] = tob_meta['new_CPG_id']
     tob_sex = tob_meta.loc[:, ["sample_id", "sex"]]
+
     # extract sex for BioHEART
     bioheart_sex = bioheart_meta.loc[:, ["sample_id", "sex"]]
     # combine_info
     sex_df = pd.concat([tob_sex, bioheart_sex], axis=0)
+    # add sex as unknown (0) if missing
+    if fill_in_sex:
+        sex_df['sex'] = sex_df['sex'].fillna(0)
 
     # age
     # create a list from dictionary to populate
-    age_dict_list: list(dict) = []
+    age_dict_list = []
     # loop over projects (tob-wgs, bioheart)
     for project_name in project_names.split(','):
         query_vars = {'project_name': project_name}
@@ -134,6 +142,10 @@ def main(
                 age = 'NA'
             age_dict_list.append({'sample_id': cpg_id, 'age': age})
     age_df = pd.DataFrame(age_dict_list)
+    # add average age if missing
+    if fill_in_age:
+        mean_age = age_df['age'].mean()
+        age_df['age'].fillna(mean_age)
 
     # genotype PCs
     pcs_ht_path = dataset_path(f'large_cohort/{vds_version}/ancestry/scores.ht')