Skip to content

Commit

Permalink
Merge pull request #140 from tcezard/EVA-3207_limit_clustering
Browse files Browse the repository at this point in the history
EVA-3207 - Prevent remapping/clustering when the target genome is from a different species
  • Loading branch information
tcezard authored Mar 22, 2023
2 parents 546e52c + f281109 commit 6674963
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 1 deletion.
13 changes: 12 additions & 1 deletion eva_submission/eload_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from ebi_eva_common_pyutils.config import cfg
from ebi_eva_common_pyutils.config_utils import get_mongo_uri_for_eva_profile, get_primary_mongo_creds_for_profile, \
get_accession_pg_creds_for_profile, get_count_service_creds_for_profile
from ebi_eva_common_pyutils.ena_utils import get_assembly_name_and_taxonomy_id
from ebi_eva_common_pyutils.metadata_utils import resolve_variant_warehouse_db_name, insert_new_assembly_and_taxonomy, \
get_assembly_set_from_metadata
from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query, execute_query
Expand Down Expand Up @@ -90,7 +91,8 @@ def ingest(

if 'optional_remap_and_cluster' in tasks:
target_assembly = self._get_target_assembly()
if target_assembly:
# EVA-3207: Temporary limitation while we sort out the remapping across species
if target_assembly and self._target_assembly_from_same_taxonomy(target_assembly):
self.run_remap_and_cluster_workflow(target_assembly, resume=resume)

if do_variant_load or annotation_only:
Expand Down Expand Up @@ -399,6 +401,15 @@ def _get_target_assembly(self):
self.warning(f'Could not find any current supported assembly for {self.taxonomy}, skipping clustering')
return None

def _target_assembly_from_same_taxonomy(self, target_assembly):
# Find taxonomy of the target assembly
_, taxonomy_id_from_target = get_assembly_name_and_taxonomy_id(target_assembly)
if int(taxonomy_id_from_target) != int(self.taxonomy):
self.warning(f'Target assembly {target_assembly} is from a different taxonomy {taxonomy_id_from_target} '
f'compared to the current project {self.taxonomy}. Therefore remapping will not be carried out!')
return False
return True

def create_extraction_properties(self, output_file_path, taxonomy):
properties = self.properties_generator.get_remapping_extraction_properties(
taxonomy=taxonomy,
Expand Down
8 changes: 8 additions & 0 deletions tests/test_eload_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,13 @@ def test_ingest_all_tasks(self):
patch('eva_submission.eload_utils.get_all_results_for_query') as m_get_alias_results, \
patch('eva_submission.eload_ingestion.get_vep_and_vep_cache_version') as m_get_vep_versions, \
patch('eva_submission.eload_utils.requests.post') as m_post, \
patch('eva_submission.eload_ingestion.get_assembly_name_and_taxonomy_id') as m_get_tax, \
self._patch_mongo_database():
m_get_alias_results.return_value = [['alias']]
m_get_vep_versions.return_value = (100, 100, 'homo_sapiens')
m_post.return_value.text = self.get_mock_result_for_ena_date()
m_get_results.side_effect = default_db_results_for_ingestion()
m_get_tax.return_value = ('name', '9090')
self.eload.ingest(1)

def test_ingest_metadata_load(self):
Expand Down Expand Up @@ -370,8 +372,10 @@ def test_ingest_clustering(self):
with self._patch_metadata_handle(), \
patch('eva_submission.eload_ingestion.get_all_results_for_query') as m_get_results, \
patch('eva_submission.eload_ingestion.command_utils.run_command_with_output', autospec=True) as m_run_command, \
patch('eva_submission.eload_ingestion.get_assembly_name_and_taxonomy_id') as m_get_tax, \
self._patch_mongo_database():
m_get_results.side_effect = default_db_results_for_clustering()
m_get_tax.return_value = ('name', '9796')
self.eload.ingest(tasks=['optional_remap_and_cluster'])
assert self.eload.eload_cfg.query('ingestion', 'remap_and_cluster', 'target_assembly') == 'GCA_123'
assert m_run_command.call_count == 1
Expand All @@ -394,6 +398,7 @@ def test_resume_when_step_fails(self):
patch('eva_submission.eload_utils.get_all_results_for_query') as m_get_alias_results, \
patch('eva_submission.eload_ingestion.get_vep_and_vep_cache_version') as m_get_vep_versions, \
patch('eva_submission.eload_utils.requests.post') as m_post, \
patch('eva_submission.eload_ingestion.get_assembly_name_and_taxonomy_id') as m_get_tax, \
self._patch_mongo_database():
m_get_alias_results.return_value = [['alias']]
m_get_vep_versions.return_value = (100, 100, 'homo_sapiens')
Expand All @@ -408,6 +413,7 @@ def test_resume_when_step_fails(self):
None, # clustering
None, # variant load
]
m_get_tax.return_value = ('name', '9090')

with self.assertRaises(subprocess.CalledProcessError):
self.eload.ingest()
Expand All @@ -425,11 +431,13 @@ def test_resume_completed_job(self):
patch('eva_submission.eload_utils.get_all_results_for_query') as m_get_alias_results, \
patch('eva_submission.eload_ingestion.get_vep_and_vep_cache_version') as m_get_vep_versions, \
patch('eva_submission.eload_utils.requests.post') as m_post, \
patch('eva_submission.eload_ingestion.get_assembly_name_and_taxonomy_id') as m_get_tax, \
self._patch_mongo_database():
m_get_alias_results.return_value = [['alias']]
m_get_vep_versions.return_value = (100, 100, 'homo_sapiens')
m_post.return_value.text = self.get_mock_result_for_ena_date()
m_get_results.side_effect = default_db_results_for_ingestion() + default_db_results_for_ingestion()
m_get_tax.return_value = ('name', '9796')

# Resuming with no existing job execution is fine
self.eload.ingest(resume=True)
Expand Down

0 comments on commit 6674963

Please sign in to comment.