From 4e92cbef07488f9d787346632934569aab0d7b31 Mon Sep 17 00:00:00 2001 From: tcezard Date: Wed, 22 Mar 2023 11:46:25 +0000 Subject: [PATCH 1/4] Prevent remapping/clustering when the target genome is from a different species --- eva_submission/eload_ingestion.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/eva_submission/eload_ingestion.py b/eva_submission/eload_ingestion.py index b2d64010..ebf08006 100644 --- a/eva_submission/eload_ingestion.py +++ b/eva_submission/eload_ingestion.py @@ -10,10 +10,13 @@ from ebi_eva_common_pyutils.config import cfg from ebi_eva_common_pyutils.config_utils import get_mongo_uri_for_eva_profile, get_primary_mongo_creds_for_profile, \ get_accession_pg_creds_for_profile, get_count_service_creds_for_profile +from ebi_eva_common_pyutils.ena_utils import get_assembly_name_and_taxonomy_id from ebi_eva_common_pyutils.metadata_utils import resolve_variant_warehouse_db_name, insert_new_assembly_and_taxonomy, \ get_assembly_set_from_metadata from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query, execute_query from ebi_eva_common_pyutils.spring_properties import SpringPropertiesGenerator +from ebi_eva_common_pyutils.taxonomy import taxonomy +from ebi_eva_common_pyutils.taxonomy.taxonomy import get_scientific_name_from_taxonomy from eva_submission import NEXTFLOW_DIR from eva_submission.eload_submission import Eload @@ -90,7 +93,8 @@ def ingest( if 'optional_remap_and_cluster' in tasks: target_assembly = self._get_target_assembly() - if target_assembly: + # EVA-3207: Temporary limitation while we sort out the remapping across species + if target_assembly and self._target_assembly_from_same_taxonomy(target_assembly): self.run_remap_and_cluster_workflow(target_assembly, resume=resume) if do_variant_load or annotation_only: @@ -399,6 +403,15 @@ def _get_target_assembly(self): self.warning(f'Could not find any current supported assembly for {self.taxonomy}, skipping clustering') return None + def _target_assembly_from_same_taxonomy(self, target_assembly): + # Find taxonomy of the target assembly + _, taxonomy_id_from_target = get_assembly_name_and_taxonomy_id(target_assembly) + if int(taxonomy_id_from_target) != int(self.taxonomy): + self.warning(f'Target assembly {target_assembly} is from a different taxonomy {taxonomy_id_from_target} ' + f'compare to the current project {self.taxonomy}') + return False + return True + def create_extraction_properties(self, output_file_path, taxonomy): properties = self.properties_generator.get_remapping_extraction_properties( taxonomy=taxonomy, From 3a8a75b3cc93f0cc46e9aa03164c87db8bb248e4 Mon Sep 17 00:00:00 2001 From: tcezard Date: Wed, 22 Mar 2023 11:47:44 +0000 Subject: [PATCH 2/4] remove unused imports --- eva_submission/eload_ingestion.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/eva_submission/eload_ingestion.py b/eva_submission/eload_ingestion.py index ebf08006..9859daec 100644 --- a/eva_submission/eload_ingestion.py +++ b/eva_submission/eload_ingestion.py @@ -15,8 +15,6 @@ get_assembly_set_from_metadata from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query, execute_query from ebi_eva_common_pyutils.spring_properties import SpringPropertiesGenerator -from ebi_eva_common_pyutils.taxonomy import taxonomy -from ebi_eva_common_pyutils.taxonomy.taxonomy import get_scientific_name_from_taxonomy from eva_submission import NEXTFLOW_DIR from eva_submission.eload_submission import Eload From cccd1be91b29f11bbf2795510632aa2c36ba459c Mon Sep 17 00:00:00 2001 From: tcezard Date: Wed, 22 Mar 2023 12:16:05 +0000 Subject: [PATCH 3/4] Update tests --- tests/test_eload_ingestion.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_eload_ingestion.py b/tests/test_eload_ingestion.py index a7a23d10..18172a8b 100644 --- a/tests/test_eload_ingestion.py +++ b/tests/test_eload_ingestion.py @@ -151,11 +151,13 @@ def test_ingest_all_tasks(self): patch('eva_submission.eload_utils.get_all_results_for_query') as m_get_alias_results, \ patch('eva_submission.eload_ingestion.get_vep_and_vep_cache_version') as m_get_vep_versions, \ patch('eva_submission.eload_utils.requests.post') as m_post, \ + patch('eva_submission.eload_ingestion.get_assembly_name_and_taxonomy_id') as m_get_tax, \ self._patch_mongo_database(): m_get_alias_results.return_value = [['alias']] m_get_vep_versions.return_value = (100, 100, 'homo_sapiens') m_post.return_value.text = self.get_mock_result_for_ena_date() m_get_results.side_effect = default_db_results_for_ingestion() + m_get_tax.return_value = ('name', '9090') self.eload.ingest(1) def test_ingest_metadata_load(self): @@ -370,8 +372,10 @@ def test_ingest_clustering(self): with self._patch_metadata_handle(), \ patch('eva_submission.eload_ingestion.get_all_results_for_query') as m_get_results, \ patch('eva_submission.eload_ingestion.command_utils.run_command_with_output', autospec=True) as m_run_command, \ + patch('eva_submission.eload_ingestion.get_assembly_name_and_taxonomy_id') as m_get_tax, \ self._patch_mongo_database(): m_get_results.side_effect = default_db_results_for_clustering() + m_get_tax.return_value = ('name', '9796') self.eload.ingest(tasks=['optional_remap_and_cluster']) assert self.eload.eload_cfg.query('ingestion', 'remap_and_cluster', 'target_assembly') == 'GCA_123' assert m_run_command.call_count == 1 @@ -394,6 +398,7 @@ def test_resume_when_step_fails(self): patch('eva_submission.eload_utils.get_all_results_for_query') as m_get_alias_results, \ patch('eva_submission.eload_ingestion.get_vep_and_vep_cache_version') as m_get_vep_versions, \ patch('eva_submission.eload_utils.requests.post') as m_post, \ + patch('eva_submission.eload_ingestion.get_assembly_name_and_taxonomy_id') as m_get_tax, \ self._patch_mongo_database(): m_get_alias_results.return_value = [['alias']] m_get_vep_versions.return_value = (100, 100, 'homo_sapiens') @@ -408,6 +413,7 @@ def test_resume_when_step_fails(self): None, # clustering None, # variant load ] + m_get_tax.return_value = ('name', '9090') with self.assertRaises(subprocess.CalledProcessError): self.eload.ingest() @@ -425,11 +431,13 @@ def test_resume_completed_job(self): patch('eva_submission.eload_utils.get_all_results_for_query') as m_get_alias_results, \ patch('eva_submission.eload_ingestion.get_vep_and_vep_cache_version') as m_get_vep_versions, \ patch('eva_submission.eload_utils.requests.post') as m_post, \ + patch('eva_submission.eload_ingestion.get_assembly_name_and_taxonomy_id') as m_get_tax, \ self._patch_mongo_database(): m_get_alias_results.return_value = [['alias']] m_get_vep_versions.return_value = (100, 100, 'homo_sapiens') m_post.return_value.text = self.get_mock_result_for_ena_date() m_get_results.side_effect = default_db_results_for_ingestion() + default_db_results_for_ingestion() + m_get_tax.return_value = ('name', '9796') # Resuming with no existing job execution is fine self.eload.ingest(resume=True) From f281109ffbb8125b5d63cf2c229299e27a6bd97c Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Wed, 22 Mar 2023 21:15:31 +0000 Subject: [PATCH 4/4] Update eva_submission/eload_ingestion.py Co-authored-by: sundarvenkata-EBI --- eva_submission/eload_ingestion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eva_submission/eload_ingestion.py b/eva_submission/eload_ingestion.py index 9859daec..50703891 100644 --- a/eva_submission/eload_ingestion.py +++ b/eva_submission/eload_ingestion.py @@ -406,7 +406,7 @@ def _target_assembly_from_same_taxonomy(self, target_assembly): _, taxonomy_id_from_target = get_assembly_name_and_taxonomy_id(target_assembly) if int(taxonomy_id_from_target) != int(self.taxonomy): self.warning(f'Target assembly {target_assembly} is from a different taxonomy {taxonomy_id_from_target} ' - f'compare to the current project {self.taxonomy}') + f'compared to the current project {self.taxonomy}. Therefore remapping will not be carried out!') return False return True