-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #30 from apriltuesday/EVA-2386
EVA-2386 - Automate backlog processing
- Loading branch information
Showing
11 changed files
with
335 additions
and
64 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
#!/usr/bin/env python | ||
|
||
# Copyright 2021 EMBL - European Bioinformatics Institute | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import logging | ||
import os | ||
import sys | ||
from argparse import ArgumentParser | ||
|
||
from ebi_eva_common_pyutils.logger import logging_config as log_cfg | ||
|
||
from eva_submission.eload_backlog import EloadBacklog | ||
|
||
sys.path.append(os.path.dirname(os.path.dirname(__file__))) | ||
from eva_submission.eload_validation import EloadValidation | ||
from eva_submission.submission_config import load_config | ||
|
||
logger = log_cfg.get_logger(__name__) | ||
|
||
|
||
def main(): | ||
argparse = ArgumentParser(description='Prepare to process backlog study and validate VCFs.') | ||
argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission') | ||
argparse.add_argument('--debug', action='store_true', default=False, | ||
help='Set the script to output logging information at debug level') | ||
|
||
args = argparse.parse_args() | ||
|
||
log_cfg.add_stdout_handler() | ||
if args.debug: | ||
log_cfg.set_log_level(logging.DEBUG) | ||
|
||
# Load the config_file from default location | ||
load_config() | ||
|
||
preparation = EloadBacklog(args.eload) | ||
preparation.fill_in_config() | ||
|
||
validation = EloadValidation(args.eload) | ||
validation_tasks = ['assembly_check', 'vcf_check'] | ||
validation.validate(validation_tasks) | ||
|
||
logger.info('Preparation complete, if files are valid please run ingestion as normal.') | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import os | ||
from xml.etree import ElementTree as ET | ||
|
||
from cached_property import cached_property | ||
from ebi_eva_common_pyutils.config import cfg | ||
from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query | ||
import requests | ||
from requests.auth import HTTPBasicAuth | ||
|
||
from eva_submission.eload_submission import Eload | ||
from eva_submission.eload_utils import get_metadata_conn, get_genome_fasta_and_report | ||
|
||
|
||
class EloadBacklog(Eload): | ||
|
||
def fill_in_config(self): | ||
"""Fills in config params from metadata DB and ENA, enabling later parts of pipeline to run.""" | ||
self.eload_cfg.set('brokering', 'ena', 'PROJECT', value=self.project_accession) | ||
self.get_analysis_info() | ||
self.get_species_info() | ||
self.get_hold_date() | ||
self.eload_cfg.write() | ||
|
||
@cached_property | ||
def project_accession(self): | ||
with get_metadata_conn() as conn: | ||
query = f"select project_accession from evapro.project_eva_submission where eload_id={self.eload_num};" | ||
rows = get_all_results_for_query(conn, query) | ||
if len(rows) != 1: | ||
raise ValueError(f'No project accession for {self.eload} found in metadata DB.') | ||
return rows[0][0] | ||
|
||
@cached_property | ||
def project_alias(self): | ||
with get_metadata_conn() as conn: | ||
query = f"select alias from evapro.project where project_accession={self.project_accession};" | ||
rows = get_all_results_for_query(conn, query) | ||
if len(rows) != 1: | ||
raise ValueError(f'No project alias for {self.project_accession} found in metadata DB.') | ||
return rows[0][0] | ||
|
||
def get_species_info(self): | ||
"""Adds species info into the config: taxonomy id and scientific name, | ||
and assembly accession, fasta, and report.""" | ||
with get_metadata_conn() as conn: | ||
query = f"select a.taxonomy_id, b.scientific_name, d.assembly_accession " \ | ||
f"from project_taxonomy a " \ | ||
f"join taxonomy b on a.taxonomy_id=b.taxonomy_id " \ | ||
f"join assembly_set c on b.taxonomy_id=c.taxonomy_id " \ | ||
f"join accessioned_assembly d on c.assembly_set_id=d.assembly_set_id " \ | ||
f"where a.project_accession='{self.project_accession}';" | ||
rows = get_all_results_for_query(conn, query) | ||
if len(rows) != 1: | ||
raise ValueError(f'No taxonomy for {self.project_accession} found in metadata DB.') | ||
tax_id, sci_name, asm_accession = rows[0] | ||
self.eload_cfg.set('submission', 'taxonomy_id', value=tax_id) | ||
self.eload_cfg.set('submission', 'scientific_name', value=sci_name) | ||
self.eload_cfg.set('submission', 'assembly_accession', value=asm_accession) | ||
|
||
fasta_path, report_path = get_genome_fasta_and_report(sci_name, asm_accession) | ||
self.eload_cfg.set('submission', 'assembly_fasta', value=fasta_path) | ||
self.eload_cfg.set('submission', 'assembly_report', value=report_path) | ||
|
||
def get_analysis_info(self): | ||
"""Adds analysis info into the config: analysis accession(s), and vcf and index files.""" | ||
with get_metadata_conn() as conn: | ||
query = f"select a.analysis_accession, array_agg(c.filename) " \ | ||
f"from project_analysis a " \ | ||
f"join analysis_file b on a.analysis_accession=b.analysis_accession " \ | ||
f"join file c on b.file_id=c.file_id " \ | ||
f"where a.project_accession='{self.project_accession}' " \ | ||
f"group by a.analysis_accession;" | ||
rows = get_all_results_for_query(conn, query) | ||
if len(rows) == 0: | ||
raise ValueError(f'No analyses for {self.project_accession} found in metadata DB.') | ||
|
||
submitted_vcfs = [] | ||
for analysis_accession, filenames in rows: | ||
# TODO for now we assume a single analysis per project as that's what the eload config supports | ||
self.eload_cfg.set('brokering', 'ena', 'ANALYSIS', value=analysis_accession) | ||
for fn in filenames: | ||
full_path = os.path.join(self._get_dir('vcf'), fn) | ||
if not os.path.exists(full_path): | ||
self.error(f'File not found: {full_path}') | ||
self.error(f'Please check that all VCF and index files are present before retrying.') | ||
raise FileNotFoundError(f'File not found: {full_path}') | ||
if full_path.endswith('tbi'): | ||
index_file = full_path | ||
else: | ||
vcf_file = full_path | ||
if not index_file or not vcf_file: | ||
raise ValueError(f'VCF or index file is missing from metadata DB for analysis {analysis_accession}') | ||
submitted_vcfs.append(vcf_file) | ||
self.eload_cfg.set('brokering', 'vcf_files', vcf_file, 'index', value=index_file) | ||
self.eload_cfg.set('submission', 'vcf_files', value=submitted_vcfs) | ||
|
||
def get_hold_date(self): | ||
"""Gets hold date from ENA and adds to the config.""" | ||
xml_request = f'''<SUBMISSION_SET> | ||
<SUBMISSION> | ||
<ACTIONS> | ||
<ACTION> | ||
<RECEIPT target="{self.project_alias}"/> | ||
</ACTION> | ||
</ACTIONS> | ||
</SUBMISSION> | ||
</SUBMISSION_SET>''' | ||
response = requests.post( | ||
cfg.query('ena', 'submit_url'), | ||
auth=HTTPBasicAuth(cfg.query('ena', 'username'), cfg.query('ena', 'password')), | ||
files={'SUBMISSION': xml_request} | ||
) | ||
receipt = ET.fromstring(response.text) | ||
try: | ||
hold_date = receipt.findall('PROJECT')[0].attrib['holdUntilDate'] | ||
self.eload_cfg.set('brokering', 'ena', 'hold_date', value=hold_date) | ||
except (IndexError, KeyError): | ||
raise ValueError(f"Couldn't get hold date from ENA for {self.project_accession} ({self.project_alias})") | ||
# TODO if there's no hold date because the study is already public, this should be okay |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.