Skip to content

Commit

Permalink
Merge pull request #120 from tcezard/retryable_ENA_FTP_upload
Browse files Browse the repository at this point in the history
Retry-able ENA FTP upload
  • Loading branch information
tcezard authored Aug 2, 2022
2 parents 40474d2 + eca5bc7 commit 2d492b5
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 28 deletions.
43 changes: 26 additions & 17 deletions eva_submission/ENA_submission/upload_to_ENA.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,25 +35,34 @@ def __init__(self, eload, ena_spreadsheet, output_dir):
self.converter = EnaXlsxConverter(ena_spreadsheet, output_dir, self.eload)
self.ena_auth = HTTPBasicAuth(cfg.query('ena', 'username'), cfg.query('ena', 'password'))

@retry(exceptions=ftplib.all_errors, tries=3, delay=2, backoff=1.2, jitter=(1, 3))
def upload_vcf_files_to_ena_ftp(self, files_to_upload):
host = cfg.query('ena', 'ftphost')
self.info(f'Connect to {host}')
ftps = HackFTP_TLS()
# Set a weak cipher to enable connection
# https://stackoverflow.com/questions/38015537/python-requests-exceptions-sslerror-dh-key-too-small
ftps.context.set_ciphers('DEFAULT:@SECLEVEL=1')
ftps.connect(host, port=int(cfg.query('ena', 'ftpport', ret_default=21)))
ftps.login(cfg.query('ena', 'username'), cfg.query('ena', 'password'))
ftps.prot_p()
if self.eload not in ftps.nlst():
self.info(f'Create {self.eload} directory')
ftps.mkd(self.eload)
ftps.cwd(self.eload)
for file_to_upload in files_to_upload:
file_name = os.path.basename(file_to_upload)
self.info(f'Upload {file_name} to FTP')
with open(file_to_upload, 'rb') as open_file:
ftps.storbinary('STOR %s' % file_name, open_file)
# Heuristic to set the expected timeout assuming 10Mb/s upload speed but no less than 30 sec
# and no more than an hour
max_file_size = max([os.path.getsize(f) for f in files_to_upload])
timeout = min(max(int(max_file_size / 10000000), 30), 3600)
self.info(f'Connect to {host} with timeout: {timeout}')
with HackFTP_TLS() as ftps:
# Set a weak cipher to enable connection
# https://stackoverflow.com/questions/38015537/python-requests-exceptions-sslerror-dh-key-too-small
ftps.context.set_ciphers('DEFAULT:@SECLEVEL=1')
ftps.connect(host, port=int(cfg.query('ena', 'ftpport', ret_default=21)), timeout=timeout)
ftps.login(cfg.query('ena', 'username'), cfg.query('ena', 'password'))
ftps.prot_p()
if self.eload not in ftps.nlst():
self.info(f'Create {self.eload} directory')
ftps.mkd(self.eload)
ftps.cwd(self.eload)
previous_content = ftps.nlst()
for file_to_upload in files_to_upload:
file_name = os.path.basename(file_to_upload)
if file_name in previous_content and ftps.size(file_name) == os.path.getsize(file_to_upload):
self.warning(f'{file_name} Already exist and has the same size on the FTP. Skip upload.')
continue
self.info(f'Upload {file_name} to FTP')
with open(file_to_upload, 'rb') as open_file:
ftps.storbinary('STOR %s' % file_name, open_file)

@retry(requests.exceptions.ConnectionError, tries=3, delay=2, backoff=1.2, jitter=(1, 3))
def _post_xml_file_to_ena(self, url, file_dict):
Expand Down
7 changes: 5 additions & 2 deletions eva_submission/eload_brokering.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def prepare_brokering(self, force=False):
self.info('Preparation has already been run, Skip!')

def broker_to_ena(self, force=False, existing_project=None, async_upload=False, dry_ena_upload=False):
if not self.eload_cfg.query('brokering', 'ena', 'PROJECT') or force:
if not self.eload_cfg.query('brokering', 'ena', 'pass') or force:
ena_spreadsheet = os.path.join(self._get_dir('ena'), 'metadata_spreadsheet.xlsx')
# Set the project in the metadata sheet which is then converted to XML
self.update_metadata_spreadsheet(self.eload_cfg['validation']['valid']['metadata_spreadsheet'],
Expand Down Expand Up @@ -82,7 +82,10 @@ def broker_to_ena(self, force=False, existing_project=None, async_upload=False,
# Upload XML to ENA
ena_uploader.upload_xml_files_to_ena(dry_ena_upload)
if not dry_ena_upload:
self.eload_cfg.set('brokering', 'ena', value=ena_uploader.results)
# Update the accessions in case we're working with existing project
accessions = ena_uploader.results
accessions.update(self.eload_cfg.query('brokering', 'ena', ret_default={}))
self.eload_cfg.set('brokering', 'ena', value=accessions)
self.eload_cfg.set('brokering', 'ena', 'date', value=self.now)
self.eload_cfg.set('brokering', 'ena', 'hold_date', value=ena_uploader.converter.hold_date)
self.eload_cfg.set('brokering', 'ena', 'pass', value=not bool(ena_uploader.results['errors']))
Expand Down
3 changes: 3 additions & 0 deletions tests/resources/brokering/test_vcf_file.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
##fileformat=VCFv4.2
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 S4 S5 S6 S7 S8 S9 S10
1 721105 . A T . PASS DP=40 GT:DP 1/1:30 1/1:30 1/1:30 1/1:30 1/1:30 1/1:30 1/1:30 1/1:30 1/1:30 1/1:30
60 changes: 51 additions & 9 deletions tests/test_upload_to_ENA.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import shutil
from unittest import TestCase
import os
from unittest.mock import patch, Mock

import pytest
from ebi_eva_common_pyutils.config import cfg

from eva_submission import ROOT_DIR
from eva_submission.ENA_submission.upload_to_ENA import ENAUploader, ENAUploaderAsync
from eva_submission.ENA_submission.upload_to_ENA import ENAUploader, ENAUploaderAsync, HackFTP_TLS
from eva_submission.eload_utils import get_file_content
from eva_submission.submission_config import load_config

Expand All @@ -25,14 +27,16 @@ class TestENAUploader(TestCase):
<ACTIONS>ADD</ACTIONS>
</RECEIPT>'''


def setUp(self) -> None:
resources_folder = os.path.join(ROOT_DIR, 'tests', 'resources')
brokering_folder = os.path.join(resources_folder, 'brokering')
config_file = os.path.join(resources_folder, 'submission_config.yml')
self.resources_folder = os.path.join(ROOT_DIR, 'tests', 'resources')
self.brokering_folder = os.path.join(self.resources_folder, 'brokering')

config_file = os.path.join(self.resources_folder, 'submission_config.yml')
load_config(config_file)
metadata_file = os.path.join(brokering_folder, 'metadata_sheet.xlsx')
self.uploader = ENAUploader('ELOAD_1', metadata_file, brokering_folder)
self.uploader_async = ENAUploaderAsync('ELOAD_1', metadata_file, brokering_folder)
metadata_file = os.path.join(self.brokering_folder, 'metadata_sheet.xlsx')
self.uploader = ENAUploader('ELOAD_1', metadata_file, self.brokering_folder)
self.uploader_async = ENAUploaderAsync('ELOAD_1', metadata_file, self.brokering_folder)

def tearDown(self) -> None:
if os.path.exists(self.uploader_async.converter.single_submission_file):
Expand Down Expand Up @@ -130,4 +134,42 @@ def test_single_dry_upload_xml_files_to_ena(self):
'endpoint:')
mock_info.assert_any_call('file: ELOAD_1.SingleSubmission.xml')
mock_post.assert_not_called()
mock_get.assert_not_called()
mock_get.assert_not_called()

@pytest.mark.skip(reason="Need to use real ENA credentials in submission_config.yml for this test to work")
def test_upload_FTP(self):
filename = 'test_vcf_file.vcf'
with HackFTP_TLS() as ftps:
# Connect to delete the file
ftps.context.set_ciphers('DEFAULT:@SECLEVEL=1')
ftps.connect(cfg.query('ena', 'ftphost'), port=int(cfg.query('ena', 'ftpport', ret_default=21)))
ftps.login(cfg.query('ena', 'username'), cfg.query('ena', 'password'))
ftps.prot_p()
list_files = ftps.nlst()
if self.uploader.eload in list_files:
ftps.cwd(self.uploader.eload)
list_files = ftps.nlst()
if self.uploader.eload in list_files:
ftps.delete(filename)

files_to_upload = os.path.join(self.brokering_folder, filename)
self.uploader.upload_vcf_files_to_ena_ftp([files_to_upload])

with HackFTP_TLS() as ftps:
# Connect to check that the file has been uploaded
ftps.context.set_ciphers('DEFAULT:@SECLEVEL=1')
ftps.connect(cfg.query('ena', 'ftphost'), port=int(cfg.query('ena', 'ftpport', ret_default=21)))
ftps.login(cfg.query('ena', 'username'), cfg.query('ena', 'password'))
ftps.prot_p()
list_files = ftps.nlst()
assert self.uploader.eload in list_files
ftps.cwd(self.uploader.eload)
list_files = ftps.nlst()
assert filename in list_files

# Attempt to load again: It should not load the file
with patch.object(ENAUploader, 'warning') as mock_warning:
self.uploader.upload_vcf_files_to_ena_ftp([files_to_upload])
mock_warning.assert_called_once_with(
'test_vcf_file.vcf Already exist and has the same size on the FTP. Skip upload.'
)

0 comments on commit 2d492b5

Please sign in to comment.