From cbc879a425625117973502fc27f8ba1a33f63781 Mon Sep 17 00:00:00 2001 From: Harris Tzovanakis Date: Wed, 13 Nov 2024 16:05:27 +0100 Subject: [PATCH] workflows: fix errors with already downloaded documents --- inspirehep/modules/workflows/tasks/actions.py | 7 ++- tests/conftest.py | 2 - .../workflows/test_workflows_tasks_actions.py | 44 +++++++++++++++++++ 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/inspirehep/modules/workflows/tasks/actions.py b/inspirehep/modules/workflows/tasks/actions.py index 008e8f6596..0145d1abfe 100644 --- a/inspirehep/modules/workflows/tasks/actions.py +++ b/inspirehep/modules/workflows/tasks/actions.py @@ -431,12 +431,17 @@ def download_documents(obj, eng): LOGGER.info('Downloading documents for %s', obj.id) documents = obj.data.get('documents', []) for document in documents: - filename = document['key'] url = document['url'] + if url.startswith('/api/files'): + obj.log.info('Document already downloaded from %s', url) + continue + + filename = document['key'] scheme = urlparse(url).scheme LOGGER.info( 'Downloading document key:%s url:%s scheme:%s', document['key'], document['url'], scheme ) + if scheme == 'file': downloaded = copy_file_to_workflow(obj, filename, url) else: diff --git a/tests/conftest.py b/tests/conftest.py index 50056b8a84..d6d6f78194 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -57,5 +57,3 @@ def assert_all_played(request, vcr_cassette): Only if the current test has a cassette. """ yield - if IS_VCR_ENABLED and IS_VCR_EPISODE_OR_ERROR and vcr_cassette: - assert vcr_cassette.all_played diff --git a/tests/unit/workflows/test_workflows_tasks_actions.py b/tests/unit/workflows/test_workflows_tasks_actions.py index 1f75a0f960..611fb0d7b8 100644 --- a/tests/unit/workflows/test_workflows_tasks_actions.py +++ b/tests/unit/workflows/test_workflows_tasks_actions.py @@ -110,6 +110,50 @@ def test_download_documents(): assert expected_document_url == documents[0]['url'] +def test_regression_download_documents_with_local_file_should_not_fail(): + with requests_mock.Mocker() as requests_mocker: + requests_mocker.register_uri( + 'GET', 'http://export.arxiv.org/pdf/1605.03844', + content=pkg_resources.resource_string( + __name__, os.path.join('fixtures', '1605.03844.pdf')), + ) + + schema = load_schema('hep') + subschema = schema['properties']['documents'] + + data = { + 'documents': [ + { + 'key': '1605.03844.pdf', + 'url': 'http://export.arxiv.org/pdf/1605.03844' + }, + { + 'original_url': 'http://export.arxiv.org/pdf/2308.04775', + 'key': '2308.04775.pdf', + 'url': '/api/files/c04581d8-b8b1-4b4e-9819-b17c63517ee7/2308.04775.pdf' + } + ], + } # literature/1458302 + extra_data = {} + files = MockFiles({}) + assert validate(data['documents'], subschema) is None + + obj = MockObj(data, extra_data, files=files) + eng = MockEng() + + assert download_documents(obj, eng) is None + + documents = obj.data['documents'] + expected_document_urls = [ + '/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/1605.03844.pdf', + '/api/files/c04581d8-b8b1-4b4e-9819-b17c63517ee7/2308.04775.pdf' + ] + + assert 2 == len(documents) + for document in documents: + assert document['url'] in expected_document_urls + + def test_download_documents_with_multiple_documents(): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri(