From af64b798e75b0287ac64cd755b4a2a1404d7736a Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 9 Nov 2023 16:14:26 -0300 Subject: [PATCH] feat: merge exporter and data_registry apps --- core/settings.py | 3 +- core/urls.py | 1 - {exporter => data_registry}/middleware.py | 0 data_registry/urls.py | 1 + data_registry/views.py | 36 ++++++++++++++++++-- exporter/apps.py | 5 --- exporter/urls.py | 7 ---- exporter/views.py | 40 ----------------------- tests/data_registry/test_views.py | 2 +- tests/exporter/test_views.py | 33 ++++++++++++++----- 10 files changed, 62 insertions(+), 66 deletions(-) rename {exporter => data_registry}/middleware.py (100%) delete mode 100644 exporter/apps.py delete mode 100644 exporter/urls.py delete mode 100644 exporter/views.py diff --git a/core/settings.py b/core/settings.py index d639b0eb..6a6f7f17 100644 --- a/core/settings.py +++ b/core/settings.py @@ -54,13 +54,12 @@ "django.contrib.humanize", "data_registry", "markdownx", - "exporter", ] MIDDLEWARE = [ "django.middleware.security.SecurityMiddleware", # Add before GZipMiddleware to modify its response. - "exporter.middleware.ContentEncodingMiddleware", + "data_registry.middleware.ContentEncodingMiddleware", # This site is not affected by BREACH. # https://docs.djangoproject.com/en/4.2/ref/middleware/#django.middleware.gzip.GZipMiddleware "django.middleware.gzip.GZipMiddleware", diff --git a/core/urls.py b/core/urls.py index 73a61136..8ba19bde 100644 --- a/core/urls.py +++ b/core/urls.py @@ -4,7 +4,6 @@ urlpatterns = [ path("", include("data_registry.urls"), name="data-registry"), - path("", include("exporter.urls"), name="exporter"), path("admin/", admin.site.urls), path("markdownx/", include("markdownx.urls")), ] diff --git a/exporter/middleware.py b/data_registry/middleware.py similarity index 100% rename from exporter/middleware.py rename to data_registry/middleware.py diff --git a/data_registry/urls.py b/data_registry/urls.py index 7e9b21c1..2d6ac3de 100644 --- a/data_registry/urls.py +++ b/data_registry/urls.py @@ -6,6 +6,7 @@ path("", views.index, name="index"), path("search/", views.search, name="search"), path("publication/", views.detail, name="detail"), + path("publication//download", views.download_export, name="download"), # https://code.djangoproject.com/ticket/26556 path("i18n/setlang/", i18n.set_language, name="set-language"), # Uncomment after re-integrating Spoonbill. diff --git a/data_registry/views.py b/data_registry/views.py index b5bbe736..2d56a6d8 100644 --- a/data_registry/views.py +++ b/data_registry/views.py @@ -1,4 +1,5 @@ import logging +import re import string from collections import defaultdict from datetime import date, datetime, timedelta @@ -11,20 +12,21 @@ from django.contrib.postgres.aggregates import ArrayAgg from django.db.models import Count, OuterRef, Q, Subquery from django.db.models.functions import Substr -from django.http.response import HttpResponse, JsonResponse +from django.http.response import FileResponse, HttpResponse, HttpResponseBadRequest, HttpResponseNotFound, JsonResponse from django.shortcuts import get_object_or_404, redirect, render from django.utils.translation import get_language, get_language_from_request from django.utils.translation import gettext as _ from data_registry.models import Collection, Job from data_registry.util import collection_queryset -from exporter.util import Export +from exporter.util import Export, TaskStatus logger = logging.getLogger(__name__) alphabets = defaultdict(lambda: string.ascii_uppercase) # https://en.wikipedia.org/wiki/Cyrillic_script_in_Unicode#Basic_Cyrillic_alphabet alphabets["ru"] = "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ" +EXPORT_PATTERN = re.compile(r"\A(full|\d{4})\.(jsonl\.gz|csv\.tar\.gz|xlsx)\Z") def index(request): @@ -174,6 +176,36 @@ def spiders(request): return JsonResponse(json.get("spiders"), safe=False) +def download_export(request, id): + """ + Returns an exported file as a FileResponse object. + """ + name = request.GET.get("name") + + # Guard against path traversal. + if not EXPORT_PATTERN.match(name): + return HttpResponseBadRequest("The name query string parameter is invalid") + + collection = get_object_or_404(collection_queryset(request), id=id) + + active_job = collection.job.filter(active=True).first() + if not active_job: + return HttpResponseNotFound("This OCDS dataset is not available for download") + + export = Export(active_job.id, basename=name) + if export.status != TaskStatus.COMPLETED: + return HttpResponseNotFound("File not found") + + return FileResponse( + export.path.open("rb"), + as_attachment=True, + filename=f"{collection.source_id}_{name}", + # Set Content-Encoding to skip GZipMiddleware. (ContentEncodingMiddleware removes the empty header.) + # https://docs.djangoproject.com/en/4.2/ref/middleware/#module-django.middleware.gzip + headers={"Content-Encoding": ""}, + ) + + def excel_data(request, job_id, job_range=None): job = Job.objects.get(id=job_id) export = Export(job_id) diff --git a/exporter/apps.py b/exporter/apps.py deleted file mode 100644 index e8fed610..00000000 --- a/exporter/apps.py +++ /dev/null @@ -1,5 +0,0 @@ -from django.apps import AppConfig - - -class ExporterConfig(AppConfig): - name = "exporter" diff --git a/exporter/urls.py b/exporter/urls.py deleted file mode 100644 index 39213bf2..00000000 --- a/exporter/urls.py +++ /dev/null @@ -1,7 +0,0 @@ -from django.urls import path - -from exporter import views - -urlpatterns = [ - path("publication//download", views.download_export, name="download"), -] diff --git a/exporter/views.py b/exporter/views.py deleted file mode 100644 index b40a8448..00000000 --- a/exporter/views.py +++ /dev/null @@ -1,40 +0,0 @@ -import re - -from django.http import FileResponse -from django.http.response import HttpResponseBadRequest, HttpResponseNotFound -from django.shortcuts import get_object_or_404 - -from data_registry.util import collection_queryset -from exporter.util import Export, TaskStatus - -EXPORT_PATTERN = re.compile(r"\A(full|\d{4})\.(jsonl\.gz|csv\.tar\.gz|xlsx)\Z") - - -def download_export(request, id): - """ - Returns an exported file as a FileResponse object. - """ - name = request.GET.get("name") - - # Guard against path traversal. - if not EXPORT_PATTERN.match(name): - return HttpResponseBadRequest("The name query string parameter is invalid") - - collection = get_object_or_404(collection_queryset(request), id=id) - - active_job = collection.job.filter(active=True).first() - if not active_job: - return HttpResponseNotFound("This OCDS dataset is not available for download") - - export = Export(active_job.id, basename=name) - if export.status != TaskStatus.COMPLETED: - return HttpResponseNotFound("File not found") - - return FileResponse( - export.path.open("rb"), - as_attachment=True, - filename=f"{collection.source_id}_{name}", - # Set Content-Encoding to skip GZipMiddleware. (ContentEncodingMiddleware removes the empty header.) - # https://docs.djangoproject.com/en/4.2/ref/middleware/#module-django.middleware.gzip - headers={"Content-Encoding": ""}, - ) diff --git a/tests/data_registry/test_views.py b/tests/data_registry/test_views.py index d156b809..6112406a 100644 --- a/tests/data_registry/test_views.py +++ b/tests/data_registry/test_views.py @@ -21,7 +21,7 @@ def setUpTestData(cls): @patch("exporter.util.Export.get_files") def test_detail(self, get_files): get_files.return_value = {"jsonl": {"by_year": [{"year": 2022, "size": 1}]}} - url = f"/publication/{self.collection.id}/download?name=2022.jsonl.gz" + url = f"/en/publication/{self.collection.id}/download?name=2022.jsonl.gz" with self.assertNumQueries(2): response = Client().get(f"/en/publication/{self.collection.id}") diff --git a/tests/exporter/test_views.py b/tests/exporter/test_views.py index 5d8af146..4b3bd086 100644 --- a/tests/exporter/test_views.py +++ b/tests/exporter/test_views.py @@ -18,32 +18,49 @@ def setUp(cls): ) cls.job = cls.collection.job.create( active=True, + id=2, + ) + cls.collection_no_job = Collection.objects.create( + id=3, + title="Test", + source_id="abc", + public=True, ) + cls.collection_no_job.job.create( + active=True, + id=4, + ) + + def test_collection_not_found(self): + with self.assertNumQueries(1): + response = Client().get("/en/publication/10/download?name=2000.jsonl.gz") + + self.assertEqual(response.status_code, 404) def test_download_export_invalid_suffix(self): with self.assertNumQueries(0): - response = Client().get("/publication/2/download?name=invalid") + response = Client().get(f"/en/publication/{self.collection.id}/download?name=invalid") self.assertEqual(response.status_code, 400) self.assertEqual(response.content, b"The name query string parameter is invalid") def test_download_export_empty_parameter(self): with self.assertNumQueries(0): - response = Client().get("/publication/2/download?name=") + response = Client().get(f"/en/publication/{self.collection.id}/download?name=") self.assertEqual(response.status_code, 400) self.assertEqual(response.content, b"The name query string parameter is invalid") def test_download_export_waiting(self): - with self.assertNumQueries(1): - response = Client().get("/publication/1/download?name=2000.jsonl.gz") - + with self.assertNumQueries(2): + response = Client().get(f"/en/publication/{self.collection_no_job.id}/download?name=2000.jsonl.gz") self.assertEqual(response.status_code, 404) + self.assertEqual(response.content, b"File not found") @patch("exporter.util.Export.lockfile", new_callable=PropertyMock) def test_download_export_running(self, exists): with self.assertNumQueries(2): - response = Client().get("/publication/2/download?name=2000.jsonl.gz") + response = Client().get(f"/en/publication/{self.collection.id}/download?name=2000.jsonl.gz") self.assertEqual(response.status_code, 404) self.assertEqual(response.content, b"File not found") @@ -57,7 +74,7 @@ def test_download_export_completed(self): with self.subTest(suffix=suffix): with self.assertNumQueries(2): response = Client().get( - f"/publication/2/download?name=2000.{suffix}", + f"/en/publication/{self.collection.id}/download?name=2000.{suffix}", HTTP_ACCEPT_ENCODING="gzip", ) self.assertEqual(response.status_code, 200) @@ -70,7 +87,7 @@ def test_download_export_completed(self): "Content-Type": content_type, "Cross-Origin-Opener-Policy": "same-origin", "Referrer-Policy": "same-origin", - "Vary": "Accept-Language, Cookie", + "Vary": "Cookie", "X-Content-Type-Options": "nosniff", "X-Frame-Options": "DENY", },