From b8109efde4fd91504cc36a63932fa20442141a00 Mon Sep 17 00:00:00 2001 From: pkdash Date: Fri, 7 Jun 2024 15:02:26 -0400 Subject: [PATCH 1/3] [#159] compute content location from submission data --- api/models/user.py | 106 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/api/models/user.py b/api/models/user.py index 94b3f8b..890003f 100644 --- a/api/models/user.py +++ b/api/models/user.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from datetime import datetime from enum import Enum from typing import List, Optional, TYPE_CHECKING @@ -16,6 +17,56 @@ class SubmissionType(str, Enum): IGUIDE_FORM = 'IGUIDE_FORM' +class StorageProvider(str, Enum): + AWS = "AWS" + GCP = "GCP" + Azure = "Azure" + GoogleDrive = "Google Drive" + Dropbox = "Dropbox" + OneDrive = "OneDrive" + Box = "Box" + CUAHSI = "CUAHSI" + + +@dataclass +class ContentStorage: + url_pattern: str + storage_name: str + + @classmethod + def get_storage(cls, storage_provider: StorageProvider): + if storage_provider == StorageProvider.AWS: + return cls("amazonaws.com", "AWS") + + if storage_provider == StorageProvider.GCP: + return cls("storage.googleapis.com", "GCP") + + if storage_provider == StorageProvider.Azure: + return cls("blob.core.windows.net", "Azure") + + if storage_provider == StorageProvider.GoogleDrive: + return cls("drive.google.com", "Google Drive") + + if storage_provider == StorageProvider.Dropbox: + return cls("dropbox.com", "Dropbox") + + if storage_provider == StorageProvider.OneDrive: + return cls("onedrive.live.com", "OneDrive") + + if storage_provider == StorageProvider.Box: + return cls("app.box.com", "Box") + + if storage_provider == StorageProvider.CUAHSI: + return cls("minio.cuahsi.io", "CUAHSI") + + def get_storage_name(self, url: Optional[str], repository_identifier: Optional[str]): + if repository_identifier and self.url_pattern in repository_identifier: + return self.storage_name + if url and self.url_pattern in url: + return self.storage_name + return None + + class S3Path(BaseModel): path: str bucket: str @@ -41,6 +92,61 @@ class Submission(Document): repository_identifier: Optional[str] s3_path: Optional[S3Path] + @property + def content_location(self): + # determine the content location based on the repository type + if self.repository == SubmissionType.HYDROSHARE: + return self.repository + elif self.repository == SubmissionType.S3: + endpoint_url = self.s3_path.endpoint_url.rstrip("/") + storage = ContentStorage.get_storage(StorageProvider.AWS) + if endpoint_url.endswith(storage.url_pattern): + return storage.storage_name + storage = ContentStorage.get_storage(StorageProvider.CUAHSI) + if endpoint_url.endswith(storage.url_pattern): + return storage.storage_name + return self.repository + + # determine the content location based on the URL or repository identifier + + # check for GCP + storage = ContentStorage.get_storage(StorageProvider.GCP) + storage_name = storage.get_storage_name(self.url, self.repository_identifier) + if storage_name: + return storage_name + + # check for Azure + storage = ContentStorage.get_storage(StorageProvider.Azure) + storage_name = storage.get_storage_name(self.url, self.repository_identifier) + if storage_name: + return storage_name + + # check for Google Drive + storage = ContentStorage.get_storage(StorageProvider.GoogleDrive) + storage_name = storage.get_storage_name(self.url, self.repository_identifier) + if storage_name: + return storage_name + + # check for dropbox + storage = ContentStorage.get_storage(StorageProvider.Dropbox) + storage_name = storage.get_storage_name(self.url, self.repository_identifier) + if storage_name: + return storage_name + + # check for one drive + storage = ContentStorage.get_storage(StorageProvider.OneDrive) + storage_name = storage.get_storage_name(self.url, self.repository_identifier) + if storage_name: + return storage_name + + # check for box + storage = ContentStorage.get_storage(StorageProvider.Box) + storage_name = storage.get_storage_name(self.url, self.repository_identifier) + if storage_name: + return storage_name + + return self.repository + class User(Document): access_token: str From 871fccb73dbe731e2d42341da9225523467be01b Mon Sep 17 00:00:00 2001 From: pkdash Date: Fri, 7 Jun 2024 15:03:47 -0400 Subject: [PATCH 2/3] [#159] store content location in discovery collection --- triggers/update_catalog.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/triggers/update_catalog.py b/triggers/update_catalog.py index 135f837..c74ddf5 100644 --- a/triggers/update_catalog.py +++ b/triggers/update_catalog.py @@ -46,6 +46,9 @@ async def watch_catalog(db: AsyncIOMotorClient): submission: Submission = await Submission.find_one({"identifier": document["_id"]}) catalog_entry["registrationDate"] = submission.submitted catalog_entry["name_for_sorting"] = str.lower(catalog_entry["name"]) + catalog_entry["submission_type"] = submission.repository + # location of the dataset files e.g. AWS,GCP, Azure, Hydroshare, CUAHSI, etc. + catalog_entry["content_location"] = submission.content_location await db["discovery"].find_one_and_replace( {"_id": document["_id"]}, catalog_entry, upsert=True ) From 704fafbc7e1573399f152b25bc64c80b2bd59b6a Mon Sep 17 00:00:00 2001 From: pkdash Date: Fri, 7 Jun 2024 15:04:31 -0400 Subject: [PATCH 3/3] [#159] updating tests for content location --- tests/test_dataset_routes.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/tests/test_dataset_routes.py b/tests/test_dataset_routes.py index ef29b89..d7c90af 100644 --- a/tests/test_dataset_routes.py +++ b/tests/test_dataset_routes.py @@ -2,7 +2,7 @@ from api.adapters.utils import RepositoryType from api.models.catalog import Submission -from api.models.user import SubmissionType, User +from api.models.user import SubmissionType, User, S3Path pytestmark = pytest.mark.asyncio @@ -105,6 +105,11 @@ async def test_create_dataset_s3(client_test, dataset_data, test_user_access_tok assert len(submission_response_data) == 1 assert submission_response_data[0]['repository'] == RepositoryType.S3 assert submission_response_data[0]['s3_path'] == s3_path + if object_store_type == "minio": + content_location = "CUAHSI" + else: + content_location = "AWS" + await _check_s3_submission(test_user_access_token, s3_path, content_location) @pytest.mark.parametrize('object_store_type', ['s3', 'minio']) @@ -193,6 +198,11 @@ async def test_update_dataset_s3(client_test, dataset_data, test_user_access_tok assert len(submission_response_data) == 1 assert submission_response_data[0]['repository'] == RepositoryType.S3 assert submission_response_data[0]['s3_path'] == s3_path + if object_store_type == "minio": + content_location = "CUAHSI" + else: + content_location = "AWS" + await _check_s3_submission(test_user_access_token, s3_path, content_location) @pytest.mark.asyncio @@ -501,3 +511,21 @@ async def _check_hs_submission(hs_dataset, user_access_token, hs_published_res_i assert user.submission(submission_id) is not None assert user.submission(submission_id).repository == "HYDROSHARE" assert user.submission(submission_id).repository_identifier == hs_published_res_id + assert user.submission(submission_id).content_location == "HYDROSHARE" + + +async def _check_s3_submission(user_access_token, s3_path, content_location="AWS"): + s3_path = S3Path(**s3_path) + # there should be one related submission record in the db + submissions = await Submission.find().to_list() + assert len(submissions) == 1 + user = await User.find_one(User.access_token == user_access_token, fetch_links=True) + assert len(user.submissions) == 1 + submission = user.submissions[0] + submission_id = submission.identifier + assert submission_id == user.submissions[0].identifier + assert user.submission(submission_id) is not None + assert user.submission(submission_id).repository == "S3" + assert user.submission(submission_id).s3_path == s3_path + assert user.submission(submission_id).repository_identifier == s3_path.identifier + assert submission.content_location == content_location