V1.6.0 features (#565)

### Feature or Bugfix Release PR with the following list of features. Refer to each PR for the details ### Detail - #498 - #482 - #543 - #524 (which also solves #531) - #532 - #535 - #497 - #515 - #529 - #562 - #455 - #572 - #567 - #573 - #579 - #578 - #582 ### Breaking changes - release notes - ⚠️ IMPORTANT: upgrade to a version >V1.5.0 before upgrading to V1.6 to avoid deletion of resources in custom resource deletion - ⚠️ IMPORTANT: requires an update of environments and then datasets after upgrading. Either using cdk.json parameter `enable_update_dataall_stacks_in_cicd_pipeline`, waiting for overnight update stack task, or manually updating first environments and then datasets - CloudFront distribution replace for #529 - Additional EC2 permissions in CDK Synth CodeBuild stage for #543 --> this can be avoided by upgrading to v1.5.6 before upgrading to v1.6.0 - local development affected by more restrictive pivotRole trust policy ### Relates V1.6.0 release By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. --------- Co-authored-by: Gezim Musliaj <[email protected]> Co-authored-by: Noah Paige <[email protected]> Co-authored-by: nikpodsh <[email protected]> Co-authored-by: chamcca <[email protected]> Co-authored-by: Nikita Podshivalov <[email protected]> Co-authored-by: dbalintx <[email protected]> Co-authored-by: mourya-33 <[email protected]>
data-dot-all · Jul 19, 2023 · 84c555e · 84c555e
1 parent 45c5cfb
commit 84c555e
Show file tree

Hide file tree

Showing 114 changed files with 4,379 additions and 2,646 deletions.
diff --git a/UserGuide.pdf b/UserGuide.pdf
diff --git a/backend/cdkproxymain.py b/backend/cdkproxymain.py
@@ -52,7 +52,12 @@ def up(response: Response):
 def check_creds(response: Response):
     logger.info('GET /awscreds')
     try:
-        sts = boto3.client('sts', region_name=os.getenv('AWS_REGION', 'eu-west-1'))
+        region = os.getenv('AWS_REGION', 'eu-west-1')
+        sts = boto3.client(
+            'sts',
+            region_name=region,
+            endpoint_url=f"https://sts.{region}.amazonaws.com"
+        )
         data = sts.get_caller_identity()
         return {
             'DH_DOCKER_VERSION': os.environ.get('DH_DOCKER_VERSION'),
@@ -84,7 +89,7 @@ def check_connect(response: Response):
         return {
             'DH_DOCKER_VERSION': os.environ.get('DH_DOCKER_VERSION'),
             '_ts': datetime.now().isoformat(),
-            'message': f"Connected to database for environment {ENVNAME}({engine.dbconfig.params['host']}:{engine.dbconfig.params['port']})",
+            'message': f"Connected to database for environment {ENVNAME}({engine.dbconfig.host})",
         }
     except Exception as e:
         logger.exception('DBCONNECTIONERROR')

diff --git a/backend/dataall/api/Objects/Dataset/input_types.py b/backend/dataall/api/Objects/Dataset/input_types.py
@@ -42,6 +42,7 @@ class DatasetSortField(GraphQLEnumMapper):
         gql.Argument('language', gql.Ref('Language')),
         gql.Argument('confidentiality', gql.Ref('ConfidentialityClassification')),
         gql.Argument(name='stewards', type=gql.String),
+        gql.Argument('KmsAlias', gql.NonNullableType(gql.String)),
     ],
 )
 
@@ -94,7 +95,7 @@ class DatasetSortField(GraphQLEnumMapper):
         gql.Argument('description', gql.String),
         gql.Argument('bucketName', gql.NonNullableType(gql.String)),
         gql.Argument('glueDatabaseName', gql.String),
-        gql.Argument('KmsKeyId', gql.String),
+        gql.Argument('KmsKeyAlias', gql.NonNullableType(gql.String)),
         gql.Argument('adminRoleName', gql.String),
         gql.Argument('tags', gql.ArrayType(gql.String)),
         gql.Argument('owner', gql.NonNullableType(gql.String)),

diff --git a/backend/dataall/api/Objects/Dataset/resolvers.py b/backend/dataall/api/Objects/Dataset/resolvers.py
@@ -13,7 +13,8 @@
 from ....aws.handlers.glue import Glue
 from ....aws.handlers.service_handlers import Worker
 from ....aws.handlers.sts import SessionHelper
-from ....aws.handlers.sns import Sns
+from ....aws.handlers.kms import KMS
+
 from ....aws.handlers.quicksight import Quicksight
 from ....db import paginate, exceptions, permissions, models
 from ....db.api import Dataset, Environment, ShareObject, ResourcePolicy
@@ -32,6 +33,21 @@ def check_dataset_account(environment):
     return True
 
 
+def check_imported_resources(environment, kmsAlias):
+    if kmsAlias not in ["Undefined", "", "SSE-S3"]:
+        key_id = KMS.get_key_id(
+            account_id=environment.AwsAccountId,
+            region=environment.region,
+            key_alias=f"alias/{kmsAlias}"
+        )
+        if not key_id:
+            raise exceptions.AWSResourceNotFound(
+                action=permissions.IMPORT_DATASET,
+                message=f'KMS key with alias={kmsAlias} cannot be found',
+            )
+    return True
+
+
 def create_dataset(context: Context, source, input=None):
     with context.engine.scoped_session() as session:
         environment = Environment.get_environment_by_uri(session, input.get('environmentUri'))
@@ -71,6 +87,7 @@ def import_dataset(context: Context, source, input=None):
     with context.engine.scoped_session() as session:
         environment = Environment.get_environment_by_uri(session, input.get('environmentUri'))
         check_dataset_account(environment=environment)
+        check_imported_resources(environment=environment, kmsAlias=input.get('KmsKeyAlias', ""))
 
         dataset = Dataset.create_dataset(
             session=session,
@@ -83,9 +100,9 @@ def import_dataset(context: Context, source, input=None):
         dataset.imported = True
         dataset.importedS3Bucket = True if input['bucketName'] else False
         dataset.importedGlueDatabase = True if input.get('glueDatabaseName') else False
-        dataset.importedKmsKey = True if input.get('KmsKeyId') else False
+        dataset.importedKmsKey = True if input.get('KmsKeyAlias') else False
         dataset.importedAdminRole = True if input.get('adminRoleName') else False
-
+        dataset.KmsAlias = "SSE-S3" if input.get('KmsKeyAlias') == "" else input.get('KmsKeyAlias')
         Dataset.create_dataset_stack(session, dataset)
 
         indexers.upsert_dataset(
@@ -231,6 +248,7 @@ def update_dataset(context, source, datasetUri: str = None, input: dict = None):
         dataset = Dataset.get_dataset_by_uri(session, datasetUri)
         environment = Environment.get_environment_by_uri(session, dataset.environmentUri)
         check_dataset_account(environment=environment)
+        check_imported_resources(environment=environment, kmsAlias=input.get('KmsAlias', ""))
         updated_dataset = Dataset.update_dataset(
             session=session,
             username=context.username,

diff --git a/backend/dataall/api/Objects/DatasetProfiling/mutations.py b/backend/dataall/api/Objects/DatasetProfiling/mutations.py
@@ -7,13 +7,3 @@
     type=gql.Ref('DatasetProfilingRun'),
     resolver=start_profiling_run,
 )
-
-updateDatasetProfilingRunResults = gql.MutationField(
-    name='updateDatasetProfilingRunResults',
-    args=[
-        gql.Argument(name='profilingRunUri', type=gql.NonNullableType(gql.String)),
-        gql.Argument(name='results', type=gql.NonNullableType(gql.String)),
-    ],
-    type=gql.Ref('DatasetProfilingRun'),
-    resolver=update_profiling_run_results,
-)
diff --git a/backend/dataall/api/Objects/DatasetProfiling/queries.py b/backend/dataall/api/Objects/DatasetProfiling/queries.py
@@ -2,24 +2,6 @@
 from .resolvers import *
 
 
-getDatasetProfilingRun = gql.QueryField(
-    name='getDatasetProfilingRun',
-    args=[gql.Argument(name='profilingRunUri', type=gql.NonNullableType(gql.String))],
-    type=gql.Ref('DatasetProfilingRun'),
-    resolver=get_profiling_run,
-)
-
-
-listDatasetProfilingRuns = gql.QueryField(
-    name='listDatasetProfilingRuns',
-    args=[
-        gql.Argument(name='datasetUri', type=gql.NonNullableType(gql.String)),
-        gql.Argument(name='filter', type=gql.Ref('DatasetProfilingRunFilter')),
-    ],
-    type=gql.Ref('DatasetProfilingRunSearchResults'),
-    resolver=list_profiling_runs,
-)
-
 listDatasetTableProfilingRuns = gql.QueryField(
     name='listDatasetTableProfilingRuns',
     args=[gql.Argument(name='tableUri', type=gql.NonNullableType(gql.String))],
@@ -31,5 +13,5 @@
     name='getDatasetTableProfilingRun',
     args=[gql.Argument(name='tableUri', type=gql.NonNullableType(gql.String))],
     type=gql.Ref('DatasetProfilingRun'),
-    resolver=get_last_table_profiling_run,
+    resolver=get_dataset_table_profiling_run,
 )
diff --git a/backend/dataall/api/Objects/DatasetProfiling/resolvers.py b/backend/dataall/api/Objects/DatasetProfiling/resolvers.py
@@ -1,6 +1,7 @@
 import json
 import logging
 
+from .... import db
 from ....api.context import Context
 from ....aws.handlers.service_handlers import Worker
 from ....aws.handlers.sts import SessionHelper
@@ -19,7 +20,30 @@ def resolve_dataset(context, source: models.DatasetProfilingRun):
         )
 
 
+def resolve_profiling_run_status(context: Context, source: models.DatasetProfilingRun):
+    if not source:
+        return None
+    with context.engine.scoped_session() as session:
+        task = models.Task(
+            targetUri=source.profilingRunUri, action='glue.job.profiling_run_status'
+        )
+        session.add(task)
+    Worker.queue(engine=context.engine, task_ids=[task.taskUri])
+    return source.status
+
+
+def resolve_profiling_results(context: Context, source: models.DatasetProfilingRun):
+    if not source or source.results == {}:
+        return None
+    else:
+        return json.dumps(source.results)
+
+
 def start_profiling_run(context: Context, source, input: dict = None):
+    """
+    Triggers profiling jobs on a Table.
+    Only Dataset owners with PROFILE_DATASET_TABLE can perform this action
+    """
     with context.engine.scoped_session() as session:
 
         ResourcePolicy.check_user_resource_permission(
@@ -48,47 +72,14 @@ def start_profiling_run(context: Context, source, input: dict = None):
     return run
 
 
-def get_profiling_run_status(context: Context, source: models.DatasetProfilingRun):
-    if not source:
-        return None
-    with context.engine.scoped_session() as session:
-        task = models.Task(
-            targetUri=source.profilingRunUri, action='glue.job.profiling_run_status'
-        )
-        session.add(task)
-    Worker.queue(engine=context.engine, task_ids=[task.taskUri])
-    return source.status
-
-
-def get_profiling_results(context: Context, source: models.DatasetProfilingRun):
-    if not source or source.results == {}:
-        return None
-    else:
-        return json.dumps(source.results)
-
-
-def update_profiling_run_results(context: Context, source, profilingRunUri, results):
-    with context.engine.scoped_session() as session:
-        run = api.DatasetProfilingRun.update_run(
-            session=session, profilingRunUri=profilingRunUri, results=results
-        )
-        return run
-
-
-def list_profiling_runs(context: Context, source, datasetUri=None):
-    with context.engine.scoped_session() as session:
-        return api.DatasetProfilingRun.list_profiling_runs(session, datasetUri)
-
-
-def get_profiling_run(context: Context, source, profilingRunUri=None):
-    with context.engine.scoped_session() as session:
-        return api.DatasetProfilingRun.get_profiling_run(
-            session=session, profilingRunUri=profilingRunUri
-        )
-
-
-def get_last_table_profiling_run(context: Context, source, tableUri=None):
+def get_dataset_table_profiling_run(context: Context, source, tableUri=None):
+    """
+    Shows the results of the last profiling job on a Table.
+    For datasets "Unclassified" all users can perform this action.
+    For datasets "Secret" or "Official", only users with PREVIEW_DATASET_TABLE permissions can perform this action.
+    """
     with context.engine.scoped_session() as session:
+        _check_preview_permissions_if_needed(context=context, session=session, tableUri=tableUri)
         run: models.DatasetProfilingRun = (
             api.DatasetProfilingRun.get_table_last_profiling_run(
                 session=session, tableUri=tableUri
@@ -102,7 +93,7 @@ def get_last_table_profiling_run(context: Context, source, tableUri=None):
                 environment = api.Environment.get_environment_by_uri(
                     session, dataset.environmentUri
                 )
-                content = get_profiling_results_from_s3(
+                content = _get_profiling_results_from_s3(
                     environment, dataset, table, run
                 )
                 if content:
@@ -121,7 +112,7 @@ def get_last_table_profiling_run(context: Context, source, tableUri=None):
         return run
 
 
-def get_profiling_results_from_s3(environment, dataset, table, run):
+def _get_profiling_results_from_s3(environment, dataset, table, run):
     s3 = SessionHelper.remote_session(environment.AwsAccountId).client(
         's3', region_name=environment.region
     )
@@ -141,7 +132,32 @@ def get_profiling_results_from_s3(environment, dataset, table, run):
 
 
 def list_table_profiling_runs(context: Context, source, tableUri=None):
+    """
+    Lists the runs of a profiling job on a Table.
+    For datasets "Unclassified" all users can perform this action.
+    For datasets "Secret" or "Official", only users with PREVIEW_DATASET_TABLE permissions can perform this action.
+    """
     with context.engine.scoped_session() as session:
+        _check_preview_permissions_if_needed(context=context, session=session, tableUri=tableUri)
         return api.DatasetProfilingRun.list_table_profiling_runs(
             session=session, tableUri=tableUri, filter={}
         )
+
+
+def _check_preview_permissions_if_needed(context, session, tableUri):
+    table: models.DatasetTable = db.api.DatasetTable.get_dataset_table_by_uri(
+        session, tableUri
+    )
+    dataset = db.api.Dataset.get_dataset_by_uri(session, table.datasetUri)
+    if (
+            dataset.confidentiality
+            != models.ConfidentialityClassification.Unclassified.value
+    ):
+        ResourcePolicy.check_user_resource_permission(
+            session=session,
+            username=context.username,
+            groups=context.groups,
+            resource_uri=table.tableUri,
+            permission_name=permissions.PREVIEW_DATASET_TABLE,
+        )
+    return True
diff --git a/backend/dataall/api/Objects/DatasetProfiling/schema.py b/backend/dataall/api/Objects/DatasetProfiling/schema.py
@@ -1,8 +1,8 @@
 from ... import gql
 from .resolvers import (
     resolve_dataset,
-    get_profiling_run_status,
-    get_profiling_results,
+    resolve_profiling_run_status,
+    resolve_profiling_results,
 )
 
 DatasetProfilingRun = gql.ObjectType(
@@ -16,11 +16,11 @@
         gql.Field(name='GlueTriggerName', type=gql.String),
         gql.Field(name='GlueTableName', type=gql.String),
         gql.Field(name='AwsAccountId', type=gql.String),
-        gql.Field(name='results', type=gql.String, resolver=get_profiling_results),
+        gql.Field(name='results', type=gql.String, resolver=resolve_profiling_results),
         gql.Field(name='created', type=gql.String),
         gql.Field(name='updated', type=gql.String),
         gql.Field(name='owner', type=gql.String),
-        gql.Field('status', type=gql.String, resolver=get_profiling_run_status),
+        gql.Field('status', type=gql.String, resolver=resolve_profiling_run_status),
         gql.Field(name='dataset', type=gql.Ref('Dataset'), resolver=resolve_dataset),
     ],
 )

diff --git a/backend/dataall/api/Objects/Environment/queries.py b/backend/dataall/api/Objects/Environment/queries.py
@@ -175,6 +175,14 @@
     test_scope='Environment',
 )
 
+getCDKExecPolicyPresignedUrl = gql.QueryField(
+    name='getCDKExecPolicyPresignedUrl',
+    args=[gql.Argument(name='organizationUri', type=gql.NonNullableType(gql.String))],
+    type=gql.String,
+    resolver=get_cdk_exec_policy_template,
+    test_scope='Environment',
+)
+
 
 getPivotRoleExternalId = gql.QueryField(
     name='getPivotRoleExternalId',

diff --git a/backend/dataall/api/Objects/Environment/resolvers.py b/backend/dataall/api/Objects/Environment/resolvers.py
@@ -700,6 +700,52 @@ def get_pivot_role_template(context: Context, source, organizationUri=None):
             raise e
 
 
+def get_cdk_exec_policy_template(context: Context, source, organizationUri=None):
+    from ....utils import Parameter
+
+    with context.engine.scoped_session() as session:
+        ResourcePolicy.check_user_resource_permission(
+            session=session,
+            username=context.username,
+            groups=context.groups,
+            resource_uri=organizationUri,
+            permission_name=permissions.GET_ORGANIZATION,
+        )
+        cdk_exec_policy_bucket = Parameter().get_parameter(
+            env=os.getenv('envname', 'local'), path='s3/resources_bucket_name'
+        )
+        cdk_exec_policy_bucket_key = Parameter().get_parameter(
+            env=os.getenv('envname', 'local'), path='s3/cdk_exec_policy_prefix'
+        )
+        if not cdk_exec_policy_bucket or not cdk_exec_policy_bucket_key:
+            raise exceptions.AWSResourceNotFound(
+                action='GET_CDK_EXEC_POLICY_TEMPLATE',
+                message='CDK Exec Yaml template file could not be found on Amazon S3 bucket',
+            )
+        try:
+            s3_client = boto3.client(
+                's3',
+                region_name=os.getenv('AWS_REGION', 'eu-central-1'),
+                config=Config(
+                    signature_version='s3v4', s3={'addressing_style': 'virtual'}
+                ),
+            )
+            presigned_url = s3_client.generate_presigned_url(
+                'get_object',
+                Params=dict(
+                    Bucket=cdk_exec_policy_bucket,
+                    Key=cdk_exec_policy_bucket_key,
+                ),
+                ExpiresIn=15 * 60,
+            )
+            return presigned_url
+        except ClientError as e:
+            log.error(
+                f'Failed to get presigned URL for CDK Exec role template due to: {e}'
+            )
+            raise e
+
+
 def get_external_id(context: Context, source, organizationUri=None):
     with context.engine.scoped_session() as session:
         ResourcePolicy.check_user_resource_permission(
@@ -731,6 +777,6 @@ def get_pivot_role_name(context: Context, source, organizationUri=None):
         if not pivot_role_name:
             raise exceptions.AWSResourceNotFound(
                 action='GET_PIVOT_ROLE_NAME',
-                message='Pivot role name could not be found on AWS Secretsmanager',
+                message='Pivot role name could not be found on AWS Systems Manager - Parameter Store',
             )
         return pivot_role_name
diff --git a/backend/dataall/api/Objects/ShareObject/input_types.py b/backend/dataall/api/Objects/ShareObject/input_types.py
@@ -8,6 +8,7 @@
         gql.Argument(name='groupUri', type=gql.NonNullableType(gql.String)),
         gql.Argument(name='principalId', type=gql.NonNullableType(gql.String)),
         gql.Argument(name='principalType', type=gql.NonNullableType(gql.String)),
+        gql.Argument(name='requestPurpose', type=gql.String),
     ],
 )