From f6a7f309f69c2d3429ef2561940cba3415a33db3 Mon Sep 17 00:00:00 2001 From: Markus Cozowicz Date: Sat, 8 Jun 2024 09:40:27 +0000 Subject: [PATCH] cleanup for docs --- docs/requirements.txt | 4 +- sempy_labs/AI.py | 409 ++-- sempy_labs/Connections.py | 212 +- sempy_labs/CreatePQTFile.py | 191 -- sempy_labs/DirectLakeSchemaCompare.py | 87 - sempy_labs/GenerateReport.py | 255 --- sempy_labs/GetLakehouseTables.py | 159 -- sempy_labs/GetMeasureDependencies.py | 203 +- sempy_labs/GetSemanticModelBim.py | 59 +- sempy_labs/Guardrails.py | 37 +- sempy_labs/LogAnalytics.py | 46 +- sempy_labs/MeasureDependencyTree.py | 39 +- sempy_labs/MigrateCalcTablesToLakehouse.py | 311 --- .../MigrateCalcTablesToSemanticModel.py | 123 -- .../MigrateModelObjectsToSemanticModel.py | 324 --- .../MigrateTablesColumnsToSemanticModel.py | 135 -- sempy_labs/MigrationValidation.py | 133 -- sempy_labs/ModelAutoBuild.py | 150 +- sempy_labs/ModelBPA.py | 1599 +++++++++----- sempy_labs/OneLakeIntegration.py | 124 +- sempy_labs/QSO.py | 289 ++- sempy_labs/RefreshCalcTables.py | 107 +- sempy_labs/RefreshSemanticModel.py | 122 +- sempy_labs/ReportFunctions.py | 742 ------- .../ShowUnsupportedDirectLakeObjects.py | 68 - sempy_labs/TOM.py | 1926 +++++++++++------ sempy_labs/Translations.py | 385 +++- sempy_labs/Vertipaq.py | 976 ++++++--- sempy_labs/WarmCache.py | 187 +- sempy_labs/__init__.py | 28 +- sempy_labs/_clear_cache.py | 22 +- ...del.py => _create_blank_semantic_model.py} | 40 +- sempy_labs/_create_pqt_file.py | 238 ++ sempy_labs/{Fallback.py => _fallback.py} | 45 +- ...icModel.py => _generate_semantic_model.py} | 105 +- ...elperFunctions.py => _helper_functions.py} | 243 ++- sempy_labs/_icons.py | 4 + .../{ListFunctions.py => _list_functions.py} | 925 +++++--- sempy_labs/directlake/__init__.py | 24 + .../directlake/_directlake_schema_compare.py | 108 + .../_directlake_schema_sync.py} | 97 +- .../_get_directlake_lakehouse.py} | 34 +- .../_get_shared_expression.py} | 39 +- .../_list_directlake_model_calc_tables.py} | 26 +- .../_show_unsupported_directlake_objects.py | 88 + ..._directlake_model_lakehouse_connection.py} | 53 +- .../_update_directlake_partition_entity.py} | 46 +- sempy_labs/lakehouse/__init__.py | 10 + .../_get_lakehouse_columns.py} | 48 +- sempy_labs/lakehouse/_get_lakehouse_tables.py | 248 +++ .../{Lakehouse.py => lakehouse/_lakehouse.py} | 49 +- sempy_labs/migration/__init__.py | 16 + .../_migrate_calctables_to_lakehouse.py | 433 ++++ .../_migrate_calctables_to_semantic_model.py | 153 ++ ...migrate_model_objects_to_semantic_model.py | 524 +++++ ...igrate_tables_columns_to_semantic_model.py | 169 ++ sempy_labs/migration/_migration_validation.py | 230 ++ sempy_labs/report/__init__.py | 15 + sempy_labs/report/_generate_report.py | 260 +++ sempy_labs/report/_report_functions.py | 869 ++++++++ .../_report_rebind.py} | 69 +- sempy_labs/shortcuts.py | 237 +- 62 files changed, 9307 insertions(+), 5590 deletions(-) delete mode 100644 sempy_labs/CreatePQTFile.py delete mode 100644 sempy_labs/DirectLakeSchemaCompare.py delete mode 100644 sempy_labs/GenerateReport.py delete mode 100644 sempy_labs/GetLakehouseTables.py delete mode 100644 sempy_labs/MigrateCalcTablesToLakehouse.py delete mode 100644 sempy_labs/MigrateCalcTablesToSemanticModel.py delete mode 100644 sempy_labs/MigrateModelObjectsToSemanticModel.py delete mode 100644 sempy_labs/MigrateTablesColumnsToSemanticModel.py delete mode 100644 sempy_labs/MigrationValidation.py delete mode 100644 sempy_labs/ReportFunctions.py delete mode 100644 sempy_labs/ShowUnsupportedDirectLakeObjects.py rename sempy_labs/{CreateBlankSemanticModel.py => _create_blank_semantic_model.py} (57%) create mode 100644 sempy_labs/_create_pqt_file.py rename sempy_labs/{Fallback.py => _fallback.py} (55%) rename sempy_labs/{GenerateSemanticModel.py => _generate_semantic_model.py} (56%) rename sempy_labs/{HelperFunctions.py => _helper_functions.py} (64%) create mode 100644 sempy_labs/_icons.py rename sempy_labs/{ListFunctions.py => _list_functions.py} (55%) create mode 100644 sempy_labs/directlake/__init__.py create mode 100644 sempy_labs/directlake/_directlake_schema_compare.py rename sempy_labs/{DirectLakeSchemaSync.py => directlake/_directlake_schema_sync.py} (50%) rename sempy_labs/{GetDirectLakeLakehouse.py => directlake/_get_directlake_lakehouse.py} (69%) rename sempy_labs/{GetSharedExpression.py => directlake/_get_shared_expression.py} (54%) rename sempy_labs/{ListDirectLakeModelCalcTables.py => directlake/_list_directlake_model_calc_tables.py} (68%) create mode 100644 sempy_labs/directlake/_show_unsupported_directlake_objects.py rename sempy_labs/{UpdateDirectLakeModelLakehouseConnection.py => directlake/_update_directlake_model_lakehouse_connection.py} (51%) rename sempy_labs/{UpdateDirectLakePartitionEntity.py => directlake/_update_directlake_partition_entity.py} (58%) create mode 100644 sempy_labs/lakehouse/__init__.py rename sempy_labs/{GetLakehouseColumns.py => lakehouse/_get_lakehouse_columns.py} (58%) create mode 100644 sempy_labs/lakehouse/_get_lakehouse_tables.py rename sempy_labs/{Lakehouse.py => lakehouse/_lakehouse.py} (67%) create mode 100644 sempy_labs/migration/__init__.py create mode 100644 sempy_labs/migration/_migrate_calctables_to_lakehouse.py create mode 100644 sempy_labs/migration/_migrate_calctables_to_semantic_model.py create mode 100644 sempy_labs/migration/_migrate_model_objects_to_semantic_model.py create mode 100644 sempy_labs/migration/_migrate_tables_columns_to_semantic_model.py create mode 100644 sempy_labs/migration/_migration_validation.py create mode 100644 sempy_labs/report/__init__.py create mode 100644 sempy_labs/report/_generate_report.py create mode 100644 sempy_labs/report/_report_functions.py rename sempy_labs/{ReportRebind.py => report/_report_rebind.py} (67%) diff --git a/docs/requirements.txt b/docs/requirements.txt index e42cbf45..75a5603a 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,4 +5,6 @@ pandas==2.0.3 azure-identity==1.7.1 azure-keyvault-secrets azure-storage-file-datalake==12.3.1 -azure-storage-blob>=12.9.0 \ No newline at end of file +azure-storage-blob>=12.9.0 +anytree +IPython \ No newline at end of file diff --git a/sempy_labs/AI.py b/sempy_labs/AI.py index e70aaa30..c6ecd9fd 100644 --- a/sempy_labs/AI.py +++ b/sempy_labs/AI.py @@ -6,66 +6,94 @@ from pyspark.sql import SparkSession from typing import List, Optional, Union + def optimize_semantic_model(dataset: str, workspace: Optional[str] = None): from .ModelBPA import run_model_bpa - from .Fallback import check_fallback_reason - from .HelperFunctions import format_dax_object_name - - modelBPA = run_model_bpa(dataset = dataset, workspace = workspace, return_dataframe = True) - dfC = fabric.list_columns(dataset = dataset, workspace = workspace, extended = True) - dfC['Column Object'] = format_dax_object_name(dfC['Table Name'], dfC['Column Name']) - dfC['Total Size'] = dfC['Total Size'].astype('int') - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) + from ._fallback import check_fallback_reason + from ._helper_functions import format_dax_object_name - modelBPA_col = modelBPA[modelBPA['Object Type'] == 'Column'] - modelBPA_col = pd.merge(modelBPA_col, dfC[['Column Object', 'Total Size']], left_on = 'Object Name', right_on = 'Column Object', how = 'left') + modelBPA = run_model_bpa( + dataset=dataset, workspace=workspace, return_dataframe=True + ) + dfC = fabric.list_columns(dataset=dataset, workspace=workspace, extended=True) + dfC["Column Object"] = format_dax_object_name(dfC["Table Name"], dfC["Column Name"]) + dfC["Total Size"] = dfC["Total Size"].astype("int") + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + + modelBPA_col = modelBPA[modelBPA["Object Type"] == "Column"] + modelBPA_col = pd.merge( + modelBPA_col, + dfC[["Column Object", "Total Size"]], + left_on="Object Name", + right_on="Column Object", + how="left", + ) - isDirectLake = any(r['Mode'] == 'DirectLake' for i, r in dfP.iterrows()) + isDirectLake = any(r["Mode"] == "DirectLake" for i, r in dfP.iterrows()) if isDirectLake: - fallback = check_fallback_reason(dataset = dataset, workspace = workspace) - fallback_filt = fallback[fallback['FallbackReasonID']== 2] + fallback = check_fallback_reason(dataset=dataset, workspace=workspace) + fallback_filt = fallback[fallback["FallbackReasonID"] == 2] if len(fallback_filt) > 0: - print(f"The '{dataset}' semantic model is a Direct Lake semantic model which contains views. Since views always fall back to DirectQuery, it is recommended to only use lakehouse tables and not views.") + print( + f"The '{dataset}' semantic model is a Direct Lake semantic model which contains views. Since views always fall back to DirectQuery, it is recommended to only use lakehouse tables and not views." + ) # Potential model reduction estimate - ruleNames = ['Remove unnecessary columns','Set IsAvailableInMdx to false on non-attribute columns'] + ruleNames = [ + "Remove unnecessary columns", + "Set IsAvailableInMdx to false on non-attribute columns", + ] for rule in ruleNames: - df = modelBPA_col[modelBPA_col['Rule Name'] == rule] - df_filt = df[['Object Name', 'Total Size']].sort_values(by='Total Size', ascending=False) - totSize = df['Total Size'].sum() + df = modelBPA_col[modelBPA_col["Rule Name"] == rule] + df_filt = df[["Object Name", "Total Size"]].sort_values( + by="Total Size", ascending=False + ) + totSize = df["Total Size"].sum() if len(df_filt) > 0: - print(f"Potential savings of {totSize} bytes from following the '{rule}' rule.") + print( + f"Potential savings of {totSize} bytes from following the '{rule}' rule." + ) display(df_filt) else: print(f"The '{rule}' rule has been followed.") -def generate_measure_descriptions(dataset: str, measures: Union[str,List[str]], gpt_model: Optional[str] = 'gpt-35-turbo', workspace: Optional[str] = None): +def generate_measure_descriptions( + dataset: str, + measures: Union[str, List[str]], + gpt_model: Optional[str] = "gpt-35-turbo", + workspace: Optional[str] = None, +): - service_name = 'synapseml-openai' + service_name = "synapseml-openai" if isinstance(measures, str): measures = [measures] - validModels = ['gpt-35-turbo', 'gpt-35-turbo-16k', 'gpt-4'] + validModels = ["gpt-35-turbo", "gpt-35-turbo-16k", "gpt-4"] if gpt_model not in validModels: - print(f"The '{gpt_model}' model is not a valid model. Enter a gpt_model from this list: {validModels}.") + print( + f"The '{gpt_model}' model is not a valid model. Enter a gpt_model from this list: {validModels}." + ) return - dfM = fabric.list_measures(dataset = dataset, workspace = workspace) + dfM = fabric.list_measures(dataset=dataset, workspace=workspace) if measures is not None: - dfM_filt = dfM[dfM['Measure Name'].isin(measures)] + dfM_filt = dfM[dfM["Measure Name"].isin(measures)] else: dfM_filt = dfM - df = dfM_filt[['Table Name', 'Measure Name', 'Measure Expression']] + df = dfM_filt[["Table Name", "Measure Name", "Measure Expression"]] - df['prompt'] = f"The following is DAX code used by Microsoft Power BI. Please explain this code in simple terms:" +df['Measure Expression'] + df["prompt"] = ( + f"The following is DAX code used by Microsoft Power BI. Please explain this code in simple terms:" + + df["Measure Expression"] + ) # Generate new column in df dataframe which has the AI-generated descriptions completion = { @@ -73,43 +101,55 @@ def generate_measure_descriptions(dataset: str, measures: Union[str,List[str]], .setDeploymentName(gpt_model) .setMaxTokens(200) .setCustomServiceName(service_name) - .setPromptCol('prompt') - .setErrorCol('error') - .setOutputCol('completions') + .setPromptCol("prompt") + .setErrorCol("error") + .setOutputCol("completions") } completed_df = completion.transform(df).cache() completed_df.select( - col('prompt'), - col('error'), - col('completions.choices.text').getItem(0).alias('text'), + col("prompt"), + col("error"), + col("completions.choices.text").getItem(0).alias("text"), ) # Update the model to use the new descriptions tom_server = fabric.create_tom_server(readonly=False, workspace=workspace) m = tom_server.Databases.GetByName(dataset).Model - #for t in m.Tables: - #tName = t.Name - #for ms in t.Measures: - #mName = ms.Name - #mDesc = promptValue + # for t in m.Tables: + # tName = t.Name + # for ms in t.Measures: + # mName = ms.Name + # mDesc = promptValue + + # m.SaveChanges() - #m.SaveChanges() -def generate_aggs(dataset: str, table_name: str, columns: Union[str,List[str]], workspace: Optional[str] = None, lakehouse_workspace: Optional[str] = None): +def generate_aggs( + dataset: str, + table_name: str, + columns: Union[str, List[str]], + workspace: Optional[str] = None, + lakehouse_workspace: Optional[str] = None, +): + + from ._helper_functions import ( + get_direct_lake_sql_endpoint, + create_abfss_path, + format_dax_object_name, + resolve_lakehouse_id, + ) - from .HelperFunctions import get_direct_lake_sql_endpoint, create_abfss_path, format_dax_object_name, resolve_lakehouse_id - sempy.fabric._client._utils._init_analysis_services() import Microsoft.AnalysisServices.Tabular as TOM import System - #columns = { + # columns = { #'SalesAmount': 'Sum', #'ProductKey': 'GroupBy', #'OrderDateKey': 'GroupBy' - #} + # } if workspace == None: workspace_id = fabric.get_workspace_id() @@ -125,71 +165,87 @@ def generate_aggs(dataset: str, table_name: str, columns: Union[str,List[str]], columns = [columns] columnValues = columns.keys() - - aggTypes = ['Sum', 'Count', 'Min', 'Max', 'GroupBy'] - aggTypesAggregate = ['Sum', 'Count', 'Min', 'Max'] - numericTypes = ['Int64', 'Double', 'Decimal'] + + aggTypes = ["Sum", "Count", "Min", "Max", "GroupBy"] + aggTypesAggregate = ["Sum", "Count", "Min", "Max"] + numericTypes = ["Int64", "Double", "Decimal"] if any(value not in aggTypes for value in columns.values()): - print(f"Invalid aggregation type(s) have been specified in the 'columns' parameter. Valid aggregation types: {aggTypes}.") + print( + f"Invalid aggregation type(s) have been specified in the 'columns' parameter. Valid aggregation types: {aggTypes}." + ) return - dfC = fabric.list_columns(dataset = dataset, workspace = workspace) - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) - dfM = fabric.list_measures(dataset = dataset, workspace = workspace) - dfR = fabric.list_relationships(dataset = dataset, workspace = workspace) - if not any(r['Mode'] == 'DirectLake' for i, r in dfP.iterrows()): - print(f"The '{dataset}' semantic model within the '{workspace}' workspace is not in Direct Lake mode. This function is only relevant for Direct Lake semantic models.") + dfC = fabric.list_columns(dataset=dataset, workspace=workspace) + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + dfM = fabric.list_measures(dataset=dataset, workspace=workspace) + dfR = fabric.list_relationships(dataset=dataset, workspace=workspace) + if not any(r["Mode"] == "DirectLake" for i, r in dfP.iterrows()): + print( + f"The '{dataset}' semantic model within the '{workspace}' workspace is not in Direct Lake mode. This function is only relevant for Direct Lake semantic models." + ) return - - dfC_filtT = dfC[dfC['Table Name'] == table_name] + + dfC_filtT = dfC[dfC["Table Name"] == table_name] if len(dfC_filtT) == 0: - print(f"The '{table_name}' table does not exist in the '{dataset}' semantic model within the '{workspace}' workspace.") + print( + f"The '{table_name}' table does not exist in the '{dataset}' semantic model within the '{workspace}' workspace." + ) return - - dfC_filt = dfC[(dfC['Table Name'] == table_name) & (dfC['Column Name'].isin(columnValues))] + + dfC_filt = dfC[ + (dfC["Table Name"] == table_name) & (dfC["Column Name"].isin(columnValues)) + ] if len(columns) != len(dfC_filt): - print(f"Columns listed in '{columnValues}' do not exist in the '{table_name}' table in the '{dataset}' semantic model within the '{workspace}' workspace.") + print( + f"Columns listed in '{columnValues}' do not exist in the '{table_name}' table in the '{dataset}' semantic model within the '{workspace}' workspace." + ) return - + # Check if doing sum/count/min/max etc. on a non-number column - for col,agg in columns.items(): - dfC_col = dfC_filt[dfC_filt['Column Name'] == col] - dataType = dfC_col['Data Type'].iloc[0] + for col, agg in columns.items(): + dfC_col = dfC_filt[dfC_filt["Column Name"] == col] + dataType = dfC_col["Data Type"].iloc[0] if agg in aggTypesAggregate and dataType not in numericTypes: - print(f"The '{col}' column in the '{table_name}' table is of '{dataType}' data type. Only columns of '{numericTypes}' data types can be aggregated as '{aggTypesAggregate}' aggregation types.") + print( + f"The '{col}' column in the '{table_name}' table is of '{dataType}' data type. Only columns of '{numericTypes}' data types can be aggregated as '{aggTypesAggregate}' aggregation types." + ) return # Create/update lakehouse delta agg table - aggSuffix = '_agg' + aggSuffix = "_agg" aggTableName = f"{table_name}{aggSuffix}" - aggLakeTName = aggTableName.lower().replace(' ','_') - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) - dfP_filt = dfP[dfP['Table Name'] == table_name] - lakeTName = dfP_filt['Query'].iloc[0] + aggLakeTName = aggTableName.lower().replace(" ", "_") + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + dfP_filt = dfP[dfP["Table Name"] == table_name] + lakeTName = dfP_filt["Query"].iloc[0] - sqlEndpointId = get_direct_lake_sql_endpoint(dataset = dataset, workspace = workspace) + sqlEndpointId = get_direct_lake_sql_endpoint(dataset=dataset, workspace=workspace) - dfI = fabric.list_items(workspace = lakehouse_workspace, type = 'SQLEndpoint') - dfI_filt = dfI[(dfI['Id'] == sqlEndpointId)] + dfI = fabric.list_items(workspace=lakehouse_workspace, type="SQLEndpoint") + dfI_filt = dfI[(dfI["Id"] == sqlEndpointId)] if len(dfI_filt) == 0: - print(f"The lakehouse (SQL Endpoint) used by the '{dataset}' semantic model does not reside in the '{lakehouse_workspace}' workspace. Please update the lakehouse_workspace parameter.") + print( + f"The lakehouse (SQL Endpoint) used by the '{dataset}' semantic model does not reside in the '{lakehouse_workspace}' workspace. Please update the lakehouse_workspace parameter." + ) return - - lakehouseName = dfI_filt['Display Name'].iloc[0] - lakehouse_id = resolve_lakehouse_id(lakehouse = lakehouseName, workspace = lakehouse_workspace) + + lakehouseName = dfI_filt["Display Name"].iloc[0] + lakehouse_id = resolve_lakehouse_id( + lakehouse=lakehouseName, workspace=lakehouse_workspace + ) # Generate SQL query - query = 'SELECT' - groupBy = '\nGROUP BY' + query = "SELECT" + groupBy = "\nGROUP BY" for col, agg in columns.items(): - colFilt = dfC_filt[dfC_filt['Column Name'] == col] - sourceCol = colFilt['Source'].iloc[0] + colFilt = dfC_filt[dfC_filt["Column Name"] == col] + sourceCol = colFilt["Source"].iloc[0] - if agg == 'GroupBy': + if agg == "GroupBy": query = f"{query}\n{sourceCol}," groupBy = f"{groupBy}\n{sourceCol}," else: @@ -198,12 +254,16 @@ def generate_aggs(dataset: str, table_name: str, columns: Union[str,List[str]], query = query[:-1] spark = SparkSession.builder.getOrCreate() - fromTablePath = create_abfss_path(lakehouse_id=lakehouse_id, lakehouse_workspace_id=lakehouse_workspace_id, delta_table_name=lakeTName) + fromTablePath = create_abfss_path( + lakehouse_id=lakehouse_id, + lakehouse_workspace_id=lakehouse_workspace_id, + delta_table_name=lakeTName, + ) df = spark.read.format("delta").load(fromTablePath) - tempTableName = 'delta_table_' + lakeTName + tempTableName = "delta_table_" + lakeTName df.createOrReplaceTempView(tempTableName) sqlQuery = f"{query} \n FROM {tempTableName} {groupBy}" - + sqlQuery = sqlQuery[:-1] print(sqlQuery) @@ -211,20 +271,24 @@ def generate_aggs(dataset: str, table_name: str, columns: Union[str,List[str]], spark_df = spark.sql(sqlQuery) f"\nCreating/updating the '{aggLakeTName}' table in the lakehouse..." # Write spark dataframe to delta table - aggFilePath = create_abfss_path(lakehouse_id = lakehouse_id, lakehouse_workspace_id = lakehouse_workspace_id, delta_table_name = aggLakeTName) - spark_df.write.mode('overwrite').format('delta').save(aggFilePath) + aggFilePath = create_abfss_path( + lakehouse_id=lakehouse_id, + lakehouse_workspace_id=lakehouse_workspace_id, + delta_table_name=aggLakeTName, + ) + spark_df.write.mode("overwrite").format("delta").save(aggFilePath) f"The '{aggLakeTName}' table has been created/updated in the lakehouse." # Create/update semantic model agg table tom_server = fabric.create_tom_server(readonly=False, workspace=workspace) m = tom_server.Databases.GetByName(dataset).Model f"\nUpdating the '{dataset}' semantic model..." - dfC_agg = dfC[dfC['Table Name'] == aggTableName] + dfC_agg = dfC[dfC["Table Name"] == aggTableName] if len(dfC_agg) == 0: print(f"Creating the '{aggTableName}' table...") - exp = m.Expressions['DatabaseQuery'] - tbl = TOM.Table() + exp = m.Expressions["DatabaseQuery"] + tbl = TOM.Table() tbl.Name = aggTableName tbl.IsHidden = True @@ -241,9 +305,9 @@ def generate_aggs(dataset: str, table_name: str, columns: Union[str,List[str]], tbl.Partitions.Add(part) for i, r in dfC_filt.iterrows(): - scName = r['Source'] - cName = r['Column Name'] - dType = r['Data Type'] + scName = r["Source"] + cName = r["Column Name"] + dType = r["Data Type"] col = TOM.DataColumn() col.Name = cName @@ -252,10 +316,14 @@ def generate_aggs(dataset: str, table_name: str, columns: Union[str,List[str]], col.DataType = System.Enum.Parse(TOM.DataType, dType) tbl.Columns.Add(col) - print(f"The '{aggTableName}'[{cName}] column has been added to the '{dataset}' semantic model.") + print( + f"The '{aggTableName}'[{cName}] column has been added to the '{dataset}' semantic model." + ) m.Tables.Add(tbl) - print(f"The '{aggTableName}' table has been added to the '{dataset}' semantic model.") + print( + f"The '{aggTableName}' table has been added to the '{dataset}' semantic model." + ) else: print(f"Updating the '{aggTableName}' table's columns...") # Remove existing columns @@ -267,9 +335,9 @@ def generate_aggs(dataset: str, table_name: str, columns: Union[str,List[str]], m.Tables[tName].Columns.Remove(cName) # Add columns for i, r in dfC_filt.iterrows(): - scName = r['Source'] - cName = r['Column Name'] - dType = r['Data Type'] + scName = r["Source"] + cName = r["Column Name"] + dType = r["Data Type"] col = TOM.DataColumn() col.Name = cName @@ -281,58 +349,68 @@ def generate_aggs(dataset: str, table_name: str, columns: Union[str,List[str]], print(f"The '{aggTableName}'[{cName}] column has been added.") # Create relationships - relMap = { - 'm': 'Many', - '1': 'One', - '0': 'None' - } + relMap = {"m": "Many", "1": "One", "0": "None"} print(f"\nGenerating necessary relationships...") for i, r in dfR.iterrows(): - fromTable = r['From Table'] - fromColumn = r['From Column'] - toTable = r['To Table'] - toColumn = r['To Column'] - cfb = r['Cross Filtering Behavior'] - sfb = r['Security Filtering Behavior'] - mult = r['Multiplicity'] - - crossFB = System.Enum.Parse(TOM.CrossFilteringBehavior,cfb) - secFB = System.Enum.Parse(TOM.SecurityFilteringBehavior,sfb) - fromCardinality = System.Enum.Parse(TOM.RelationshipEndCardinality, relMap.get(mult[0])) - toCardinality = System.Enum.Parse(TOM.RelationshipEndCardinality, relMap.get(mult[-1])) - + fromTable = r["From Table"] + fromColumn = r["From Column"] + toTable = r["To Table"] + toColumn = r["To Column"] + cfb = r["Cross Filtering Behavior"] + sfb = r["Security Filtering Behavior"] + mult = r["Multiplicity"] + + crossFB = System.Enum.Parse(TOM.CrossFilteringBehavior, cfb) + secFB = System.Enum.Parse(TOM.SecurityFilteringBehavior, sfb) + fromCardinality = System.Enum.Parse( + TOM.RelationshipEndCardinality, relMap.get(mult[0]) + ) + toCardinality = System.Enum.Parse( + TOM.RelationshipEndCardinality, relMap.get(mult[-1]) + ) + rel = TOM.SingleColumnRelationship() rel.FromCardinality = fromCardinality rel.ToCardinality = toCardinality - rel.IsActive = r['Active'] + rel.IsActive = r["Active"] rel.CrossFilteringBehavior = crossFB rel.SecurityFilteringBehavior = secFB - rel.RelyOnReferentialIntegrity = r['Rely On Referential Integrity'] + rel.RelyOnReferentialIntegrity = r["Rely On Referential Integrity"] if fromTable == table_name: try: rel.FromColumn = m.Tables[aggTableName].Columns[fromColumn] m.Relationships.Add(rel) - print(f"'{aggTableName}'[{fromColumn}] -> '{toTable}'[{toColumn}] relationship has been added.") + print( + f"'{aggTableName}'[{fromColumn}] -> '{toTable}'[{toColumn}] relationship has been added." + ) except: - print(f"'{aggTableName}'[{fromColumn}] -> '{toTable}'[{toColumn}] relationship has not been created.") - elif toTable == table_name: + print( + f"'{aggTableName}'[{fromColumn}] -> '{toTable}'[{toColumn}] relationship has not been created." + ) + elif toTable == table_name: try: rel.ToColumn = m.Tables[aggTableName].Columns[toColumn] m.Relationships.Add(rel) - print(f"'{fromTable}'[{fromColumn}] -> '{aggTableName}'[{toColumn}] relationship has been added.") + print( + f"'{fromTable}'[{fromColumn}] -> '{aggTableName}'[{toColumn}] relationship has been added." + ) except: - print(f"'{fromTable}'[{fromColumn}] -> '{aggTableName}'[{toColumn}] relationship has not been created.") + print( + f"'{fromTable}'[{fromColumn}] -> '{aggTableName}'[{toColumn}] relationship has not been created." + ) f"Relationship creation is complete." # Create IF measure f"\nCreating measure to check if the agg table can be used..." - aggChecker = 'IF(' - dfR_filt = dfR[(dfR['From Table'] == table_name) & (~dfR['From Column'].isin(columnValues))] + aggChecker = "IF(" + dfR_filt = dfR[ + (dfR["From Table"] == table_name) & (~dfR["From Column"].isin(columnValues)) + ] for i, r in dfR_filt.iterrows(): - toTable = r['To Table'] + toTable = r["To Table"] aggChecker = f"{aggChecker}\nISCROSSFILTERED('{toTable}') ||" aggChecker = aggChecker[:-3] @@ -342,7 +420,10 @@ def generate_aggs(dataset: str, table_name: str, columns: Union[str,List[str]], # Todo: add IFISFILTERED clause for columns f"\n Creating the base measures in the agg table..." # Create base agg measures - dep = fabric.evaluate_dax(dataset = dataset, workspace = workspace, dax_string = """ + dep = fabric.evaluate_dax( + dataset=dataset, + workspace=workspace, + dax_string=""" SELECT [TABLE] AS [Table Name] ,[OBJECT] AS [Object Name] @@ -352,27 +433,32 @@ def generate_aggs(dataset: str, table_name: str, columns: Union[str,List[str]], ,[REFERENCED_OBJECT_TYPE] AS [Referenced Object Type] FROM $SYSTEM.DISCOVER_CALC_DEPENDENCY WHERE [OBJECT_TYPE] = 'MEASURE' - """) - - baseMeasures = dep[(dep['Referenced Object Type'] == 'COLUMN') & (dep['Referenced Table'] == table_name) & (dep['Referenced Object'].isin(columnValues))] + """, + ) + + baseMeasures = dep[ + (dep["Referenced Object Type"] == "COLUMN") + & (dep["Referenced Table"] == table_name) + & (dep["Referenced Object"].isin(columnValues)) + ] for i, r in baseMeasures.iterrows(): - tName = r['Table Name'] - mName = r['Object Name'] - cName = r['Referenced Object'] - dfM_filt = dfM[dfM['Measure Name'] == mName] - expr = dfM_filt['Measure Expression'].iloc[0] + tName = r["Table Name"] + mName = r["Object Name"] + cName = r["Referenced Object"] + dfM_filt = dfM[dfM["Measure Name"] == mName] + expr = dfM_filt["Measure Expression"].iloc[0] colFQNonAgg = format_dax_object_name(tName, cName) colFQAgg = format_dax_object_name(aggTableName, cName) colNQNonAgg = f"{tName}[{cName}]" - if ' ' in tName: - newExpr = expr.replace(colFQNonAgg,colFQAgg) + if " " in tName: + newExpr = expr.replace(colFQNonAgg, colFQAgg) else: - newExpr = expr.replace(colFQNonAgg, colFQAgg).replace(colNQNonAgg,colFQAgg) + newExpr = expr.replace(colFQNonAgg, colFQAgg).replace(colNQNonAgg, colFQAgg) print(expr) print(newExpr) - + aggMName = mName + aggSuffix measure = TOM.Measure() measure.Name = aggMName @@ -380,39 +466,30 @@ def generate_aggs(dataset: str, table_name: str, columns: Union[str,List[str]], measure.Expression = newExpr m.Tables[aggTableName].Measures.Add(measure) f"The '{aggMName}' measure has been created in the '{aggTableName}' table." - - # Update base detail measures - - #m.SaveChanges() - - - - - - + # Update base detail measures + # m.SaveChanges() # Identify views used within Direct Lake model -#workspace = 'MK Demo 6' -#lakehouse = 'MyLakehouse' -#dataset = 'MigrationTest' -#lakehouse_workspace = workspace - -#dfView = pd.DataFrame(columns=['Workspace Name', 'Lakehouse Name', 'View Name']) -#dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) -#isDirectLake = any(r['Mode'] == 'DirectLake' for i, r in dfP.iterrows()) - -#spark = SparkSession.builder.getOrCreate() -#views = spark.sql(f"SHOW VIEWS IN {lakehouse}").collect() -#for view in views: +# workspace = 'MK Demo 6' +# lakehouse = 'MyLakehouse' +# dataset = 'MigrationTest' +# lakehouse_workspace = workspace + +# dfView = pd.DataFrame(columns=['Workspace Name', 'Lakehouse Name', 'View Name']) +# dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) +# isDirectLake = any(r['Mode'] == 'DirectLake' for i, r in dfP.iterrows()) + +# spark = SparkSession.builder.getOrCreate() +# views = spark.sql(f"SHOW VIEWS IN {lakehouse}").collect() +# for view in views: # viewName = view['viewName'] # isTemporary = view['isTemporary'] # new_data = {'Workspace Name': workspace, 'Lakehouse Name': lakehouse, 'View Name': viewName} # dfView = pd.concat([dfView, pd.DataFrame(new_data, index=[0])], ignore_index=True) -#dfView -#lakeT = get_lakehouse_tables(lakehouse, lakehouse_workspace) -#if not dfP['Query'].isin(lakeT['Table Name'].values): +# dfView +# lakeT = get_lakehouse_tables(lakehouse, lakehouse_workspace) +# if not dfP['Query'].isin(lakeT['Table Name'].values): # if - diff --git a/sempy_labs/Connections.py b/sempy_labs/Connections.py index fe97202f..de310ee6 100644 --- a/sempy_labs/Connections.py +++ b/sempy_labs/Connections.py @@ -3,11 +3,32 @@ import pandas as pd from typing import List, Optional, Union -def create_connection_cloud(name: str, server_name: str, database_name: str, user_name: str, password: str, privacy_level: str): - #https://review.learn.microsoft.com/en-us/rest/api/fabric/core/connections/create-connection?branch=features%2Fdmts&tabs=HTTP - - df = pd.DataFrame(columns=['Connection ID', 'Connection Name', 'Connectivity Type', 'Connection Type', 'Connection Path', 'Privacy Level', 'Credential Type', 'Single Sign On Type', 'Connection Encryption', 'Skip Test Connection']) +def create_connection_cloud( + name: str, + server_name: str, + database_name: str, + user_name: str, + password: str, + privacy_level: str, +): + + # https://review.learn.microsoft.com/en-us/rest/api/fabric/core/connections/create-connection?branch=features%2Fdmts&tabs=HTTP + + df = pd.DataFrame( + columns=[ + "Connection ID", + "Connection Name", + "Connectivity Type", + "Connection Type", + "Connection Path", + "Privacy Level", + "Credential Type", + "Single Sign On Type", + "Connection Encryption", + "Skip Test Connection", + ] + ) client = fabric.FabricRestClient() @@ -17,15 +38,9 @@ def create_connection_cloud(name: str, server_name: str, database_name: str, use "connectionDetails": { "type": "SQL", "parameters": [ - { - "name": "server", - "value": server_name - }, - { - "name": "database", - "value": database_name - } - ] + {"name": "server", "value": server_name}, + {"name": "database", "value": database_name}, + ], }, "privacyLevel": privacy_level, "credentialDetails": { @@ -33,33 +48,62 @@ def create_connection_cloud(name: str, server_name: str, database_name: str, use "connectionEncryption": "NotEncrypted", "skipTestConnection": False, "credentials": { - "credentialType": "Basic", - "username": user_name, - "password": password - } - } + "credentialType": "Basic", + "username": user_name, + "password": password, + }, + }, } - response = client.post(f"/v1/connections",json=request_body) + response = client.post(f"/v1/connections", json=request_body) if response.status_code == 200: o = response.json() - new_data = {'Connection Id': o['id'], 'Connection Name': o['name'], 'Connectivity Type': o['connectivityType'], - 'Connection Type': o['connectionDetails']['type'], 'Connection Path': o['connectionDetails']['path'], 'Privacy Level': o['privacyLevel'], - 'Credential Type': o['credentialDetails']['credentialType'], 'Single Sign On Type': o['credentialDetails']['singleSignOnType'], - 'Connection Encryption': o['credentialDetails']['connectionEncryption'], 'Skip Test Connection': o['credentialDetails']['skipTestConnection'] + new_data = { + "Connection Id": o["id"], + "Connection Name": o["name"], + "Connectivity Type": o["connectivityType"], + "Connection Type": o["connectionDetails"]["type"], + "Connection Path": o["connectionDetails"]["path"], + "Privacy Level": o["privacyLevel"], + "Credential Type": o["credentialDetails"]["credentialType"], + "Single Sign On Type": o["credentialDetails"]["singleSignOnType"], + "Connection Encryption": o["credentialDetails"]["connectionEncryption"], + "Skip Test Connection": o["credentialDetails"]["skipTestConnection"], } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - df['Skip Test Connection'] = df['Skip Test Connection'].astype(bool) + df["Skip Test Connection"] = df["Skip Test Connection"].astype(bool) return df else: print(response.status_code) -def create_connection_on_prem(name: str, gateway_id: str, server_name: str, database_name: str, credentials: str, privacy_level: str): - df = pd.DataFrame(columns=['Connection ID', 'Connection Name', 'Gateway ID', 'Connectivity Type', 'Connection Type', 'Connection Path', 'Privacy Level', 'Credential Type', 'Single Sign On Type', 'Connection Encryption', 'Skip Test Connection']) +def create_connection_on_prem( + name: str, + gateway_id: str, + server_name: str, + database_name: str, + credentials: str, + privacy_level: str, +): + + df = pd.DataFrame( + columns=[ + "Connection ID", + "Connection Name", + "Gateway ID", + "Connectivity Type", + "Connection Type", + "Connection Path", + "Privacy Level", + "Credential Type", + "Single Sign On Type", + "Connection Encryption", + "Skip Test Connection", + ] + ) client = fabric.FabricRestClient() @@ -70,15 +114,9 @@ def create_connection_on_prem(name: str, gateway_id: str, server_name: str, data "connectionDetails": { "type": "SQL", "parameters": [ - { - "name": "server", - "value": server_name - }, - { - "name": "database", - "value": database_name - } - ] + {"name": "server", "value": server_name}, + {"name": "database", "value": database_name}, + ], }, "privacyLevel": privacy_level, "credentialDetails": { @@ -86,37 +124,63 @@ def create_connection_on_prem(name: str, gateway_id: str, server_name: str, data "connectionEncryption": "NotEncrypted", "skipTestConnection": False, "credentials": { - "credentialType": "Windows", - "values": [ - { - "gatewayId": gateway_id, - "credentials": credentials - } - ] - } - } + "credentialType": "Windows", + "values": [{"gatewayId": gateway_id, "credentials": credentials}], + }, + }, } - response = client.post(f"/v1/connections",json=request_body) + response = client.post(f"/v1/connections", json=request_body) if response.status_code == 200: o = response.json() - new_data = {'Connection Id': o['id'], 'Connection Name': o['name'], 'Gateway ID': o['gatewayId'], 'Connectivity Type': o['connectivityType'], - 'Connection Type': o['connectionDetails']['type'], 'Connection Path': o['connectionDetails']['path'], 'Privacy Level': o['privacyLevel'], - 'Credential Type': o['credentialDetails']['credentialType'], 'Single Sign On Type': o['credentialDetails']['singleSignOnType'], - 'Connection Encryption': o['credentialDetails']['connectionEncryption'], 'Skip Test Connection': o['credentialDetails']['skipTestConnection'] + new_data = { + "Connection Id": o["id"], + "Connection Name": o["name"], + "Gateway ID": o["gatewayId"], + "Connectivity Type": o["connectivityType"], + "Connection Type": o["connectionDetails"]["type"], + "Connection Path": o["connectionDetails"]["path"], + "Privacy Level": o["privacyLevel"], + "Credential Type": o["credentialDetails"]["credentialType"], + "Single Sign On Type": o["credentialDetails"]["singleSignOnType"], + "Connection Encryption": o["credentialDetails"]["connectionEncryption"], + "Skip Test Connection": o["credentialDetails"]["skipTestConnection"], } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - df['Skip Test Connection'] = df['Skip Test Connection'].astype(bool) + df["Skip Test Connection"] = df["Skip Test Connection"].astype(bool) return df else: print(response.status_code) -def create_connection_vnet(name: str, gateway_id: str, server_name: str, database_name: str, user_name: str, password: str, privacy_level: str): - df = pd.DataFrame(columns=['Connection ID', 'Connection Name', 'Gateway ID', 'Connectivity Type', 'Connection Type', 'Connection Path', 'Privacy Level', 'Credential Type', 'Single Sign On Type', 'Connection Encryption', 'Skip Test Connection']) +def create_connection_vnet( + name: str, + gateway_id: str, + server_name: str, + database_name: str, + user_name: str, + password: str, + privacy_level: str, +): + + df = pd.DataFrame( + columns=[ + "Connection ID", + "Connection Name", + "Gateway ID", + "Connectivity Type", + "Connection Type", + "Connection Path", + "Privacy Level", + "Credential Type", + "Single Sign On Type", + "Connection Encryption", + "Skip Test Connection", + ] + ) client = fabric.FabricRestClient() @@ -127,15 +191,9 @@ def create_connection_vnet(name: str, gateway_id: str, server_name: str, databas "connectionDetails": { "type": "SQL", "parameters": [ - { - "name": "server", - "value": server_name - }, - { - "name": "database", - "value": database_name - } - ] + {"name": "server", "value": server_name}, + {"name": "database", "value": database_name}, + ], }, "privacyLevel": privacy_level, "credentialDetails": { @@ -143,26 +201,34 @@ def create_connection_vnet(name: str, gateway_id: str, server_name: str, databas "connectionEncryption": "Encrypted", "skipTestConnection": False, "credentials": { - "credentialType": "Basic", - "username": user_name, - "password": password - } - } + "credentialType": "Basic", + "username": user_name, + "password": password, + }, + }, } - response = client.post(f"/v1/connections",json=request_body) + response = client.post(f"/v1/connections", json=request_body) if response.status_code == 200: o = response.json() - new_data = {'Connection Id': o['id'], 'Connection Name': o['name'], 'Gateway ID': o['gatewayId'], 'Connectivity Type': o['connectivityType'], - 'Connection Type': o['connectionDetails']['type'], 'Connection Path': o['connectionDetails']['path'], 'Privacy Level': o['privacyLevel'], - 'Credential Type': o['credentialDetails']['credentialType'], 'Single Sign On Type': o['credentialDetails']['singleSignOnType'], - 'Connection Encryption': o['credentialDetails']['connectionEncryption'], 'Skip Test Connection': o['credentialDetails']['skipTestConnection'] + new_data = { + "Connection Id": o["id"], + "Connection Name": o["name"], + "Gateway ID": o["gatewayId"], + "Connectivity Type": o["connectivityType"], + "Connection Type": o["connectionDetails"]["type"], + "Connection Path": o["connectionDetails"]["path"], + "Privacy Level": o["privacyLevel"], + "Credential Type": o["credentialDetails"]["credentialType"], + "Single Sign On Type": o["credentialDetails"]["singleSignOnType"], + "Connection Encryption": o["credentialDetails"]["connectionEncryption"], + "Skip Test Connection": o["credentialDetails"]["skipTestConnection"], } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - df['Skip Test Connection'] = df['Skip Test Connection'].astype(bool) + df["Skip Test Connection"] = df["Skip Test Connection"].astype(bool) return df else: - print(response.status_code) \ No newline at end of file + print(response.status_code) diff --git a/sempy_labs/CreatePQTFile.py b/sempy_labs/CreatePQTFile.py deleted file mode 100644 index f3303b2f..00000000 --- a/sempy_labs/CreatePQTFile.py +++ /dev/null @@ -1,191 +0,0 @@ -import sempy -import sempy.fabric as fabric -import pandas as pd -import json, os, shutil -import xml.etree.ElementTree as ET -from .ListFunctions import list_tables -from .Lakehouse import lakehouse_attached -from sempy._utils._log import log -from typing import List, Optional, Union - -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' - -@log -def create_pqt_file(dataset: str, workspace: Optional[str] = None, file_name: Optional[str] = None): - - """ - Dynamically generates a [Power Query Template](https://learn.microsoft.com/power-query/power-query-template) file based on the semantic model. The .pqt file is saved within the Files section of your lakehouse. - - Parameters - ---------- - dataset : str - Name of the semantic model. - workspace : str, default=None - The Fabric workspace name. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - file_name : str, default=None - The name of the Power Query Template file to be generated. - Defaults to None which resolves to 'PowerQueryTemplate'. - - Returns - ------- - - """ - - if file_name is None: - file_name = 'PowerQueryTemplate' - - lakeAttach = lakehouse_attached() - - if lakeAttach == False: - print(f"{red_dot} In order to run the 'create_pqt_file' function, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook.") - return - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - - folderPath = '/lakehouse/default/Files' - subFolderPath = os.path.join(folderPath, 'pqtnewfolder') - os.makedirs(subFolderPath, exist_ok=True) - - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) - dfT = list_tables(dataset, workspace) - dfE = fabric.list_expressions(dataset = dataset, workspace = workspace) - - # Check if M-partitions are used - if any(dfP['Source Type'] == 'M'): - class QueryMetadata: - def __init__(self, QueryName, QueryGroupId=None, LastKnownIsParameter=None, LastKnownResultTypeName=None, LoadEnabled=True, IsHidden=False): - self.QueryName = QueryName - self.QueryGroupId = QueryGroupId - self.LastKnownIsParameter = LastKnownIsParameter - self.LastKnownResultTypeName = LastKnownResultTypeName - self.LoadEnabled = LoadEnabled - self.IsHidden = IsHidden - - class RootObject: - def __init__(self, DocumentLocale, EngineVersion, QueriesMetadata, QueryGroups=None): - if QueryGroups is None: - QueryGroups = [] - self.DocumentLocale = DocumentLocale - self.EngineVersion = EngineVersion - self.QueriesMetadata = QueriesMetadata - self.QueryGroups = QueryGroups - - # STEP 1: Create MashupDocument.pq - mdfileName = 'MashupDocument.pq' - mdFilePath = os.path.join(subFolderPath, mdfileName) - sb = 'section Section1;' - for table_name in dfP['Table Name'].unique(): - tName = '#\"' + table_name + '"' - sourceExpression = dfT.loc[(dfT['Name'] == table_name), 'Source Expression'].iloc[0] - refreshPolicy = dfT.loc[(dfT['Name'] == table_name), 'Refresh Policy'].iloc[0] - sourceType = dfP.loc[(dfP['Table Name'] == table_name), 'Source Type'].iloc[0] - - if sourceType == 'M' or refreshPolicy: - sb = sb + '\n' + 'shared ' + tName + ' = ' - - partitions_in_table = dfP.loc[dfP['Table Name'] == table_name, 'Partition Name'].unique() - - i=1 - for partition_name in partitions_in_table: - pSourceType = dfP.loc[(dfP['Table Name'] == table_name) & (dfP['Partition Name'] == partition_name), 'Source Type'].iloc[0] - pQuery = dfP.loc[(dfP['Table Name'] == table_name) & (dfP['Partition Name'] == partition_name), 'Query'].iloc[0] - - if pQuery is not None: - pQueryNoSpaces = pQuery.replace(' ','').replace('\n','').replace('\t','').replace('\r','') - if pQueryNoSpaces.startswith('letSource=""'): - pQuery = 'let\n\tSource = ""\nin\n\tSource' - - if pSourceType == 'M' and i==1: - sb = sb + pQuery + ';' - elif refreshPolicy and i==1: - sb = sb + sourceExpression + ';' - i+=1 - - for index, row in dfE.iterrows(): - expr = row['Expression'] - eName = row['Name'] - eName = '#"' + eName + '"' - sb = sb + '\n' + "shared " + eName + " = " + expr + ";" - - with open(mdFilePath, 'w') as file: - file.write(sb) - - # STEP 2: Create the MashupMetadata.json file - mmfileName = 'MashupMetadata.json' - mmFilePath = os.path.join(subFolderPath, mmfileName) - queryMetadata = [] - - for tName in dfP['Table Name'].unique(): - sourceType = dfP.loc[(dfP['Table Name'] == tName), 'Source Type'].iloc[0] - refreshPolicy = dfT.loc[(dfT['Name'] == tName), 'Refresh Policy'].iloc[0] - if sourceType == 'M' or refreshPolicy: - queryMetadata.append(QueryMetadata(tName, None, None, None, True, False)) - - for i, r in dfE.iterrows(): - eName = r['Name'] - eKind = r['Kind'] - if eKind == 'M': - queryMetadata.append(QueryMetadata(eName, None, None, None, True, False)) - else: - queryMetadata.append(QueryMetadata(eName, None, None, None, False, False)) - - rootObject = RootObject("en-US", "2.126.453.0", queryMetadata) - - def obj_to_dict(obj): - if isinstance(obj, list): - return [obj_to_dict(e) for e in obj] - elif hasattr(obj, "__dict__"): - return {k: obj_to_dict(v) for k, v in obj.__dict__.items()} - else: - return obj - jsonContent = json.dumps(obj_to_dict(rootObject), indent=4) - - with open(mmFilePath, 'w') as json_file: - json_file.write(jsonContent) - - # STEP 3: Create Metadata.json file - mFileName = 'Metadata.json' - mFilePath = os.path.join(subFolderPath, mFileName) - metaData = {"Name": "fileName", "Description": "", "Version": "1.0.0.0"} - jsonContent = json.dumps(metaData, indent=4) - - with open(mFilePath, 'w') as json_file: - json_file.write(jsonContent) - - # STEP 4: Create [Content_Types].xml file: - ns = 'http://schemas.openxmlformats.org/package/2006/content-types' - ET.register_namespace('', ns) - types = ET.Element("{%s}Types" % ns) - default1 = ET.SubElement(types, "{%s}Default" % ns, {"Extension": "json", "ContentType": "application/json"}) - default2 = ET.SubElement(types, "{%s}Default" % ns, {"Extension": "pq", "ContentType": "application/x-ms-m"}) - xmlDocument = ET.ElementTree(types) - xmlFileName = '[Content_Types].xml' - xmlFilePath = os.path.join(subFolderPath, xmlFileName) - xmlDocument.write(xmlFilePath, xml_declaration=True, encoding='utf-8', method='xml') - - # STEP 5: Zip up the 4 files - zipFileName = file_name + '.zip' - zipFilePath = os.path.join(folderPath, zipFileName) - shutil.make_archive(zipFilePath[:-4], 'zip', subFolderPath) - - # STEP 6: Convert the zip file back into a .pqt file - newExt = '.pqt' - directory = os.path.dirname(zipFilePath) - fileNameWithoutExtension = os.path.splitext(os.path.basename(zipFilePath))[0] - newFilePath = os.path.join(directory, fileNameWithoutExtension + newExt) - shutil.move(zipFilePath, newFilePath) - - #STEP 7: Delete subFolder directory which is no longer needed - shutil.rmtree(subFolderPath, ignore_errors=True) - - print(f"{green_dot} '{file_name}.pqt' has been created based on the '{dataset}' semantic model in the '{workspace}' workspace within the Files section of your lakehouse.") - - else: - print(f"{yellow_dot} The '{dataset}' semantic model in the '{workspace}' workspace does not use Power Query so a Power Query Template file cannot be generated.") \ No newline at end of file diff --git a/sempy_labs/DirectLakeSchemaCompare.py b/sempy_labs/DirectLakeSchemaCompare.py deleted file mode 100644 index 66e1fd0e..00000000 --- a/sempy_labs/DirectLakeSchemaCompare.py +++ /dev/null @@ -1,87 +0,0 @@ -import sempy -import sempy.fabric as fabric -import pandas as pd -from .HelperFunctions import format_dax_object_name, resolve_lakehouse_name, get_direct_lake_sql_endpoint -from .GetLakehouseColumns import get_lakehouse_columns -from .ListFunctions import list_tables -from typing import List, Optional, Union - -def direct_lake_schema_compare(dataset: str, workspace: Optional[str] = None, lakehouse: Optional[str] = None, lakehouse_workspace: Optional[str] = None): - - """ - Checks that all the tables in a Direct Lake semantic model map to tables in their corresponding lakehouse and that the columns in each table exist. - - Parameters - ---------- - dataset : str - Name of the semantic model. - workspace : str, default=None - The Fabric workspace name. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - lakehouse : str, default=None - The Fabric lakehouse used by the Direct Lake semantic model. - Defaults to None which resolves to the lakehouse attached to the notebook. - lakehouse_workspace : str, default=None - The Fabric workspace used by the lakehouse. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - - """ - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - - if lakehouse_workspace is None: - lakehouse_workspace = workspace - - if lakehouse == None: - lakehouse_id = fabric.get_lakehouse_id() - lakehouse = resolve_lakehouse_name(lakehouse_id, lakehouse_workspace) - - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) - sqlEndpointId = get_direct_lake_sql_endpoint(dataset, workspace) - dfI = fabric.list_items(workspace = lakehouse_workspace, type = 'SQLEndpoint') - dfI_filt = dfI[(dfI['Id'] == sqlEndpointId)] - - if len(dfI_filt) == 0: - print(f"The SQL Endpoint in the '{dataset}' semantic model in the '{workspace} workspace does not point to the '{lakehouse}' lakehouse in the '{lakehouse_workspace}' workspace as specified.") - return - - if not any(r['Mode'] == 'DirectLake' for i, r in dfP.iterrows()): - print(f"The '{dataset}' semantic model is not in Direct Lake mode.") - return - - dfT = list_tables(dataset, workspace) - dfC = fabric.list_columns(dataset = dataset, workspace = workspace) - lc = get_lakehouse_columns(lakehouse, lakehouse_workspace) - - dfT.rename(columns={'Type': 'Table Type'}, inplace=True) - dfP_filt = dfP[dfP['Mode'] == 'DirectLake'] - dfC = pd.merge(dfC,dfP[['Table Name', 'Query']], on='Table Name', how='inner') - dfC = pd.merge(dfC,dfT[['Name', 'Table Type']], left_on='Table Name', right_on='Name', how='inner') - dfC['Full Column Name'] = format_dax_object_name(dfC['Query'], dfC['Source']) - dfC_filt = dfC[dfC['Table Type'] == 'Table'] - # Schema compare - missingtbls = dfP_filt[~dfP_filt['Query'].isin(lc['Table Name'])] - missingtbls = missingtbls[['Table Name', 'Query']] - missingtbls.rename(columns={'Query': 'Source Table'}, inplace=True) - missingcols = dfC_filt[~dfC_filt['Full Column Name'].isin(lc['Full Column Name'])] - missingcols = missingcols[['Table Name', 'Column Name', 'Type', 'Data Type', 'Source']] - missingcols.rename(columns={'Source': 'Source Column'}, inplace=True) - - if len(missingtbls) == 0: - print(f"All tables exist in the '{lakehouse}' lakehouse within the '{lakehouse_workspace}' workspace.") - else: - print(f"The following tables exist in the '{dataset}' semantic model within the '{workspace}' workspace but do not exist in the '{lakehouse}' lakehouse within the '{lakehouse_workspace}' workspace.") - display(missingtbls) - if len(missingcols) == 0: - print(f"All columns exist in the '{lakehouse}' lakehouse within the '{lakehouse_workspace}' workspace.") - else: - print(f"The following columns exist in the '{dataset}' semantic model within the '{workspace}' workspace but do not exist in the '{lakehouse}' lakehouse within the '{lakehouse_workspace}' workspace.") - display(missingcols) - \ No newline at end of file diff --git a/sempy_labs/GenerateReport.py b/sempy_labs/GenerateReport.py deleted file mode 100644 index d6de2531..00000000 --- a/sempy_labs/GenerateReport.py +++ /dev/null @@ -1,255 +0,0 @@ -import sempy -import sempy.fabric as fabric -import pandas as pd -import json, base64, time -from typing import List, Optional, Union - -def create_report_from_reportjson(report: str, dataset: str, report_json: str, theme_json: Optional[str] = None, workspace: Optional[str] = None): - - """ - Creates a report based on a report.json file (and an optional themes.json file). - - Parameters - ---------- - report : str - Name of the report. - dataset : str - Name of the semantic model to connect to the report. - report_json : str - The report.json file to be used to create the report. - theme_json : str, default=None - The theme.json file to be used for the theme of the report. - workspace : str, default=None - The Fabric workspace name. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - - """ - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - else: - workspace_id = fabric.resolve_workspace_id(workspace) - - objectType = 'Report' - - dfI_m = fabric.list_items(workspace = workspace, type = 'SemanticModel') - dfI_model = dfI_m[(dfI_m['Display Name'] == dataset)] - - if len(dfI_model) == 0: - print(f"ERROR: The '{dataset}' semantic model does not exist in the '{workspace}' workspace.") - return - - datasetId = dfI_model['Id'].iloc[0] - - dfI_r = fabric.list_items(workspace = workspace, type = 'Report') - dfI_rpt = dfI_r[(dfI_r['Display Name'] == report)] - - if len(dfI_rpt) > 0: - print(f"WARNING: '{report}' already exists as a report in the '{workspace}' workspace.") - return - - client = fabric.FabricRestClient() - defPBIR = { - "version": "1.0", - "datasetReference": { - "byPath": None, - "byConnection": { - "connectionString": None, - "pbiServiceModelId": None, - "pbiModelVirtualServerName": "sobe_wowvirtualserver", - "pbiModelDatabaseName": datasetId, - "name": "EntityDataSource", - "connectionType": "pbiServiceXmlaStyleLive" - } - } -} - - def conv_b64(file): - - loadJson = json.dumps(file) - f = base64.b64encode(loadJson.encode('utf-8')).decode('utf-8') - - return f - - definitionPBIR = conv_b64(defPBIR) - payloadReportJson = conv_b64(report_json) - - if theme_json == None: - request_body = { - 'displayName': report, - 'type': objectType, - 'definition': { - "parts": [ - { - "path": "report.json", - "payload": payloadReportJson, - "payloadType": "InlineBase64" - }, - { - "path": "definition.pbir", - "payload": definitionPBIR, - "payloadType": "InlineBase64" - } - ] - - } - } - else: - payloadThemeJson = conv_b64(theme_json) - themeID = theme_json['payload']['blob']['displayName'] - themePath = 'StaticResources/SharedResources/BaseThemes/' + themeID + '.json' - request_body = { - 'displayName': report, - 'type': objectType, - 'definition': { - "parts": [ - { - "path": "report.json", - "payload": payloadReportJson, - "payloadType": "InlineBase64" - }, - { - "path": themePath, - "payload": payloadThemeJson, - "payloadType": "InlineBase64" - }, - { - "path": "definition.pbir", - "payload": definitionPBIR, - "payloadType": "InlineBase64" - } - ] - - } - } - - response = client.post(f"/v1/workspaces/{workspace_id}/items",json=request_body) - - if response.status_code == 201: - print('Report creation succeeded') - print(response.json()) - elif response.status_code == 202: - operationId = response.headers['x-ms-operation-id'] - response = client.get(f"/v1/operations/{operationId}") - response_body = json.loads(response.content) - while response_body['status'] != 'Succeeded': - time.sleep(3) - response = client.get(f"/v1/operations/{operationId}") - response_body = json.loads(response.content) - response = client.get(f"/v1/operations/{operationId}/result") - print('Report creation succeeded') - print(response.json()) - -def update_report_from_reportjson(report: str, report_json: str, workspace: Optional[str] = None): - - """ - Updates a report based on a report.json file. - - Parameters - ---------- - report : str - Name of the report. - report_json : str - The report.json file to be used to update the report. - workspace : str, default=None - The Fabric workspace name in which the report resides. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - - """ - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - else: - workspace_id = fabric.resolve_workspace_id(workspace) - - objectType = 'Report' - - dfR = fabric.list_reports(workspace = workspace) - dfR_filt = dfR[(dfR['Name'] == report) & (dfR['Report Type'] == 'PowerBIReport')] - - if len(dfR_filt) == 0: - print(f"The '{report}' report does not exist in the '{workspace}' workspace.") - return - - reportId = dfR_filt['Id'].iloc[0] - client = fabric.FabricRestClient() - - response = client.post(f"/v1/workspaces/{workspace_id}/items/{reportId}/getDefinition") - df_items = pd.json_normalize(response.json()['definition']['parts']) - df_items_filt = df_items[df_items['path'] == 'definition.pbir'] - rptDefFile = df_items_filt['payload'].iloc[0] - #datasetId = dfR_filt['Dataset Id'].iloc[0] - #datasetWorkspaceId = dfR_filt['Dataset Workspace Id'].iloc[0] - - - #defPBIR = { - #"version": "1.0", - #"datasetReference": { - # "byPath": None, - # "byConnection": { - # "connectionString": None, - # "pbiServiceModelId": None, - # "pbiModelVirtualServerName": "sobe_wowvirtualserver", - # "pbiModelDatabaseName": datasetId, - # "name": "EntityDataSource", - # "connectionType": "pbiServiceXmlaStyleLive" - # } - #} -#} - - def conv_b64(file): - - loadJson = json.dumps(file) - f = base64.b64encode(loadJson.encode('utf-8')).decode('utf-8') - - return f - - #definitionPBIR = conv_b64(defPBIR) - payloadReportJson = conv_b64(report_json) - - request_body = { - 'displayName': report, - 'type': objectType, - 'definition': { - "parts": [ - { - "path": "report.json", - "payload": payloadReportJson, - "payloadType": "InlineBase64" - }, - { - "path": "definition.pbir", - "payload": rptDefFile, - "payloadType": "InlineBase64" - } - ] - - } - } - - response = client.post(f"/v1/workspaces/{workspace_id}/reports/{reportId}/updateDefinition",json=request_body) - - if response.status_code == 201: - print(f"The '{report}' report has been successfully updated.") - #print(response.json()) - elif response.status_code == 202: - operationId = response.headers['x-ms-operation-id'] - response = client.get(f"/v1/operations/{operationId}") - response_body = json.loads(response.content) - while response_body['status'] != 'Succeeded': - time.sleep(3) - response = client.get(f"/v1/operations/{operationId}") - response_body = json.loads(response.content) - response = client.get(f"/v1/operations/{operationId}/result") - print(f"The '{report}' report has been successfully updated.") - #print(response.json()) \ No newline at end of file diff --git a/sempy_labs/GetLakehouseTables.py b/sempy_labs/GetLakehouseTables.py deleted file mode 100644 index f14f60fa..00000000 --- a/sempy_labs/GetLakehouseTables.py +++ /dev/null @@ -1,159 +0,0 @@ -import sempy -import sempy.fabric as fabric -import pandas as pd -from pyspark.sql import SparkSession -import pyarrow.parquet as pq -import datetime -from .HelperFunctions import resolve_lakehouse_id, resolve_lakehouse_name -from .Guardrails import get_sku_size, get_directlake_guardrails_for_sku -from .Lakehouse import lakehouse_attached -from typing import List, Optional, Union - -def get_lakehouse_tables(lakehouse: Optional[str] = None, workspace: Optional[str] = None, extended: Optional[bool] = False, count_rows: Optional[bool] = False, export: Optional[bool] = False): - - """ - Shows the tables of a lakehouse and their respective properties. Option to include additional properties relevant to Direct Lake guardrails. - - Parameters - ---------- - lakehouse : str, default=None - The Fabric lakehouse. - Defaults to None which resolves to the lakehouse attached to the notebook. - lakehouse_workspace : str, default=None - The Fabric workspace used by the lakehouse. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - extended : bool, default=False - Obtains additional columns relevant to the size of each table. - count_rows : bool, default=False - Obtains a row count for each lakehouse table. - export : bool, default=False - Exports the resulting dataframe to a delta table in the lakehouse. - - Returns - ------- - pandas.DataFrame - Shows the tables/columns within a lakehouse and their properties. - """ - - df = pd.DataFrame(columns=['Workspace Name', 'Lakehouse Name', 'Table Name', 'Format', 'Type', 'Location']) - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - else: - workspace_id = fabric.resolve_workspace_id(workspace) - - if lakehouse == None: - lakehouse_id = fabric.get_lakehouse_id() - lakehouse = resolve_lakehouse_name(lakehouse_id, workspace) - else: - lakehouse_id = resolve_lakehouse_id(lakehouse, workspace) - - if count_rows: #Setting countrows defaults to extended=True - extended=True - - client = fabric.FabricRestClient() - response = client.get(f"/v1/workspaces/{workspace_id}/lakehouses/{lakehouse_id}/tables") - - for i in response.json()['data']: - tName = i['name'] - tType = i['type'] - tFormat = i['format'] - tLocation = i['location'] - if extended == False: - new_data = {'Workspace Name': workspace, 'Lakehouse Name': lakehouse, 'Table Name': tName, 'Format': tFormat, 'Type': tType, 'Location': tLocation } - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - else: - sku_value = get_sku_size(workspace) - guardrail = get_directlake_guardrails_for_sku(sku_value) - - spark = SparkSession.builder.getOrCreate() - - intColumns = ['Files', 'Row Groups', 'Table Size'] - if tType == 'Managed' and tFormat == 'delta': - detail_df = spark.sql(f"DESCRIBE DETAIL `{tName}`").collect()[0] - num_files = detail_df.numFiles - size_in_bytes = detail_df.sizeInBytes - - delta_table_path = f"Tables/{tName}" - latest_files = spark.read.format('delta').load(delta_table_path).inputFiles() - file_paths = [f.split("/")[-1] for f in latest_files] - - # Handle FileNotFoundError - num_rowgroups = 0 - for filename in file_paths: - try: - num_rowgroups += pq.ParquetFile(f"/lakehouse/default/{delta_table_path}/{filename}").num_row_groups - except FileNotFoundError: - continue - - if count_rows: - num_rows = spark.table(tName).count() - intColumns.append('Row Count') - new_data = {'Workspace Name': workspace, 'Lakehouse Name': lakehouse, 'Table Name': tName, 'Format': tFormat, 'Type': tType, 'Location': tLocation, 'Files': num_files, 'Row Groups': num_rowgroups, 'Row Count': num_rows, 'Table Size': size_in_bytes } - else: - new_data = {'Workspace Name': workspace, 'Lakehouse Name': lakehouse, 'Table Name': tName, 'Format': tFormat, 'Type': tType, 'Location': tLocation, 'Files': num_files, 'Row Groups': num_rowgroups, 'Table Size': size_in_bytes } - - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - df[intColumns] = df[intColumns].astype(int) - - df['SKU'] = guardrail['Fabric SKUs'].iloc[0] - df['Parquet File Guardrail'] = guardrail['Parquet files per table'].iloc[0] - df['Row Group Guardrail'] = guardrail['Row groups per table'].iloc[0] - df['Row Count Guardrail'] = guardrail['Rows per table (millions)'].iloc[0] * 1000000 - - df['Parquet File Guardrail Hit'] = df['Files'] > df['Parquet File Guardrail'] - df['Row Group Guardrail Hit'] = df['Row Groups'] > df['Row Group Guardrail'] - - if count_rows: - df['Row Count Guardrail Hit'] = df['Row Count'] > df['Row Count Guardrail'] - - if export: - lakeAttach = lakehouse_attached() - if lakeAttach == False: - print(f"In order to save the report.json file, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook.") - return - spark = SparkSession.builder.getOrCreate() - - lakehouse_id = fabric.get_lakehouse_id() - lakehouse = resolve_lakehouse_name(lakehouse_id = lakehouse_id, workspace = workspace) - lakeTName = 'lakehouse_table_details' - lakeT_filt = df[df['Table Name'] == lakeTName] - - query = f"SELECT MAX(RunId) FROM {lakehouse}.{lakeTName}" - - if len(lakeT_filt) == 0: - runId = 1 - else: - dfSpark = spark.sql(query) - maxRunId = dfSpark.collect()[0][0] - runId = maxRunId + 1 - - export_df = df.copy() - - cols = ['Files', 'Row Groups', 'Row Count', 'Table Size', 'SKU', 'Parquet File Guardrail', 'Row Group Guardrail', 'Row Count Guardrail', 'Parquet File Guardrail Hit', 'Row Group Guardrail Hit', 'Row Count Guardrail Hit'] - - for c in cols: - if c not in export_df: - if c in ['Files', 'Row Groups', 'Row Count', 'Table Size', 'Parquet File Guardrail', 'Row Group Guardrail', 'Row Count Guardrail']: - export_df[c] = 0 - export_df[c] = export_df[c].astype(int) - elif c in ['SKU']: - export_df[c] = None - export_df[c] = export_df[c].astype(str) - elif c in ['Parquet File Guardrail Hit', 'Row Group Guardrail Hit', 'Row Count Guardrail Hit']: - export_df[c] = False - export_df[c] = export_df[c].astype(bool) - - print(f"Saving Lakehouse table properties to the '{lakeTName}' table in the lakehouse...\n") - now = datetime.datetime.now() - export_df['Timestamp'] = now - export_df['RunId'] = runId - - export_df.columns = export_df.columns.str.replace(' ', '_') - spark_df = spark.createDataFrame(export_df) - spark_df.write.mode('append').format('delta').saveAsTable(lakeTName) - print(f"\u2022 Lakehouse table properties have been saved to the '{lakeTName}' delta table.") - - return df \ No newline at end of file diff --git a/sempy_labs/GetMeasureDependencies.py b/sempy_labs/GetMeasureDependencies.py index 6a1ba50d..ecb2a28a 100644 --- a/sempy_labs/GetMeasureDependencies.py +++ b/sempy_labs/GetMeasureDependencies.py @@ -1,11 +1,11 @@ import sempy import sempy.fabric as fabric import pandas as pd -from .HelperFunctions import format_dax_object_name +from ._helper_functions import format_dax_object_name from typing import List, Optional, Union -def get_measure_dependencies(dataset: str, workspace: Optional[str] = None): +def get_measure_dependencies(dataset: str, workspace: Optional[str] = None): """ Shows all dependencies for all measures in a semantic model. @@ -28,8 +28,10 @@ def get_measure_dependencies(dataset: str, workspace: Optional[str] = None): workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - dep = fabric.evaluate_dax(dataset = dataset, workspace = workspace, dax_string = - """ + dep = fabric.evaluate_dax( + dataset=dataset, + workspace=workspace, + dax_string=""" SELECT [TABLE] AS [Table Name] ,[OBJECT] AS [Object Name] @@ -39,44 +41,96 @@ def get_measure_dependencies(dataset: str, workspace: Optional[str] = None): ,[REFERENCED_OBJECT_TYPE] AS [Referenced Object Type] FROM $SYSTEM.DISCOVER_CALC_DEPENDENCY WHERE [OBJECT_TYPE] = 'MEASURE' - """) + """, + ) - dep['Object Type'] = dep['Object Type'].str.capitalize() - dep['Referenced Object Type'] = dep['Referenced Object Type'].str.capitalize() + dep["Object Type"] = dep["Object Type"].str.capitalize() + dep["Referenced Object Type"] = dep["Referenced Object Type"].str.capitalize() - dep['Full Object Name'] = format_dax_object_name(dep['Table Name'], dep['Object Name']) - dep['Referenced Full Object Name'] = format_dax_object_name(dep['Referenced Table'], dep['Referenced Object']) - dep['Parent Node'] = dep['Object Name'] + dep["Full Object Name"] = format_dax_object_name( + dep["Table Name"], dep["Object Name"] + ) + dep["Referenced Full Object Name"] = format_dax_object_name( + dep["Referenced Table"], dep["Referenced Object"] + ) + dep["Parent Node"] = dep["Object Name"] df = dep - df['Done'] = df.apply(lambda row: False if row['Referenced Object Type'] == 'Measure' else True, axis=1) + df["Done"] = df.apply( + lambda row: False if row["Referenced Object Type"] == "Measure" else True, + axis=1, + ) - while(any(df['Done'] == False)): + while any(df["Done"] == False): for i, r in df.iterrows(): - rObjFull = r['Referenced Full Object Name'] - rObj = r['Referenced Object'] - if r['Done'] == False: - dep_filt = dep[dep['Full Object Name'] == rObjFull] + rObjFull = r["Referenced Full Object Name"] + rObj = r["Referenced Object"] + if r["Done"] == False: + dep_filt = dep[dep["Full Object Name"] == rObjFull] for index, dependency in dep_filt.iterrows(): d = True - if dependency[5] == 'Measure': + if dependency[5] == "Measure": d = False - df = pd.concat([df, pd.DataFrame([{'Table Name': r['Table Name'], 'Object Name': r['Object Name'], 'Object Type': r['Object Type'] - , 'Referenced Object': dependency[4], 'Referenced Table': dependency[3], 'Referenced Object Type': dependency[5], 'Done': d, 'Full Object Name': r['Full Object Name'], 'Referenced Full Object Name': dependency[7],'Parent Node': rObj }])], ignore_index=True) + df = pd.concat( + [ + df, + pd.DataFrame( + [ + { + "Table Name": r["Table Name"], + "Object Name": r["Object Name"], + "Object Type": r["Object Type"], + "Referenced Object": dependency[4], + "Referenced Table": dependency[3], + "Referenced Object Type": dependency[5], + "Done": d, + "Full Object Name": r["Full Object Name"], + "Referenced Full Object Name": dependency[ + 7 + ], + "Parent Node": rObj, + } + ] + ), + ], + ignore_index=True, + ) else: - df = pd.concat([df, pd.DataFrame([{'Table Name': r['Table Name'], 'Object Name': r['Object Name'], 'Object Type': r['Object Type'] - , 'Referenced Object': dependency[5], 'Referenced Table': dependency[4], 'Referenced Object Type': dependency[6], 'Done': d, 'Full Object Name': r['Full Object Name'], 'Referenced Full Object Name': dependency[7],'Parent Node': rObj }])], ignore_index=True) + df = pd.concat( + [ + df, + pd.DataFrame( + [ + { + "Table Name": r["Table Name"], + "Object Name": r["Object Name"], + "Object Type": r["Object Type"], + "Referenced Object": dependency[5], + "Referenced Table": dependency[4], + "Referenced Object Type": dependency[6], + "Done": d, + "Full Object Name": r["Full Object Name"], + "Referenced Full Object Name": dependency[ + 7 + ], + "Parent Node": rObj, + } + ] + ), + ], + ignore_index=True, + ) - df.loc[i, 'Done'] = True + df.loc[i, "Done"] = True - df = df.drop(['Done','Full Object Name','Referenced Full Object Name'], axis=1) + df = df.drop(["Done", "Full Object Name", "Referenced Full Object Name"], axis=1) return df -def get_model_calc_dependencies(dataset: str, workspace: Optional[str] = None): +def get_model_calc_dependencies(dataset: str, workspace: Optional[str] = None): """ Shows all dependencies for all objects in a semantic model. @@ -99,8 +153,10 @@ def get_model_calc_dependencies(dataset: str, workspace: Optional[str] = None): workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - dep = fabric.evaluate_dax(dataset = dataset, workspace = workspace, dax_string = - """ + dep = fabric.evaluate_dax( + dataset=dataset, + workspace=workspace, + dax_string=""" SELECT [TABLE] AS [Table Name] ,[OBJECT] AS [Object Name] @@ -110,40 +166,93 @@ def get_model_calc_dependencies(dataset: str, workspace: Optional[str] = None): ,[REFERENCED_OBJECT] AS [Referenced Object] ,[REFERENCED_OBJECT_TYPE] AS [Referenced Object Type] FROM $SYSTEM.DISCOVER_CALC_DEPENDENCY - """) + """, + ) - dep['Object Type'] = dep['Object Type'].str.replace('_',' ').str.title() - dep['Referenced Object Type'] = dep['Referenced Object Type'].str.replace('_',' ').str.title() + dep["Object Type"] = dep["Object Type"].str.replace("_", " ").str.title() + dep["Referenced Object Type"] = ( + dep["Referenced Object Type"].str.replace("_", " ").str.title() + ) - dep['Full Object Name'] = format_dax_object_name(dep['Table Name'], dep['Object Name']) - dep['Referenced Full Object Name'] = format_dax_object_name(dep['Referenced Table'], dep['Referenced Object']) - dep['Parent Node'] = dep['Object Name'] + dep["Full Object Name"] = format_dax_object_name( + dep["Table Name"], dep["Object Name"] + ) + dep["Referenced Full Object Name"] = format_dax_object_name( + dep["Referenced Table"], dep["Referenced Object"] + ) + dep["Parent Node"] = dep["Object Name"] df = dep - objs = ['Measure','Calc Column', 'Calculation Item', 'Calc Table'] + objs = ["Measure", "Calc Column", "Calculation Item", "Calc Table"] - df['Done'] = df.apply(lambda row: False if row['Referenced Object Type'] in objs else True, axis=1) + df["Done"] = df.apply( + lambda row: False if row["Referenced Object Type"] in objs else True, axis=1 + ) - while(any(df['Done'] == False)): + while any(df["Done"] == False): for i, r in df.iterrows(): - rObjFull = r['Referenced Full Object Name'] - rObj = r['Referenced Object'] - if r['Done'] == False: - dep_filt = dep[dep['Full Object Name'] == rObjFull] + rObjFull = r["Referenced Full Object Name"] + rObj = r["Referenced Object"] + if r["Done"] == False: + dep_filt = dep[dep["Full Object Name"] == rObjFull] for index, dependency in dep_filt.iterrows(): - d = True + d = True if dependency[5] in objs: d = False - df = pd.concat([df, pd.DataFrame([{'Table Name': r['Table Name'], 'Object Name': r['Object Name'], 'Object Type': r['Object Type'] - , 'Referenced Object': dependency[4], 'Referenced Table': dependency[3], 'Referenced Object Type': dependency[5], 'Done': d, 'Full Object Name': r['Full Object Name'], 'Referenced Full Object Name': dependency[7],'Parent Node': rObj }])], ignore_index=True) + df = pd.concat( + [ + df, + pd.DataFrame( + [ + { + "Table Name": r["Table Name"], + "Object Name": r["Object Name"], + "Object Type": r["Object Type"], + "Referenced Object": dependency[4], + "Referenced Table": dependency[3], + "Referenced Object Type": dependency[5], + "Done": d, + "Full Object Name": r["Full Object Name"], + "Referenced Full Object Name": dependency[ + 7 + ], + "Parent Node": rObj, + } + ] + ), + ], + ignore_index=True, + ) else: - df = pd.concat([df, pd.DataFrame([{'Table Name': r['Table Name'], 'Object Name': r['Object Name'], 'Object Type': r['Object Type'] - , 'Referenced Object': dependency[5], 'Referenced Table': dependency[4], 'Referenced Object Type': dependency[6], 'Done': d, 'Full Object Name': r['Full Object Name'], 'Referenced Full Object Name': dependency[7],'Parent Node': rObj }])], ignore_index=True) + df = pd.concat( + [ + df, + pd.DataFrame( + [ + { + "Table Name": r["Table Name"], + "Object Name": r["Object Name"], + "Object Type": r["Object Type"], + "Referenced Object": dependency[5], + "Referenced Table": dependency[4], + "Referenced Object Type": dependency[6], + "Done": d, + "Full Object Name": r["Full Object Name"], + "Referenced Full Object Name": dependency[ + 7 + ], + "Parent Node": rObj, + } + ] + ), + ], + ignore_index=True, + ) - df.loc[i, 'Done'] = True + df.loc[i, "Done"] = True - df = df.drop(['Done'], axis=1) + df = df.drop(["Done"], axis=1) - return df \ No newline at end of file + return df diff --git a/sempy_labs/GetSemanticModelBim.py b/sempy_labs/GetSemanticModelBim.py index 0959dadb..f60526e5 100644 --- a/sempy_labs/GetSemanticModelBim.py +++ b/sempy_labs/GetSemanticModelBim.py @@ -1,13 +1,16 @@ -import sempy import sempy.fabric as fabric import pandas as pd import json, os, time, base64 -from .HelperFunctions import resolve_lakehouse_name -from .Lakehouse import lakehouse_attached +from sempy_labs._helper_functions import resolve_lakehouse_name +from sempy_labs.lakehouse import lakehouse_attached from typing import List, Optional, Union -def get_semantic_model_bim(dataset: str, workspace: Optional[str] = None, save_to_file_name: Optional[str] = None): +def get_semantic_model_bim( + dataset: str, + workspace: Optional[str] = None, + save_to_file_name: Optional[str] = None, +): """ Extracts the Model.bim file for a given semantic model. @@ -33,47 +36,53 @@ def get_semantic_model_bim(dataset: str, workspace: Optional[str] = None, save_t workspace = fabric.resolve_workspace_name(workspace_id) else: workspace_id = fabric.resolve_workspace_id(workspace) - - objType = 'SemanticModel' + + objType = "SemanticModel" client = fabric.FabricRestClient() - itemList = fabric.list_items(workspace = workspace, type = objType) - itemListFilt = itemList[(itemList['Display Name'] == dataset)] - itemId = itemListFilt['Id'].iloc[0] - response = client.post(f"/v1/workspaces/{workspace_id}/items/{itemId}/getDefinition") - + itemList = fabric.list_items(workspace=workspace, type=objType) + itemListFilt = itemList[(itemList["Display Name"] == dataset)] + itemId = itemListFilt["Id"].iloc[0] + response = client.post( + f"/v1/workspaces/{workspace_id}/items/{itemId}/getDefinition" + ) + if response.status_code == 200: res = response.json() elif response.status_code == 202: - operationId = response.headers['x-ms-operation-id'] + operationId = response.headers["x-ms-operation-id"] response = client.get(f"/v1/operations/{operationId}") - response_body = json.loads(response.content) - while response_body['status'] != 'Succeeded': + response_body = json.loads(response.content) + while response_body["status"] != "Succeeded": time.sleep(3) response = client.get(f"/v1/operations/{operationId}") response_body = json.loads(response.content) response = client.get(f"/v1/operations/{operationId}/result") res = response.json() - df_items = pd.json_normalize(res['definition']['parts']) - df_items_filt = df_items[df_items['path'] == 'model.bim'] - payload = df_items_filt['payload'].iloc[0] - bimFile = base64.b64decode(payload).decode('utf-8') + df_items = pd.json_normalize(res["definition"]["parts"]) + df_items_filt = df_items[df_items["path"] == "model.bim"] + payload = df_items_filt["payload"].iloc[0] + bimFile = base64.b64decode(payload).decode("utf-8") bimJson = json.loads(bimFile) - if save_to_file_name is not None: + if save_to_file_name is not None: lakeAttach = lakehouse_attached() if lakeAttach == False: - print(f"In order to save the model.bim file, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook.") + print( + f"In order to save the model.bim file, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook." + ) return - + lakehouse_id = fabric.get_lakehouse_id() lakehouse = resolve_lakehouse_name(lakehouse_id, workspace) - folderPath = '/lakehouse/default/Files' - fileExt = '.bim' + folderPath = "/lakehouse/default/Files" + fileExt = ".bim" if not save_to_file_name.endswith(fileExt): save_to_file_name = save_to_file_name + fileExt filePath = os.path.join(folderPath, save_to_file_name) with open(filePath, "w") as json_file: json.dump(bimJson, json_file, indent=4) - print(f"The .bim file for the '{dataset}' semantic model has been saved to the '{lakehouse}' in this location: '{filePath}'.\n\n") + print( + f"The .bim file for the '{dataset}' semantic model has been saved to the '{lakehouse}' in this location: '{filePath}'.\n\n" + ) - return bimJson \ No newline at end of file + return bimJson diff --git a/sempy_labs/Guardrails.py b/sempy_labs/Guardrails.py index 3826cb45..1849289b 100644 --- a/sempy_labs/Guardrails.py +++ b/sempy_labs/Guardrails.py @@ -3,8 +3,8 @@ import pandas as pd from typing import List, Optional, Union -def get_direct_lake_guardrails(): +def get_direct_lake_guardrails(): """ Shows the guardrails for when Direct Lake semantic models will fallback to Direct Query based on Microsoft's online documentation. @@ -17,17 +17,17 @@ def get_direct_lake_guardrails(): A table showing the Direct Lake guardrails by SKU. """ - url = 'https://learn.microsoft.com/power-bi/enterprise/directlake-overview' + url = "https://learn.microsoft.com/power-bi/enterprise/directlake-overview" tables = pd.read_html(url) df = tables[0] - df['Fabric SKUs'] = df['Fabric SKUs'].str.split('/') - df = df.explode('Fabric SKUs', ignore_index=True) - + df["Fabric SKUs"] = df["Fabric SKUs"].str.split("/") + df = df.explode("Fabric SKUs", ignore_index=True) + return df -def get_sku_size(workspace: Optional[str] = None): +def get_sku_size(workspace: Optional[str] = None): """ Shows the SKU size for a workspace. @@ -49,15 +49,20 @@ def get_sku_size(workspace: Optional[str] = None): workspace = fabric.resolve_workspace_name(workspace_id) dfC = fabric.list_capacities() - dfW = fabric.list_workspaces().sort_values(by='Name', ascending=True) - dfC.rename(columns={'Id': 'Capacity Id'}, inplace=True) - dfCW = pd.merge(dfW, dfC[['Capacity Id', 'Sku', 'Region', 'State']], on='Capacity Id', how='inner') - sku_value = dfCW.loc[dfCW['Name'] == workspace, 'Sku'].iloc[0] - + dfW = fabric.list_workspaces().sort_values(by="Name", ascending=True) + dfC.rename(columns={"Id": "Capacity Id"}, inplace=True) + dfCW = pd.merge( + dfW, + dfC[["Capacity Id", "Sku", "Region", "State"]], + on="Capacity Id", + how="inner", + ) + sku_value = dfCW.loc[dfCW["Name"] == workspace, "Sku"].iloc[0] + return sku_value -def get_directlake_guardrails_for_sku(sku_size: str): +def get_directlake_guardrails_for_sku(sku_size: str): """ Shows the guardrails for Direct Lake based on the SKU used by your workspace's capacity. *Use the result of the 'get_sku_size' function as an input for this function's skuSize parameter.* @@ -65,7 +70,7 @@ def get_directlake_guardrails_for_sku(sku_size: str): Parameters ---------- sku_size : str - Sku size of a workspace/capacity + Sku size of a workspace/capacity Returns ------- @@ -74,6 +79,6 @@ def get_directlake_guardrails_for_sku(sku_size: str): """ df = get_direct_lake_guardrails() - filtered_df = df[df['Fabric SKUs'] == sku_size] - - return filtered_df \ No newline at end of file + filtered_df = df[df["Fabric SKUs"] == sku_size] + + return filtered_df diff --git a/sempy_labs/LogAnalytics.py b/sempy_labs/LogAnalytics.py index 8b4cacad..14d7197d 100644 --- a/sempy_labs/LogAnalytics.py +++ b/sempy_labs/LogAnalytics.py @@ -1,13 +1,18 @@ import sempy import sempy.fabric as fabric import pandas as pd -from .HelperFunctions import resolve_dataset_id +from ._helper_functions import resolve_dataset_id from typing import List, Optional, Union from sempy._utils._log import log -@log -def run_dax(dataset: str, dax_query: str, user_name: Optional[str] = None, workspace: Optional[str] = None): +@log +def run_dax( + dataset: str, + dax_query: str, + user_name: Optional[str] = None, + workspace: Optional[str] = None, +): """ Runs a DAX query against a semantic model. @@ -30,7 +35,7 @@ def run_dax(dataset: str, dax_query: str, user_name: Optional[str] = None, works A pandas dataframe holding the result of the DAX query. """ - #https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/execute-queries-in-group + # https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/execute-queries-in-group if workspace is None: workspace_id = fabric.get_workspace_id() @@ -38,31 +43,24 @@ def run_dax(dataset: str, dax_query: str, user_name: Optional[str] = None, works else: workspace_id = fabric.resolve_workspace_id(workspace) - dataset_id = resolve_dataset_id(dataset = dataset, workspace = workspace) + dataset_id = resolve_dataset_id(dataset=dataset, workspace=workspace) if user_name is None: - request_body = { - "queries": [ - { - "query": dax_query - } - ] - } + request_body = {"queries": [{"query": dax_query}]} else: request_body = { - "queries": [ - { - "query": dax_query + "queries": [{"query": dax_query}], + "impersonatedUserName": user_name, } - ], - "impersonatedUserName": user_name - } client = fabric.PowerBIRestClient() - response = client.post(f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}/executeQueries", json = request_body) - data = response.json()['results'][0]['tables'] - column_names = data[0]['rows'][0].keys() - data_rows = [row.values() for item in data for row in item['rows']] + response = client.post( + f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}/executeQueries", + json=request_body, + ) + data = response.json()["results"][0]["tables"] + column_names = data[0]["rows"][0].keys() + data_rows = [row.values() for item in data for row in item["rows"]] df = pd.DataFrame(data_rows, columns=column_names) - - return df \ No newline at end of file + + return df diff --git a/sempy_labs/MeasureDependencyTree.py b/sempy_labs/MeasureDependencyTree.py index 17a3b649..32000041 100644 --- a/sempy_labs/MeasureDependencyTree.py +++ b/sempy_labs/MeasureDependencyTree.py @@ -5,9 +5,11 @@ from typing import List, Optional, Union from sempy._utils._log import log -@log -def measure_dependency_tree(dataset: str, measure_name: str, workspace: Optional[str] = None): +@log +def measure_dependency_tree( + dataset: str, measure_name: str, workspace: Optional[str] = None +): """ Prints a measure dependency tree of all dependent objects for a measure in a semantic model. @@ -27,20 +29,21 @@ def measure_dependency_tree(dataset: str, measure_name: str, workspace: Optional """ - if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - dfM = fabric.list_measures(dataset = dataset, workspace = workspace) - dfM_filt = dfM[dfM['Measure Name'] == measure_name] + dfM = fabric.list_measures(dataset=dataset, workspace=workspace) + dfM_filt = dfM[dfM["Measure Name"] == measure_name] if len(dfM_filt) == 0: - print(f"The '{measure_name}' measure does not exist in the '{dataset}' semantic model in the '{workspace}' workspace.") + print( + f"The '{measure_name}' measure does not exist in the '{dataset}' semantic model in the '{workspace}' workspace." + ) return md = get_measure_dependencies(dataset, workspace) - df_filt = md[md['Object Name'] == measure_name] + df_filt = md[md["Object Name"] == measure_name] # Create a dictionary to hold references to nodes node_dict = {} @@ -50,27 +53,27 @@ def measure_dependency_tree(dataset: str, measure_name: str, workspace: Optional # Populate the tree for _, row in df_filt.iterrows(): - #measure_name = row['Object Name'] - ref_obj_table_name = row['Referenced Table'] - ref_obj_name = row['Referenced Object'] - ref_obj_type = row['Referenced Object Type'] - parent_node_name = row['Parent Node'] - + # measure_name = row['Object Name'] + ref_obj_table_name = row["Referenced Table"] + ref_obj_name = row["Referenced Object"] + ref_obj_type = row["Referenced Object Type"] + parent_node_name = row["Parent Node"] + # Create or get the parent node parent_node = node_dict.get(parent_node_name) if parent_node is None: - parent_node = Node(parent_node_name) + parent_node = Node(parent_node_name) node_dict[parent_node_name] = parent_node parent_node.custom_property = measureIcon + " " # Create the child node child_node_name = ref_obj_name child_node = Node(child_node_name, parent=parent_node) - if ref_obj_type == 'Column': + if ref_obj_type == "Column": child_node.custom_property = columnIcon + " '" + ref_obj_table_name + "'" - elif ref_obj_type == 'Table': + elif ref_obj_type == "Table": child_node.custom_property = tableIcon + " " - elif ref_obj_type == 'Measure': + elif ref_obj_type == "Measure": child_node.custom_property = measureIcon + " " # Update the dictionary with the child node @@ -81,4 +84,4 @@ def measure_dependency_tree(dataset: str, measure_name: str, workspace: Optional if tableIcon in node.custom_property: print(f"{pre}{node.custom_property}'{node.name}'") else: - print(f"{pre}{node.custom_property}[{node.name}]") \ No newline at end of file + print(f"{pre}{node.custom_property}[{node.name}]") diff --git a/sempy_labs/MigrateCalcTablesToLakehouse.py b/sempy_labs/MigrateCalcTablesToLakehouse.py deleted file mode 100644 index 67317272..00000000 --- a/sempy_labs/MigrateCalcTablesToLakehouse.py +++ /dev/null @@ -1,311 +0,0 @@ -import sempy -import sempy.fabric as fabric -import pandas as pd -import re, datetime, time -from .GetLakehouseTables import get_lakehouse_tables -from .HelperFunctions import resolve_lakehouse_name, resolve_lakehouse_id, create_abfss_path -from .TOM import connect_semantic_model -from pyspark.sql import SparkSession -from typing import List, Optional, Union -from sempy._utils._log import log - -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' - -@log -def migrate_calc_tables_to_lakehouse(dataset: str, new_dataset: str, workspace: Optional[str] = None, new_dataset_workspace: Optional[str] = None, lakehouse: Optional[str] = None, lakehouse_workspace: Optional[str] = None): - - """ - Creates delta tables in your lakehouse based on the DAX expression of a calculated table in an import/DirectQuery semantic model. The DAX expression encapsulating the calculated table logic is stored in the new Direct Lake semantic model as model annotations. - - Parameters - ---------- - dataset : str - Name of the import/DirectQuery semantic model. - new_dataset : str - Name of the Direct Lake semantic model. - workspace : str, default=None - The Fabric workspace name in which the import/DirectQuery semantic model exists. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - new_dataset_workspace : str - The Fabric workspace name in which the Direct Lake semantic model will be created. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - lakehouse : str, default=None - The Fabric lakehouse used by the Direct Lake semantic model. - Defaults to None which resolves to the lakehouse attached to the notebook. - lakehouse_workspace : str, default=None - The Fabric workspace used by the lakehouse. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - - """ - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - else: - workspace_id = fabric.resolve_workspace_id(workspace) - - if new_dataset_workspace == None: - new_dataset_workspace = workspace - - if lakehouse_workspace == None: - lakehouse_workspace = new_dataset_workspace - lakehouse_workspace_id = fabric.resolve_workspace_id(lakehouse_workspace) - else: - lakehouse_workspace_id = fabric.resolve_workspace_id(lakehouse_workspace) - - if lakehouse == None: - lakehouse_id = fabric.get_lakehouse_id() - lakehouse = resolve_lakehouse_name(lakehouse_id, lakehouse_workspace) - else: - lakehouse_id = resolve_lakehouse_id(lakehouse, lakehouse_workspace) - - dfC = fabric.list_columns(dataset = dataset, workspace = workspace) - #dfC['Column Object'] = "'" + dfC['Table Name'] + "'[" + dfC['Column Name'] + "]" - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) - dfP_filt = dfP[(dfP['Source Type'] == 'Calculated')] - dfP_filt = dfP_filt[~dfP_filt['Query'].str.contains('NAMEOF')] #Remove field parameters - #dfC_CalcColumn = dfC[dfC['Type'] == 'Calculated'] - lakeTables = get_lakehouse_tables(lakehouse, lakehouse_workspace) - - # Do not execute the function if lakehouse tables already exist with the same name - killFunction = False - for i, r in dfP_filt.iterrows(): - tName = r['Table Name'] - dtName = tName.replace(' ', '_') - - if dtName in lakeTables['Table Name'].values: - print(f"{red_dot} The '{tName}' table already exists as '{dtName}' in the '{lakehouse}' lakehouse in the '{workspace}' workspace.") - killFunction = True - - if killFunction: - return - - spark = SparkSession.builder.getOrCreate() - - if len(dfP_filt) == 0: - print(f"{yellow_dot} The '{dataset}' semantic model in the '{workspace}' workspace has no calculated tables.") - return - - start_time = datetime.datetime.now() - timeout = datetime.timedelta(minutes=1) - success = False - - while not success: - try: - with connect_semantic_model(dataset=dataset, workspace = workspace, readonly=True) as tom: - success = True - for t in tom.model.Tables: - if tom.is_auto_date_table(table_name = t.Name): - print(f"{yellow_dot} The '{t.Name}' table is an auto-datetime table and is not supported in the Direct Lake migration process. Please create a proper Date/Calendar table in your lakehoues and use it in your Direct Lake model.") - else: - for p in t.Partitions: - if str(p.SourceType) == 'Calculated': - query = p.Source.Expression - if 'NAMEOF' not in query: # exclude field parameters - daxQuery = '' - if query.lower().startswith('calendar') and any(str(c.Type) == 'Calculated' for c in t.Columns): - daxQuery = f"ADDCOLUMNS(\n{query}," - for c in t.Columns: - if str(c.Type) == 'Calculated': - expr = c.Expression - expr = expr.replace(f"'{t.Name}'",'').replace(f"{t.Name}[Date]",'[Date]') - expr = expr.replace('[MonthNo]','MONTH([Date])').replace('[QuarterNo]','INT((MONTH([Date]) + 2) / 3)') - daxQuery = f"{daxQuery}\n\"{c.Name}\",{expr}," - daxQuery = 'EVALUATE\n' + daxQuery.rstrip(',') + '\n)' - else: - daxQuery = f"EVALUATE\n{query}" - daxQueryTopN = daxQuery.replace('EVALUATE\n', 'EVALUATE\nTOPN(1,') + ')' - - try: - df = fabric.evaluate_dax(dataset = dataset, dax_string = daxQueryTopN, workspace = workspace) - - for col in df.columns: - pattern = r"\[([^\]]+)\]" - - matches = re.findall(pattern, col) - new_column_name = matches[0].replace(' ','') - - df.rename(columns={col: new_column_name}, inplace=True) - - try: - dataType = next(str(c.DataType) for c in tom.model.Tables[t.Name].Columns if str(c.Type) == 'CalculatedTableColumn' and c.SourceColumn == col) - except: - dataType = next(str(c.DataType) for c in tom.model.Tables[t.Name].Columns if str(c.Type) == 'Calculated' and c.Name == new_column_name) - - if dataType == 'Int64': - df[new_column_name] = df[new_column_name].astype(int) - elif dataType in ['Decimal', 'Double']: - df[new_column_name] = df[new_column_name].astype(float) - elif dataType == 'Boolean': - df[new_column_name] = df[new_column_name].astype(bool) - elif dataType == 'DateTime': - df[new_column_name] = pd.to_datetime(df[new_column_name]) - - delta_table_name = t.Name.replace(' ','_').lower() - - spark_df = spark.createDataFrame(df) - filePath = create_abfss_path(lakehouse_id = lakehouse_id, lakehouse_workspace_id = lakehouse_workspace_id, delta_table_name = delta_table_name) - spark_df.write.mode('overwrite').format('delta').save(filePath) - - start_time2 = datetime.datetime.now() - timeout2 = datetime.timedelta(minutes=1) - success2 = False - - while not success2: - try: - with connect_semantic_model(dataset=new_dataset, readonly=False, workspace=new_dataset_workspace) as tom2: - success2 = True - tom2.set_annotation(object = tom2.model, name = t.Name, value = daxQuery) - except Exception as e: - if datetime.datetime.now() - start_time2 > timeout2: - break - time.sleep(1) - - print(f"{green_dot} Calculated table '{t.Name}' has been created as delta table '{delta_table_name.lower()}' in the '{lakehouse}' lakehouse within the '{lakehouse_workspace}' workspace.") - except: - print(f"{red_dot} Failed to create calculated table '{t.Name}' as a delta table in the lakehouse.") - except Exception as e: - if datetime.datetime.now() - start_time > timeout: - break - time.sleep(1) - -@log -def migrate_field_parameters(dataset: str, new_dataset: str, workspace: Optional[str] = None, new_dataset_workspace: Optional[str] = None): - - """ - Migrates field parameters from one semantic model to another. - - Parameters - ---------- - dataset : str - Name of the import/DirectQuery semantic model. - new_dataset : str - Name of the Direct Lake semantic model. - workspace : str, default=None - The Fabric workspace name in which the import/DirectQuery semantic model exists. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - new_dataset_workspace : str - The Fabric workspace name in which the Direct Lake semantic model will be created. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - - """ - - from .HelperFunctions import format_dax_object_name - sempy.fabric._client._utils._init_analysis_services() - import Microsoft.AnalysisServices.Tabular as TOM - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - - if new_dataset_workspace == None: - new_dataset_workspace = workspace - - dfC = fabric.list_columns(dataset = dataset, workspace = workspace) - dfC['Column Object'] = format_dax_object_name(dfC['Table Name'], dfC['Column Name']) - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) - dfP_filt = dfP[(dfP['Source Type'] == 'Calculated')] - dfP_filt = dfP_filt[dfP_filt['Query'].str.contains('NAMEOF')] # Only field parameters - dfC_CalcColumn = dfC[dfC['Type'] == 'Calculated'] - - if len(dfP_filt) == 0: - print(f"{green_dot} The '{dataset}' semantic model in the '{workspace}' workspace has no field parameters.") - return - - start_time = datetime.datetime.now() - timeout = datetime.timedelta(minutes=1) - success = False - - while not success: - try: - with connect_semantic_model(dataset=new_dataset, workspace=new_dataset_workspace, readonly=False) as tom: - success = True - - for i,r in dfP_filt.iterrows(): - tName = r['Table Name'] - query = r['Query'] - - # For field parameters, remove calc columns from the query - rows = query.strip().split('\n') - filtered_rows = [row for row in rows if not any(value in row for value in dfC_CalcColumn['Column Object'].values)] - updated_query_string = '\n'.join(filtered_rows) - - # Remove extra comma - lines = updated_query_string.strip().split('\n') - lines[-2] = lines[-2].rstrip(',') - expr = '\n'.join(lines) - - try: - par = TOM.Partition() - par.Name = tName - - parSource = TOM.CalculatedPartitionSource() - par.Source = parSource - parSource.Expression = expr - - tbl = TOM.Table() - tbl.Name = tName - tbl.Partitions.Add(par) - - columns = ['Value1', 'Value2', 'Value3'] - - for colName in columns: - col = TOM.CalculatedTableColumn() - col.Name = colName - col.SourceColumn = '[' + colName + ']' - col.DataType = TOM.DataType.String - - tbl.Columns.Add(col) - - tom.model.Tables.Add(tbl) - - ep = TOM.JsonExtendedProperty() - ep.Name = 'ParameterMetadata' - ep.Value = '{"version":3,"kind":2}' - - rcd = TOM.RelatedColumnDetails() - gpc = TOM.GroupByColumn() - gpc.GroupingColumn = tom.model.Tables[tName].Columns['Value2'] - rcd.GroupByColumns.Add(gpc) - - # Update column properties - tom.model.Tables[tName].Columns['Value2'].IsHidden = True - tom.model.Tables[tName].Columns['Value3'].IsHidden = True - tom.model.Tables[tName].Columns['Value3'].DataType = TOM.DataType.Int64 - tom.model.Tables[tName].Columns['Value1'].SortByColumn = tom.model.Tables[tName].Columns['Value3'] - tom.model.Tables[tName].Columns['Value2'].SortByColumn = tom.model.Tables[tName].Columns['Value3'] - tom.model.Tables[tName].Columns['Value2'].ExtendedProperties.Add(ep) - tom.model.Tables[tName].Columns['Value1'].RelatedColumnDetails = rcd - - dfC_filt1 = dfC[(dfC['Table Name'] == tName) & (dfC['Source'] == '[Value1]')] - col1 = dfC_filt1['Column Name'].iloc[0] - dfC_filt2 = dfC[(dfC['Table Name'] == tName) & (dfC['Source'] == '[Value2]')] - col2 = dfC_filt2['Column Name'].iloc[0] - dfC_filt3 = dfC[(dfC['Table Name'] == tName) & (dfC['Source'] == '[Value3]')] - col3 = dfC_filt3['Column Name'].iloc[0] - - tom.model.Tables[tName].Columns['Value1'].Name = col1 - tom.model.Tables[tName].Columns['Value2'].Name = col2 - tom.model.Tables[tName].Columns['Value3'].Name = col3 - - print(f"{green_dot} The '{tName}' table has been added as a field parameter to the '{new_dataset}' semantic model in the '{new_dataset_workspace}' workspace.") - except: - print(f"{red_dot} The '{tName}' table has not been added as a field parameter.") - except Exception as e: - if datetime.datetime.now() - start_time > timeout: - break - time.sleep(1) \ No newline at end of file diff --git a/sempy_labs/MigrateCalcTablesToSemanticModel.py b/sempy_labs/MigrateCalcTablesToSemanticModel.py deleted file mode 100644 index 6b7e04cb..00000000 --- a/sempy_labs/MigrateCalcTablesToSemanticModel.py +++ /dev/null @@ -1,123 +0,0 @@ -import sempy -import sempy.fabric as fabric -import re, datetime, time -from .GetLakehouseTables import get_lakehouse_tables -from .HelperFunctions import resolve_lakehouse_name -from .TOM import connect_semantic_model -from typing import List, Optional, Union -from sempy._utils._log import log - -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' - -@log -def migrate_calc_tables_to_semantic_model(dataset: str, new_dataset: str, workspace: Optional[str] = None, new_dataset_workspace: Optional[str] = None, lakehouse: Optional[str] = None, lakehouse_workspace: Optional[str] = None ): - - """ - Creates new tables in the Direct Lake semantic model based on the lakehouse tables created using the 'migrate_calc_tables_to_lakehouse' function. - - Parameters - ---------- - dataset : str - Name of the import/DirectQuery semantic model. - new_dataset : str - Name of the Direct Lake semantic model. - workspace : str, default=None - The Fabric workspace name in which the import/DirectQuery semantic model exists. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - new_dataset_workspace : str - The Fabric workspace name in which the Direct Lake semantic model will be created. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - lakehouse : str, default=None - The Fabric lakehouse used by the Direct Lake semantic model. - Defaults to None which resolves to the lakehouse attached to the notebook. - lakehouse_workspace : str, default=None - The Fabric workspace used by the lakehouse. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - - """ - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - else: - workspace_id = fabric.resolve_workspace_id(workspace) - - if new_dataset_workspace == None: - new_dataset_workspace = workspace - - if lakehouse_workspace == None: - lakehouse_workspace = new_dataset_workspace - if lakehouse == None: - lakehouse_id = fabric.get_lakehouse_id() - lakehouse = resolve_lakehouse_name(lakehouse_id, lakehouse_workspace) - - # Get calc tables but not field parameters - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) - dfP_filt = dfP[(dfP['Source Type'] == 'Calculated')] - dfP_filt = dfP_filt[~dfP_filt['Query'].str.contains('NAMEOF')] - - dfC = fabric.list_columns(dataset = dataset, workspace = workspace) - lc = get_lakehouse_tables(lakehouse=lakehouse, workspace=lakehouse_workspace) - # Get all calc table columns of calc tables not including field parameters - dfC_filt = dfC[(dfC['Table Name'].isin(dfP_filt['Table Name']))]# & (dfC['Type'] == 'CalculatedTableColumn')] - #dfA = list_annotations(new_dataset, new_dataset_workspace) - #dfA_filt = dfA[(dfA['Object Type'] == 'Model') & ~ (dfA['Annotation Value'].str.contains('NAMEOF'))] - - if len(dfP_filt) == 0: - print(f"{green_dot} The '{dataset}' semantic model has no calculated tables.") - return - - start_time = datetime.datetime.now() - timeout = datetime.timedelta(minutes=1) - success = False - - while not success: - try: - with connect_semantic_model(dataset=new_dataset, readonly=False, workspace=new_dataset_workspace) as tom: - success = True - for tName in dfC_filt['Table Name'].unique(): - if tName.lower() in lc['Table Name'].values: - - try: - tom.model.Tables[tName] - except: - tom.add_table(name = tName) - tom.add_entity_partition(table_name=tName, entity_name=tName.replace(' ','_').lower()) - - columns_in_table = dfC_filt.loc[dfC_filt['Table Name'] == tName, 'Column Name'].unique() - - for cName in columns_in_table: - scName = dfC.loc[(dfC['Table Name'] == tName) & (dfC['Column Name'] == cName), 'Source'].iloc[0] - cDataType = dfC.loc[(dfC['Table Name'] == tName) & (dfC['Column Name'] == cName), 'Data Type'].iloc[0] - cType = dfC.loc[(dfC['Table Name'] == tName) & (dfC['Column Name'] == cName), 'Type'].iloc[0] - - #av = tom.get_annotation_value(object = tom.model, name = tName) - - #if cType == 'CalculatedTableColumn': - #lakeColumn = scName.replace(' ','_') - #elif cType == 'Calculated': - pattern = r'\[([^]]+)\]' - - matches = re.findall(pattern, scName) - lakeColumn = matches[0].replace(' ','') - try: - tom.model.Tables[tName].Columns[cName] - except: - tom.add_data_column(table_name = tName, column_name=cName, source_column=lakeColumn, data_type=cDataType) - print(f"{green_dot} The '{tName}'[{cName}] column has been added.") - - print(f"\n{green_dot} All viable calculated tables have been added to the model.") - - except Exception as e: - if datetime.datetime.now() - start_time > timeout: - break - time.sleep(1) \ No newline at end of file diff --git a/sempy_labs/MigrateModelObjectsToSemanticModel.py b/sempy_labs/MigrateModelObjectsToSemanticModel.py deleted file mode 100644 index aa984255..00000000 --- a/sempy_labs/MigrateModelObjectsToSemanticModel.py +++ /dev/null @@ -1,324 +0,0 @@ -import sempy -import sempy.fabric as fabric -import re, datetime, time -from .ListFunctions import list_tables -from .HelperFunctions import create_relationship_name -from .TOM import connect_semantic_model -from typing import List, Optional, Union -from sempy._utils._log import log - -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' - -@log -def migrate_model_objects_to_semantic_model(dataset: str, new_dataset: str, workspace: Optional[str] = None, new_dataset_workspace: Optional[str] = None): - - """ - Adds the rest of the model objects (besides tables/columns) and their properties to a Direct Lake semantic model based on an import/DirectQuery semantic model. - - Parameters - ---------- - dataset : str - Name of the import/DirectQuery semantic model. - new_dataset : str - Name of the Direct Lake semantic model. - workspace : str, default=None - The Fabric workspace name in which the import/DirectQuery semantic model exists. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - new_dataset_workspace : str - The Fabric workspace name in which the Direct Lake semantic model will be created. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - - """ - - sempy.fabric._client._utils._init_analysis_services() - import Microsoft.AnalysisServices.Tabular as TOM - import System - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - else: - workspaceId = fabric.resolve_workspace_id(workspace) - - if new_dataset_workspace == None: - new_dataset_workspace = workspace - - dfT = list_tables(dataset, workspace) - dfC = fabric.list_columns(dataset = dataset, workspace = workspace) - dfM = fabric.list_measures(dataset = dataset, workspace = workspace) - dfR = fabric.list_relationships(dataset = dataset, workspace = workspace) - dfRole = fabric.get_roles(dataset = dataset, workspace = workspace) - dfRLS = fabric.get_row_level_security_permissions(dataset = dataset, workspace = workspace) - dfCI = fabric.list_calculation_items(dataset = dataset, workspace = workspace) - dfP = fabric.list_perspectives(dataset = dataset, workspace = workspace) - dfTranslation = fabric.list_translations(dataset = dataset, workspace = workspace) - dfH = fabric.list_hierarchies(dataset = dataset, workspace = workspace) - dfPar = fabric.list_partitions(dataset = dataset, workspace = workspace) - - dfP_cc = dfPar[(dfPar['Source Type'] == 'Calculated')] - dfP_fp = dfP_cc[dfP_cc['Query'].str.contains('NAMEOF')] - dfC_fp = dfC[dfC['Table Name'].isin(dfP_fp['Table Name'].values)] - - print(f"{in_progress} Updating '{new_dataset}' based on '{dataset}'...") - start_time = datetime.datetime.now() - timeout = datetime.timedelta(minutes=1) - success = False - - while not success: - try: - with connect_semantic_model(dataset=new_dataset, readonly=False, workspace=new_dataset_workspace) as tom: - success = True - - isDirectLake = any(str(p.Mode) == 'DirectLake' for t in tom.model.Tables for p in t.Partitions) - - print(f"\n{in_progress} Updating table properties...") - for t in tom.model.Tables: - t.IsHidden = bool(dfT.loc[dfT['Name'] == t.Name, 'Hidden'].iloc[0]) - t.Description = dfT.loc[dfT['Name'] == t.Name, 'Description'].iloc[0] - t.DataCategory = dfT.loc[dfT['Name'] == t.Name, 'Data Category'].iloc[0] - - print(f"{green_dot} The '{t.Name}' table's properties have been updated.") - - print(f"\n{in_progress} Updating column properties...") - for t in tom.model.Tables: - if t.Name not in dfP_fp['Table Name'].values: # do not include field parameters - dfT_filtered = dfT[dfT['Name'] == t.Name] - tType = dfT_filtered['Type'].iloc[0] - for c in t.Columns: - if not c.Name.startswith('RowNumber-'): - dfC_filt = dfC[(dfC['Table Name'] == t.Name) & (dfC['Column Name'] == c.Name)] - cName = dfC_filt['Column Name'].iloc[0] - c.Name = cName - if tType == 'Table': - c.SourceColumn = cName.replace(' ', '_') - c.IsHidden = bool(dfC_filt['Hidden'].iloc[0]) - c.DataType = System.Enum.Parse(TOM.DataType, dfC_filt['Data Type'].iloc[0]) - c.DisplayFolder = dfC_filt['Display Folder'].iloc[0] - c.FormatString = dfC_filt['Format String'].iloc[0] - c.SummarizeBy = System.Enum.Parse(TOM.AggregateFunction, dfC_filt['Summarize By'].iloc[0]) - c.DataCategory = dfC_filt['Data Category'].iloc[0] - c.IsKey = bool(dfC_filt['Key'].iloc[0]) - sbc = dfC_filt['Sort By Column'].iloc[0] - - if sbc != None: - try: - c.SortByColumn = tom.model.Tables[t.Name].Columns[sbc] - except: - print(f"{red_dot} Failed to create '{sbc}' as a Sort By Column for the '{c.Name}' in the '{t.Name}' table.") - print(f"{green_dot} The '{t.Name}'[{c.Name}] column's properties have been updated.") - - print(f"\n{in_progress} Creating hierarchies...") - dfH_grouped = dfH.groupby(['Table Name', 'Hierarchy Name', 'Hierarchy Hidden', 'Hierarchy Description']).agg({'Level Name': list, 'Column Name': list}).reset_index() - - for i, r in dfH_grouped.iterrows(): - tName = r['Table Name'] - hName = r['Hierarchy Name'] - hDesc = r['Hierarchy Description'] - hHid = bool(r['Hierarchy Hidden']) - cols = r['Column Name'] - lvls = r['Level Name'] - - try: - tom.model.Tables[tName].Hierarchies[hName] - except: - tom.add_hierarchy(table_name = tName, hierarchy_name=hName, hierarchy_description=hDesc, hierarchy_hidden=hHid, columns=cols, levels=lvls) - print(f"{green_dot} The '{hName}' hierarchy has been added.") - - print(f"\n{in_progress} Creating measures...") - for i, r in dfM.iterrows(): - tName = r['Table Name'] - mName = r['Measure Name'] - mExpr = r['Measure Expression'] - mHidden = bool(r['Measure Hidden']) - mDF = r['Measure Display Folder'] - mDesc = r['Measure Description'] - mFS = r['Format String'] - - try: - tom.model.Tables[tName].Measures[mName] - except: - tom.add_measure(table_name = tName, measure_name=mName, expression=mExpr, hidden=mHidden, display_folder=mDF, description=mDesc, format_string=mFS) - print(f"{green_dot} The '{mName}' measure has been added.") - - for cgName in dfCI['Calculation Group Name'].unique(): - - isHidden = bool(dfCI.loc[(dfCI['Calculation Group Name'] == cgName), 'Hidden'].iloc[0]) - prec = int(dfCI.loc[(dfCI['Calculation Group Name'] == cgName), 'Precedence'].iloc[0]) - desc = dfCI.loc[(dfCI['Calculation Group Name'] == cgName), 'Description'].iloc[0] - - try: - tom.model.Tables[cgName] - except: - tom.add_calculation_group(name = cgName, description = desc, precedence=prec, hidden=isHidden) - print(f"{green_dot} The '{cgName}' calculation group has been added.") - tom.model.DiscourageImplicitMeasures = True - - print(f"\n{in_progress} Updating calculation group column name...") - dfC_filt = dfC[(dfC['Table Name'] == cgName) & (dfC['Hidden'] == False)] - colName = dfC_filt['Column Name'].iloc[0] - tom.model.Tables[cgName].Columns['Name'].Name = colName - - calcItems = dfCI.loc[dfCI['Calculation Group Name'] == cgName, 'Calculation Item Name'].unique() - - print(f"\n{in_progress} Creating calculation items...") - for calcItem in calcItems: - ordinal = int(dfCI.loc[(dfCI['Calculation Group Name'] == cgName) & (dfCI['Calculation Item Name'] == calcItem), 'Ordinal'].iloc[0]) - expr = dfCI.loc[(dfCI['Calculation Group Name'] == cgName) & (dfCI['Calculation Item Name'] == calcItem), 'Expression'].iloc[0] - fse = dfCI.loc[(dfCI['Calculation Group Name'] == cgName) & (dfCI['Calculation Item Name'] == calcItem), 'Format String Expression'].iloc[0] - try: - tom.model.Tables[cgName].CalculationGroup.CalculationItems[calcItem] - except: - tom.add_calculation_item(table_name = cgName, calculation_item_name=calcItem, expression=expr, format_string_expression=fse, ordinal=ordinal) - print(f"{green_dot} The '{calcItem}' has been added to the '{cgName}' calculation group.") - - print(f"\n{in_progress} Creating relationships...") - for index, row in dfR.iterrows(): - fromTable = row['From Table'] - fromColumn = row['From Column'] - toTable = row['To Table'] - toColumn = row['To Column'] - isActive = row['Active'] - cfb = row['Cross Filtering Behavior'] - sfb = row['Security Filtering Behavior'] - rori = row['Rely On Referential Integrity'] - mult = row['Multiplicity'] - - card_mapping = {'m': 'Many', '1': 'One', '0': 'None'} - - fromCard = card_mapping.get(mult[0]) - toCard = card_mapping.get(mult[-1]) - - relName = create_relationship_name(fromTable,fromColumn,toTable,toColumn) - - if any(r.FromTable.Name == fromTable and r.FromColumn.Name == fromColumn and r.ToTable.Name == toTable and r.ToColumn.Name == toColumn for r in tom.model.Relationships): - print(f"{yellow_dot} {relName} already exists as a relationship in the semantic model.") - elif isDirectLake and any(r.FromTable.Name == fromTable and r.FromColumn.Name == fromColumn and r.ToTable.Name == toTable and r.ToColumn.Name == toColumn and (r.FromColumn.DataType == 'DateTime' or r.ToColumn.DataType == 'DateTime') for r in tom.model.Relationships): - print(f"{yellow_dot} {relName} was not created since relationships based on DateTime columns are not supported.") - elif isDirectLake and any(r.FromTable.Name == fromTable and r.FromColumn.Name == fromColumn and r.ToTable.Name == toTable and r.ToColumn.Name == toColumn and (r.FromColumn.DataType != r.ToColumn.DataType) for r in tom.model.Relationships): - print(f"{yellow_dot} {relName} was not created since columns used in a relationship must have the same data type.") - else: - try: - tom.add_relationship( - from_table = fromTable, from_column=fromColumn, - to_table=toTable, to_column=toColumn, - from_cardinality=fromCard,to_cardinality=toCard, - cross_filtering_behavior=cfb, - security_filtering_behavior=sfb, - rely_on_referential_integrity=rori, - is_active=isActive) - - print(f"{green_dot} The {relName} relationship has been added.") - except: - print(f"{red_dot} The {relName} relationship was not added.") - - print(f"\n{in_progress} Creating roles...") - for index, row in dfRole.iterrows(): - roleName = row['Role'] - roleDesc = row['Description'] - modPerm = row['Model Permission'] - - try: - tom.model.Roles[roleName] - except: - tom.add_role(role_name=roleName, model_permission=modPerm, description=roleDesc) - print(f"{green_dot} The '{roleName}' role has been added.") - - print(f"\n{in_progress} Creating row level security...") - for index, row in dfRLS.iterrows(): - roleName = row['Role'] - tName = row['Table'] - expr = row['Filter Expression'] - - try: - tom.set_rls(role_name=roleName, table_name=tName, filter_expression=expr) - print(f"{green_dot} Row level security for the '{tName}' table within the '{roleName}' role has been set.") - except: - print(f"{red_dot} Row level security for the '{tName}' table within the '{roleName}' role was not set.") - - print(f"\n{in_progress} Creating perspectives...") - for pName in dfP['Perspective Name'].unique(): - - try: - tom.model.Perspectives[pName] - except: - tom.add_perspective(perspective_name=pName) - print(f"{green_dot} The '{pName}' perspective has been added.") - - print(f"\n{in_progress} Adding objects to perspectives...") - for index, row in dfP.iterrows(): - pName = row['Perspective Name'] - tName = row['Table Name'] - oName = row['Object Name'] - oType = row['Object Type'] - tType = dfT.loc[(dfT['Name'] == tName), 'Type'].iloc[0] - - try: - if oType == 'Table': - tom.add_to_perspective(object = tom.model.Tables[tName], perspective_name=pName) - elif oType == 'Column': - tom.add_to_perspective(object = tom.model.Tables[tName].Columns[oName], perspective_name=pName) - elif oType == 'Measure': - tom.add_to_perspective(object = tom.model.Tables[tName].Measures[oName], perspective_name=pName) - elif oType == 'Hierarchy': - tom.add_to_perspective(object = tom.model.Tables[tName].Hierarchies[oName], perspective_name=pName) - except: - pass - - print(f"\n{in_progress} Creating translation languages...") - for trName in dfTranslation['Culture Name'].unique(): - try: - tom.model.Cultures[trName] - except: - tom.add_translation(trName) - print(f"{green_dot} The '{trName}' translation language has been added.") - - print(f"\n{in_progress} Creating translation values...") - for index, row in dfTranslation.iterrows(): - trName = row['Culture Name'] - tName = row['Table Name'] - oName = row['Object Name'] - oType = row['Object Type'] - translation = row['Translation'] - prop = row['Property'] - - if prop == 'Caption': - prop = 'Name' - elif prop == 'DisplayFolder': - prop = 'Display Folder' - - try: - if oType == 'Table': - tom.set_translation(object = tom.model.Tables[tName], language=trName, property = prop, value = translation) - elif oType == 'Column': - tom.set_translation(object = tom.model.Tables[tName].Columns[oName], language=trName, property = prop, value = translation) - elif oType == 'Measure': - tom.set_translation(object = tom.model.Tables[tName].Measures[oName], language=trName, property = prop, value = translation) - elif oType == 'Hierarchy': - tom.set_translation(object = tom.model.Tables[tName].Hierarchies[oName], language=trName, property = prop, value = translation) - elif oType == 'Level': - - pattern = r'\[([^]]+)\]' - matches = re.findall(pattern, oName) - lName = matches[0] - - pattern = r"'([^']+)'" - matches = re.findall(pattern, oName) - hName = matches[0] - tom.set_translation(object = tom.model.Tables[tName].Hierarchies[hName].Levels[lName], language=trName, property = prop, value = translation) - except: - pass - - print(f"\n{green_dot} Migration of objects from '{dataset}' -> '{new_dataset}' is complete.") - - except Exception as e: - if datetime.datetime.now() - start_time > timeout: - break - time.sleep(1) \ No newline at end of file diff --git a/sempy_labs/MigrateTablesColumnsToSemanticModel.py b/sempy_labs/MigrateTablesColumnsToSemanticModel.py deleted file mode 100644 index 47f5d054..00000000 --- a/sempy_labs/MigrateTablesColumnsToSemanticModel.py +++ /dev/null @@ -1,135 +0,0 @@ -import sempy -import sempy.fabric as fabric -import pandas as pd -import datetime, time -from .ListFunctions import list_tables -from .GetSharedExpression import get_shared_expression -from .HelperFunctions import resolve_lakehouse_name -from .Lakehouse import lakehouse_attached -from .TOM import connect_semantic_model -from typing import List, Optional, Union -from sempy._utils._log import log - -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' - -@log -def migrate_tables_columns_to_semantic_model(dataset: str, new_dataset: str, workspace: Optional[str] = None, new_dataset_workspace: Optional[str] = None, lakehouse: Optional[str] = None, lakehouse_workspace: Optional[str] = None): - - """ - Adds tables/columns to the new Direct Lake semantic model based on an import/DirectQuery semantic model. - - Parameters - ---------- - dataset : str - Name of the import/DirectQuery semantic model. - new_dataset : str - Name of the Direct Lake semantic model. - workspace : str, default=None - The Fabric workspace name in which the import/DirectQuery semantic model exists. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - new_dataset_workspace : str - The Fabric workspace name in which the Direct Lake semantic model will be created. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - lakehouse : str, default=None - The Fabric lakehouse used by the Direct Lake semantic model. - Defaults to None which resolves to the lakehouse attached to the notebook. - lakehouse_workspace : str, default=None - The Fabric workspace used by the lakehouse. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - - """ - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - else: - workspace_id = fabric.resolve_workspace_id(workspace) - - if new_dataset_workspace == None: - new_dataset_workspace = workspace - - if lakehouse_workspace == None: - lakehouse_workspace = new_dataset_workspace - - if lakehouse == None: - lakehouse_id = fabric.get_lakehouse_id() - lakehouse = resolve_lakehouse_name(lakehouse_id, lakehouse_workspace) - - # Check that lakehouse is attached to the notebook - lakeAttach = lakehouse_attached() - - # Run if lakehouse is attached to the notebook or a lakehouse & lakehouse workspace are specified - if lakeAttach or (lakehouse is not None and lakehouse_workspace is not None): - shEx = get_shared_expression(lakehouse, lakehouse_workspace) - - dfC = fabric.list_columns(dataset = dataset, workspace = workspace) - dfT = list_tables(dataset, workspace) - dfT.rename(columns={'Type': 'Table Type'}, inplace=True) - dfC = pd.merge(dfC, dfT[['Name', 'Table Type']], left_on = 'Table Name', right_on = 'Name', how='left') - dfT_filt = dfT[dfT['Table Type'] == 'Table'] - dfC_filt = dfC[(dfC['Table Type'] == 'Table') & ~(dfC['Column Name'].str.startswith('RowNumber-')) & (dfC['Type'] != 'Calculated')] - - print(f"{in_progress} Updating '{new_dataset}' based on '{dataset}'...") - start_time = datetime.datetime.now() - timeout = datetime.timedelta(minutes=1) - success = False - - while not success: - try: - with connect_semantic_model(dataset=new_dataset, readonly=False, workspace=new_dataset_workspace) as tom: - success = True - try: - tom.model.Expressions['DatabaseQuery'] - except: - tom.add_expression('DatabaseQuery', expression = shEx) - print(f"{green_dot} The 'DatabaseQuery' expression has been added.") - - for i, r in dfT_filt.iterrows(): - tName = r['Name'] - tDC = r['Data Category'] - tHid = bool(r['Hidden']) - tDesc = r['Description'] - - try: - tom.model.Tables[tName] - except: - tom.add_table(name = tName, description=tDesc, data_category=tDC, hidden=tHid) - tom.add_entity_partition(table_name = tName, entity_name = tName.replace(' ','_')) - print(f"{green_dot} The '{tName}' table has been added.") - - for i, r in dfC_filt.iterrows(): - tName = r['Table Name'] - cName = r['Column Name'] - scName = r['Source'].replace(' ','_') - cHid = bool(r['Hidden']) - cDataType = r['Data Type'] - - try: - tom.model.Tables[tName].Columns[cName] - except: - tom.add_data_column(table_name=tName, column_name=cName, source_column=scName, hidden=cHid, data_type=cDataType) - print(f"{green_dot} The '{tName}'[{cName}] column has been added.") - - print(f"\n{green_dot} All regular tables and columns have been added to the '{new_dataset}' semantic model.") - except Exception as e: - if datetime.datetime.now() - start_time > timeout: - break - time.sleep(1) - else: - print(f"{red_dot} Lakehouse not attached to notebook and lakehouse/lakehouse_workspace are not specified. Please add your lakehouse to this notebook or specify the lakehouse/lakehouse_workspace parameters.") - print(f"To attach a lakehouse to a notebook, go to the the 'Explorer' window to the left, click 'Lakehouses' to add your lakehouse to this notebook") - print(f"\nLearn more here: https://learn.microsoft.com/fabric/data-engineering/lakehouse-notebook-explore#add-or-remove-a-lakehouse") - - - - - \ No newline at end of file diff --git a/sempy_labs/MigrationValidation.py b/sempy_labs/MigrationValidation.py deleted file mode 100644 index 150f7f78..00000000 --- a/sempy_labs/MigrationValidation.py +++ /dev/null @@ -1,133 +0,0 @@ -import sempy -import sempy.fabric as fabric -import pandas as pd -from .HelperFunctions import create_relationship_name -from .TOM import connect_semantic_model -from typing import List, Optional, Union -from sempy._utils._log import log - -def list_semantic_model_objects(dataset: str, workspace: Optional[str] = None): - - """ - Shows a list of semantic model objects. - - Parameters - ---------- - dataset : str - Name of the semantic model. - workspace : str, default=None - The Fabric workspace name. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - - Returns - ------- - pandas.DataFrame - A pandas dataframe showing a list of objects in the semantic model - """ - - if workspace is None: - workspace = fabric.resolve_workspace_name() - - df = pd.DataFrame(columns=['Parent Name', 'Object Name', 'Object Type']) - with connect_semantic_model(dataset=dataset, workspace = workspace, readonly=True) as tom: - for t in tom.model.Tables: - if t.CalculationGroup is not None: - new_data = {'Parent Name': t.Parent.Name, 'Object Name': t.Name, 'Object Type': 'Calculation Group'} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - for ci in t.CalculationGroup.CalculationItems: - new_data = {'Parent Name': t.Name, 'Object Name': ci.Name, 'Object Type': str(ci.ObjectType)} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - elif any(str(p.SourceType) == 'Calculated' for p in t.Partitions): - new_data = {'Parent Name': t.Parent.Name, 'Object Name': t.Name, 'Object Type': 'Calculated Table'} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - else: - new_data = {'Parent Name': t.Parent.Name, 'Object Name': t.Name, 'Object Type': str(t.ObjectType)} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - for c in t.Columns: - if str(c.Type) != 'RowNumber': - if str(c.Type) == 'Calculated': - new_data = {'Parent Name': c.Parent.Name, 'Object Name': c.Name, 'Object Type': 'Calculated Column'} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - else: - new_data = {'Parent Name': c.Parent.Name, 'Object Name': c.Name, 'Object Type': str(c.ObjectType)} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - for m in t.Measures: - new_data = {'Parent Name': m.Parent.Name, 'Object Name': m.Name, 'Object Type': str(m.ObjectType)} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - for h in t.Hierarchies: - new_data = {'Parent Name': h.Parent.Name, 'Object Name': h.Name, 'Object Type': str(h.ObjectType)} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - for l in h.Levels: - new_data = {'Parent Name': l.Parent.Name, 'Object Name': l.Name, 'Object Type': str(l.ObjectType)} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - for p in t.Partitions: - new_data = {'Parent Name': p.Parent.Name, 'Object Name': p.Name, 'Object Type': str(p.ObjectType)} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - for r in tom.model.Relationships: - rName = create_relationship_name(r.FromTable.Name, r.FromColumn.Name, r.ToTable.Name, r.ToColumn.Name) - new_data = {'Parent Name': r.Parent.Name, 'Object Name': rName, 'Object Type': str(r.ObjectType)} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - for role in tom.model.Roles: - new_data = {'Parent Name': role.Parent.Name, 'Object Name': role.Name, 'Object Type': str(role.ObjectType)} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - for rls in role.TablePermissions: - new_data = {'Parent Name': role.Name, 'Object Name': rls.Name, 'Object Type': str(rls.ObjectType)} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - for tr in tom.model.Cultures: - new_data = {'Parent Name': tr.Parent.Name, 'Object Name': tr.Name, 'Object Type': str(tr.ObjectType)} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - for per in tom.model.Perspectives: - new_data = {'Parent Name': per.Parent.Name, 'Object Name': per.Name, 'Object Type': str(per.ObjectType)} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - - return df - -@log -def migration_validation(dataset: str, new_dataset: str, workspace: Optional[str] = None, new_dataset_workspace: Optional[str] = None): - - """ - Shows the objects in the original semantic model and whether then were migrated successfully or not. - - Parameters - ---------- - dataset : str - Name of the import/DirectQuery semantic model. - new_dataset : str - Name of the Direct Lake semantic model. - workspace : str, default=None - The Fabric workspace name in which the import/DirectQuery semantic model exists. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - new_dataset_workspace : str - The Fabric workspace name in which the Direct Lake semantic model will be created. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - pandas.DataFrame - A pandas dataframe showing a list of objects and whether they were successfully migrated. Also shows the % of objects which were migrated successfully. - """ - - dfA = list_semantic_model_objects(dataset = dataset, workspace = workspace) - dfB = list_semantic_model_objects(dataset = new_dataset, workspace = new_dataset_workspace) - - def is_migrated(row): - if row['Object Type'] == 'Calculated Table': - return ((dfB['Parent Name'] == row['Parent Name']) & - (dfB['Object Name'] == row['Object Name']) & - (dfB['Object Type'].isin(['Calculated Table', 'Table']))).any() - else: - return ((dfB['Parent Name'] == row['Parent Name']) & - (dfB['Object Name'] == row['Object Name']) & - (dfB['Object Type'] == row['Object Type'])).any() - - dfA['Migrated'] = dfA.apply(is_migrated, axis=1) - - denom = len(dfA) - num = len(dfA[dfA['Migrated']]) - print(f"{100 * round(num / denom,2)}% migrated") - - return dfA \ No newline at end of file diff --git a/sempy_labs/ModelAutoBuild.py b/sempy_labs/ModelAutoBuild.py index 6497061c..befa151a 100644 --- a/sempy_labs/ModelAutoBuild.py +++ b/sempy_labs/ModelAutoBuild.py @@ -2,14 +2,20 @@ import sempy.fabric as fabric import pandas as pd from .TOM import connect_semantic_model -from .CreateBlankSemanticModel import create_blank_semantic_model -from .GetSharedExpression import get_shared_expression +from ._create_blank_semantic_model import create_blank_semantic_model +from .directlake.GetSharedExpression import get_shared_expression from typing import List, Optional, Union from sempy._utils._log import log -@log -def model_auto_build(dataset: str, file_path: str, workspace: Optional[str] = None, lakehouse: Optional[str] = None, lakehouse_workspace: Optional[str] = None): +@log +def model_auto_build( + dataset: str, + file_path: str, + workspace: Optional[str] = None, + lakehouse: Optional[str] = None, + lakehouse_workspace: Optional[str] = None, +): """ Dynamically generates a semantic model based on an Excel file template. @@ -29,10 +35,10 @@ def model_auto_build(dataset: str, file_path: str, workspace: Optional[str] = No The Fabric workspace used by the lakehouse. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- - + """ if workspace is None: @@ -42,76 +48,96 @@ def model_auto_build(dataset: str, file_path: str, workspace: Optional[str] = No if lakehouse_workspace is None: lakehouse_workspace = workspace - sheets = ['Model', 'Tables', 'Measures', 'Columns', 'Roles', 'Hierarchies', 'Relationships'] + sheets = [ + "Model", + "Tables", + "Measures", + "Columns", + "Roles", + "Hierarchies", + "Relationships", + ] - create_blank_semantic_model(dataset=dataset, workspace = workspace) + create_blank_semantic_model(dataset=dataset, workspace=workspace) - with connect_semantic_model(dataset = dataset, workspace = workspace) as tom: + with connect_semantic_model(dataset=dataset, workspace=workspace) as tom: - #DL Only + # DL Only expr = get_shared_expression(lakehouse=lakehouse, workspace=lakehouse_workspace) - tom.add_expression(name = 'DatbaseQuery', expression = expr) + tom.add_expression(name="DatbaseQuery", expression=expr) for sheet in sheets: - df = pd.read_excel(file_path, sheet_name= sheet) + df = pd.read_excel(file_path, sheet_name=sheet) - if sheet == 'Tables': + if sheet == "Tables": for i, r in df.iterrows(): - tName = r['Table Name'] - desc = r['Description'] - dc = r['Data Category'] - mode = r['Mode'] - hidden = bool(r['Hidden']) - - tom.add_table(name = tName, description = desc, data_category=dc, hidden = hidden) - if mode == 'DirectLake': - tom.add_entity_partition(table_name = tName, entity_name=tName) - elif sheet == 'Columns': + tName = r["Table Name"] + desc = r["Description"] + dc = r["Data Category"] + mode = r["Mode"] + hidden = bool(r["Hidden"]) + + tom.add_table( + name=tName, description=desc, data_category=dc, hidden=hidden + ) + if mode == "DirectLake": + tom.add_entity_partition(table_name=tName, entity_name=tName) + elif sheet == "Columns": for i, r in df.iterrows(): - tName = r['Table Name'] - cName = r['Column Name'] - scName = r['Source Column'] - dataType = r['Data Type'] - hidden = bool(r['Hidden']) - key = bool(r['Key']) - if dataType == 'Integer': - dataType = 'Int64' - desc = r['Description'] + tName = r["Table Name"] + cName = r["Column Name"] + scName = r["Source Column"] + dataType = r["Data Type"] + hidden = bool(r["Hidden"]) + key = bool(r["Key"]) + if dataType == "Integer": + dataType = "Int64" + desc = r["Description"] tom.add_data_column( - table_name = tName, column_name=cName, source_column=scName, - data_type=dataType, description = desc, hidden=hidden, key=key) - elif sheet == 'Measures': + table_name=tName, + column_name=cName, + source_column=scName, + data_type=dataType, + description=desc, + hidden=hidden, + key=key, + ) + elif sheet == "Measures": for i, r in df.iterrows(): - tName = r['Table Name'] - mName = r['Measure Name'] - expr = r['Expression'] - desc = r['Description'] - format = r['Format String'] - hidden = bool(r['Hidden']) + tName = r["Table Name"] + mName = r["Measure Name"] + expr = r["Expression"] + desc = r["Description"] + format = r["Format String"] + hidden = bool(r["Hidden"]) tom.add_measure( - table_name = tName, measure_name=mName, - expression=expr, format_string=format, description=desc, hidden=hidden) - elif sheet == 'Relationships': + table_name=tName, + measure_name=mName, + expression=expr, + format_string=format, + description=desc, + hidden=hidden, + ) + elif sheet == "Relationships": for i, r in df.iterrows(): - fromTable = r['From Table'] - fromColumn = r['From Column'] - toTable = r['To Table'] - toColumn = r['To Column'] - fromCard = r['From Cardinality'] - toCard = r['To Cardinality'] + fromTable = r["From Table"] + fromColumn = r["From Column"] + toTable = r["To Table"] + toColumn = r["To Column"] + fromCard = r["From Cardinality"] + toCard = r["To Cardinality"] tom.add_relationship( - from_table=fromTable, from_column= fromColumn, - to_table=toTable, to_column = toColumn, - from_cardinality=fromCard, to_cardinality=toCard) - elif sheet == 'Roles': - print('hi') - elif sheet == 'Hierarchies': - print('hi') - - - - - \ No newline at end of file + from_table=fromTable, + from_column=fromColumn, + to_table=toTable, + to_column=toColumn, + from_cardinality=fromCard, + to_cardinality=toCard, + ) + elif sheet == "Roles": + print("hi") + elif sheet == "Hierarchies": + print("hi") diff --git a/sempy_labs/ModelBPA.py b/sempy_labs/ModelBPA.py index 89f2ff97..9059c07f 100644 --- a/sempy_labs/ModelBPA.py +++ b/sempy_labs/ModelBPA.py @@ -6,309 +6,709 @@ from IPython.display import display, HTML from pyspark.sql import SparkSession from .GetMeasureDependencies import get_measure_dependencies -from .HelperFunctions import format_dax_object_name, resolve_lakehouse_name -from .Lakehouse import lakehouse_attached -from .GetLakehouseTables import get_lakehouse_tables +from ._helper_functions import format_dax_object_name, resolve_lakehouse_name +from .lakehouse.Lakehouse import lakehouse_attached +from .lakehouse.GetLakehouseTables import get_lakehouse_tables from typing import List, Optional, Union from sempy._utils._log import log -def model_bpa_rules(): - """ +def model_bpa_rules(): + """ Shows the default rules for the semantic model BPA used by the run_model_bpa function. Parameters ---------- - + Returns ------- pandas.DataFrame A pandas dataframe containing the default rules for the run_model_bpa function. """ - df_rules = pd.DataFrame([ - ('Performance', 'Column', 'Warning', 'Do not use floating point data types', - lambda df: df['Data Type'] == 'Double', - 'The "Double" floating point data type should be avoided, as it can result in unpredictable roundoff errors and decreased performance in certain scenarios. Use "Int64" or "Decimal" where appropriate (but note that "Decimal" is limited to 4 digits after the decimal sign).', - ), - ('Performance', 'Column', 'Warning', 'Avoid using calculated columns', - lambda df: df['Type'] == 'Calculated', - 'Calculated columns do not compress as well as data columns so they take up more memory. They also slow down processing times for both the table as well as process recalc. Offload calculated column logic to your data warehouse and turn these calculated columns into data columns.', - 'https://www.elegantbi.com/post/top10bestpractices', - ), - ('Performance', 'Relationship', 'Warning', 'Check if bi-directional and many-to-many relationships are valid', - lambda df: (df['Multiplicity'] == 'm:m') | (df['Cross Filtering Behavior'] == 'BothDirections'), - 'Bi-directional and many-to-many relationships may cause performance degradation or even have unintended consequences. Make sure to check these specific relationships to ensure they are working as designed and are actually necessary.', - 'https://www.sqlbi.com/articles/bidirectional-relationships-and-ambiguity-in-dax' - ), - ('Performance', 'Row Level Security', 'Info', 'Check if dynamic row level security (RLS) is necessary', - lambda df: df['Is Dynamic'], - 'Usage of dynamic row level security (RLS) can add memory and performance overhead. Please research the pros/cons of using it.', - 'https://docs.microsoft.com/power-bi/admin/service-admin-rls', - ), - ('Performance', 'Table', 'Warning', 'Avoid using many-to-many relationships on tables used for dynamic row level security', - lambda df: (df['Used in M2M Relationship'] == True) & (df['Used in Dynamic RLS'] == True), - "Using many-to-many relationships on tables which use dynamic row level security can cause serious query performance degradation. This pattern's performance problems compound when snowflaking multiple many-to-many relationships against a table which contains row level security. Instead, use one of the patterns shown in the article below where a single dimension table relates many-to-one to a security table.", - 'https://www.elegantbi.com/post/dynamicrlspatterns', - ), - ('Performance', 'Relationship', 'Warning', 'Many-to-many relationships should be single-direction', - lambda df: (df['Multiplicity'] == 'm:m') & (df['Cross Filtering Behavior'] == 'BothDirections'), - ), - ('Performance', 'Column', 'Warning', 'Set IsAvailableInMdx to false on non-attribute columns', - lambda df: (df['Is Direct Lake'] == False) & (df['Is Available in MDX'] == True) & ((df['Hidden'] == True) | (df['Parent Is Hidden'] == True)) & (df['Used in Sort By'] == False) & (df['Used in Hierarchy'] == False) & (df['Sort By Column'] == None), - 'To speed up processing time and conserve memory after processing, attribute hierarchies should not be built for columns that are never used for slicing by MDX clients. In other words, all hidden columns that are not used as a Sort By Column or referenced in user hierarchies should have their IsAvailableInMdx property set to false. The IsAvailableInMdx property is not relevant for Direct Lake models.', - 'https://blog.crossjoin.co.uk/2018/07/02/isavailableinmdx-ssas-tabular', - ), - #('Performance', 'Partition', 'Warning', "Set 'Data Coverage Definition' property on the DirectQuery partition of a hybrid table", - # lambda df: (df['Data Coverage Definition Expression'].isnull()) & (df['Mode'] == 'DirectQuery') & (df['Import Partitions'] > 0) & (df['Has Date Table']), - # "Setting the 'Data Coverage Definition' property may lead to better performance because the engine knows when it can only query the import-portion of the table and when it needs to query the DirectQuery portion of the table.", - # "https://learn.microsoft.com/analysis-services/tom/table-partitions?view=asallproducts-allversions", - #), - ('Performance', 'Table', 'Warning', "Set dimensions tables to dual mode instead of import when using DirectQuery on fact tables", - lambda df: (df['Import Partitions'] == 1) & (df['Model Has DQ']) & (df['Used in Relationship x:1']), - "https://learn.microsoft.com/power-bi/transform-model/desktop-storage-mode#propagation-of-the-dual-setting", - - ), - ('Performance', 'Partition', 'Warning', 'Minimize Power Query transformations', - lambda df: (df['Source Type'] == 'M') & (('Table.Combine(\"' in df['Query']) | ('Table.Join(\"' in df['Query']) | ('Table.NestedJoin(\"' in df['Query']) | ('Table.AddColumn(\"' in df['Query']) | ('Table.Group(\"' in df['Query']) | ('Table.Sort(\"' in df['Query']) | ('Table.Sort(\"' in df['Query']) | ('Table.Pivot(\"' in df['Query']) | ('Table.Unpivot(\"' in df['Query']) | ('Table.UnpivotOtherColumns(\"' in df['Query']) | ('Table.Distinct(\"' in df['Query']) | ('[Query=(\"\"SELECT' in df['Query']) | ('Value.NativeQuery' in df['Query']) | ('OleDb.Query' in df['Query']) | ('Odbc.Query' in df['Query']) ), - 'Minimize Power Query transformations in order to improve model processing performance. It is a best practice to offload these transformations to the data warehouse if possible. Also, please check whether query folding is occurring within your model. Please reference the article below for more information on query folding.', - 'https://docs.microsoft.com/power-query/power-query-folding', - ), - ('Performance', 'Table', 'Warning', 'Consider a star-schema instead of a snowflake architecture', - lambda df: (df['Type'] != 'Calculation Group') & df['Used in Relationship Both Sides'], - 'Generally speaking, a star-schema is the optimal architecture for tabular models. That being the case, there are valid cases to use a snowflake approach. Please check your model and consider moving to a star-schema architecture.', - 'https://docs.microsoft.com/power-bi/guidance/star-schema', - ), - ('Performance', 'Table', 'Warning', 'Reduce usage of calculated tables', - lambda df: df['Type'] == 'Calculated Table', - 'Migrate calculated table logic to your data warehouse. Reliance on calculated tables will lead to technical debt and potential misalignments if you have multiple models on your platform.', - ), - ('Performance', 'Column', 'Warning', 'Reduce usage of calculated columns that use the RELATED function', - lambda df: (df['Type'] == 'Calculated') & (df['Source'].str.contains(r'related\s*\(', case=False)), - 'Calculated columns do not compress as well as data columns and may cause longer processing times. As such, calculated columns should be avoided if possible. One scenario where they may be easier to avoid is if they use the RELATED function.', - 'https://www.sqlbi.com/articles/storage-differences-between-calculated-columns-and-calculated-tables', - ), - ('Performance', 'Model', 'Warning', 'Avoid excessive bi-directional or many-to-many relationships', - lambda df: (df['M2M or BiDi Relationship Count'] / df['Relationship Count']) > 0.3, - 'Limit use of b-di and many-to-many relationships. This rule flags the model if more than 30% of relationships are bi-di or many-to-many.', - 'https://www.sqlbi.com/articles/bidirectional-relationships-and-ambiguity-in-dax', - ), - ('Performance', 'Column', 'Warning', 'Avoid bi-directional or many-to-many relationships against high-cardinality columns', - lambda df: df['Used in M2M/BiDi Relationship'] & df['Column Cardinality'] > 100000, - 'For best performance, it is recommended to avoid using bi-directional relationships against high-cardinality columns', - ), - ('Performance', 'Table', 'Warning', 'Remove auto-date table', - lambda df: (df['Type'] == 'Calculated Table') & ( (df['Name'].str.startswith('DateTableTemplate_')) | (df['Name'].str.startswith('LocalDateTable_')) ), - 'Avoid using auto-date tables. Make sure to turn off auto-date table in the settings in Power BI Desktop. This will save memory resources.', - 'https://www.youtube.com/watch?v=xu3uDEHtCrg', - ), - ('Performance', 'Table', 'Warning', 'Date/calendar tables should be marked as a date table', - lambda df: ( (df['Name'].str.contains(r'date', case=False)) | (df['Name'].str.contains(r'calendar', case=False)) ) & (df['Data Category'] != 'Time'), - "This rule looks for tables that contain the words 'date' or 'calendar' as they should likely be marked as a date table.", - 'https://docs.microsoft.com/power-bi/transform-model/desktop-date-tables', - ), - ('Performance', 'Table', 'Warning', 'Large tables should be partitioned', - lambda df: (df['Is Direct Lake'] == False) & (df['Partition Count'] == 1) & (df['Row Count'] > 25000000), - 'Large tables should be partitioned in order to optimize processing. This is not relevant for semantic models in Direct Lake mode as they can only have one partition per table.', - ), - ('Performance', 'Row Level Security', 'Warning', 'Limit row level security (RLS) logic', - lambda df: df['Filter Expression'].str.contains('|'.join(['right', 'left', 'filter', 'upper', 'lower', 'find' ]), case=False), - 'Try to simplify the DAX used for row level security. Usage of the functions within this rule can likely be offloaded to the upstream systems (data warehouse).', - ), - ('Performance', 'Model', 'Warning', 'Model should have a date table', - lambda df: df['Has Date Table'], - 'Generally speaking, models should generally have a date table. Models that do not have a date table generally are not taking advantage of features such as time intelligence or may not have a properly structured architecture.', - ), - ('Performance', 'Measure', 'Warning', 'Measures using time intelligence and model is using Direct Query', - lambda df: df['DQ Date Function Used'], - 'At present, time intelligence functions are known to not perform as well when using Direct Query. If you are having performance issues, you may want to try alternative solutions such as adding columns in the fact table that show previous year or previous month data.', - ), - ('Error Prevention', 'Calculation Item', 'Error', 'Calculation items must have an expression', - lambda df: df['Expression'].str.len() == 0, - 'Calculation items must have an expression. Without an expression, they will not show any values.', - ), - ('Error Prevention', ['Table', 'Column', 'Measure', 'Hierarchy', 'Partition'], 'Error', 'Avoid invalid characters in names', - lambda df: df['Name'].apply(lambda x: any(unicodedata.category(char) == 'Cc' and not char.isspace() for char in x)), - 'This rule identifies if a name for a given object in your model (i.e. table/column/measure) which contains an invalid character. Invalid characters will cause an error when deploying the model (and failure to deploy). This rule has a fix expression which converts the invalid character into a space, resolving the issue.', - ), - ('Error Prevention', ['Table', 'Column', 'Measure', 'Hierarchy'], 'Error', 'Avoid invalid characters in descriptions', - lambda df: df['Description'].apply(lambda x: any(unicodedata.category(char) == 'Cc' and not char.isspace() for char in x)), - 'This rule identifies if a description for a given object in your model (i.e. table/column/measure) which contains an invalid character. Invalid characters will cause an error when deploying the model (and failure to deploy). This rule has a fix expression which converts the invalid character into a space, resolving the issue.', - ), - ('Error Prevention', 'Relationship', 'Warning', 'Relationship columns should be of the same data type', - lambda df: df['From Column Data Type'] != df['To Column Data Type'], - "Columns used in a relationship should be of the same data type. Ideally, they will be of integer data type (see the related rule '[Formatting] Relationship columns should be of integer data type'). Having columns within a relationship which are of different data types may lead to various issues.", - ), - ('Error Prevention', 'Column', 'Error', 'Data columns must have a source column', - lambda df: (df['Type'] == 'Data') & (df['Source'].str.len() == 0), - 'Data columns must have a source column. A data column without a source column will cause an error when processing the model.', - ), - ('Error Prevention', 'Column', 'Warning', 'Set IsAvailableInMdx to true on necessary columns', - lambda df: (df['Is Direct Lake'] == False) & (df['Is Available in MDX'] == False) & ((df['Used in Sort By'] == True) | (df['Used in Hierarchy'] == True) | (df['Sort By Column'] != None)), - 'In order to avoid errors, ensure that attribute hierarchies are enabled if a column is used for sorting another column, used in a hierarchy, used in variations, or is sorted by another column. The IsAvailableInMdx property is not relevant for Direct Lake models.', - ), - ('Error Prevention', 'Table', 'Error', 'Avoid the USERELATIONSHIP function and RLS against the same table', - lambda df: (df['USERELATIONSHIP Used'] == True) & (df['Used in RLS'] == True), - "The USERELATIONSHIP function may not be used against a table which also leverages row-level security (RLS). This will generate an error when using the particular measure in a visual. This rule will highlight the table which is used in a measure's USERELATIONSHIP function as well as RLS.", - 'https://blog.crossjoin.co.uk/2013/05/10/userelationship-and-tabular-row-security', - ), - ('DAX Expressions', 'Measure', 'Warning', 'Avoid using the IFERROR function', - lambda df: df['Measure Expression'].str.contains(r'irerror\s*\(', case=False), - 'Avoid using the IFERROR function as it may cause performance degradation. If you are concerned about a divide-by-zero error, use the DIVIDE function as it naturally resolves such errors as blank (or you can customize what should be shown in case of such an error).', - 'https://www.elegantbi.com/post/top10bestpractices', - ), - ('DAX Expressions', 'Measure', 'Warning', 'Use the TREATAS function instead of INTERSECT for virtual relationships', - lambda df: df['Measure Expression'].str.contains(r'intersect\s*\(', case=False), - 'The TREATAS function is more efficient and provides better performance than the INTERSECT function when used in virutal relationships.', - 'https://www.sqlbi.com/articles/propagate-filters-using-treatas-in-dax', - ), - ('DAX Expressions', 'Measure', 'Warning', 'The EVALUATEANDLOG function should not be used in production models', - lambda df: df['Measure Expression'].str.contains(r'evaluateandlog\s*\(', case=False), - 'The EVALUATEANDLOG function is meant to be used only in development/test environments and should not be used in production models.', - 'https://pbidax.wordpress.com/2022/08/16/introduce-the-dax-evaluateandlog-function', - ), - ('DAX Expressions', 'Measure', 'Warning', 'Measures should not be direct references of other measures', - lambda df: df['Measure Expression'].str.strip().isin(df['Measure Object']), - "This rule identifies measures which are simply a reference to another measure. As an example, consider a model with two measures: [MeasureA] and [MeasureB]. This rule would be triggered for MeasureB if MeasureB's DAX was MeasureB:=[MeasureA]. Such duplicative measures should be removed.", - ), - ('DAX Expressions', 'Measure', 'Warning', 'No two measures should have the same definition', - lambda df: df['Measure Expression'].apply(lambda x: re.sub(r'\s+', '', x)).duplicated(keep=False), - 'Two measures with different names and defined by the same DAX expression should be avoided to reduce redundancy.', - ), - ('DAX Expressions', 'Measure', 'Warning', 'Avoid addition or subtraction of constant values to results of divisions', - lambda df: df["Measure Expression"].str.contains("(?i)DIVIDE\\s*\\((\\s*.*?)\\)\\s*[+-]\\s*1" or "\\/\\s*.*(?=[-+]\\s*1)", regex=True), - ), - ('DAX Expressions', 'Measure', 'Warning', "Avoid using '1-(x/y)' syntax", - lambda df: df['Measure Expression'].str.contains("[0-9]+\\s*[-+]\\s*[\\(]*\\s*(?i)SUM\\s*\\(\\s*'*[A-Za-z0-9 _]+'*\\s*\\[[A-Za-z0-9 _]+\\]\\s*\\)\\s*\\/" or '[0-9]+\\s*[-+]\\s*(?i)DIVIDE\\s*\\(', regex=True), - "Instead of using the '1-(x/y)' or '1+(x/y)' syntax to achieve a percentage calculation, use the basic DAX functions (as shown below). Using the improved syntax will generally improve the performance. The '1+/-...' syntax always returns a value whereas the solution without the '1+/-...' does not (as the value may be 'blank'). Therefore the '1+/-...' syntax may return more rows/columns which may result in a slower query speed. Let's clarify with an example: Avoid this: 1 - SUM ( 'Sales'[CostAmount] ) / SUM( 'Sales'[SalesAmount] ) Better: DIVIDE ( SUM ( 'Sales'[SalesAmount] ) - SUM ( 'Sales'[CostAmount] ), SUM ( 'Sales'[SalesAmount] ) ) Best: VAR x = SUM ( 'Sales'[SalesAmount] ) RETURN DIVIDE ( x - SUM ( 'Sales'[CostAmount] ), x )", - ), - ('DAX Expressions', 'Measure', 'Warning', 'Filter measure values by columns, not tables', - lambda df: df['Measure Expression'].str.contains("(?i)CALCULATE\\s*\\(\\s*[^,]+,\\s*(?i)FILTER\\s*\\(\\s*'*[A-Za-z0-9 _]+'*\\s*,\\s*\\[[^\\]]+\\]" or "(?i)CALCULATETABLE\\s*\\([^,]*,\\s*(?i)FILTER\\s*\\(\\s*'*[A-Za-z0-9 _]+'*\\s*,\\s*\\[", regex=True), - "Instead of using this pattern FILTER('Table',[Measure]>Value) for the filter parameters of a CALCULATE or CALCULATETABLE function, use one of the options below (if possible). Filtering on a specific column will produce a smaller table for the engine to process, thereby enabling faster performance. Using the VALUES function or the ALL function depends on the desired measure result.\nOption 1: FILTER(VALUES('Table'[Column]),[Measure] > Value)\nOption 2: FILTER(ALL('Table'[Column]),[Measure] > Value)", - 'https://docs.microsoft.com/power-bi/guidance/dax-avoid-avoid-filter-as-filter-argument', - ), - ('DAX Expressions', 'Measure', 'Warning', 'Filter column values with proper syntax', - lambda df: df['Measure Expression'].str.contains("(?i)CALCULATE\\s*\\(\\s*[^,]+,\\s*(?i)FILTER\\s*\\(\\s*'*[A-Za-z0-9 _]+'*\\s*,\\s*'*[A-Za-z0-9 _]+'*\\[[A-Za-z0-9 _]+\\]" or "(?i)CALCULATETABLE\\s*\\([^,]*,\\s*(?i)FILTER\\s*\\(\\s*'*[A-Za-z0-9 _]+'*\\s*,\\s*'*[A-Za-z0-9 _]+'*\\[[A-Za-z0-9 _]+\\]", regex=True), - "Instead of using this pattern FILTER('Table','Table'[Column]=\"Value\") for the filter parameters of a CALCULATE or CALCULATETABLE function, use one of the options below. As far as whether to use the KEEPFILTERS function, see the second reference link below.\nOption 1: KEEPFILTERS('Table'[Column]=\"Value\")\nOption 2: 'Table'[Column]=\"Value\"", - 'https://docs.microsoft.com/power-bi/guidance/dax-avoid-avoid-filter-as-filter-argument Reference: https://www.sqlbi.com/articles/using-keepfilters-in-dax', - ), - ('DAX Expressions', 'Measure', 'Warning', 'Use the DIVIDE function for division', - lambda df: df['Measure Expression'].str.contains("\\]\\s*\\/(?!\\/)(?!\\*)\" or \"\\)\\s*\\/(?!\\/)(?!\\*)",regex=True), - 'Use the DIVIDE function instead of using "/". The DIVIDE function resolves divide-by-zero cases. As such, it is recommended to use to avoid errors.', - 'https://docs.microsoft.com/power-bi/guidance/dax-divide-function-operator', - ), - ('DAX Expressions', 'Measure', 'Error', 'Column references should be fully qualified', - lambda df: df['Has Unqualified Column Reference'], - 'Using fully qualified column references makes it easier to distinguish between column and measure references, and also helps avoid certain errors. When referencing a column in DAX, first specify the table name, then specify the column name in square brackets.', - 'https://www.elegantbi.com/post/top10bestpractices', - ), - ('DAX Expressions', 'Measure', 'Error', 'Measure references should be unqualified', - lambda df: df['Has Fully Qualified Measure Reference'], - 'Using unqualified measure references makes it easier to distinguish between column and measure references, and also helps avoid certain errors. When referencing a measure using DAX, do not specify the table name. Use only the measure name in square brackets.', - 'https://www.elegantbi.com/post/top10bestpractices', - ), - ('DAX Expressions', 'Relationship', 'Warning', 'Inactive relationships that are never activated', - lambda df: df['Inactive without USERELATIONSHIP'], - 'Inactive relationships are activated using the USERELATIONSHIP function. If an inactive relationship is not referenced in any measure via this function, the relationship will not be used. It should be determined whether the relationship is not necessary or to activate the relationship via this method.', - 'https://dax.guide/userelationship', - ), - ('Maintenance', 'Column', 'Warning', 'Remove unnecessary columns', - lambda df: (df['Hidden'] | df['Parent Is Hidden']) & ~ df['Used in Relationship'] & ~ df['Used in Sort By'] & ~ df['Used in Hierarchy'] & (df['Referenced By'] == 0) & ~ (df['Used in RLS']), # usedInOLS - 'Hidden columns that are not referenced by any DAX expressions, relationships, hierarchy levels or Sort By-properties should be removed.', - ), - ('Maintenance', 'Measure', 'Warning', 'Remove unnecessary measures', - lambda df: df['Measure Hidden'] & (df['Referenced By'] == 0), - 'Hidden measures that are not referenced by any DAX expressions should be removed for maintainability.', - ), - #('Maintenance', 'Role', 'Warning', 'Remove roles with no members', - # lambda df: df['Member Count'] == 0, - #), - ('Maintenance', 'Table', 'Warning', 'Ensure tables have relationships', - lambda df: (df['Used in Relationship'] == False) & (df['Type'] != 'Calculation Group'), - 'This rule highlights tables which are not connected to any other table in the model with a relationship.', - ), - ('Maintenance', 'Table', 'Warning', 'Calculation groups with no calculation items', - lambda df: (df['Type'] == 'Calculation Group') & (df['Has Calculation Items']), - ), - ('Maintenance', 'Column', 'Info', 'Visible objects with no description', - lambda df: (df['Hidden'] == False) & (df['Description'].str.len() == 0), - 'Calculation groups have no function unless they have calculation items.', - ), - ('Formatting', 'Column', 'Warning', "Provide format string for 'Date' columns", - lambda df: (df['Column Name'].str.contains(r'date', case=False)) & (df['Data Type'] == 'DateTime') & (df['Format String'] != 'mm/dd/yyyy'), - 'Columns of type "DateTime" that have "Month" in their names should be formatted as "mm/dd/yyyy".', - ), - ('Formatting', 'Column', 'Warning', 'Do not summarize numeric columns', - lambda df: ((df['Data Type'] == 'Int64') | (df['Data Type'] == 'Decimal') | (df['Data Type'] == 'Double')) & (df['Summarize By'] != 'None') & ~ ((df['Hidden']) | (df['Parent Is Hidden']) ), - 'Numeric columns (integer, decimal, double) should have their SummarizeBy property set to "None" to avoid accidental summation in Power BI (create measures instead).', - ), - ('Formatting', 'Measure', 'Info', 'Provide format string for measures', - lambda df: ~ ((df['Measure Hidden']) | (df['Parent Is Hidden'])) & (df['Format String'].str.len() == 0), - 'Visible measures should have their format string property assigned.', - ), - ('Formatting', 'Column', 'Info', 'Add data category for columns', - lambda df: (df['Data Category'] == '') & ((((df['Column Name'].str.contains(r'country', case=False)) | (df['Column Name'].str.contains(r'city', case=False)) | (df['Column Name'].str.contains(r'continent', case=False))) & (df['Data Type'] == 'String')) | (((df['Column Name'].str.contains(r'latitude', case=False)) | (df['Column Name'].str.contains(r'longitude', case=False))) & (df['Data Type'] == 'String')) ), - 'Add Data Category property for appropriate columns.', - 'https://docs.microsoft.com/power-bi/transform-model/desktop-data-categorization', - ), - ('Formatting', 'Measure', 'Warning', 'Percentages should be formatted with thousands separators and 1 decimal', - lambda df: (df['Format String'].str.contains('%')) & (df['Format String'] != '#,0.0%;-#,0.0%;#,0.0%'), - ), - ('Formatting', 'Measure', 'Warning', 'Whole numbers should be formatted with thousands separators and no decimals', - lambda df: (~ df['Format String'].str.contains('$')) & ~ (df['Format String'].str.contains('%')) & ~ ((df['Format String'] == '#,0') | (df['Format String'] == '#,0.0')), - ), - ('Formatting', 'Column', 'Info', 'Hide foreign keys', - lambda df: (df['Foreign Key']) & (df['Hidden'] == False), - 'Foreign keys should always be hidden.', - ), - ('Formatting', 'Column', 'Info', 'Mark primary keys', - lambda df: (df['Primary Key']) & (df['Key'] == False), - "Set the 'Key' property to 'True' for primary key columns within the column properties.", - ), - ('Formatting', 'Column', 'Info', 'Month (as a string) must be sorted', - lambda df: (df['Column Name'].str.contains(r'month', case=False)) & ~ (df['Column Name'].str.contains(r'months', case=False)) & (df['Data Type'] == 'String') & (df['Sort By Column'] == ''), - 'This rule highlights month columns which are strings and are not sorted. If left unsorted, they will sort alphabetically (i.e. April, August...). Make sure to sort such columns so that they sort properly (January, February, March...).', - ), - ('Formatting', 'Relationship', 'Warning', 'Relationship columns should be of integer data type', - lambda df: (df['From Column Data Type'] != 'Int64') | (df['To Column Data Type'] != 'Int64'), - 'It is a best practice for relationship columns to be of integer data type. This applies not only to data warehousing but data modeling as well.', - ), - ('Formatting', 'Column', 'Warning', 'Provide format string for "Month" columns', - lambda df: (df['Column Name'].str.contains(r'month', case=False)) & (df['Data Type'] == 'DateTime') & (df['Format String'] != 'MMMM yyyy'), - 'Columns of type "DateTime" that have "Month" in their names should be formatted as "MMMM yyyy".', - ), - ('Formatting', 'Column', 'Info', 'Format flag columns as Yes/No value strings', - lambda df: ( df['Column Name'].str.startswith("Is") & (df['Data Type'] == "Int64") & ~ (df['Hidden'] | df['Parent Is Hidden']) ) | ( df['Column Name'].str.endswith(" Flag") & (df['Data Type'] != "String") & ~ (df['Hidden'] | df['Parent Is Hidden']) ), - 'Flags must be properly formatted as Yes/No as this is easier to read than using 0/1 integer values.', - ), - #('Formatting', ['Table', 'Column', 'Measure', 'Partition', 'Hierarchy'], 'Error', 'Objects should not start or end with a space', - # lambda df: (df['Name'].str[0] == ' ') | (df['Name'].str[-1] == ' '), - # 'Objects should not start or end with a space. This usually happens by accident and is difficult to find.', - #), - ('Formatting', ['Table', 'Column', 'Measure', 'Partition', 'Hierarchy'], 'Info', 'First letter of objects must be capitalized', - lambda df: df['Name'].str[0].str.upper() != df['Name'].str[0], - 'The first letter of object names should be capitalized to maintain professional quality.', - ), - ('Naming Conventions', ['Table', 'Column', 'Measure', 'Partition', 'Hierarchy'], 'Warning', 'Object names must not contain special characters', - lambda df: df['Name'].str.contains(r'[\t\r\n]'), - 'Object names should not include tabs, line breaks, etc.', - )#, - #('Error Prevention', ['Table'], 'Error', 'Avoid invalid characters in names', - # lambda df: df['Name'].str.char.iscontrol() & ~ df['Name'].str.char.isspace(), - #)#, - - ], columns=['Category', 'Scope', 'Severity', 'Rule Name', 'Expression', 'Description', 'URL']) - - df_rules['Severity'] = df_rules['Severity'].replace('Warning', '⚠️').replace('Error', '\u274C').replace('Info', 'ℹ️') - - pd.set_option('display.max_colwidth', 1000) - - return df_rules + df_rules = pd.DataFrame( + [ + ( + "Performance", + "Column", + "Warning", + "Do not use floating point data types", + lambda df: df["Data Type"] == "Double", + 'The "Double" floating point data type should be avoided, as it can result in unpredictable roundoff errors and decreased performance in certain scenarios. Use "Int64" or "Decimal" where appropriate (but note that "Decimal" is limited to 4 digits after the decimal sign).', + ), + ( + "Performance", + "Column", + "Warning", + "Avoid using calculated columns", + lambda df: df["Type"] == "Calculated", + "Calculated columns do not compress as well as data columns so they take up more memory. They also slow down processing times for both the table as well as process recalc. Offload calculated column logic to your data warehouse and turn these calculated columns into data columns.", + "https://www.elegantbi.com/post/top10bestpractices", + ), + ( + "Performance", + "Relationship", + "Warning", + "Check if bi-directional and many-to-many relationships are valid", + lambda df: (df["Multiplicity"] == "m:m") + | (df["Cross Filtering Behavior"] == "BothDirections"), + "Bi-directional and many-to-many relationships may cause performance degradation or even have unintended consequences. Make sure to check these specific relationships to ensure they are working as designed and are actually necessary.", + "https://www.sqlbi.com/articles/bidirectional-relationships-and-ambiguity-in-dax", + ), + ( + "Performance", + "Row Level Security", + "Info", + "Check if dynamic row level security (RLS) is necessary", + lambda df: df["Is Dynamic"], + "Usage of dynamic row level security (RLS) can add memory and performance overhead. Please research the pros/cons of using it.", + "https://docs.microsoft.com/power-bi/admin/service-admin-rls", + ), + ( + "Performance", + "Table", + "Warning", + "Avoid using many-to-many relationships on tables used for dynamic row level security", + lambda df: (df["Used in M2M Relationship"] == True) + & (df["Used in Dynamic RLS"] == True), + "Using many-to-many relationships on tables which use dynamic row level security can cause serious query performance degradation. This pattern's performance problems compound when snowflaking multiple many-to-many relationships against a table which contains row level security. Instead, use one of the patterns shown in the article below where a single dimension table relates many-to-one to a security table.", + "https://www.elegantbi.com/post/dynamicrlspatterns", + ), + ( + "Performance", + "Relationship", + "Warning", + "Many-to-many relationships should be single-direction", + lambda df: (df["Multiplicity"] == "m:m") + & (df["Cross Filtering Behavior"] == "BothDirections"), + ), + ( + "Performance", + "Column", + "Warning", + "Set IsAvailableInMdx to false on non-attribute columns", + lambda df: (df["Is Direct Lake"] == False) + & (df["Is Available in MDX"] == True) + & ((df["Hidden"] == True) | (df["Parent Is Hidden"] == True)) + & (df["Used in Sort By"] == False) + & (df["Used in Hierarchy"] == False) + & (df["Sort By Column"] == None), + "To speed up processing time and conserve memory after processing, attribute hierarchies should not be built for columns that are never used for slicing by MDX clients. In other words, all hidden columns that are not used as a Sort By Column or referenced in user hierarchies should have their IsAvailableInMdx property set to false. The IsAvailableInMdx property is not relevant for Direct Lake models.", + "https://blog.crossjoin.co.uk/2018/07/02/isavailableinmdx-ssas-tabular", + ), + # ('Performance', 'Partition', 'Warning', "Set 'Data Coverage Definition' property on the DirectQuery partition of a hybrid table", + # lambda df: (df['Data Coverage Definition Expression'].isnull()) & (df['Mode'] == 'DirectQuery') & (df['Import Partitions'] > 0) & (df['Has Date Table']), + # "Setting the 'Data Coverage Definition' property may lead to better performance because the engine knows when it can only query the import-portion of the table and when it needs to query the DirectQuery portion of the table.", + # "https://learn.microsoft.com/analysis-services/tom/table-partitions?view=asallproducts-allversions", + # ), + ( + "Performance", + "Table", + "Warning", + "Set dimensions tables to dual mode instead of import when using DirectQuery on fact tables", + lambda df: (df["Import Partitions"] == 1) + & (df["Model Has DQ"]) + & (df["Used in Relationship x:1"]), + "https://learn.microsoft.com/power-bi/transform-model/desktop-storage-mode#propagation-of-the-dual-setting", + ), + ( + "Performance", + "Partition", + "Warning", + "Minimize Power Query transformations", + lambda df: (df["Source Type"] == "M") + & ( + ('Table.Combine("' in df["Query"]) + | ('Table.Join("' in df["Query"]) + | ('Table.NestedJoin("' in df["Query"]) + | ('Table.AddColumn("' in df["Query"]) + | ('Table.Group("' in df["Query"]) + | ('Table.Sort("' in df["Query"]) + | ('Table.Sort("' in df["Query"]) + | ('Table.Pivot("' in df["Query"]) + | ('Table.Unpivot("' in df["Query"]) + | ('Table.UnpivotOtherColumns("' in df["Query"]) + | ('Table.Distinct("' in df["Query"]) + | ('[Query=(""SELECT' in df["Query"]) + | ("Value.NativeQuery" in df["Query"]) + | ("OleDb.Query" in df["Query"]) + | ("Odbc.Query" in df["Query"]) + ), + "Minimize Power Query transformations in order to improve model processing performance. It is a best practice to offload these transformations to the data warehouse if possible. Also, please check whether query folding is occurring within your model. Please reference the article below for more information on query folding.", + "https://docs.microsoft.com/power-query/power-query-folding", + ), + ( + "Performance", + "Table", + "Warning", + "Consider a star-schema instead of a snowflake architecture", + lambda df: (df["Type"] != "Calculation Group") + & df["Used in Relationship Both Sides"], + "Generally speaking, a star-schema is the optimal architecture for tabular models. That being the case, there are valid cases to use a snowflake approach. Please check your model and consider moving to a star-schema architecture.", + "https://docs.microsoft.com/power-bi/guidance/star-schema", + ), + ( + "Performance", + "Table", + "Warning", + "Reduce usage of calculated tables", + lambda df: df["Type"] == "Calculated Table", + "Migrate calculated table logic to your data warehouse. Reliance on calculated tables will lead to technical debt and potential misalignments if you have multiple models on your platform.", + ), + ( + "Performance", + "Column", + "Warning", + "Reduce usage of calculated columns that use the RELATED function", + lambda df: (df["Type"] == "Calculated") + & (df["Source"].str.contains(r"related\s*\(", case=False)), + "Calculated columns do not compress as well as data columns and may cause longer processing times. As such, calculated columns should be avoided if possible. One scenario where they may be easier to avoid is if they use the RELATED function.", + "https://www.sqlbi.com/articles/storage-differences-between-calculated-columns-and-calculated-tables", + ), + ( + "Performance", + "Model", + "Warning", + "Avoid excessive bi-directional or many-to-many relationships", + lambda df: ( + df["M2M or BiDi Relationship Count"] / df["Relationship Count"] + ) + > 0.3, + "Limit use of b-di and many-to-many relationships. This rule flags the model if more than 30% of relationships are bi-di or many-to-many.", + "https://www.sqlbi.com/articles/bidirectional-relationships-and-ambiguity-in-dax", + ), + ( + "Performance", + "Column", + "Warning", + "Avoid bi-directional or many-to-many relationships against high-cardinality columns", + lambda df: df["Used in M2M/BiDi Relationship"] + & df["Column Cardinality"] + > 100000, + "For best performance, it is recommended to avoid using bi-directional relationships against high-cardinality columns", + ), + ( + "Performance", + "Table", + "Warning", + "Remove auto-date table", + lambda df: (df["Type"] == "Calculated Table") + & ( + (df["Name"].str.startswith("DateTableTemplate_")) + | (df["Name"].str.startswith("LocalDateTable_")) + ), + "Avoid using auto-date tables. Make sure to turn off auto-date table in the settings in Power BI Desktop. This will save memory resources.", + "https://www.youtube.com/watch?v=xu3uDEHtCrg", + ), + ( + "Performance", + "Table", + "Warning", + "Date/calendar tables should be marked as a date table", + lambda df: ( + (df["Name"].str.contains(r"date", case=False)) + | (df["Name"].str.contains(r"calendar", case=False)) + ) + & (df["Data Category"] != "Time"), + "This rule looks for tables that contain the words 'date' or 'calendar' as they should likely be marked as a date table.", + "https://docs.microsoft.com/power-bi/transform-model/desktop-date-tables", + ), + ( + "Performance", + "Table", + "Warning", + "Large tables should be partitioned", + lambda df: (df["Is Direct Lake"] == False) + & (df["Partition Count"] == 1) + & (df["Row Count"] > 25000000), + "Large tables should be partitioned in order to optimize processing. This is not relevant for semantic models in Direct Lake mode as they can only have one partition per table.", + ), + ( + "Performance", + "Row Level Security", + "Warning", + "Limit row level security (RLS) logic", + lambda df: df["Filter Expression"].str.contains( + "|".join(["right", "left", "filter", "upper", "lower", "find"]), + case=False, + ), + "Try to simplify the DAX used for row level security. Usage of the functions within this rule can likely be offloaded to the upstream systems (data warehouse).", + ), + ( + "Performance", + "Model", + "Warning", + "Model should have a date table", + lambda df: df["Has Date Table"], + "Generally speaking, models should generally have a date table. Models that do not have a date table generally are not taking advantage of features such as time intelligence or may not have a properly structured architecture.", + ), + ( + "Performance", + "Measure", + "Warning", + "Measures using time intelligence and model is using Direct Query", + lambda df: df["DQ Date Function Used"], + "At present, time intelligence functions are known to not perform as well when using Direct Query. If you are having performance issues, you may want to try alternative solutions such as adding columns in the fact table that show previous year or previous month data.", + ), + ( + "Error Prevention", + "Calculation Item", + "Error", + "Calculation items must have an expression", + lambda df: df["Expression"].str.len() == 0, + "Calculation items must have an expression. Without an expression, they will not show any values.", + ), + ( + "Error Prevention", + ["Table", "Column", "Measure", "Hierarchy", "Partition"], + "Error", + "Avoid invalid characters in names", + lambda df: df["Name"].apply( + lambda x: any( + unicodedata.category(char) == "Cc" and not char.isspace() + for char in x + ) + ), + "This rule identifies if a name for a given object in your model (i.e. table/column/measure) which contains an invalid character. Invalid characters will cause an error when deploying the model (and failure to deploy). This rule has a fix expression which converts the invalid character into a space, resolving the issue.", + ), + ( + "Error Prevention", + ["Table", "Column", "Measure", "Hierarchy"], + "Error", + "Avoid invalid characters in descriptions", + lambda df: df["Description"].apply( + lambda x: any( + unicodedata.category(char) == "Cc" and not char.isspace() + for char in x + ) + ), + "This rule identifies if a description for a given object in your model (i.e. table/column/measure) which contains an invalid character. Invalid characters will cause an error when deploying the model (and failure to deploy). This rule has a fix expression which converts the invalid character into a space, resolving the issue.", + ), + ( + "Error Prevention", + "Relationship", + "Warning", + "Relationship columns should be of the same data type", + lambda df: df["From Column Data Type"] != df["To Column Data Type"], + "Columns used in a relationship should be of the same data type. Ideally, they will be of integer data type (see the related rule '[Formatting] Relationship columns should be of integer data type'). Having columns within a relationship which are of different data types may lead to various issues.", + ), + ( + "Error Prevention", + "Column", + "Error", + "Data columns must have a source column", + lambda df: (df["Type"] == "Data") & (df["Source"].str.len() == 0), + "Data columns must have a source column. A data column without a source column will cause an error when processing the model.", + ), + ( + "Error Prevention", + "Column", + "Warning", + "Set IsAvailableInMdx to true on necessary columns", + lambda df: (df["Is Direct Lake"] == False) + & (df["Is Available in MDX"] == False) + & ( + (df["Used in Sort By"] == True) + | (df["Used in Hierarchy"] == True) + | (df["Sort By Column"] != None) + ), + "In order to avoid errors, ensure that attribute hierarchies are enabled if a column is used for sorting another column, used in a hierarchy, used in variations, or is sorted by another column. The IsAvailableInMdx property is not relevant for Direct Lake models.", + ), + ( + "Error Prevention", + "Table", + "Error", + "Avoid the USERELATIONSHIP function and RLS against the same table", + lambda df: (df["USERELATIONSHIP Used"] == True) + & (df["Used in RLS"] == True), + "The USERELATIONSHIP function may not be used against a table which also leverages row-level security (RLS). This will generate an error when using the particular measure in a visual. This rule will highlight the table which is used in a measure's USERELATIONSHIP function as well as RLS.", + "https://blog.crossjoin.co.uk/2013/05/10/userelationship-and-tabular-row-security", + ), + ( + "DAX Expressions", + "Measure", + "Warning", + "Avoid using the IFERROR function", + lambda df: df["Measure Expression"].str.contains( + r"irerror\s*\(", case=False + ), + "Avoid using the IFERROR function as it may cause performance degradation. If you are concerned about a divide-by-zero error, use the DIVIDE function as it naturally resolves such errors as blank (or you can customize what should be shown in case of such an error).", + "https://www.elegantbi.com/post/top10bestpractices", + ), + ( + "DAX Expressions", + "Measure", + "Warning", + "Use the TREATAS function instead of INTERSECT for virtual relationships", + lambda df: df["Measure Expression"].str.contains( + r"intersect\s*\(", case=False + ), + "The TREATAS function is more efficient and provides better performance than the INTERSECT function when used in virutal relationships.", + "https://www.sqlbi.com/articles/propagate-filters-using-treatas-in-dax", + ), + ( + "DAX Expressions", + "Measure", + "Warning", + "The EVALUATEANDLOG function should not be used in production models", + lambda df: df["Measure Expression"].str.contains( + r"evaluateandlog\s*\(", case=False + ), + "The EVALUATEANDLOG function is meant to be used only in development/test environments and should not be used in production models.", + "https://pbidax.wordpress.com/2022/08/16/introduce-the-dax-evaluateandlog-function", + ), + ( + "DAX Expressions", + "Measure", + "Warning", + "Measures should not be direct references of other measures", + lambda df: df["Measure Expression"] + .str.strip() + .isin(df["Measure Object"]), + "This rule identifies measures which are simply a reference to another measure. As an example, consider a model with two measures: [MeasureA] and [MeasureB]. This rule would be triggered for MeasureB if MeasureB's DAX was MeasureB:=[MeasureA]. Such duplicative measures should be removed.", + ), + ( + "DAX Expressions", + "Measure", + "Warning", + "No two measures should have the same definition", + lambda df: df["Measure Expression"] + .apply(lambda x: re.sub(r"\s+", "", x)) + .duplicated(keep=False), + "Two measures with different names and defined by the same DAX expression should be avoided to reduce redundancy.", + ), + ( + "DAX Expressions", + "Measure", + "Warning", + "Avoid addition or subtraction of constant values to results of divisions", + lambda df: df["Measure Expression"].str.contains( + "(?i)DIVIDE\\s*\\((\\s*.*?)\\)\\s*[+-]\\s*1" + or "\\/\\s*.*(?=[-+]\\s*1)", + regex=True, + ), + ), + ( + "DAX Expressions", + "Measure", + "Warning", + "Avoid using '1-(x/y)' syntax", + lambda df: df["Measure Expression"].str.contains( + "[0-9]+\\s*[-+]\\s*[\\(]*\\s*(?i)SUM\\s*\\(\\s*'*[A-Za-z0-9 _]+'*\\s*\\[[A-Za-z0-9 _]+\\]\\s*\\)\\s*\\/" + or "[0-9]+\\s*[-+]\\s*(?i)DIVIDE\\s*\\(", + regex=True, + ), + "Instead of using the '1-(x/y)' or '1+(x/y)' syntax to achieve a percentage calculation, use the basic DAX functions (as shown below). Using the improved syntax will generally improve the performance. The '1+/-...' syntax always returns a value whereas the solution without the '1+/-...' does not (as the value may be 'blank'). Therefore the '1+/-...' syntax may return more rows/columns which may result in a slower query speed. Let's clarify with an example: Avoid this: 1 - SUM ( 'Sales'[CostAmount] ) / SUM( 'Sales'[SalesAmount] ) Better: DIVIDE ( SUM ( 'Sales'[SalesAmount] ) - SUM ( 'Sales'[CostAmount] ), SUM ( 'Sales'[SalesAmount] ) ) Best: VAR x = SUM ( 'Sales'[SalesAmount] ) RETURN DIVIDE ( x - SUM ( 'Sales'[CostAmount] ), x )", + ), + ( + "DAX Expressions", + "Measure", + "Warning", + "Filter measure values by columns, not tables", + lambda df: df["Measure Expression"].str.contains( + "(?i)CALCULATE\\s*\\(\\s*[^,]+,\\s*(?i)FILTER\\s*\\(\\s*'*[A-Za-z0-9 _]+'*\\s*,\\s*\\[[^\\]]+\\]" + or "(?i)CALCULATETABLE\\s*\\([^,]*,\\s*(?i)FILTER\\s*\\(\\s*'*[A-Za-z0-9 _]+'*\\s*,\\s*\\[", + regex=True, + ), + "Instead of using this pattern FILTER('Table',[Measure]>Value) for the filter parameters of a CALCULATE or CALCULATETABLE function, use one of the options below (if possible). Filtering on a specific column will produce a smaller table for the engine to process, thereby enabling faster performance. Using the VALUES function or the ALL function depends on the desired measure result.\nOption 1: FILTER(VALUES('Table'[Column]),[Measure] > Value)\nOption 2: FILTER(ALL('Table'[Column]),[Measure] > Value)", + "https://docs.microsoft.com/power-bi/guidance/dax-avoid-avoid-filter-as-filter-argument", + ), + ( + "DAX Expressions", + "Measure", + "Warning", + "Filter column values with proper syntax", + lambda df: df["Measure Expression"].str.contains( + "(?i)CALCULATE\\s*\\(\\s*[^,]+,\\s*(?i)FILTER\\s*\\(\\s*'*[A-Za-z0-9 _]+'*\\s*,\\s*'*[A-Za-z0-9 _]+'*\\[[A-Za-z0-9 _]+\\]" + or "(?i)CALCULATETABLE\\s*\\([^,]*,\\s*(?i)FILTER\\s*\\(\\s*'*[A-Za-z0-9 _]+'*\\s*,\\s*'*[A-Za-z0-9 _]+'*\\[[A-Za-z0-9 _]+\\]", + regex=True, + ), + "Instead of using this pattern FILTER('Table','Table'[Column]=\"Value\") for the filter parameters of a CALCULATE or CALCULATETABLE function, use one of the options below. As far as whether to use the KEEPFILTERS function, see the second reference link below.\nOption 1: KEEPFILTERS('Table'[Column]=\"Value\")\nOption 2: 'Table'[Column]=\"Value\"", + "https://docs.microsoft.com/power-bi/guidance/dax-avoid-avoid-filter-as-filter-argument Reference: https://www.sqlbi.com/articles/using-keepfilters-in-dax", + ), + ( + "DAX Expressions", + "Measure", + "Warning", + "Use the DIVIDE function for division", + lambda df: df["Measure Expression"].str.contains( + '\\]\\s*\\/(?!\\/)(?!\\*)" or "\\)\\s*\\/(?!\\/)(?!\\*)', regex=True + ), + 'Use the DIVIDE function instead of using "/". The DIVIDE function resolves divide-by-zero cases. As such, it is recommended to use to avoid errors.', + "https://docs.microsoft.com/power-bi/guidance/dax-divide-function-operator", + ), + ( + "DAX Expressions", + "Measure", + "Error", + "Column references should be fully qualified", + lambda df: df["Has Unqualified Column Reference"], + "Using fully qualified column references makes it easier to distinguish between column and measure references, and also helps avoid certain errors. When referencing a column in DAX, first specify the table name, then specify the column name in square brackets.", + "https://www.elegantbi.com/post/top10bestpractices", + ), + ( + "DAX Expressions", + "Measure", + "Error", + "Measure references should be unqualified", + lambda df: df["Has Fully Qualified Measure Reference"], + "Using unqualified measure references makes it easier to distinguish between column and measure references, and also helps avoid certain errors. When referencing a measure using DAX, do not specify the table name. Use only the measure name in square brackets.", + "https://www.elegantbi.com/post/top10bestpractices", + ), + ( + "DAX Expressions", + "Relationship", + "Warning", + "Inactive relationships that are never activated", + lambda df: df["Inactive without USERELATIONSHIP"], + "Inactive relationships are activated using the USERELATIONSHIP function. If an inactive relationship is not referenced in any measure via this function, the relationship will not be used. It should be determined whether the relationship is not necessary or to activate the relationship via this method.", + "https://dax.guide/userelationship", + ), + ( + "Maintenance", + "Column", + "Warning", + "Remove unnecessary columns", + lambda df: (df["Hidden"] | df["Parent Is Hidden"]) + & ~df["Used in Relationship"] + & ~df["Used in Sort By"] + & ~df["Used in Hierarchy"] + & (df["Referenced By"] == 0) + & ~(df["Used in RLS"]), # usedInOLS + "Hidden columns that are not referenced by any DAX expressions, relationships, hierarchy levels or Sort By-properties should be removed.", + ), + ( + "Maintenance", + "Measure", + "Warning", + "Remove unnecessary measures", + lambda df: df["Measure Hidden"] & (df["Referenced By"] == 0), + "Hidden measures that are not referenced by any DAX expressions should be removed for maintainability.", + ), + # ('Maintenance', 'Role', 'Warning', 'Remove roles with no members', + # lambda df: df['Member Count'] == 0, + # ), + ( + "Maintenance", + "Table", + "Warning", + "Ensure tables have relationships", + lambda df: (df["Used in Relationship"] == False) + & (df["Type"] != "Calculation Group"), + "This rule highlights tables which are not connected to any other table in the model with a relationship.", + ), + ( + "Maintenance", + "Table", + "Warning", + "Calculation groups with no calculation items", + lambda df: (df["Type"] == "Calculation Group") + & (df["Has Calculation Items"]), + ), + ( + "Maintenance", + "Column", + "Info", + "Visible objects with no description", + lambda df: (df["Hidden"] == False) & (df["Description"].str.len() == 0), + "Calculation groups have no function unless they have calculation items.", + ), + ( + "Formatting", + "Column", + "Warning", + "Provide format string for 'Date' columns", + lambda df: (df["Column Name"].str.contains(r"date", case=False)) + & (df["Data Type"] == "DateTime") + & (df["Format String"] != "mm/dd/yyyy"), + 'Columns of type "DateTime" that have "Month" in their names should be formatted as "mm/dd/yyyy".', + ), + ( + "Formatting", + "Column", + "Warning", + "Do not summarize numeric columns", + lambda df: ( + (df["Data Type"] == "Int64") + | (df["Data Type"] == "Decimal") + | (df["Data Type"] == "Double") + ) + & (df["Summarize By"] != "None") + & ~((df["Hidden"]) | (df["Parent Is Hidden"])), + 'Numeric columns (integer, decimal, double) should have their SummarizeBy property set to "None" to avoid accidental summation in Power BI (create measures instead).', + ), + ( + "Formatting", + "Measure", + "Info", + "Provide format string for measures", + lambda df: ~((df["Measure Hidden"]) | (df["Parent Is Hidden"])) + & (df["Format String"].str.len() == 0), + "Visible measures should have their format string property assigned.", + ), + ( + "Formatting", + "Column", + "Info", + "Add data category for columns", + lambda df: (df["Data Category"] == "") + & ( + ( + ( + (df["Column Name"].str.contains(r"country", case=False)) + | (df["Column Name"].str.contains(r"city", case=False)) + | (df["Column Name"].str.contains(r"continent", case=False)) + ) + & (df["Data Type"] == "String") + ) + | ( + ( + (df["Column Name"].str.contains(r"latitude", case=False)) + | (df["Column Name"].str.contains(r"longitude", case=False)) + ) + & (df["Data Type"] == "String") + ) + ), + "Add Data Category property for appropriate columns.", + "https://docs.microsoft.com/power-bi/transform-model/desktop-data-categorization", + ), + ( + "Formatting", + "Measure", + "Warning", + "Percentages should be formatted with thousands separators and 1 decimal", + lambda df: (df["Format String"].str.contains("%")) + & (df["Format String"] != "#,0.0%;-#,0.0%;#,0.0%"), + ), + ( + "Formatting", + "Measure", + "Warning", + "Whole numbers should be formatted with thousands separators and no decimals", + lambda df: (~df["Format String"].str.contains("$")) + & ~(df["Format String"].str.contains("%")) + & ~((df["Format String"] == "#,0") | (df["Format String"] == "#,0.0")), + ), + ( + "Formatting", + "Column", + "Info", + "Hide foreign keys", + lambda df: (df["Foreign Key"]) & (df["Hidden"] == False), + "Foreign keys should always be hidden.", + ), + ( + "Formatting", + "Column", + "Info", + "Mark primary keys", + lambda df: (df["Primary Key"]) & (df["Key"] == False), + "Set the 'Key' property to 'True' for primary key columns within the column properties.", + ), + ( + "Formatting", + "Column", + "Info", + "Month (as a string) must be sorted", + lambda df: (df["Column Name"].str.contains(r"month", case=False)) + & ~(df["Column Name"].str.contains(r"months", case=False)) + & (df["Data Type"] == "String") + & (df["Sort By Column"] == ""), + "This rule highlights month columns which are strings and are not sorted. If left unsorted, they will sort alphabetically (i.e. April, August...). Make sure to sort such columns so that they sort properly (January, February, March...).", + ), + ( + "Formatting", + "Relationship", + "Warning", + "Relationship columns should be of integer data type", + lambda df: (df["From Column Data Type"] != "Int64") + | (df["To Column Data Type"] != "Int64"), + "It is a best practice for relationship columns to be of integer data type. This applies not only to data warehousing but data modeling as well.", + ), + ( + "Formatting", + "Column", + "Warning", + 'Provide format string for "Month" columns', + lambda df: (df["Column Name"].str.contains(r"month", case=False)) + & (df["Data Type"] == "DateTime") + & (df["Format String"] != "MMMM yyyy"), + 'Columns of type "DateTime" that have "Month" in their names should be formatted as "MMMM yyyy".', + ), + ( + "Formatting", + "Column", + "Info", + "Format flag columns as Yes/No value strings", + lambda df: ( + df["Column Name"].str.startswith("Is") + & (df["Data Type"] == "Int64") + & ~(df["Hidden"] | df["Parent Is Hidden"]) + ) + | ( + df["Column Name"].str.endswith(" Flag") + & (df["Data Type"] != "String") + & ~(df["Hidden"] | df["Parent Is Hidden"]) + ), + "Flags must be properly formatted as Yes/No as this is easier to read than using 0/1 integer values.", + ), + # ('Formatting', ['Table', 'Column', 'Measure', 'Partition', 'Hierarchy'], 'Error', 'Objects should not start or end with a space', + # lambda df: (df['Name'].str[0] == ' ') | (df['Name'].str[-1] == ' '), + # 'Objects should not start or end with a space. This usually happens by accident and is difficult to find.', + # ), + ( + "Formatting", + ["Table", "Column", "Measure", "Partition", "Hierarchy"], + "Info", + "First letter of objects must be capitalized", + lambda df: df["Name"].str[0].str.upper() != df["Name"].str[0], + "The first letter of object names should be capitalized to maintain professional quality.", + ), + ( + "Naming Conventions", + ["Table", "Column", "Measure", "Partition", "Hierarchy"], + "Warning", + "Object names must not contain special characters", + lambda df: df["Name"].str.contains(r"[\t\r\n]"), + "Object names should not include tabs, line breaks, etc.", + ), # , + # ('Error Prevention', ['Table'], 'Error', 'Avoid invalid characters in names', + # lambda df: df['Name'].str.char.iscontrol() & ~ df['Name'].str.char.isspace(), + # )#, + ], + columns=[ + "Category", + "Scope", + "Severity", + "Rule Name", + "Expression", + "Description", + "URL", + ], + ) -@log -def run_model_bpa(dataset: str, rules_dataframe: Optional[pd.DataFrame] = None, workspace: Optional[str] = None, export: Optional[bool] = False, return_dataframe: Optional[bool] = False, **kwargs): + df_rules["Severity"] = ( + df_rules["Severity"] + .replace("Warning", "⚠️") + .replace("Error", "\u274C") + .replace("Info", "ℹ️") + ) + + pd.set_option("display.max_colwidth", 1000) + return df_rules + + +@log +def run_model_bpa( + dataset: str, + rules_dataframe: Optional[pd.DataFrame] = None, + workspace: Optional[str] = None, + export: Optional[bool] = False, + return_dataframe: Optional[bool] = False, + **kwargs, +): """ Displays an HTML visualization of the results of the Best Practice Analyzer scan for a semantic model. @@ -326,18 +726,23 @@ def run_model_bpa(dataset: str, rules_dataframe: Optional[pd.DataFrame] = None, If True, exports the resulting dataframe to a delta table in the lakehouse attached to the notebook. return_dataframe : bool, default=False If True, returns a pandas dataframe instead of the visualization. - + Returns ------- pandas.DataFrame A pandas dataframe in HTML format showing semantic model objects which violated the best practice analyzer rules. """ - if 'extend' in kwargs: - print("The 'extend' parameter has been deprecated. Please remove this parameter from the function going forward.") - del kwargs['extend'] + if "extend" in kwargs: + print( + "The 'extend' parameter has been deprecated. Please remove this parameter from the function going forward." + ) + del kwargs["extend"] - warnings.filterwarnings("ignore", message="This pattern is interpreted as a regular expression, and has match groups.") + warnings.filterwarnings( + "ignore", + message="This pattern is interpreted as a regular expression, and has match groups.", + ) if workspace == None: workspace_id = fabric.get_workspace_id() @@ -346,194 +751,340 @@ def run_model_bpa(dataset: str, rules_dataframe: Optional[pd.DataFrame] = None, if rules_dataframe is None: rules_dataframe = model_bpa_rules() - dfT = fabric.list_tables(dataset = dataset, workspace = workspace, extended=True) + dfT = fabric.list_tables(dataset=dataset, workspace=workspace, extended=True) dfT = dfT.drop_duplicates() - dfC = fabric.list_columns(dataset = dataset, workspace = workspace, extended=True, additional_xmla_properties=['Parent.DataCategory', 'Parent.IsHidden']) - dfC = dfC[~dfC['Column Name'].str.startswith('RowNumber-')] - - dfM = fabric.list_measures(dataset = dataset, workspace = workspace, additional_xmla_properties=['Parent.IsHidden']) - dfR = fabric.list_relationships(dataset = dataset, workspace = workspace, additional_xmla_properties=['FromCardinality', 'ToCardinality']) - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace, additional_xmla_properties=['DataCoverageDefinition.Expression']) - dfH = fabric.list_hierarchies(dataset = dataset, workspace = workspace) - dfRole = fabric.get_roles(dataset = dataset, workspace = workspace) - dfRM = fabric.get_roles(dataset = dataset, workspace = workspace, include_members=True) - dfRLS = fabric.get_row_level_security_permissions(dataset = dataset, workspace = workspace) - #dfTr = fabric.list_translations(dataset = datasetName, workspace = workspaceName) - #dfE = fabric.list_expressions(dataset = datasetName, workspace = workspaceName) - dfCI = fabric.list_calculation_items(dataset = dataset, workspace = workspace) - #dfDS = fabric.list_datasources(dataset = datasetName, workspace = workspaceName) - #dfPersp = fabric.list_perspectives(dataset = datasetName, workspace = workspaceName) - dfD = fabric.list_datasets(mode = 'rest', workspace = workspace) - dfD = dfD[dfD['Dataset Name'] == dataset] - #datasetOwner = dfD['Configured By'].iloc[0] + dfC = fabric.list_columns( + dataset=dataset, + workspace=workspace, + extended=True, + additional_xmla_properties=["Parent.DataCategory", "Parent.IsHidden"], + ) + dfC = dfC[~dfC["Column Name"].str.startswith("RowNumber-")] + + dfM = fabric.list_measures( + dataset=dataset, + workspace=workspace, + additional_xmla_properties=["Parent.IsHidden"], + ) + dfR = fabric.list_relationships( + dataset=dataset, + workspace=workspace, + additional_xmla_properties=["FromCardinality", "ToCardinality"], + ) + dfP = fabric.list_partitions( + dataset=dataset, + workspace=workspace, + additional_xmla_properties=["DataCoverageDefinition.Expression"], + ) + dfH = fabric.list_hierarchies(dataset=dataset, workspace=workspace) + dfRole = fabric.get_roles(dataset=dataset, workspace=workspace) + dfRM = fabric.get_roles(dataset=dataset, workspace=workspace, include_members=True) + dfRLS = fabric.get_row_level_security_permissions( + dataset=dataset, workspace=workspace + ) + # dfTr = fabric.list_translations(dataset = datasetName, workspace = workspaceName) + # dfE = fabric.list_expressions(dataset = datasetName, workspace = workspaceName) + dfCI = fabric.list_calculation_items(dataset=dataset, workspace=workspace) + # dfDS = fabric.list_datasources(dataset = datasetName, workspace = workspaceName) + # dfPersp = fabric.list_perspectives(dataset = datasetName, workspace = workspaceName) + dfD = fabric.list_datasets(mode="rest", workspace=workspace) + dfD = dfD[dfD["Dataset Name"] == dataset] + # datasetOwner = dfD['Configured By'].iloc[0] md = get_measure_dependencies(dataset, workspace) - isDirectLake = any(r['Mode'] == 'DirectLake' for i, r in dfP.iterrows()) - dfC['Is Direct Lake'] = isDirectLake - dfT['Is Direct Lake'] = isDirectLake + isDirectLake = any(r["Mode"] == "DirectLake" for i, r in dfP.iterrows()) + dfC["Is Direct Lake"] = isDirectLake + dfT["Is Direct Lake"] = isDirectLake - cols = ['From Cardinality', 'To Cardinality'] + cols = ["From Cardinality", "To Cardinality"] for col in cols: if not col in dfR: dfR[col] = None - cols = ['Parent Is Hidden'] + cols = ["Parent Is Hidden"] for col in cols: if not col in dfM: dfM[col] = None - + # Data Coverage Definition rule - dfP_imp = dfP[dfP['Mode'] == 'Import'] - dfTP = dfP_imp.groupby('Table Name')['Partition Name'].count().reset_index() - dfTP.rename(columns={'Partition Name': 'Import Partitions'}, inplace=True) - dfP = pd.merge(dfP, dfTP[['Table Name', 'Import Partitions']], on = 'Table Name', how = 'left') - dfP['Import Partitions'].fillna(0, inplace=True) - dfC_DateKey = dfC[(dfC['Parent Data Category'] == 'Time') & (dfC['Data Type'] == 'DateTime') & (dfC['Key'])] + dfP_imp = dfP[dfP["Mode"] == "Import"] + dfTP = dfP_imp.groupby("Table Name")["Partition Name"].count().reset_index() + dfTP.rename(columns={"Partition Name": "Import Partitions"}, inplace=True) + dfP = pd.merge( + dfP, dfTP[["Table Name", "Import Partitions"]], on="Table Name", how="left" + ) + dfP["Import Partitions"].fillna(0, inplace=True) + dfC_DateKey = dfC[ + (dfC["Parent Data Category"] == "Time") + & (dfC["Data Type"] == "DateTime") + & (dfC["Key"]) + ] hasDateTable = False if len(dfC_DateKey) > 0: hasDateTable = True - dfP['Has Date Table'] = hasDateTable + dfP["Has Date Table"] = hasDateTable # Set dims to dual mode - dfR_one = dfR[dfR['To Cardinality'] == 'One'] - dfTP = dfP_imp.groupby('Table Name')['Partition Name'].count().reset_index() - dfTP.rename(columns={'Partition Name': 'Import Partitions'}, inplace=True) - dfT = pd.merge(dfT, dfTP, left_on = 'Name', right_on = 'Table Name', how='left') - dfT.drop(columns=['Table Name'], inplace=True) - dfT['Import Partitions'].fillna(0, inplace=True) - hasDQ = any(r['Mode'] == 'DirectQuery' for i, r in dfP.iterrows()) - dfT['Model Has DQ'] = hasDQ - dfT['Used in Relationship x:1'] = dfT['Name'].isin(dfR_one['To Table']) + dfR_one = dfR[dfR["To Cardinality"] == "One"] + dfTP = dfP_imp.groupby("Table Name")["Partition Name"].count().reset_index() + dfTP.rename(columns={"Partition Name": "Import Partitions"}, inplace=True) + dfT = pd.merge(dfT, dfTP, left_on="Name", right_on="Table Name", how="left") + dfT.drop(columns=["Table Name"], inplace=True) + dfT["Import Partitions"].fillna(0, inplace=True) + hasDQ = any(r["Mode"] == "DirectQuery" for i, r in dfP.iterrows()) + dfT["Model Has DQ"] = hasDQ + dfT["Used in Relationship x:1"] = dfT["Name"].isin(dfR_one["To Table"]) dfF = fabric.evaluate_dax( - dataset = dataset, workspace = workspace, dax_string = - """ + dataset=dataset, + workspace=workspace, + dax_string=""" SELECT [FUNCTION_NAME] FROM $SYSTEM.MDSCHEMA_FUNCTIONS WHERE [INTERFACE_NAME] = 'DATETIME' - """) + """, + ) - dfC['Name'] = dfC['Column Name'] - dfH['Name'] = dfH['Hierarchy Name'] - dfM['Name'] = dfM['Measure Name'] - dfP['Name'] = dfP['Partition Name'] - dfRole['Name'] = dfRole['Role'] - dfD['Name'] = dfD['Dataset Name'] - dfH['Description'] = dfH['Hierarchy Description'] - dfM['Description'] = dfM['Measure Description'] - dfH['Hierarchy Object'] = format_dax_object_name(dfH['Table Name'], dfH['Hierarchy Name']) + dfC["Name"] = dfC["Column Name"] + dfH["Name"] = dfH["Hierarchy Name"] + dfM["Name"] = dfM["Measure Name"] + dfP["Name"] = dfP["Partition Name"] + dfRole["Name"] = dfRole["Role"] + dfD["Name"] = dfD["Dataset Name"] + dfH["Description"] = dfH["Hierarchy Description"] + dfM["Description"] = dfM["Measure Description"] + dfH["Hierarchy Object"] = format_dax_object_name( + dfH["Table Name"], dfH["Hierarchy Name"] + ) - dfCI['Calculation Object'] = format_dax_object_name(dfCI['Calculation Group Name'], dfCI['Calculation Item Name']) + dfCI["Calculation Object"] = format_dax_object_name( + dfCI["Calculation Group Name"], dfCI["Calculation Item Name"] + ) - dfRole['Member Count'] = dfRM['Role'].isin(dfRole['Role']).sum() - dfRLS['Is Dynamic'] = dfRLS['Filter Expression'].str.contains(r'userprincipalname\s*\(', case=False) | dfRLS['Filter Expression'].str.contains(r'username\s*\(', case=False) + dfRole["Member Count"] = dfRM["Role"].isin(dfRole["Role"]).sum() + dfRLS["Is Dynamic"] = dfRLS["Filter Expression"].str.contains( + r"userprincipalname\s*\(", case=False + ) | dfRLS["Filter Expression"].str.contains(r"username\s*\(", case=False) # Partition Count - partition_count = dfP.groupby('Table Name').size().reset_index(name='Partition Count') - dfT = pd.merge(dfT, partition_count, left_on='Name', right_on='Table Name', how='left').drop('Table Name', axis=1) - dfT['Partition Count'] = dfT['Partition Count'].fillna(0).astype(int) - - dfT = dfT.merge(dfP[['Table Name', 'Partition Name']], how='left', left_on='Name', right_on='Table Name') - dfT['First Partition Name'] = dfT.groupby('Name')['Partition Name'].transform('first') - dfT.drop('Table Name', axis=1, inplace=True) - - dfC['Sort By Column Object'] = format_dax_object_name(dfC['Table Name'], dfC['Sort By Column']) - dfC['Column Object'] = format_dax_object_name(dfC['Table Name'], dfC['Column Name']) - dfM['Measure Object'] = "[" + dfM['Measure Name'] + "]" - dfM['Measure Fully Qualified'] = format_dax_object_name(dfM['Table Name'], dfM['Measure Name']) - dfM['Measure Fully Qualified No Spaces'] = dfM['Table Name'] + '[' + dfM['Measure Name'] + ']' - #dfM['Measure Fully Qualified No Spaces'] = dfM.apply(lambda row: row['Table Name'] + '[' + row['Measure Name'] + ']' if ' ' not in row['Table Name'] else '', axis=1) - dfC['Column Unqualified'] = "[" + dfC['Column Name'] + "]" - dfC['Column Object No Spaces'] = dfC.apply(lambda row: row['Table Name'] + '[' + row['Column Name'] + ']' if ' ' not in row['Table Name'] else '', axis=1) - dfC['Used in Sort By'] = dfC['Column Object'].isin(dfC['Sort By Column Object']) - dfH['Column Object'] = format_dax_object_name(dfH['Table Name'], dfH['Column Name']) - dfC['Used in Hierarchy'] = dfC['Column Object'].isin(dfH['Column Object']) - dfR['From Object'] = format_dax_object_name(dfR['From Table'], dfR['From Column']) - dfR['To Object'] = format_dax_object_name(dfR['To Table'], dfR['To Column']) - dfT['Used in Relationship'] = dfT['Name'].isin(dfR['From Table']) | dfT['Name'].isin(dfR['To Table']) - dfT['Used in Relationship Both Sides'] = dfT['Name'].isin(dfR['From Table']) & dfT['Name'].isin(dfR['To Table']) - dfC['Used in Relationship'] = dfC['Column Object'].isin(dfR['From Object']) | dfC['Column Object'].isin(dfR['To Object']) - - dfR_filt = dfR[(dfR['Cross Filtering Behavior'] == 'BothDirections') | (dfR['Multiplicity'] == 'm:m')] - dfC['Used in M2M/BiDi Relationship'] = dfC['Column Object'].isin(dfR_filt['From Object']) | dfC['Column Object'].isin(dfR_filt['To Object']) - dfC['Foreign Key'] = dfC['Column Object'].isin(dfR[dfR['From Cardinality'] == 'Many']['From Object']) - dfC['Primary Key'] = dfC['Column Object'].isin(dfR[dfR['To Cardinality'] == 'One']['To Object']) - dfT['Used in M2M Relationship'] = dfT['Name'].isin(dfR[dfR['Multiplicity'] == 'm:m'][['From Table']]) | dfT['Name'].isin(dfR[dfR['Multiplicity'] == 'm:m'][['To Table']]) - dfT['Used in Dynamic RLS'] = dfT['Name'].isin(dfRLS[dfRLS['Is Dynamic']]['Table']) - dfT['Used in RLS'] = dfT['Name'].isin(dfRLS.loc[dfRLS['Filter Expression'].str.len() > 0, 'Table']) - dfC['Primary Key'] = dfC['Column Object'].isin(dfR.loc[dfR['To Cardinality'] == 'One', 'To Object']) - dfD['Has Date Table'] = any((r['Parent Data Category'] == 'Time') & (r['Data Type'] == 'DateTime') & (r['Key'] == True) for i, r in dfC.iterrows()) - #dfC['In Date Table'] = dfC['Table Name'].isin(dfT.loc[dfT['Data Category'] == "Time", 'Name']) - dfD['Relationship Count'] = len(dfR) - dfD['M2M or BiDi Relationship Count'] = len(dfR[(dfR['Multiplicity'] == 'm:m') | (dfR['Cross Filtering Behavior'] == 'BothDirections')]) - dfD['Calculation Group Count'] = len(dfT[dfT['Type'] == 'Calculation Group']) - dfT['Has Calculation Items'] = np.where((dfT['Type'] == 'Calculation Group') & dfT['Name'].isin(dfCI['Calculation Group Name']), True, False) - dfP['Partition Object'] = format_dax_object_name(dfP['Table Name'], dfP['Partition Name']) - dfRLS['RLS Object'] = format_dax_object_name(dfRLS['Role'], dfRLS['Table']) - - function_pattern = '|'.join(dfF['FUNCTION_NAME'].map(re.escape)) - - dfM['DQ Date Function Used'] = any(dfP['Mode'] == 'DirectQuery') & dfM['Measure Expression'].str.contains(f'({function_pattern})\\s*\\(', case=False, regex=True) - - md['Reference'] = "'" + md['Referenced Table'] + "'[" + md['Referenced Object'] + ']' - - dfC['Referenced By'] = md[(md['Referenced Object Type'] == 'Column') & (md['Reference'].isin(dfC['Column Object']))].groupby('Reference').size().reset_index(name='Count')['Count'] - dfC['Referenced By'].fillna(0, inplace=True) - dfC['Referenced By'] = dfC['Referenced By'].fillna(0).astype(int) - - dfM['Referenced By'] = md[(md['Referenced Object Type'] == 'Measure') & (md['Referenced Object'].isin(dfM['Measure Name']))].groupby('Referenced Object').size().reset_index(name='Count')['Count'] - dfM['Referenced By'].fillna(0, inplace=True) - dfM['Referenced By'] = dfM['Referenced By'].fillna(0).astype(int) + partition_count = ( + dfP.groupby("Table Name").size().reset_index(name="Partition Count") + ) + dfT = pd.merge( + dfT, partition_count, left_on="Name", right_on="Table Name", how="left" + ).drop("Table Name", axis=1) + dfT["Partition Count"] = dfT["Partition Count"].fillna(0).astype(int) + + dfT = dfT.merge( + dfP[["Table Name", "Partition Name"]], + how="left", + left_on="Name", + right_on="Table Name", + ) + dfT["First Partition Name"] = dfT.groupby("Name")["Partition Name"].transform( + "first" + ) + dfT.drop("Table Name", axis=1, inplace=True) + + dfC["Sort By Column Object"] = format_dax_object_name( + dfC["Table Name"], dfC["Sort By Column"] + ) + dfC["Column Object"] = format_dax_object_name(dfC["Table Name"], dfC["Column Name"]) + dfM["Measure Object"] = "[" + dfM["Measure Name"] + "]" + dfM["Measure Fully Qualified"] = format_dax_object_name( + dfM["Table Name"], dfM["Measure Name"] + ) + dfM["Measure Fully Qualified No Spaces"] = ( + dfM["Table Name"] + "[" + dfM["Measure Name"] + "]" + ) + # dfM['Measure Fully Qualified No Spaces'] = dfM.apply(lambda row: row['Table Name'] + '[' + row['Measure Name'] + ']' if ' ' not in row['Table Name'] else '', axis=1) + dfC["Column Unqualified"] = "[" + dfC["Column Name"] + "]" + dfC["Column Object No Spaces"] = dfC.apply( + lambda row: ( + row["Table Name"] + "[" + row["Column Name"] + "]" + if " " not in row["Table Name"] + else "" + ), + axis=1, + ) + dfC["Used in Sort By"] = dfC["Column Object"].isin(dfC["Sort By Column Object"]) + dfH["Column Object"] = format_dax_object_name(dfH["Table Name"], dfH["Column Name"]) + dfC["Used in Hierarchy"] = dfC["Column Object"].isin(dfH["Column Object"]) + dfR["From Object"] = format_dax_object_name(dfR["From Table"], dfR["From Column"]) + dfR["To Object"] = format_dax_object_name(dfR["To Table"], dfR["To Column"]) + dfT["Used in Relationship"] = dfT["Name"].isin(dfR["From Table"]) | dfT[ + "Name" + ].isin(dfR["To Table"]) + dfT["Used in Relationship Both Sides"] = dfT["Name"].isin(dfR["From Table"]) & dfT[ + "Name" + ].isin(dfR["To Table"]) + dfC["Used in Relationship"] = dfC["Column Object"].isin(dfR["From Object"]) | dfC[ + "Column Object" + ].isin(dfR["To Object"]) + + dfR_filt = dfR[ + (dfR["Cross Filtering Behavior"] == "BothDirections") + | (dfR["Multiplicity"] == "m:m") + ] + dfC["Used in M2M/BiDi Relationship"] = dfC["Column Object"].isin( + dfR_filt["From Object"] + ) | dfC["Column Object"].isin(dfR_filt["To Object"]) + dfC["Foreign Key"] = dfC["Column Object"].isin( + dfR[dfR["From Cardinality"] == "Many"]["From Object"] + ) + dfC["Primary Key"] = dfC["Column Object"].isin( + dfR[dfR["To Cardinality"] == "One"]["To Object"] + ) + dfT["Used in M2M Relationship"] = dfT["Name"].isin( + dfR[dfR["Multiplicity"] == "m:m"][["From Table"]] + ) | dfT["Name"].isin(dfR[dfR["Multiplicity"] == "m:m"][["To Table"]]) + dfT["Used in Dynamic RLS"] = dfT["Name"].isin(dfRLS[dfRLS["Is Dynamic"]]["Table"]) + dfT["Used in RLS"] = dfT["Name"].isin( + dfRLS.loc[dfRLS["Filter Expression"].str.len() > 0, "Table"] + ) + dfC["Primary Key"] = dfC["Column Object"].isin( + dfR.loc[dfR["To Cardinality"] == "One", "To Object"] + ) + dfD["Has Date Table"] = any( + (r["Parent Data Category"] == "Time") + & (r["Data Type"] == "DateTime") + & (r["Key"] == True) + for i, r in dfC.iterrows() + ) + # dfC['In Date Table'] = dfC['Table Name'].isin(dfT.loc[dfT['Data Category'] == "Time", 'Name']) + dfD["Relationship Count"] = len(dfR) + dfD["M2M or BiDi Relationship Count"] = len( + dfR[ + (dfR["Multiplicity"] == "m:m") + | (dfR["Cross Filtering Behavior"] == "BothDirections") + ] + ) + dfD["Calculation Group Count"] = len(dfT[dfT["Type"] == "Calculation Group"]) + dfT["Has Calculation Items"] = np.where( + (dfT["Type"] == "Calculation Group") + & dfT["Name"].isin(dfCI["Calculation Group Name"]), + True, + False, + ) + dfP["Partition Object"] = format_dax_object_name( + dfP["Table Name"], dfP["Partition Name"] + ) + dfRLS["RLS Object"] = format_dax_object_name(dfRLS["Role"], dfRLS["Table"]) + + function_pattern = "|".join(dfF["FUNCTION_NAME"].map(re.escape)) + + dfM["DQ Date Function Used"] = any(dfP["Mode"] == "DirectQuery") & dfM[ + "Measure Expression" + ].str.contains(f"({function_pattern})\\s*\\(", case=False, regex=True) + + md["Reference"] = ( + "'" + md["Referenced Table"] + "'[" + md["Referenced Object"] + "]" + ) + + dfC["Referenced By"] = ( + md[ + (md["Referenced Object Type"] == "Column") + & (md["Reference"].isin(dfC["Column Object"])) + ] + .groupby("Reference") + .size() + .reset_index(name="Count")["Count"] + ) + dfC["Referenced By"].fillna(0, inplace=True) + dfC["Referenced By"] = dfC["Referenced By"].fillna(0).astype(int) + + dfM["Referenced By"] = ( + md[ + (md["Referenced Object Type"] == "Measure") + & (md["Referenced Object"].isin(dfM["Measure Name"])) + ] + .groupby("Referenced Object") + .size() + .reset_index(name="Count")["Count"] + ) + dfM["Referenced By"].fillna(0, inplace=True) + dfM["Referenced By"] = dfM["Referenced By"].fillna(0).astype(int) pattern = "[^\( ][a-zA-Z0-9_()-]+\[[^\[]+\]|'[^']+'\[[^\[]+\]|\[[^\[]+\]" - dfM['Has Fully Qualified Measure Reference'] = False - dfM['Has Unqualified Column Reference'] = False + dfM["Has Fully Qualified Measure Reference"] = False + dfM["Has Unqualified Column Reference"] = False for i, r in dfM.iterrows(): - tName = r['Table Name'] - mName = r['Measure Name'] - expr = r['Measure Expression'] + tName = r["Table Name"] + mName = r["Measure Name"] + expr = r["Measure Expression"] matches = re.findall(pattern, expr) for m in matches: - if m[0] == '[': - if (m in dfC['Column Unqualified'].values) and (dfC[dfC['Table Name'] == tName]['Column Unqualified'] == m).any(): - dfM.at[i, 'Has Unqualified Column Reference'] = True + if m[0] == "[": + if (m in dfC["Column Unqualified"].values) and ( + dfC[dfC["Table Name"] == tName]["Column Unqualified"] == m + ).any(): + dfM.at[i, "Has Unqualified Column Reference"] = True else: - if (m in dfM['Measure Fully Qualified'].values) | (m in dfM['Measure Fully Qualified No Spaces'].values): - dfM.at[i, 'Has Fully Qualified Measure Reference'] = True - - dfR['Inactive without USERELATIONSHIP'] = False - for i,r in dfR[dfR['Active'] == False].iterrows(): - fromTable = r['From Table'] - fromColumn = r['From Column'] - toTable = r['To Table'] - toColumn = r['To Column'] - - dfM_filt = dfM[dfM['Measure Expression'].str.contains("(?i)USERELATIONSHIP\s*\(\s*\'*" + fromTable + "\'*\[" + fromColumn + "\]\s*,\s*\'*" + toTable + "\'*\[" + toColumn + "\]" , regex=True)] + if (m in dfM["Measure Fully Qualified"].values) | ( + m in dfM["Measure Fully Qualified No Spaces"].values + ): + dfM.at[i, "Has Fully Qualified Measure Reference"] = True + + dfR["Inactive without USERELATIONSHIP"] = False + for i, r in dfR[dfR["Active"] == False].iterrows(): + fromTable = r["From Table"] + fromColumn = r["From Column"] + toTable = r["To Table"] + toColumn = r["To Column"] + + dfM_filt = dfM[ + dfM["Measure Expression"].str.contains( + "(?i)USERELATIONSHIP\s*\(\s*'*" + + fromTable + + "'*\[" + + fromColumn + + "\]\s*,\s*'*" + + toTable + + "'*\[" + + toColumn + + "\]", + regex=True, + ) + ] if len(dfM_filt) == 0: - dfR.at[i, 'Inactive without USERELATIONSHIP'] = True - - dfC['Used in RLS'] = ( - dfC['Column Object No Spaces'].isin(dfRLS['Filter Expression']) | - dfC['Column Object'].isin(dfRLS['Filter Expression']) | - dfC.apply(lambda row: any(row['Column Name'] in expr for expr in dfRLS.loc[dfRLS['Table'] == row['Table Name'], 'Filter Expression']), axis=1) + dfR.at[i, "Inactive without USERELATIONSHIP"] = True + + dfC["Used in RLS"] = ( + dfC["Column Object No Spaces"].isin(dfRLS["Filter Expression"]) + | dfC["Column Object"].isin(dfRLS["Filter Expression"]) + | dfC.apply( + lambda row: any( + row["Column Name"] in expr + for expr in dfRLS.loc[ + dfRLS["Table"] == row["Table Name"], "Filter Expression" + ] + ), + axis=1, + ) ) # Merge dfR and dfC based on 'From Object' and 'Column Object' - merged_from = pd.merge(dfR, dfC, left_on='From Object', right_on='Column Object', how='left') - merged_to = pd.merge(dfR, dfC, left_on='To Object', right_on='Column Object', how='left') + merged_from = pd.merge( + dfR, dfC, left_on="From Object", right_on="Column Object", how="left" + ) + merged_to = pd.merge( + dfR, dfC, left_on="To Object", right_on="Column Object", how="left" + ) - dfR['From Column Data Type'] = merged_from['Data Type'] - dfR['To Column Data Type'] = merged_to['Data Type'] + dfR["From Column Data Type"] = merged_from["Data Type"] + dfR["To Column Data Type"] = merged_to["Data Type"] # Check if USERELATIONSHIP objects are used in a given column, table - userelationship_pattern = re.compile(r"USERELATIONSHIP\s*\(\s*(.*?)\s*,\s*(.*?)\s*\)", re.DOTALL | re.IGNORECASE) + userelationship_pattern = re.compile( + r"USERELATIONSHIP\s*\(\s*(.*?)\s*,\s*(.*?)\s*\)", re.DOTALL | re.IGNORECASE + ) # Function to extract objects within USERELATIONSHIP function def extract_objects(measure_expression): @@ -543,28 +1094,47 @@ def extract_objects(measure_expression): else: return [] - dfM['USERELATIONSHIP Objects'] = dfM['Measure Expression'].apply(extract_objects) - flat_object_list = [item for sublist in dfM['USERELATIONSHIP Objects'] for item in sublist] - dfC['USERELATIONSHIP Used'] = dfC['Column Object'].isin(flat_object_list) | dfC['Column Object No Spaces'].isin(flat_object_list) - dfT['USERELATIONSHIP Used'] = dfT['Name'].isin(dfC[dfC['USERELATIONSHIP Used']]['Table Name']) - dfR['Relationship Name'] = format_dax_object_name(dfR['From Table'], dfR['From Column']) + ' -> ' + format_dax_object_name(dfR['To Table'], dfR['To Column']) - dfH = dfH[['Name', 'Description', 'Table Name', 'Hierarchy Name', 'Hierarchy Description', 'Hierarchy Object']].drop_duplicates() + dfM["USERELATIONSHIP Objects"] = dfM["Measure Expression"].apply(extract_objects) + flat_object_list = [ + item for sublist in dfM["USERELATIONSHIP Objects"] for item in sublist + ] + dfC["USERELATIONSHIP Used"] = dfC["Column Object"].isin(flat_object_list) | dfC[ + "Column Object No Spaces" + ].isin(flat_object_list) + dfT["USERELATIONSHIP Used"] = dfT["Name"].isin( + dfC[dfC["USERELATIONSHIP Used"]]["Table Name"] + ) + dfR["Relationship Name"] = ( + format_dax_object_name(dfR["From Table"], dfR["From Column"]) + + " -> " + + format_dax_object_name(dfR["To Table"], dfR["To Column"]) + ) + dfH = dfH[ + [ + "Name", + "Description", + "Table Name", + "Hierarchy Name", + "Hierarchy Description", + "Hierarchy Object", + ] + ].drop_duplicates() scope_to_dataframe = { - 'Table': (dfT, ['Name']), - 'Partition': (dfP, ['Partition Object']), - 'Column': (dfC, ['Column Object']), - 'Hierarchy': (dfH, ['Hierarchy Object']), - 'Measure': (dfM, ['Measure Name']), - 'Calculation Item': (dfCI, ['Calculation Object']), - 'Relationship': (dfR, ['Relationship Name']), - 'Row Level Security': (dfRLS, ['RLS Object']), - 'Role': (dfRole, ['Role']), - 'Model': (dfD, ['Dataset Name']) + "Table": (dfT, ["Name"]), + "Partition": (dfP, ["Partition Object"]), + "Column": (dfC, ["Column Object"]), + "Hierarchy": (dfH, ["Hierarchy Object"]), + "Measure": (dfM, ["Measure Name"]), + "Calculation Item": (dfCI, ["Calculation Object"]), + "Relationship": (dfR, ["Relationship Name"]), + "Row Level Security": (dfRLS, ["RLS Object"]), + "Role": (dfRole, ["Role"]), + "Model": (dfD, ["Dataset Name"]), } def execute_rule(row): - scopes = row['Scope'] + scopes = row["Scope"] # support both str and list as scope type if isinstance(scopes, str): @@ -577,10 +1147,10 @@ def execute_rule(row): # common fields for each scope (df, violation_cols_or_func) = scope_to_dataframe[scope] - if scope in ['Hierarchy', 'Measure'] and len(df) == 0: + if scope in ["Hierarchy", "Measure"] and len(df) == 0: continue # execute rule and subset df - df_violations = df[row['Expression'](df)] + df_violations = df[row["Expression"](df)] # subset the right output columns (e.g. Table Name & Column Name) if isinstance(violation_cols_or_func, list): @@ -589,46 +1159,49 @@ def execute_rule(row): violation_func = violation_cols_or_func # build output data frame - df_output = violation_func(df_violations).copy() - - df_output.columns = ['Object Name'] - df_output['Rule Name'] = row['Rule Name'] - df_output['Category'] = row['Category'] - - df_output['Object Type'] = scope - df_output['Severity'] = row['Severity'] - df_output['Description'] = row['Description'] - df_output['URL'] = row['URL'] + df_output = violation_func(df_violations).copy() + + df_output.columns = ["Object Name"] + df_output["Rule Name"] = row["Rule Name"] + df_output["Category"] = row["Category"] + + df_output["Object Type"] = scope + df_output["Severity"] = row["Severity"] + df_output["Description"] = row["Description"] + df_output["URL"] = row["URL"] df_outputs.append(df_output) - + return df_outputs # flatten list of lists flatten_dfs = [ - df - for dfs in rules_dataframe.apply(execute_rule, axis=1).tolist() - for df in dfs] + df for dfs in rules_dataframe.apply(execute_rule, axis=1).tolist() for df in dfs + ] finalDF = pd.concat(flatten_dfs, ignore_index=True) if export: lakeAttach = lakehouse_attached() if lakeAttach == False: - print(f"In order to save the Best Practice Analyzer results, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook.") + print( + f"In order to save the Best Practice Analyzer results, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook." + ) return dfExport = finalDF.copy() delta_table_name = "modelbparesults" lakehouse_id = fabric.get_lakehouse_id() - lakehouse = resolve_lakehouse_name(lakehouse_id = lakehouse_id, workspace = workspace) + lakehouse = resolve_lakehouse_name( + lakehouse_id=lakehouse_id, workspace=workspace + ) - lakeT = get_lakehouse_tables(lakehouse = lakehouse, workspace = workspace) - lakeT_filt = lakeT[lakeT['Table Name'] == delta_table_name] + lakeT = get_lakehouse_tables(lakehouse=lakehouse, workspace=workspace) + lakeT_filt = lakeT[lakeT["Table Name"] == delta_table_name] - dfExport['Severity'].replace('⚠️', 'Warning', inplace=True) - dfExport['Severity'].replace('\u274C', 'Error', inplace=True) - dfExport['Severity'].replace('ℹ️', 'Info', inplace=True) + dfExport["Severity"].replace("⚠️", "Warning", inplace=True) + dfExport["Severity"].replace("\u274C", "Error", inplace=True) + dfExport["Severity"].replace("ℹ️", "Info", inplace=True) spark = SparkSession.builder.getOrCreate() query = f"SELECT MAX(RunId) FROM {lakehouse}.{delta_table_name}" @@ -641,36 +1214,50 @@ def execute_rule(row): runId = maxRunId + 1 now = datetime.datetime.now() - dfExport['Workspace Name'] = workspace - dfExport['Dataset Name'] = dataset - dfExport['Timestamp'] = now - dfExport['RunId'] = runId + dfExport["Workspace Name"] = workspace + dfExport["Dataset Name"] = dataset + dfExport["Timestamp"] = now + dfExport["RunId"] = runId - dfExport['RunId'] = dfExport['RunId'].astype('int') + dfExport["RunId"] = dfExport["RunId"].astype("int") - colName = 'Workspace Name' + colName = "Workspace Name" dfExport.insert(0, colName, dfExport.pop(colName)) - colName = 'Dataset Name' + colName = "Dataset Name" dfExport.insert(1, colName, dfExport.pop(colName)) - dfExport.columns = dfExport.columns.str.replace(' ', '_') + dfExport.columns = dfExport.columns.str.replace(" ", "_") spark_df = spark.createDataFrame(dfExport) - spark_df.write.mode('append').format('delta').saveAsTable(delta_table_name) - print(f"\u2022 Model Best Practice Analyzer results for the '{dataset}' semantic model have been appended to the '{delta_table_name}' delta table.") + spark_df.write.mode("append").format("delta").saveAsTable(delta_table_name) + print( + f"\u2022 Model Best Practice Analyzer results for the '{dataset}' semantic model have been appended to the '{delta_table_name}' delta table." + ) if return_dataframe: return finalDF - - pd.set_option('display.max_colwidth', 100) - finalDF = (finalDF[['Category', 'Rule Name', 'Object Type', 'Object Name' , 'Severity', 'Description', 'URL']] - .sort_values(['Category', 'Rule Name', 'Object Type', 'Object Name']) - .set_index(['Category', 'Rule Name'])) - + pd.set_option("display.max_colwidth", 100) + + finalDF = ( + finalDF[ + [ + "Category", + "Rule Name", + "Object Type", + "Object Name", + "Severity", + "Description", + "URL", + ] + ] + .sort_values(["Category", "Rule Name", "Object Type", "Object Name"]) + .set_index(["Category", "Rule Name"]) + ) + bpa2 = finalDF.reset_index() bpa_dict = { - cat: bpa2[bpa2['Category'] == cat].drop("Category", axis=1) - for cat in bpa2['Category'].drop_duplicates().values + cat: bpa2[bpa2["Category"] == cat].drop("Category", axis=1) + for cat in bpa2["Category"].drop_duplicates().values } styles = """ @@ -727,39 +1314,41 @@ def execute_rule(row): # HTML for tabs tab_html = '
' - content_html = '' + content_html = "" for i, (title, df) in enumerate(bpa_dict.items()): if df.shape[0] == 0: continue tab_id = f"tab{i}" - active_class = '' + active_class = "" if i == 0: - active_class = 'active' + active_class = "active" - summary = " + ".join([f'{idx} ({v})' for idx, v in df['Severity'].value_counts().items()]) + summary = " + ".join( + [f"{idx} ({v})" for idx, v in df["Severity"].value_counts().items()] + ) tab_html += f'' content_html += f'
' # Adding tooltip for Rule Name using Description column content_html += '' - content_html += '' + content_html += "" for _, row in df.iterrows(): - content_html += f'' + content_html += f"" if pd.notnull(row["URL"]): - content_html += f'' - elif pd.notnull(row['Description']): - content_html += f'' + content_html += f'' + elif pd.notnull(row["Description"]): + content_html += f'' else: - content_html += f'' + content_html += f'' content_html += f'' content_html += f'' content_html += f'' - content_html += f'' - content_html += '
Rule NameObject TypeObject NameSeverity
Rule NameObject TypeObject NameSeverity
{row["Rule Name"]}{row["Description"]}{row["Rule Name"]}{row["Description"]}{row["Rule Name"]}{row["Description"]}{row["Rule Name"]}{row["Description"]}{row["Rule Name"]}{row["Rule Name"]}{row["Object Type"]}{row["Object Name"]}{row["Severity"]}
' + content_html += f"" + content_html += "" - content_html += '
' - tab_html += '
' + content_html += "" + tab_html += "" # Display the tabs, tab contents, and run the script - return display(HTML(styles + tab_html + content_html + script)) \ No newline at end of file + return display(HTML(styles + tab_html + content_html + script)) diff --git a/sempy_labs/OneLakeIntegration.py b/sempy_labs/OneLakeIntegration.py index cd09c1fc..d73d104d 100644 --- a/sempy_labs/OneLakeIntegration.py +++ b/sempy_labs/OneLakeIntegration.py @@ -4,9 +4,14 @@ from typing import List, Optional, Union from sempy._utils._log import log -@log -def export_model_to_onelake(dataset: str, workspace: Optional[str] = None, destination_lakehouse: Optional[str] = None, destination_workspace: Optional[str] = None): +@log +def export_model_to_onelake( + dataset: str, + workspace: Optional[str] = None, + destination_lakehouse: Optional[str] = None, + destination_workspace: Optional[str] = None, +): """ Exports a semantic model's tables to delta tables in the lakehouse. Creates shortcuts to the tables if a lakehouse is specified. @@ -22,11 +27,11 @@ def export_model_to_onelake(dataset: str, workspace: Optional[str] = None, desti The name of the Fabric lakehouse where shortcuts will be created to access the delta tables created by the export. If the lakehouse specified does not exist, one will be created with that name. If no lakehouse is specified, shortcuts will not be created. destination_workspace : str, default=None The name of the Fabric workspace in which the lakehouse resides. - + Returns ------- - + """ if workspace == None: @@ -41,11 +46,13 @@ def export_model_to_onelake(dataset: str, workspace: Optional[str] = None, desti else: destination_workspace_id = fabric.resolve_workspace_id(destination_workspace) - dfD = fabric.list_datasets(workspace = workspace) - dfD_filt = dfD[dfD['Dataset Name'] == dataset] + dfD = fabric.list_datasets(workspace=workspace) + dfD_filt = dfD[dfD["Dataset Name"] == dataset] if len(dfD_filt) == 0: - print(f"The '{dataset}' semantic model does not exist in the '{workspace}' workspace.") + print( + f"The '{dataset}' semantic model does not exist in the '{workspace}' workspace." + ) return tmsl = f""" @@ -64,63 +71,94 @@ def export_model_to_onelake(dataset: str, workspace: Optional[str] = None, desti # Export model's tables as delta tables try: - fabric.execute_tmsl(script = tmsl, workspace = workspace) - print(f"The '{dataset}' semantic model's tables have been exported as delta tables to the '{workspace}' workspace.\n") + fabric.execute_tmsl(script=tmsl, workspace=workspace) + print( + f"The '{dataset}' semantic model's tables have been exported as delta tables to the '{workspace}' workspace.\n" + ) except: - print(f"ERROR: The '{dataset}' semantic model's tables have not been exported as delta tables to the '{workspace}' workspace.") - print(f"Make sure you enable OneLake integration for the '{dataset}' semantic model. Follow the instructions here: https://learn.microsoft.com/power-bi/enterprise/onelake-integration-overview#enable-onelake-integration") + print( + f"ERROR: The '{dataset}' semantic model's tables have not been exported as delta tables to the '{workspace}' workspace." + ) + print( + f"Make sure you enable OneLake integration for the '{dataset}' semantic model. Follow the instructions here: https://learn.microsoft.com/power-bi/enterprise/onelake-integration-overview#enable-onelake-integration" + ) return - + # Create shortcuts if destination lakehouse is specified if destination_lakehouse is not None: # Destination... - dfI_Dest = fabric.list_items(workspace = destination_workspace, type = 'Lakehouse') - dfI_filt = dfI_Dest[(dfI_Dest['Display Name'] == destination_lakehouse)] + dfI_Dest = fabric.list_items(workspace=destination_workspace, type="Lakehouse") + dfI_filt = dfI_Dest[(dfI_Dest["Display Name"] == destination_lakehouse)] if len(dfI_filt) == 0: - print(f"The '{destination_lakehouse}' lakehouse does not exist within the '{destination_workspace}' workspace.") + print( + f"The '{destination_lakehouse}' lakehouse does not exist within the '{destination_workspace}' workspace." + ) # Create lakehouse - destination_lakehouse_id = fabric.create_lakehouse(display_name = destination_lakehouse, workspace = destination_workspace) - print(f"The '{destination_lakehouse}' lakehouse has been created within the '{destination_workspace}' workspace.\n") + destination_lakehouse_id = fabric.create_lakehouse( + display_name=destination_lakehouse, workspace=destination_workspace + ) + print( + f"The '{destination_lakehouse}' lakehouse has been created within the '{destination_workspace}' workspace.\n" + ) else: - destination_lakehouse_id = dfI_filt['Id'].iloc[0] + destination_lakehouse_id = dfI_filt["Id"].iloc[0] # Source... - dfI_Source = fabric.list_items(workspace = workspace, type = 'SemanticModel') - dfI_filtSource = dfI_Source[(dfI_Source['Display Name'] == dataset)] - sourceLakehouseId = dfI_filtSource['Id'].iloc[0] + dfI_Source = fabric.list_items(workspace=workspace, type="SemanticModel") + dfI_filtSource = dfI_Source[(dfI_Source["Display Name"] == dataset)] + sourceLakehouseId = dfI_filtSource["Id"].iloc[0] # Valid tables - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace, additional_xmla_properties=['Parent.SystemManaged']) - dfP_filt = dfP[(dfP['Mode'] == 'Import') & (dfP['Source Type'] != 'CalculationGroup') & (dfP['Parent System Managed'] == False)] - dfC = fabric.list_columns(dataset = dataset, workspace = workspace) - tmc = pd.DataFrame(dfP.groupby('Table Name')['Mode'].nunique()).reset_index() - oneMode = tmc[tmc['Mode'] == 1] - tableAll = dfP_filt[dfP_filt['Table Name'].isin(dfC['Table Name'].values) & (dfP_filt['Table Name'].isin(oneMode['Table Name'].values))] - tables = tableAll['Table Name'].unique() + dfP = fabric.list_partitions( + dataset=dataset, + workspace=workspace, + additional_xmla_properties=["Parent.SystemManaged"], + ) + dfP_filt = dfP[ + (dfP["Mode"] == "Import") + & (dfP["Source Type"] != "CalculationGroup") + & (dfP["Parent System Managed"] == False) + ] + dfC = fabric.list_columns(dataset=dataset, workspace=workspace) + tmc = pd.DataFrame(dfP.groupby("Table Name")["Mode"].nunique()).reset_index() + oneMode = tmc[tmc["Mode"] == 1] + tableAll = dfP_filt[ + dfP_filt["Table Name"].isin(dfC["Table Name"].values) + & (dfP_filt["Table Name"].isin(oneMode["Table Name"].values)) + ] + tables = tableAll["Table Name"].unique() client = fabric.FabricRestClient() print("Creating shortcuts...\n") - for tableName in tables: - tablePath = 'Tables/' + tableName - shortcutName = tableName.replace(' ','') + for tableName in tables: + tablePath = "Tables/" + tableName + shortcutName = tableName.replace(" ", "") request_body = { - "path": 'Tables', - "name": shortcutName, - "target": { - "oneLake": { - "workspaceId": workspace_id, - "itemId": sourceLakehouseId, - "path": tablePath} - } + "path": "Tables", + "name": shortcutName, + "target": { + "oneLake": { + "workspaceId": workspace_id, + "itemId": sourceLakehouseId, + "path": tablePath, + } + }, } try: - response = client.post(f"/v1/workspaces/{destination_workspace_id}/items/{destination_lakehouse_id}/shortcuts",json=request_body) - if response.status_code == 201: - print(f"\u2022 The shortcut '{shortcutName}' was created in the '{destination_lakehouse}' lakehouse within the '{destination_workspace}' workspace. It is based on the '{tableName}' table in the '{dataset}' semantic model within the '{workspace}' workspace.\n") + response = client.post( + f"/v1/workspaces/{destination_workspace_id}/items/{destination_lakehouse_id}/shortcuts", + json=request_body, + ) + if response.status_code == 201: + print( + f"\u2022 The shortcut '{shortcutName}' was created in the '{destination_lakehouse}' lakehouse within the '{destination_workspace}' workspace. It is based on the '{tableName}' table in the '{dataset}' semantic model within the '{workspace}' workspace.\n" + ) else: print(response.status_code) except: - print(f"ERROR: Failed to create a shortcut for the '{tableName}' table.") \ No newline at end of file + print( + f"ERROR: Failed to create a shortcut for the '{tableName}' table." + ) diff --git a/sempy_labs/QSO.py b/sempy_labs/QSO.py index a685e94b..10e74e98 100644 --- a/sempy_labs/QSO.py +++ b/sempy_labs/QSO.py @@ -1,16 +1,12 @@ import sempy import sempy.fabric as fabric import pandas as pd -from .HelperFunctions import resolve_dataset_id +from ._helper_functions import resolve_dataset_id from typing import List, Optional, Union +import sempy_labs._icons as icons -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' def qso_sync(dataset: str, workspace: Optional[str] = None): - """ Triggers a query scale-out sync of read-only replicas for the specified dataset from the specified workspace. @@ -22,14 +18,13 @@ def qso_sync(dataset: str, workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- - - """ - #https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/trigger-query-scale-out-sync-in-group + """ + # https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/trigger-query-scale-out-sync-in-group if workspace is None: workspace_id = fabric.get_workspace_id() @@ -40,15 +35,21 @@ def qso_sync(dataset: str, workspace: Optional[str] = None): dataset_id = resolve_dataset_id(dataset, workspace) client = fabric.PowerBIRestClient() - response = client.post(f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}/queryScaleOut/sync") + response = client.post( + f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}/queryScaleOut/sync" + ) if response.status_code == 200: - print(f"{green_dot} QSO sync initiated for the '{dataset}' semantic model within the '{workspace}' workspace.") + print( + f"{icons.green_dot} QSO sync initiated for the '{dataset}' semantic model within the '{workspace}' workspace." + ) else: - print(f"{red_dot} QSO sync failed for the '{dataset}' semantic model within the '{workspace}' workspace.") + print( + f"{icons.red_dot} QSO sync failed for the '{dataset}' semantic model within the '{workspace}' workspace." + ) -def qso_sync_status(dataset: str, workspace: Optional[str] = None): +def qso_sync_status(dataset: str, workspace: Optional[str] = None): """ Returns the query scale-out sync status for the specified dataset from the specified workspace. @@ -63,14 +64,28 @@ def qso_sync_status(dataset: str, workspace: Optional[str] = None): Returns ------- - - """ - - #https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/get-query-scale-out-sync-status-in-group - df = pd.DataFrame(columns=['Scale Out Status', 'Sync Start Time', 'Sync End Time', 'Commit Version', 'Commit Timestamp', 'Target Sync Version', 'Target Sync Timestamp', 'Trigger Reason', 'Min Active Read Version', 'Min Active Read Timestamp']) - dfRep = pd.DataFrame(columns=['Replica ID', 'Replica Type', 'Replica Version', 'Replica Timestamp']) + """ + # https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/get-query-scale-out-sync-status-in-group + + df = pd.DataFrame( + columns=[ + "Scale Out Status", + "Sync Start Time", + "Sync End Time", + "Commit Version", + "Commit Timestamp", + "Target Sync Version", + "Target Sync Timestamp", + "Trigger Reason", + "Min Active Read Version", + "Min Active Read Timestamp", + ] + ) + dfRep = pd.DataFrame( + columns=["Replica ID", "Replica Type", "Replica Version", "Replica Timestamp"] + ) if workspace is None: workspace_id = fabric.get_workspace_id() @@ -81,29 +96,51 @@ def qso_sync_status(dataset: str, workspace: Optional[str] = None): dataset_id = resolve_dataset_id(dataset, workspace) client = fabric.PowerBIRestClient() - response = client.get(f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}/queryScaleOut/syncStatus") + response = client.get( + f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}/queryScaleOut/syncStatus" + ) if response.status_code == 200: o = response.json() - sos = o['scaleOutStatus'] - - if sos == 'Enabled': - new_data = {'Scale Out Status': o['scaleOutStatus'], 'Sync Start Time': o['syncStartTime'], 'Sync End Time': o['syncEndTime'], 'Commit Version': o['commitVersion'], 'Commit Timestamp': o['commitTimestamp'], 'Target Sync Version': o['targetSyncVersion'], 'Target Sync Timestamp': o['targetSyncTimestamp'], 'Trigger Reason': o['triggerReason'], 'Min Active Read Version': o['minActiveReadVersion'], 'Min Active Read Timestamp': o['minActiveReadTimestamp']} + sos = o["scaleOutStatus"] + + if sos == "Enabled": + new_data = { + "Scale Out Status": o["scaleOutStatus"], + "Sync Start Time": o["syncStartTime"], + "Sync End Time": o["syncEndTime"], + "Commit Version": o["commitVersion"], + "Commit Timestamp": o["commitTimestamp"], + "Target Sync Version": o["targetSyncVersion"], + "Target Sync Timestamp": o["targetSyncTimestamp"], + "Trigger Reason": o["triggerReason"], + "Min Active Read Version": o["minActiveReadVersion"], + "Min Active Read Timestamp": o["minActiveReadTimestamp"], + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - for r in o['scaleOutReplicas']: - new_data = {'Replica ID': r['replicaId'], 'Replica Type': r['replicaType'], 'Replica Version': str(r['replicaVersion']), 'Replica Timestamp': r['replicaTimestamp']} - dfRep = pd.concat([dfRep, pd.DataFrame(new_data, index=[0])], ignore_index=True) - - df['Sync Start Time'] = pd.to_datetime(df['Sync Start Time']) - df['Sync End Time'] = pd.to_datetime(df['Sync End Time']) - df['Commit Timestamp'] = pd.to_datetime(df['Commit Timestamp']) - df['Target Sync Timestamp'] = pd.to_datetime(df['Target Sync Timestamp']) - df['Min Active Read Timestamp'] = pd.to_datetime(df['Min Active Read Timestamp']) - dfRep['Replica Timestamp'] = pd.to_datetime(dfRep['Replica Timestamp']) - df['Commit Version'] = df['Commit Version'].astype('int') - df['Target Sync Version'] = df['Target Sync Version'].astype('int') - df['Min Active Read Version'] = df['Min Active Read Version'].astype('int') + for r in o["scaleOutReplicas"]: + new_data = { + "Replica ID": r["replicaId"], + "Replica Type": r["replicaType"], + "Replica Version": str(r["replicaVersion"]), + "Replica Timestamp": r["replicaTimestamp"], + } + dfRep = pd.concat( + [dfRep, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) + + df["Sync Start Time"] = pd.to_datetime(df["Sync Start Time"]) + df["Sync End Time"] = pd.to_datetime(df["Sync End Time"]) + df["Commit Timestamp"] = pd.to_datetime(df["Commit Timestamp"]) + df["Target Sync Timestamp"] = pd.to_datetime(df["Target Sync Timestamp"]) + df["Min Active Read Timestamp"] = pd.to_datetime( + df["Min Active Read Timestamp"] + ) + dfRep["Replica Timestamp"] = pd.to_datetime(dfRep["Replica Timestamp"]) + df["Commit Version"] = df["Commit Version"].astype("int") + df["Target Sync Version"] = df["Target Sync Version"].astype("int") + df["Min Active Read Version"] = df["Min Active Read Version"].astype("int") return df, dfRep else: @@ -112,8 +149,8 @@ def qso_sync_status(dataset: str, workspace: Optional[str] = None): else: return response.status_code -def disable_qso(dataset: str, workspace: Optional[str] = None): +def disable_qso(dataset: str, workspace: Optional[str] = None): """ Sets the max read-only replicas to 0, disabling query scale out. @@ -128,7 +165,7 @@ def disable_qso(dataset: str, workspace: Optional[str] = None): Returns ------- - + """ if workspace is None: @@ -139,23 +176,28 @@ def disable_qso(dataset: str, workspace: Optional[str] = None): dataset_id = resolve_dataset_id(dataset, workspace) - request_body = { - "queryScaleOutSettings": { - "maxReadOnlyReplicas": '0' - } - } + request_body = {"queryScaleOutSettings": {"maxReadOnlyReplicas": "0"}} client = fabric.PowerBIRestClient() - response = client.patch(f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}", json = request_body) + response = client.patch( + f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}", json=request_body + ) if response.status_code == 200: - df = list_qso_settings(dataset = dataset, workspace = workspace) - print(f"{green_dot} Query scale out has been disabled for the '{dataset}' semantic model within the '{workspace}' workspace.") + df = list_qso_settings(dataset=dataset, workspace=workspace) + print( + f"{icons.green_dot} Query scale out has been disabled for the '{dataset}' semantic model within the '{workspace}' workspace." + ) return df else: - return f"{red_dot} {response.status_code}" + return f"{icons.red_dot} {response.status_code}" -def set_qso(dataset: str, auto_sync: Optional[bool] = True, max_read_only_replicas: Optional[int] = -1, workspace: Optional[str] = None): +def set_qso( + dataset: str, + auto_sync: Optional[bool] = True, + max_read_only_replicas: Optional[int] = -1, + workspace: Optional[str] = None, +): """ Sets the query scale out settings for a semantic model. @@ -174,10 +216,10 @@ def set_qso(dataset: str, auto_sync: Optional[bool] = True, max_read_only_replic Returns ------- - + """ - #https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/update-dataset-in-group + # https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/update-dataset-in-group if workspace is None: workspace_id = fabric.get_workspace_id() @@ -188,34 +230,46 @@ def set_qso(dataset: str, auto_sync: Optional[bool] = True, max_read_only_replic dataset_id = resolve_dataset_id(dataset, workspace) if max_read_only_replicas == 0: - disable_qso(dataset = dataset, workspace = workspace) + disable_qso(dataset=dataset, workspace=workspace) return request_body = { "queryScaleOutSettings": { "autoSyncReadOnlyReplicas": auto_sync, - "maxReadOnlyReplicas": str(max_read_only_replicas) + "maxReadOnlyReplicas": str(max_read_only_replicas), } } - ssm = set_semantic_model_storage_format(dataset = dataset, storage_format='Large', workspace=workspace) + ssm = set_semantic_model_storage_format( + dataset=dataset, storage_format="Large", workspace=workspace + ) if ssm == 200: client = fabric.PowerBIRestClient() - response = client.patch(f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}", json = request_body) + response = client.patch( + f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}", + json=request_body, + ) if response.status_code == 200: - df = list_qso_settings(dataset = dataset, workspace = workspace) - print(f"{green_dot} Query scale out has been set on the '{dataset}' semantic model within the '{workspace}' workspace.") + df = list_qso_settings(dataset=dataset, workspace=workspace) + print( + f"{icons.green_dot} Query scale out has been set on the '{dataset}' semantic model within the '{workspace}' workspace." + ) return df else: - return f"{red_dot} {response.status_code}" + return f"{icons.red_dot} {response.status_code}" else: - print(f"{red_dot} Failed to set the '{dataset}' semantic model within the '{workspace}' workspace to large semantic model storage format. This is a prerequisite for enabling Query Scale Out.") - print("https://learn.microsoft.com/power-bi/enterprise/service-premium-scale-out#prerequisites") + print( + f"{icons.red_dot} Failed to set the '{dataset}' semantic model within the '{workspace}' workspace to large semantic model storage format. This is a prerequisite for enabling Query Scale Out." + ) + print( + "https://learn.microsoft.com/power-bi/enterprise/service-premium-scale-out#prerequisites" + ) return - -def set_semantic_model_storage_format(dataset: str, storage_format: str, workspace: Optional[str] = None): +def set_semantic_model_storage_format( + dataset: str, storage_format: str, workspace: Optional[str] = None +): """ Sets the semantic model storage format. @@ -232,7 +286,7 @@ def set_semantic_model_storage_format(dataset: str, storage_format: str, workspa Returns ------- - + """ if workspace is None: @@ -245,35 +299,37 @@ def set_semantic_model_storage_format(dataset: str, storage_format: str, workspa storage_format = storage_format.capitalize() - if storage_format == 'Abf': - storage_format = 'Small' - elif storage_format.startswith('Premium'): - storage_format = 'Large' + if storage_format == "Abf": + storage_format = "Small" + elif storage_format.startswith("Premium"): + storage_format = "Large" - storageFormats = ['Small', 'Large'] + storageFormats = ["Small", "Large"] - if storage_format == 'Large': - request_body = { - "targetStorageMode": "PremiumFiles" - } - elif storage_format == 'Small': - request_body = { - "targetStorageMode": "Abf" - } + if storage_format == "Large": + request_body = {"targetStorageMode": "PremiumFiles"} + elif storage_format == "Small": + request_body = {"targetStorageMode": "Abf"} else: - print(f"{red_dot} Invalid storage format value. Valid options: {storageFormats}.") + print( + f"{icons.red_dot} Invalid storage format value. Valid options: {storageFormats}." + ) return client = fabric.PowerBIRestClient() - response = client.patch(f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}", json = request_body) + response = client.patch( + f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}", json=request_body + ) if response.status_code == 200: - return print(f"{green_dot} Semantic model storage format set to '{storage_format}'.") + return print( + f"{icons.green_dot} Semantic model storage format set to '{storage_format}'." + ) else: - return f"{red_dot} {response.status_code}" + return f"{icons.red_dot} {response.status_code}" -def list_qso_settings(dataset: Optional[str] = None, workspace: Optional[str] = None): +def list_qso_settings(dataset: Optional[str] = None, workspace: Optional[str] = None): """ Shows the query scale out settings for a semantic model (or all semantic models within a workspace). @@ -302,28 +358,48 @@ def list_qso_settings(dataset: Optional[str] = None, workspace: Optional[str] = dataset_id = resolve_dataset_id(dataset, workspace) workspace_id = fabric.get_workspace_id() - df = pd.DataFrame(columns=['Dataset Id', 'Dataset Name', 'Storage Mode', 'QSO Auto Sync Enabled', 'QSO Max Read Only Replicas']) + df = pd.DataFrame( + columns=[ + "Dataset Id", + "Dataset Name", + "Storage Mode", + "QSO Auto Sync Enabled", + "QSO Max Read Only Replicas", + ] + ) client = fabric.PowerBIRestClient() response = client.get(f"/v1.0/myorg/groups/{workspace_id}/datasets") - for v in response.json()['value']: - tsm = v['targetStorageMode'] - if tsm == 'Abf': - sm = 'Small' + for v in response.json()["value"]: + tsm = v["targetStorageMode"] + if tsm == "Abf": + sm = "Small" else: - sm = 'Large' - new_data = {'Dataset Id': v['id'], 'Dataset Name': v['name'], 'Storage Mode': sm, 'QSO Auto Sync Enabled': v['queryScaleOutSettings']['autoSyncReadOnlyReplicas'], 'QSO Max Read Only Replicas': v['queryScaleOutSettings']['maxReadOnlyReplicas'] } + sm = "Large" + new_data = { + "Dataset Id": v["id"], + "Dataset Name": v["name"], + "Storage Mode": sm, + "QSO Auto Sync Enabled": v["queryScaleOutSettings"][ + "autoSyncReadOnlyReplicas" + ], + "QSO Max Read Only Replicas": v["queryScaleOutSettings"][ + "maxReadOnlyReplicas" + ], + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - df['QSO Auto Sync Enabled'] = df['QSO Auto Sync Enabled'].astype('bool') - df['QSO Max Read Only Replicas'] = df['QSO Max Read Only Replicas'].astype('int') - + df["QSO Auto Sync Enabled"] = df["QSO Auto Sync Enabled"].astype("bool") + df["QSO Max Read Only Replicas"] = df["QSO Max Read Only Replicas"].astype("int") + if dataset is not None: - df = df[df['Dataset Id'] == dataset_id] - + df = df[df["Dataset Id"] == dataset_id] + return df -def set_workspace_default_storage_format(storage_format: str, workspace: Optional[str] = None): +def set_workspace_default_storage_format( + storage_format: str, workspace: Optional[str] = None +): """ Sets the default storage format for semantic models within a workspace. @@ -338,17 +414,19 @@ def set_workspace_default_storage_format(storage_format: str, workspace: Optiona Returns ------- - + """ - #https://learn.microsoft.com/en-us/rest/api/power-bi/groups/update-group#defaultdatasetstorageformat + # https://learn.microsoft.com/en-us/rest/api/power-bi/groups/update-group#defaultdatasetstorageformat - storageFormats = ['Small', 'Large'] + storageFormats = ["Small", "Large"] storage_format = storage_format.capitalize() if storage_format not in storageFormats: - print(f"Invalid storage format. Please choose from these options: {storageFormats}.") + print( + f"Invalid storage format. Please choose from these options: {storageFormats}." + ) if workspace is None: workspace_id = fabric.get_workspace_id() @@ -356,15 +434,14 @@ def set_workspace_default_storage_format(storage_format: str, workspace: Optiona else: workspace_id = fabric.resolve_workspace_id(workspace) - request_body = { - "name": workspace, - "defaultDatasetStorageFormat": storage_format - } + request_body = {"name": workspace, "defaultDatasetStorageFormat": storage_format} client = fabric.PowerBIRestClient() - response = client.patch(f"/v1.0/myorg/groups/{workspace_id}", json = request_body) + response = client.patch(f"/v1.0/myorg/groups/{workspace_id}", json=request_body) if response.status_code == 200: - print(f"{green_dot} The default storage format for the '{workspace}' workspace has been updated to '{storage_format}.") + print( + f"{icons.green_dot} The default storage format for the '{workspace}' workspace has been updated to '{storage_format}." + ) else: - print(f"{red_dot} {response.status_code}") \ No newline at end of file + print(f"{icons.red_dot} {response.status_code}") diff --git a/sempy_labs/RefreshCalcTables.py b/sempy_labs/RefreshCalcTables.py index 010e9a0c..3fe8d733 100644 --- a/sempy_labs/RefreshCalcTables.py +++ b/sempy_labs/RefreshCalcTables.py @@ -6,15 +6,11 @@ from .TOM import connect_semantic_model from typing import List, Optional, Union from sempy._utils._log import log +import sempy_labs._icons as icons -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' @log def refresh_calc_tables(dataset: str, workspace: Optional[str] = None): - """ Recreates the delta tables in the lakehouse based on the DAX expressions stored as model annotations in the Direct Lake semantic model. @@ -26,17 +22,12 @@ def refresh_calc_tables(dataset: str, workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - """ if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - spark = SparkSession.builder.getOrCreate() start_time = datetime.datetime.now() @@ -45,66 +36,98 @@ def refresh_calc_tables(dataset: str, workspace: Optional[str] = None): while not success: try: - with connect_semantic_model(dataset=dataset, readonly=True, workspace=workspace) as tom: + with connect_semantic_model( + dataset=dataset, readonly=True, workspace=workspace + ) as tom: success = True for a in tom.model.Annotations: if any(a.Name == t.Name for t in tom.model.Tables): tName = a.Name query = a.Value - - if not query.startswith('EVALUATE'): - daxquery = 'EVALUATE \n' + query + + if not query.startswith("EVALUATE"): + daxquery = "EVALUATE \n" + query else: daxquery = query try: - df = fabric.evaluate_dax(dataset = dataset, dax_string = daxquery, workspace = workspace) + df = fabric.evaluate_dax( + dataset=dataset, + dax_string=daxquery, + workspace=workspace, + ) # Update column names for non-field parameters - if query.find('NAMEOF') == -1: + if query.find("NAMEOF") == -1: for old_column_name in df.columns: pattern = r"\[([^\]]+)\]" - - matches = re.findall(pattern, old_column_name) + + matches = re.findall(pattern, old_column_name) new_column_name = matches[0] - new_column_name = new_column_name.replace(' ','') - - df.rename(columns={old_column_name: new_column_name}, inplace=True) + new_column_name = new_column_name.replace(" ", "") + + df.rename( + columns={old_column_name: new_column_name}, + inplace=True, + ) # Update data types for lakehouse columns - dataType = next(str(c.DataType) for c in tom.all_columns() if c.Parent.Name == tName and c.SourceColumn == new_column_name) - #dfC_type = dfC[(dfC['Table Name'] == tName) & (dfC['Source'] == new_column_name)] - #dataType = dfC_type['Data Type'].iloc[0] - - if dataType == 'Int64': - df[new_column_name] = df[new_column_name].astype(int) - elif dataType in ['Decimal', 'Double']: - df[new_column_name] = df[new_column_name].astype(float) - elif dataType == 'Boolean': - df[new_column_name] = df[new_column_name].astype(bool) - elif dataType == 'DateTime': - df[new_column_name] = pd.to_datetime(df[new_column_name]) + dataType = next( + str(c.DataType) + for c in tom.all_columns() + if c.Parent.Name == tName + and c.SourceColumn == new_column_name + ) + # dfC_type = dfC[(dfC['Table Name'] == tName) & (dfC['Source'] == new_column_name)] + # dataType = dfC_type['Data Type'].iloc[0] + + if dataType == "Int64": + df[new_column_name] = df[ + new_column_name + ].astype(int) + elif dataType in ["Decimal", "Double"]: + df[new_column_name] = df[ + new_column_name + ].astype(float) + elif dataType == "Boolean": + df[new_column_name] = df[ + new_column_name + ].astype(bool) + elif dataType == "DateTime": + df[new_column_name] = pd.to_datetime( + df[new_column_name] + ) else: - df[new_column_name] = df[new_column_name].astype(str) - #else: + df[new_column_name] = df[ + new_column_name + ].astype(str) + # else: # second_column_name = df.columns[1] # third_column_name = df.columns[2] # df[third_column_name] = df[third_column_name].astype(int) - # Remove calc columns from field parameters + # Remove calc columns from field parameters # mask = df[second_column_name].isin(dfC_filt['Full Column Name']) # df = df[~mask] - delta_table_name = tName.replace(' ','_') - print(f"{in_progress} Refresh of the '{delta_table_name}' table within the lakehouse is in progress...") + delta_table_name = tName.replace(" ", "_") + print( + f"{icons.in_progress} Refresh of the '{delta_table_name}' table within the lakehouse is in progress..." + ) spark_df = spark.createDataFrame(df) - spark_df.write.mode('overwrite').format('delta').saveAsTable(delta_table_name) - print(f"{green_dot} Calculated table '{tName}' has been refreshed as the '{delta_table_name.lower()}' table in the lakehouse.") + spark_df.write.mode("overwrite").format( + "delta" + ).saveAsTable(delta_table_name) + print( + f"{icons.green_dot} Calculated table '{tName}' has been refreshed as the '{delta_table_name.lower()}' table in the lakehouse." + ) except: - print(f"{red_dot} Failed to create calculated table '{tName}' as a delta table in the lakehouse.") + print( + f"{icons.red_dot} Failed to create calculated table '{tName}' as a delta table in the lakehouse." + ) except Exception as e: if datetime.datetime.now() - start_time > timeout: break - time.sleep(1) \ No newline at end of file + time.sleep(1) diff --git a/sempy_labs/RefreshSemanticModel.py b/sempy_labs/RefreshSemanticModel.py index 599bbb7f..747919fa 100644 --- a/sempy_labs/RefreshSemanticModel.py +++ b/sempy_labs/RefreshSemanticModel.py @@ -1,18 +1,22 @@ import sempy import sempy.fabric as fabric import time -from .HelperFunctions import resolve_dataset_id +from ._helper_functions import resolve_dataset_id from typing import List, Optional, Union from sempy._utils._log import log +import sempy_labs._icons as icons -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' @log -def refresh_semantic_model(dataset: str, tables: Optional[Union[str, List[str]]] = None, partitions: Optional[Union[str, List[str]]] = None, refresh_type: Optional[str] = None, retry_count: Optional[int] = 0, apply_refresh_policy: Optional[bool] = True, workspace: Optional[str] = None): - +def refresh_semantic_model( + dataset: str, + tables: Optional[Union[str, List[str]]] = None, + partitions: Optional[Union[str, List[str]]] = None, + refresh_type: Optional[str] = None, + retry_count: Optional[int] = 0, + apply_refresh_policy: Optional[bool] = True, + workspace: Optional[str] = None, +): """ Refreshes a semantic model. @@ -37,7 +41,7 @@ def refresh_semantic_model(dataset: str, tables: Optional[Union[str, List[str]]] Returns ------- - + """ if workspace == None: @@ -45,7 +49,7 @@ def refresh_semantic_model(dataset: str, tables: Optional[Union[str, List[str]]] workspace = fabric.resolve_workspace_name(workspace_id) if refresh_type is None: - refresh_type = 'full' + refresh_type = "full" if isinstance(tables, str): tables = [tables] @@ -57,6 +61,7 @@ def refresh_semantic_model(dataset: str, tables: Optional[Union[str, List[str]]] if tables is not None: objects = objects + [{"table": table} for table in tables] if partitions is not None: + def extract_names(partition): parts = partition.split("[") table_name = parts[0].strip("'") @@ -65,43 +70,79 @@ def extract_names(partition): objects = objects + [extract_names(partition) for partition in partitions] - refresh_type = refresh_type.lower().replace('only', 'Only').replace('values', 'Values') + refresh_type = ( + refresh_type.lower().replace("only", "Only").replace("values", "Values") + ) - refreshTypes = ['full', 'automatic', 'dataOnly', 'calculate', 'clearValues', 'defragment'] + refreshTypes = [ + "full", + "automatic", + "dataOnly", + "calculate", + "clearValues", + "defragment", + ] if refresh_type not in refreshTypes: - print(f"{red_dot} Invalid refresh type. Refresh type must be one of these values: {refreshTypes}.") + print( + f"{icons.red_dot} Invalid refresh type. Refresh type must be one of these values: {refreshTypes}." + ) return - + if len(objects) == 0: - requestID = fabric.refresh_dataset(dataset = dataset, workspace = workspace, refresh_type = refresh_type, retry_count = retry_count, apply_refresh_policy = apply_refresh_policy) + requestID = fabric.refresh_dataset( + dataset=dataset, + workspace=workspace, + refresh_type=refresh_type, + retry_count=retry_count, + apply_refresh_policy=apply_refresh_policy, + ) else: - requestID = fabric.refresh_dataset(dataset = dataset, workspace = workspace, refresh_type = refresh_type, retry_count = retry_count, apply_refresh_policy = apply_refresh_policy, objects = objects) - print(f"{in_progress} Refresh of the '{dataset}' semantic model within the '{workspace}' workspace is in progress...") + requestID = fabric.refresh_dataset( + dataset=dataset, + workspace=workspace, + refresh_type=refresh_type, + retry_count=retry_count, + apply_refresh_policy=apply_refresh_policy, + objects=objects, + ) + print( + f"{icons.in_progress} Refresh of the '{dataset}' semantic model within the '{workspace}' workspace is in progress..." + ) if len(objects) != 0: print(objects) while True: - requestDetails = fabric.get_refresh_execution_details(dataset = dataset,refresh_request_id = requestID, workspace = workspace) + requestDetails = fabric.get_refresh_execution_details( + dataset=dataset, refresh_request_id=requestID, workspace=workspace + ) status = requestDetails.status # Check if the refresh has completed - if status == 'Completed': + if status == "Completed": break - elif status == 'Failed': - print(f"{red_dot} The refresh of the '{dataset}' semantic model within the '{workspace}' workspace has failed.") + elif status == "Failed": + print( + f"{icons.red_dot} The refresh of the '{dataset}' semantic model within the '{workspace}' workspace has failed." + ) return - elif status == 'Cancelled': - print(f"{yellow_dot} The refresh of the '{dataset}' semantic model within the '{workspace}' workspace has been cancelled.") + elif status == "Cancelled": + print( + f"{icons.yellow_dot} The refresh of the '{dataset}' semantic model within the '{workspace}' workspace has been cancelled." + ) return time.sleep(3) - print(f"{green_dot} Refresh of the '{dataset}' semantic model within the '{workspace}' workspace is complete.") + print( + f"{icons.green_dot} Refresh of the '{dataset}' semantic model within the '{workspace}' workspace is complete." + ) -@log -def cancel_dataset_refresh(dataset: str, request_id: Optional[str] = None, workspace: Optional[str] = None): +@log +def cancel_dataset_refresh( + dataset: str, request_id: Optional[str] = None, workspace: Optional[str] = None +): """ Cancels the refresh of a semantic model which was executed via the [Enhanced Refresh API](https://learn.microsoft.com/power-bi/connect-data/asynchronous-refresh). @@ -110,7 +151,7 @@ def cancel_dataset_refresh(dataset: str, request_id: Optional[str] = None, works dataset : str Name of the semantic model. request_id : str, default=None - The request id of a semantic model refresh. + The request id of a semantic model refresh. Defaults to finding the latest active refresh of the semantic model. workspace : str, default=None The Fabric workspace name. @@ -119,31 +160,36 @@ def cancel_dataset_refresh(dataset: str, request_id: Optional[str] = None, works Returns ------- - - """ + + """ if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) else: workspace_id = fabric.resolve_workspace_id(workspace) - - rr = fabric.list_refresh_requests(dataset = dataset, workspace = workspace) - rr_filt = rr[rr['Status'] == 'Unknown'] + + rr = fabric.list_refresh_requests(dataset=dataset, workspace=workspace) + rr_filt = rr[rr["Status"] == "Unknown"] if request_id == None: if len(rr_filt) == 0: - print(f"{red_dot} There are no active Enhanced API refreshes of the '{dataset}' semantic model within the '{workspace}' workspace.") + print( + f"{icons.red_dot} There are no active Enhanced API refreshes of the '{dataset}' semantic model within the '{workspace}' workspace." + ) return - request_id = rr_filt['Request Id'].iloc[0] - - dataset_id = resolve_dataset_id(dataset = dataset, workspace = workspace) + request_id = rr_filt["Request Id"].iloc[0] + + dataset_id = resolve_dataset_id(dataset=dataset, workspace=workspace) client = fabric.PowerBIRestClient() - response = client.delete(f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}/refreshes/{request_id}") + response = client.delete( + f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}/refreshes/{request_id}" + ) if response.status_code == 200: - print(f"{green_dot} The '{request_id}' refresh request for the '{dataset}' semantic model within the '{workspace}' workspace has been cancelled.") + print( + f"{icons.green_dot} The '{request_id}' refresh request for the '{dataset}' semantic model within the '{workspace}' workspace has been cancelled." + ) else: print(response.status_code) - diff --git a/sempy_labs/ReportFunctions.py b/sempy_labs/ReportFunctions.py deleted file mode 100644 index 6c6d3b52..00000000 --- a/sempy_labs/ReportFunctions.py +++ /dev/null @@ -1,742 +0,0 @@ -import sempy -import sempy.fabric as fabric -import pandas as pd -import json, os, time, base64, copy, re -from anytree import Node, RenderTree -from powerbiclient import Report -from synapse.ml.services import Translate -from pyspark.sql.functions import col, flatten -from pyspark.sql import SparkSession -from .GenerateReport import update_report_from_reportjson -from .Translations import language_validate -from .Lakehouse import lakehouse_attached -from .HelperFunctions import generate_embedded_filter, resolve_dataset_name, resolve_report_id, resolve_lakehouse_name -from typing import List, Optional, Union -from sempy._utils._log import log - -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' - -def get_report_json(report: str, workspace: Optional[str] = None, save_to_file_name: Optional[str] = None): - - """ - Gets the report.json file content of a Power BI report. - - Parameters - ---------- - report : str - Name of the Power BI report. - workspace : str, default=None - The Fabric workspace name. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - save_to_file_name : str, default=None - Specifying this parameter will save the report.json file to the lakehouse attached to the notebook with the file name of this parameter. - - Returns - ------- - str - The report.json file for a given Power BI report. - """ - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - else: - workspace_id = fabric.resolve_workspace_id(workspace) - - client = fabric.FabricRestClient() - - dfI = fabric.list_items(workspace = workspace, type = 'Report') - dfI_filt = dfI[(dfI['Display Name'] == report)] - - if len(dfI_filt) == 0: - print(f"{red_dot} The '{report}' report does not exist in the '{workspace}' workspace.") - return - - itemId = dfI_filt['Id'].iloc[0] - response = client.post(f"/v1/workspaces/{workspace_id}/items/{itemId}/getDefinition") - df_items = pd.json_normalize(response.json()['definition']['parts']) - df_items_filt = df_items[df_items['path'] == 'report.json'] - payload = df_items_filt['payload'].iloc[0] - - reportFile = base64.b64decode(payload).decode('utf-8') - reportJson = json.loads(reportFile) - - if save_to_file_name is not None: - lakeAttach = lakehouse_attached() - if lakeAttach == False: - print(f"{red_dot} In order to save the report.json file, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook.") - return - - lakehouse_id = fabric.get_lakehouse_id() - lakehouse = resolve_lakehouse_name(lakehouse_id, workspace) - folderPath = '/lakehouse/default/Files' - fileExt = '.json' - if not save_to_file_name.endswith(fileExt): - save_to_file_name = save_to_file_name + fileExt - filePath = os.path.join(folderPath, save_to_file_name) - with open(filePath, "w") as json_file: - json.dump(reportJson, json_file, indent=4) - print(f"{green_dot} The report.json file for the '{report}' report has been saved to the '{lakehouse}' in this location: '{filePath}'.\n\n") - - return reportJson - -def report_dependency_tree(workspace: Optional[str] = None): - - """ - Prints a dependency between reports and semantic models. - - Parameters - ---------- - workspace : str, default=None - The Fabric workspace name. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - - """ - - if workspace == None: - workspaceId = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspaceId) - - dfR = fabric.list_reports(workspace = workspace) - dfD = fabric.list_datasets(workspace = workspace) - dfR = pd.merge(dfR, dfD[['Dataset ID', 'Dataset Name']], left_on = 'Dataset Id', right_on = 'Dataset ID', how = 'left') - dfR.rename(columns={'Name': 'Report Name'}, inplace=True) - dfR = dfR[['Report Name', 'Dataset Name']] - - report_icon = '\U0001F4F6' - dataset_icon = '\U0001F9CA' - workspace_icon = '\U0001F465' - - node_dict = {} - rootNode = Node(workspace) - node_dict[workspace] = rootNode - rootNode.custom_property = workspace_icon + ' ' - - for i, r in dfR.iterrows(): - datasetName = r['Dataset Name'] - reportName = r['Report Name'] - parentNode = node_dict.get(datasetName) - if parentNode is None: - parentNode = Node(datasetName, parent = rootNode) - node_dict[datasetName] = parentNode - parentNode.custom_property = dataset_icon + ' ' - - child_node = Node(reportName, parent=parentNode) - child_node.custom_property = report_icon + ' ' - - # Print the tree structure - for pre, _, node in RenderTree(node_dict[workspace]): - print(f"{pre}{node.custom_property}'{node.name}'") - -@log -def export_report(report: str, export_format: str, file_name: Optional[str] = None, bookmark_name: Optional[str] = None, page_name: Optional[str] = None, visual_name: Optional[str] = None, report_filter: Optional[str] = None, workspace: Optional[str] = None): - - """ - Exports a Power BI report to a file in your lakehouse. - - Parameters - ---------- - report : str - Name of the Power BI report. - export_format : str - The format in which to export the report. See this link for valid formats: https://learn.microsoft.com/rest/api/power-bi/reports/export-to-file-in-group#fileformat. For image formats, enter the file extension in this parameter, not 'IMAGE'. - file_name : str, default=None - The name of the file to be saved within the lakehouse. Do not include the file extension. Defaults ot the reportName parameter value. - bookmark_name : str, default=None - The name (GUID) of a bookmark within the report. - page_name : str, default=None - The name (GUID) of the report page. - visual_name : str, default=None - The name (GUID) of a visual. If you specify this parameter you must also specify the page_name parameter. - report_filter : str, default=None - A report filter to be applied when exporting the report. Syntax is user-friendly. See above for examples. - workspace : str, default=None - The Fabric workspace name. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - - """ - - #https://learn.microsoft.com/rest/api/power-bi/reports/export-to-file-in-group - - lakeAttach = lakehouse_attached() - - if lakeAttach == False: - print(f"{red_dot} In order to run the 'export_report' function, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook.") - return - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - else: - workspace_id = fabric.resolve_workspace_id(workspace) - - if isinstance(page_name,str): - page_name = [page_name] - if isinstance(visual_name,str): - visual_name = [visual_name] - - if bookmark_name is not None and (page_name is not None or visual_name is not None): - print(f"{red_dot} If the 'bookmark_name' parameter is set, the 'page_name' and 'visual_name' parameters must not be set.") - return - if visual_name is not None and page_name is None: - print(f"{red_dot} If the 'visual_name' parameter is set, the 'page_name' parameter must be set.") - return - - validFormats = { - 'ACCESSIBLEPDF': '.pdf', - 'CSV': '.csv', - 'DOCX': '.docx', - 'MHTML': '.mhtml', - 'PDF': '.pdf', - 'PNG': '.png', - 'PPTX': '.pptx', - 'XLSX': '.xlsx', - 'XML': '.xml', - 'BMP': '.bmp', - 'EMF': '.emf', - 'GIF': '.gif', - 'JPEG': '.jpeg', - 'TIFF': '.tiff' - } - - export_format = export_format.upper() - if export_format not in validFormats: - print(f"{red_dot} The '{export_format}' format is not a valid format for exporting Power BI reports. Please enter a valid format. Options: {validFormats}") - return - - fileExt = validFormats.get(export_format) - - if file_name == None: - file_name = report + fileExt - else: - file_name = file_name + fileExt - - folderPath = '/lakehouse/default/Files' - filePath = os.path.join(folderPath, file_name) - - dfI = fabric.list_items(workspace = workspace) - dfI_filt = dfI[(dfI['Type'].isin(['Report', 'PaginatedReport'])) & (dfI['Display Name'] == report)] - - if len(dfI_filt) == 0: - print(f"{red_dot} The '{report}' report does not exist in the '{workspace}' workspace.") - return - - reportType = dfI_filt['Type'].iloc[0] - - # Limitations - pbiOnly = ['PNG'] - paginatedOnly = ['ACCESSIBLEPDF','CSV','DOCX', 'BMP', 'EMF', 'GIF', 'JPEG', 'TIFF', 'MHTML', 'XLSX', 'XML'] - - if reportType == 'Report' and export_format in paginatedOnly: - print(f"{red_dot} The '{export_format}' format is only supported for paginated reports.") - return - if reportType == 'PaginatedReport' and export_format in pbiOnly: - print(f"{red_dot} The '{export_format}' format is only supported for Power BI reports.") - return - - if reportType == 'PaginatedReport' and (bookmark_name is not None or page_name is not None or visual_name is not None): - print(f"{red_dot} Export for paginated reports does not support bookmarks/pages/visuals. Those parameters must not be set for paginated reports.") - return - - reportId = dfI_filt['Id'].iloc[0] - client = fabric.PowerBIRestClient() - - dfVisual = list_report_visuals(report = report, workspace = workspace) - dfPage = list_report_pages(report = report, workspace = workspace) - - if export_format in ['BMP', 'EMF', 'GIF', 'JPEG', 'TIFF'] and reportType == 'PaginatedReport': - request_body = { - 'format': 'IMAGE', - 'paginatedReportConfiguration': { - 'formatSettings': { - 'OutputFormat': export_format.lower() - } - } - } - elif bookmark_name is None and page_name is None and visual_name is None: - request_body = { - 'format': export_format - } - elif bookmark_name is not None: - if reportType == 'Report': - request_body = { - 'format': export_format, - 'powerBIReportConfiguration': { - 'defaultBookmark': { - 'name': bookmark_name - } - } - } - elif page_name is not None and visual_name is None: - if reportType == 'Report': - request_body = { - 'format': export_format, - 'powerBIReportConfiguration': { - } - } - - request_body['powerBIReportConfiguration']['pages'] = [] - - for page in page_name: - dfPage_filt = dfPage[dfPage['Page ID'] == page] - if len(dfPage_filt) == 0: - print(f"{red_dot} The '{page}' page does not exist in the '{report}' report within the '{workspace}' workspace.") - return - page_dict = {'pageName': page} - request_body['powerBIReportConfiguration']['pages'].append(page_dict) - - elif page_name is not None and visual_name is not None: - if len(page_name) != len(visual_name): - print(f"{red_dot} Each 'visual_name' must map to a single 'page_name'.") - return - if reportType == 'Report': - request_body = { - 'format': export_format, - 'powerBIReportConfiguration': { - } - } - - request_body['powerBIReportConfiguration']['pages'] = [] - a=0 - for page in page_name: - visual = visual_name[a] - dfVisual_filt = dfVisual[(dfVisual['Page ID'] == page) & (dfVisual['Visual ID'] == visual)] - if len(dfVisual_filt) == 0: - print(f"{red_dot} The '{visual}' visual does not exist on the '{page}' in the '{report}' report within the '{workspace}' workspace.") - return - page_dict = {'pageName': page,'visualName': visual} - request_body['powerBIReportConfiguration']['pages'].append(page_dict) - a+=1 - - # Transform and add report filter if it is specified - if report_filter is not None and reportType == 'Report': - reportFilter = generate_embedded_filter(filter = report_filter) - report_level_filter = {'filter': reportFilter} - - if 'powerBIReportConfiguration' not in request_body: - request_body['powerBIReportConfiguration'] = {} - request_body['powerBIReportConfiguration']['reportLevelFilters'] = [report_level_filter] - print(request_body) - response = client.post(f"/v1.0/myorg/groups/{workspace_id}/reports/{reportId}/ExportTo",json=request_body) - if response.status_code == 202: - response_body = json.loads(response.content) - exportId = response_body['id'] - response = client.get(f"/v1.0/myorg/groups/{workspace_id}/reports/{reportId}/exports/{exportId}") - response_body = json.loads(response.content) - while response_body['status'] not in ['Succeeded', 'Failed']: - time.sleep(3) - response = client.get(f"/v1.0/myorg/groups/{workspace_id}/reports/{reportId}/exports/{exportId}") - response_body = json.loads(response.content) - if response_body['status'] == 'Failed': - print(f"{red_dot} The export for the '{report}' report within the '{workspace}' workspace in the '{export_format}' format has failed.") - else: - response = client.get(f"/v1.0/myorg/groups/{workspace_id}/reports/{reportId}/exports/{exportId}/file") - print(f"{in_progress} Saving the '{export_format}' export for the '{report}' report within the '{workspace}' workspace to the lakehouse...") - with open(filePath, "wb") as export_file: - export_file.write(response.content) - print(f"{green_dot} The '{export_format}' export for the '{report}' report within the '{workspace}' workspace has been saved to the following location: '{filePath}'.") - - -def clone_report(report: str, cloned_report: str, workspace: Optional[str] = None, target_workspace: Optional[str] = None, target_dataset: Optional[str] = None): - - """ - Clones a Power BI report. - - Parameters - ---------- - report : str - Name of the Power BI report. - cloned_report : str - Name of the new Power BI report. - workspace : str, default=None - The Fabric workspace name. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - target_workspace : str, default=None - The name of the Fabric workspace to place the cloned report. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - target_dataset : str, default=None - The name of the semantic model to be used by the cloned report. - Defaults to None which resolves to the semantic model used by the initial report. - - Returns - ------- - - """ - - #https://learn.microsoft.com/rest/api/power-bi/reports/clone-report-in-group - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - else: - workspace_id = fabric.resolve_workspace_id(workspace) - - dfI = fabric.list_items(workspace = workspace, type = 'Report') - dfI_filt = dfI[(dfI['Display Name'] == report)] - - if len(dfI_filt) == 0: - print(f"{red_dot} The '{report}' report does not exist within the '{workspace}' workspace.") - return - - reportId = resolve_report_id(report, workspace) - - if target_workspace is None: - target_workspace = workspace - target_workspace_id = workspace_id - else: - dfW = fabric.list_workspaces() - dfW_filt = dfW[dfW['Name'] == target_workspace] - - if len(dfW_filt) == 0: - print(f"{red_dot} The '{workspace}' is not a valid workspace.") - return - target_workspace_id = dfW_filt['Id'].iloc[0] - - if target_dataset == None: - dfR = fabric.list_reports(workspace = target_workspace) - dfR_filt = dfR[dfR['Name'] == report] - target_dataset_id = dfR_filt['Dataset Id'].iloc[0] - target_dataset = resolve_dataset_name(dataset_id = target_dataset_id, workspace = target_workspace) - else: - dfD = fabric.list_datasets(workspace = target_workspace) - dfD_filt = dfD[dfD['Dataset Name'] == target_dataset] - - if len(dfD_filt) == 0: - print(f"{red_dot} The '{target_dataset}' target dataset does not exist in the '{target_workspace}' workspace.") - return - target_dataset_id = dfD_filt['Dataset Id'].iloc[0] - - client = fabric.PowerBIRestClient() - - if target_workspace is None and target_dataset is None: - request_body = { - "name": cloned_report - } - elif target_workspace is not None and target_dataset is None: - request_body = { - "name": cloned_report, - "targetWorkspaceId": target_workspace_id - } - elif target_workspace is not None and target_dataset is not None: - request_body = { - "name": cloned_report, - "targetModelId": target_dataset_id, - "targetWorkspaceId": target_workspace_id - } - elif target_workspace is None and target_dataset is not None: - request_body = { - "name": cloned_report, - "targetModelId": target_dataset_id - } - - response = client.post(f"/v1.0/myorg/groups/{workspace_id}/reports/{reportId}/Clone",json=request_body) - - if response.status_code == 200: - print(f"{green_dot} The '{report}' report has been successfully cloned as the '{cloned_report}' report within the '{target_workspace}' workspace using the '{target_dataset}' semantic model.") - else: - print(f"{red_dot} POST request failed with status code: {response.status_code}") - -def launch_report(report: str, workspace: Optional[str] = None): - - """ - Shows a Power BI report within a Fabric notebook. - - Parameters - ---------- - report : str - Name of the Power BI report. - workspace : str, default=None - The Fabric workspace name. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - str - An embedded Power BI report within the notebook. - """ - - from .HelperFunctions import resolve_report_id - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - else: - workspace_id = fabric.resolve_workspace_id(workspace) - - reportId = resolve_report_id(report, workspace) - - report = Report(group_id=workspace_id, report_id=reportId) - - return report - -def list_report_pages(report: str, workspace: Optional[str] = None): - - """ - Shows the properties of all pages within a Power BI report. - - Parameters - ---------- - report : str - Name of the Power BI report. - workspace : str, default=None - The Fabric workspace name. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - pandas.DataFrame - A pandas dataframe showing the pages within a Power BI report and their properties. - """ - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - - df = pd.DataFrame(columns=['Page ID', 'Page Name', 'Hidden', 'Width', 'Height', 'Visual Count']) - - reportJson = get_report_json(report = report, workspace = workspace) - - for section in reportJson['sections']: - pageID = section['name'] - pageName = section['displayName'] - #pageFilters = section['filters'] - pageWidth = section['width'] - pageHeight = section['height'] - visualCount = len(section['visualContainers']) - pageHidden = False - pageConfig = section['config'] - pageConfigJson = json.loads(pageConfig) - - try: - pageH = pageConfigJson['visibility'] - if pageH == 1: - pageHidden = True - except: - pass - - new_data = {'Page ID': pageID, 'Page Name': pageName, 'Hidden': pageHidden, 'Width': pageWidth, 'Height': pageHeight, 'Visual Count': visualCount} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - - df['Hidden'] = df['Hidden'].astype(bool) - intCol = ['Width', 'Height', 'Visual Count'] - df[intCol] = df[intCol].astype(int) - - return df - -def list_report_visuals(report: str, workspace: Optional[str] = None): - - """ - Shows the properties of all visuals within a Power BI report. - - Parameters - ---------- - report : str - Name of the Power BI report. - workspace : str, default=None - The Fabric workspace name. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - pandas.DataFrame - A pandas dataframe showing the visuals within a Power BI report and their properties. - """ - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - - reportJson = get_report_json(report = report, workspace = workspace) - - df = pd.DataFrame(columns=['Page Name', 'Page ID', 'Visual ID', 'Title']) - - for section in reportJson['sections']: - pageID = section['name'] - pageName = section['displayName'] - - for visual in section['visualContainers']: - visualConfig = visual['config'] - visualConfigJson = json.loads(visualConfig) - visualID = visualConfigJson['name'] - - try: - title = visualConfigJson["singleVisual"]["vcObjects"]["title"][0]["properties"]["text"]["expr"]["Literal"]["Value"] - title = title[1:-1] - except: - title = '' - - new_data = {'Page Name': pageName, 'Page ID': pageID, 'Visual ID': visualID, 'Title': title} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - - return df - -def list_report_bookmarks(report: str, workspace: Optional[str] = None): - - """ - Shows the properties of all bookmarks within a Power BI report. - - Parameters - ---------- - report : str - Name of the Power BI report. - workspace : str, default=None - The Fabric workspace name. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - pandas.DataFrame - A pandas dataframe showing the bookmarks within a Power BI report and their properties. - """ - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - - df = pd.DataFrame(columns=['Bookmark ID', 'Bookmark Name', 'Page ID', 'Visual ID', 'Visual Hidden']) - - reportJson = get_report_json(report = report, workspace = workspace) - reportConfig = reportJson['config'] - reportConfigJson = json.loads(reportConfig) - - try: - for bookmark in reportConfigJson['bookmarks']: - bID = bookmark['name'] - bName = bookmark['displayName'] - rptPageId = bookmark['explorationState']['activeSection'] - - for rptPg in bookmark['explorationState']['sections']: - for vc in bookmark['explorationState']['sections'][rptPg]['visualContainers']: - vHidden = False - try: - hidden = bookmark['explorationState']['sections'][rptPg]['visualContainers'][vc]['singleVisual']['display']['mode'] - if hidden == 'hidden': - vHidden = True - except: - pass - - new_data = {'Bookmark ID': bID, 'Bookmark Name': bName, 'Page ID': rptPageId, 'Visual ID': vc, 'Visual Hidden': vHidden } - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - - listPages = list_report_pages(report = report, workspace = workspace) - - df = pd.merge(df, listPages[['Page ID', 'Page Name']], on='Page ID', how='left') - df = df[['Bookmark ID', 'Bookmark Name', 'Page ID', 'Page Name', 'Visual ID', 'Visual Hidden']] - - return df - - except: - print(f"The '{report}' report within the '{workspace}' workspace has no bookmarks.") - -def translate_report_titles(report: str, languages: Union[str,List[str]], workspace: Optional[str] = None): - - """ - Dynamically generates new Power BI reports which have report titles translated into the specified language(s). - - Parameters - ---------- - report : str - Name of the Power BI report. - languages : str, List[str] - The language code(s) in which to translate the report titles. - workspace : str, default=None - The Fabric workspace name. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - - """ - - if isinstance(languages, str): - languages = [languages] - - for lang in languages: - language_validate(lang) - - reportJson = get_report_json(report = report, workspace = workspace) - dfV = list_report_visuals(report = report, workspace = workspace) - spark = SparkSession.builder.getOrCreate() - df = spark.createDataFrame(dfV) - columnToTranslate = 'Title' - - translate = ( - Translate() - .setTextCol(columnToTranslate) - .setToLanguage(languages) - .setOutputCol("translation") - .setConcurrency(5) - ) - - transDF = (translate - .transform(df) - .withColumn("translation", flatten(col("translation.translations"))) - .withColumn("translation", col("translation.text")) - .select('Visual ID', columnToTranslate, 'translation')) - - df_panda = transDF.toPandas() - - i=0 - for lang in languages: - #Clone report - language = language_validate(lang) - clonedReportName = f"{report}_{language}" - - dfRep = fabric.list_reports(workspace = workspace) - dfRep_filt = dfRep[(dfRep['Name'] == clonedReportName) & (dfRep['Report Type'] == 'PowerBIReport')] - - if len(dfRep_filt) > 0: - print(f"{yellow_dot} The '{clonedReportName}' report already exists in the '{workspace} workspace.") - else: - clone_report(report = report, cloned_report = clonedReportName, workspace = workspace) - print(f"{green_dot} The '{clonedReportName}' report has been created via clone in the '{workspace} workspace.") - - rptJsonTr = copy.deepcopy(reportJson) - - # Update report json file - for section in rptJsonTr['sections']: - for visual in section['visualContainers']: - visualConfig = visual['config'] - visualConfigJson = json.loads(visualConfig) - visualID = visualConfigJson['name'] - - df_filt = df_panda[(df_panda['Visual ID'] == visualID) & (df_panda['Title'] != '')] - - if len(df_filt) == 1: - tr = df_filt['translation'].str[i].iloc[0] - if len(tr) > 0: - prop = visualConfigJson["singleVisual"]["vcObjects"]["title"][0]["properties"]["text"]["expr"]["Literal"] - prop['Value'] = f"'{tr}'" - - visual['config'] = json.dumps(visualConfigJson) - - i+=1 - - # Post updated report json file to cloned report - update_report_from_reportjson(report = clonedReportName, report_json = rptJsonTr, workspace = workspace) - print(f"{green_dot} The visual titles within the '{clonedReportName}' report within the '{workspace}' have been translated into '{language}' accordingly.") - - - - - - \ No newline at end of file diff --git a/sempy_labs/ShowUnsupportedDirectLakeObjects.py b/sempy_labs/ShowUnsupportedDirectLakeObjects.py deleted file mode 100644 index 0f4277a0..00000000 --- a/sempy_labs/ShowUnsupportedDirectLakeObjects.py +++ /dev/null @@ -1,68 +0,0 @@ -import sempy -import sempy.fabric as fabric -import pandas as pd -from .ListFunctions import list_tables -from .HelperFunctions import format_dax_object_name -from typing import List, Optional, Union - -def show_unsupported_direct_lake_objects(dataset: str, workspace: Optional[str] = None): - - """ - Returns a list of a semantic model's objects which are not supported by Direct Lake based on [official documentation](https://learn.microsoft.com/power-bi/enterprise/directlake-overview#known-issues-and-limitations). - - Parameters - ---------- - dataset : str - Name of the semantic model. - workspace : str, default=None - The Fabric workspace name. - Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - pandas.DataFrame, pandas.DataFrame, pandas.DataFrame - 3 pandas dataframes showing objects in a semantic model which are not supported by Direct Lake. - """ - - pd.options.mode.chained_assignment = None - - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) - - dfT = list_tables(dataset, workspace) - dfC = fabric.list_columns(dataset = dataset, workspace = workspace) - dfR = fabric.list_relationships(dataset = dataset, workspace = workspace) - - # Calc tables - dfT_filt = dfT[dfT['Type'] == 'Calculated Table'] - dfT_filt.rename(columns={'Name': 'Table Name'}, inplace=True) - t = dfT_filt[['Table Name', 'Type']] - - # Calc columns - dfC_filt = dfC[(dfC['Type'] == 'Calculated') | (dfC['Data Type'] == 'Binary')] - c = dfC_filt[['Table Name', 'Column Name', 'Type', 'Data Type', 'Source']] - - # Relationships - dfC['Column Object'] = format_dax_object_name(dfC['Table Name'], dfC['Column Name']) - dfR['From Object'] = format_dax_object_name(dfR['From Table'], dfR['From Column']) - dfR['To Object'] = format_dax_object_name(dfR['To Table'], dfR['To Column']) - merged_from = pd.merge(dfR, dfC, left_on='From Object', right_on='Column Object', how='left') - merged_to = pd.merge(dfR, dfC, left_on='To Object', right_on='Column Object', how='left') - - dfR['From Column Data Type'] = merged_from['Data Type'] - dfR['To Column Data Type'] = merged_to['Data Type'] - - dfR_filt = dfR[((dfR['From Column Data Type'] == 'DateTime') | (dfR['To Column Data Type'] == 'DateTime')) | (dfR['From Column Data Type'] != dfR['To Column Data Type'])] - r = dfR_filt[['From Table', 'From Column', 'To Table', 'To Column', 'From Column Data Type', 'To Column Data Type']] - - #print('Calculated Tables are not supported...') - #display(t) - #print("Learn more about Direct Lake limitations here: https://learn.microsoft.com/power-bi/enterprise/directlake-overview#known-issues-and-limitations") - #print('Calculated columns are not supported. Columns of binary data type are not supported.') - #display(c) - #print('Columns used for relationship cannot be of data type datetime and they also must be of the same data type.') - #display(r) - - return t, c, r \ No newline at end of file diff --git a/sempy_labs/TOM.py b/sempy_labs/TOM.py index d5a18fff..0237a81c 100644 --- a/sempy_labs/TOM.py +++ b/sempy_labs/TOM.py @@ -3,30 +3,30 @@ import pandas as pd import re from datetime import datetime -from .HelperFunctions import format_dax_object_name -from .ListFunctions import list_relationships +from ._helper_functions import format_dax_object_name +from ._list_functions import list_relationships from .RefreshSemanticModel import refresh_semantic_model -from .Fallback import check_fallback_reason +from ._fallback import check_fallback_reason from contextlib import contextmanager from typing import List, Optional, Union, TYPE_CHECKING from sempy._utils._log import log +import sempy_labs._icons as icons if TYPE_CHECKING: import Microsoft.AnalysisServices.Tabular -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' -checked = '\u2611' -unchecked = '\u2610' -start_bold = '\033[1m' -end_bold = '\033[0m' + +checked = "\u2611" +unchecked = "\u2610" +start_bold = "\033[1m" +end_bold = "\033[0m" + @log @contextmanager -def connect_semantic_model(dataset: str, readonly: Optional[bool] = True, workspace: Optional[str] = None): - +def connect_semantic_model( + dataset: str, readonly: Optional[bool] = True, workspace: Optional[str] = None +): """ Connects to the Tabular Object Model (TOM) within a semantic model. @@ -54,18 +54,19 @@ def connect_semantic_model(dataset: str, readonly: Optional[bool] = True, worksp if workspace is None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - + fpAdded = [] class TOMWrapper: def __init__(self, dataset, workspace, readonly): - - tom_server = fabric.create_tom_server(readonly=readonly, workspace=workspace) + + tom_server = fabric.create_tom_server( + readonly=readonly, workspace=workspace + ) self.model = tom_server.Databases.GetByName(dataset).Model def all_columns(self): - """ Outputs a list of all columns within all tables in the semantic model. @@ -84,7 +85,6 @@ def all_columns(self): yield c def all_calculated_columns(self): - """ Outputs a list of all calculated columns within all tables in the semantic model. @@ -103,7 +103,6 @@ def all_calculated_columns(self): yield c def all_calculated_tables(self): - """ Outputs a list of all calculated tables in the semantic model. @@ -121,7 +120,6 @@ def all_calculated_tables(self): yield t def all_calculation_groups(self): - """ Outputs a list of all calculation groups in the semantic model. @@ -139,7 +137,6 @@ def all_calculation_groups(self): yield t def all_measures(self): - """ Outputs a list of all measures in the semantic model. @@ -157,7 +154,6 @@ def all_measures(self): yield m def all_partitions(self): - """ Outputs a list of all partitions in the semantic model. @@ -175,7 +171,6 @@ def all_partitions(self): yield p def all_hierarchies(self): - """ Outputs a list of all hierarchies in the semantic model. @@ -193,7 +188,6 @@ def all_hierarchies(self): yield h def all_levels(self): - """ Outputs a list of all levels in the semantic model. @@ -212,7 +206,6 @@ def all_levels(self): yield l def all_calculation_items(self): - """ Outputs a list of all calculation items in the semantic model. @@ -231,7 +224,6 @@ def all_calculation_items(self): yield ci def all_rls(self): - """ Outputs a list of all row level security expressions in the semantic model. @@ -248,8 +240,16 @@ def all_rls(self): for tp in r.TablePermissions: yield tp - def add_measure(self, table_name: str, measure_name: str, expression: str, format_string: Optional[str] = None, hidden: Optional[bool] = False, description: Optional[str] = None, display_folder: Optional[str] = None): - + def add_measure( + self, + table_name: str, + measure_name: str, + expression: str, + format_string: Optional[str] = None, + hidden: Optional[bool] = False, + description: Optional[str] = None, + display_folder: Optional[str] = None, + ): """ Adds a measure to the semantic model. @@ -276,7 +276,7 @@ def add_measure(self, table_name: str, measure_name: str, expression: str, forma """ obj = TOM.Measure() - obj.Name= measure_name + obj.Name = measure_name obj.Expression = expression obj.IsHidden = hidden if format_string is not None: @@ -288,8 +288,20 @@ def add_measure(self, table_name: str, measure_name: str, expression: str, forma self.model.Tables[table_name].Measures.Add(obj) - def add_calculated_table_column(self, table_name: str, column_name: str, source_column: str, data_type: str, format_string: Optional[str] = None, hidden: Optional[bool] = False, description: Optional[str] = None, display_folder: Optional[str] = None, data_category: Optional[str] = None, key: Optional[bool] = False, summarize_by: Optional[str] = None): - + def add_calculated_table_column( + self, + table_name: str, + column_name: str, + source_column: str, + data_type: str, + format_string: Optional[str] = None, + hidden: Optional[bool] = False, + description: Optional[str] = None, + display_folder: Optional[str] = None, + data_category: Optional[str] = None, + key: Optional[bool] = False, + summarize_by: Optional[str] = None, + ): """ Adds a calculated table column to a calculated table within a semantic model. @@ -324,10 +336,18 @@ def add_calculated_table_column(self, table_name: str, column_name: str, source_ """ - data_type = data_type.capitalize().replace('Integer', 'Int64').replace('Datetime', 'DateTime') + data_type = ( + data_type.capitalize() + .replace("Integer", "Int64") + .replace("Datetime", "DateTime") + ) if summarize_by is None: - summarize_by = 'Default' - summarize_by = summarize_by.capitalize().replace('Distinctcount', 'DistinctCount').replace('Avg', 'Average') + summarize_by = "Default" + summarize_by = ( + summarize_by.capitalize() + .replace("Distinctcount", "DistinctCount") + .replace("Avg", "Average") + ) obj = TOM.CalculatedTableColumn() obj.Name = column_name @@ -346,8 +366,20 @@ def add_calculated_table_column(self, table_name: str, column_name: str, source_ obj.DataCategory = data_category self.model.Tables[table_name].Columns.Add(obj) - def add_data_column(self, table_name: str, column_name: str, source_column: str, data_type: str, format_string: Optional[str] = None, hidden: Optional[bool] = False, description: Optional[str] = None, display_folder: Optional[str] = None, data_category: Optional[str] = None, key: Optional[bool] = False, summarize_by: Optional[str] = None): - + def add_data_column( + self, + table_name: str, + column_name: str, + source_column: str, + data_type: str, + format_string: Optional[str] = None, + hidden: Optional[bool] = False, + description: Optional[str] = None, + display_folder: Optional[str] = None, + data_category: Optional[str] = None, + key: Optional[bool] = False, + summarize_by: Optional[str] = None, + ): """ Adds a data column to a table within a semantic model. @@ -382,10 +414,18 @@ def add_data_column(self, table_name: str, column_name: str, source_column: str, """ - data_type = data_type.capitalize().replace('Integer', 'Int64').replace('Datetime', 'DateTime') + data_type = ( + data_type.capitalize() + .replace("Integer", "Int64") + .replace("Datetime", "DateTime") + ) if summarize_by is None: - summarize_by = 'Default' - summarize_by = summarize_by.capitalize().replace('Distinctcount', 'DistinctCount').replace('Avg', 'Average') + summarize_by = "Default" + summarize_by = ( + summarize_by.capitalize() + .replace("Distinctcount", "DistinctCount") + .replace("Avg", "Average") + ) obj = TOM.DataColumn() obj.Name = column_name @@ -404,8 +444,20 @@ def add_data_column(self, table_name: str, column_name: str, source_column: str, obj.DataCategory = data_category self.model.Tables[table_name].Columns.Add(obj) - def add_calculated_column(self, table_name: str, column_name: str, expression: str, data_type: str, format_string: Optional[str] = None, hidden: Optional[bool] = False, description: Optional[str] = None, display_folder: Optional[str] = None, data_category: Optional[str] = None, key: Optional[bool] = False, summarize_by: Optional[str] = None): - + def add_calculated_column( + self, + table_name: str, + column_name: str, + expression: str, + data_type: str, + format_string: Optional[str] = None, + hidden: Optional[bool] = False, + description: Optional[str] = None, + display_folder: Optional[str] = None, + data_category: Optional[str] = None, + key: Optional[bool] = False, + summarize_by: Optional[str] = None, + ): """ Adds a calculated column to a table within a semantic model. @@ -440,10 +492,18 @@ def add_calculated_column(self, table_name: str, column_name: str, expression: s """ - data_type = data_type.capitalize().replace('Integer', 'Int64').replace('Datetime', 'DateTime') + data_type = ( + data_type.capitalize() + .replace("Integer", "Int64") + .replace("Datetime", "DateTime") + ) if summarize_by is None: - summarize_by = 'Default' - summarize_by = summarize_by.capitalize().replace('Distinctcount', 'DistinctCount').replace('Avg', 'Average') + summarize_by = "Default" + summarize_by = ( + summarize_by.capitalize() + .replace("Distinctcount", "DistinctCount") + .replace("Avg", "Average") + ) obj = TOM.CalculatedColumn() obj.Name = column_name @@ -462,8 +522,15 @@ def add_calculated_column(self, table_name: str, column_name: str, expression: s obj.DataCategory = data_category self.model.Tables[table_name].Columns.Add(obj) - def add_calculation_item(self, table_name: str, calculation_item_name: str, expression: str, ordinal: Optional[int] = None, format_string_expression: Optional[str] = None, description: Optional[str] = None): - + def add_calculation_item( + self, + table_name: str, + calculation_item_name: str, + expression: str, + ordinal: Optional[int] = None, + format_string_expression: Optional[str] = None, + description: Optional[str] = None, + ): """ Adds a calculation item to a calculation group within a semantic model. @@ -499,8 +566,12 @@ def add_calculation_item(self, table_name: str, calculation_item_name: str, expr obj.FormatStringDefinition = fsd.Expression = format_string_expression self.model.Tables[table_name].CalculationGroup.CalculationItems.Add(obj) - def add_role(self, role_name: str, model_permission: Optional[str] = None, description: Optional[str] = None): - + def add_role( + self, + role_name: str, + model_permission: Optional[str] = None, + description: Optional[str] = None, + ): """ Adds a role to a semantic model. @@ -520,17 +591,18 @@ def add_role(self, role_name: str, model_permission: Optional[str] = None, descr """ if model_permission is None: - model_permission = 'Read' + model_permission = "Read" obj = TOM.ModelRole() obj.Name = role_name - obj.ModelPermission = System.Enum.Parse(TOM.ModelPermission, model_permission) + obj.ModelPermission = System.Enum.Parse( + TOM.ModelPermission, model_permission + ) if description is not None: obj.Description = description self.model.Roles.Add(obj) def set_rls(self, role_name: str, table_name: str, filter_expression: str): - """ Sets the row level security permissions for a table within a role. @@ -553,12 +625,15 @@ def set_rls(self, role_name: str, table_name: str, filter_expression: str): tp.FilterExpression = filter_expression try: - self.model.Roles[role_name].TablePermissions[table_name].FilterExpression = filter_expression + self.model.Roles[role_name].TablePermissions[ + table_name + ].FilterExpression = filter_expression except: self.model.Roles[role_name].TablePermissions.Add(tp) - def set_ols(self, role_name: str, table_name: str, column_name: str, permission: str): - + def set_ols( + self, role_name: str, table_name: str, column_name: str, permission: str + ): """ Sets the object level security permissions for a column within a role. @@ -580,20 +655,35 @@ def set_ols(self, role_name: str, table_name: str, column_name: str, permission: permission = permission.capitalize() - if permission not in ['Read', 'None', 'Default']: + if permission not in ["Read", "None", "Default"]: print(f"ERROR! Invalid 'permission' value.") return cp = TOM.ColumnPermission() cp.Column = self.model.Tables[table_name].Columns[column_name] - cp.MetadataPermission = System.Enum.Parse(TOM.MetadataPermission, permission) + cp.MetadataPermission = System.Enum.Parse( + TOM.MetadataPermission, permission + ) try: - self.model.Roles[role_name].TablePermissions[table_name].ColumnPermissions[column_name].MetadataPermission = System.Enum.Parse(TOM.MetadataPermission, permission) + self.model.Roles[role_name].TablePermissions[ + table_name + ].ColumnPermissions[column_name].MetadataPermission = System.Enum.Parse( + TOM.MetadataPermission, permission + ) except: - self.model.Roles[role_name].TablePermissions[table_name].ColumnPermissions.Add(cp) - - def add_hierarchy(self, table_name: str, hierarchy_name: str, columns: List[str], levels: Optional[List[str]] = None, hierarchy_description: Optional[str] = None, hierarchy_hidden: Optional[bool] = False): - + self.model.Roles[role_name].TablePermissions[ + table_name + ].ColumnPermissions.Add(cp) + + def add_hierarchy( + self, + table_name: str, + hierarchy_name: str, + columns: List[str], + levels: Optional[List[str]] = None, + hierarchy_description: Optional[str] = None, + hierarchy_hidden: Optional[bool] = False, + ): """ Adds a hierarchy to a table within a semantic model. @@ -618,19 +708,25 @@ def add_hierarchy(self, table_name: str, hierarchy_name: str, columns: List[str] """ if isinstance(columns, str): - print(f"The 'levels' parameter must be a list. For example: ['Continent', 'Country', 'City']") + print( + f"The 'levels' parameter must be a list. For example: ['Continent', 'Country', 'City']" + ) return if len(columns) == 1: - print(f"There must be at least 2 levels in order to create a hierarchy.") + print( + f"There must be at least 2 levels in order to create a hierarchy." + ) return - + if levels is None: levels = columns - + if len(columns) != len(levels): - print(f"If specifying level names, you must specify a level for each column.") + print( + f"If specifying level names, you must specify a level for each column." + ) return - + obj = TOM.Hierarchy() obj.Name = hierarchy_name obj.IsHidden = hierarchy_hidden @@ -643,10 +739,23 @@ def add_hierarchy(self, table_name: str, hierarchy_name: str, columns: List[str] lvl.Column = self.model.Tables[table_name].Columns[col] lvl.Name = levels[columns.index(col)] lvl.Ordinal = columns.index(col) - self.model.Tables[table_name].Hierarchies[hierarchy_name].Levels.Add(lvl) - - def add_relationship(self, from_table: str, from_column: str, to_table: str, to_column: str, from_cardinality: str, to_cardinality: str, cross_filtering_behavior: Optional[str] = None, is_active: Optional[bool] = True, security_filtering_behavior: Optional[str] = None, rely_on_referential_integrity: Optional[bool] = False): - + self.model.Tables[table_name].Hierarchies[hierarchy_name].Levels.Add( + lvl + ) + + def add_relationship( + self, + from_table: str, + from_column: str, + to_table: str, + to_column: str, + from_cardinality: str, + to_cardinality: str, + cross_filtering_behavior: Optional[str] = None, + is_active: Optional[bool] = True, + security_filtering_behavior: Optional[str] = None, + rely_on_referential_integrity: Optional[bool] = False, + ): """ Adds a relationship to a semantic model. @@ -670,7 +779,7 @@ def add_relationship(self, from_table: str, from_column: str, to_table: str, to_ is_active : bool, default=True Setting for whether the relationship is active or not. security_filtering_behavior : str, default=None - Setting for the security filtering behavior of the relationship. Options: ('None', 'OneDirection', 'BothDirections'). + Setting for the security filtering behavior of the relationship. Options: ('None', 'OneDirection', 'BothDirections'). Defaults to None which resolves to 'OneDirection'. rely_on_referential_integrity : bool, default=False Setting for the rely on referential integrity of the relationship. @@ -681,31 +790,48 @@ def add_relationship(self, from_table: str, from_column: str, to_table: str, to_ """ if cross_filtering_behavior is None: - cross_filtering_behavior = 'Automatic' + cross_filtering_behavior = "Automatic" if security_filtering_behavior is None: - security_filtering_behavior = 'OneDirection' + security_filtering_behavior = "OneDirection" from_cardinality = from_cardinality.capitalize() to_cardinality = to_cardinality.capitalize() cross_filtering_behavior = cross_filtering_behavior.capitalize() security_filtering_behavior = security_filtering_behavior.capitalize() - security_filtering_behavior = security_filtering_behavior.replace('direct', 'Direct') - cross_filtering_behavior = cross_filtering_behavior.replace('direct', 'Direct') + security_filtering_behavior = security_filtering_behavior.replace( + "direct", "Direct" + ) + cross_filtering_behavior = cross_filtering_behavior.replace( + "direct", "Direct" + ) rel = TOM.SingleColumnRelationship() rel.FromColumn = self.model.Tables[from_table].Columns[from_column] - rel.FromCardinality = System.Enum.Parse(TOM.RelationshipEndCardinality, from_cardinality) + rel.FromCardinality = System.Enum.Parse( + TOM.RelationshipEndCardinality, from_cardinality + ) rel.ToColumn = self.model.Tables[to_table].Columns[to_column] - rel.ToCardinality = System.Enum.Parse(TOM.RelationshipEndCardinality, to_cardinality) + rel.ToCardinality = System.Enum.Parse( + TOM.RelationshipEndCardinality, to_cardinality + ) rel.IsActive = is_active - rel.CrossFilteringBehavior = System.Enum.Parse(TOM.CrossFilteringBehavior, cross_filtering_behavior) - rel.SecurityFilteringBehavior = System.Enum.Parse(TOM.SecurityFilteringBehavior, security_filtering_behavior) + rel.CrossFilteringBehavior = System.Enum.Parse( + TOM.CrossFilteringBehavior, cross_filtering_behavior + ) + rel.SecurityFilteringBehavior = System.Enum.Parse( + TOM.SecurityFilteringBehavior, security_filtering_behavior + ) rel.RelyOnReferentialIntegrity = rely_on_referential_integrity self.model.Relationships.Add(rel) - def add_calculation_group(self, name: str, precedence: int, description: Optional[str] = None, hidden: Optional[bool] = False): - + def add_calculation_group( + self, + name: str, + precedence: int, + description: Optional[str] = None, + hidden: Optional[bool] = False, + ): """ Adds a calculation group to a semantic model. @@ -718,7 +844,7 @@ def add_calculation_group(self, name: str, precedence: int, description: Optiona description : str, default=None A description of the calculation group. hidden : bool, default=False - Whether the calculation group is hidden/visible. + Whether the calculation group is hidden/visible. Returns ------- @@ -738,28 +864,29 @@ def add_calculation_group(self, name: str, precedence: int, description: Optiona part.Source = TOM.CalculationGroupSource() tbl.Partitions.Add(part) - sortCol = 'Ordinal' + sortCol = "Ordinal" col1 = TOM.DataColumn() col1.Name = sortCol col1.SourceColumn = sortCol col1.IsHidden = True - col1.DataType = System.Enum.Parse(TOM.DataType, 'Int64') + col1.DataType = System.Enum.Parse(TOM.DataType, "Int64") tbl.Columns.Add(col1) col2 = TOM.DataColumn() - col2.Name = 'Name' - col2.SourceColumn = 'Name' - col2.DataType = System.Enum.Parse(TOM.DataType, 'String') - #col.SortByColumn = m.Tables[name].Columns[sortCol] + col2.Name = "Name" + col2.SourceColumn = "Name" + col2.DataType = System.Enum.Parse(TOM.DataType, "String") + # col.SortByColumn = m.Tables[name].Columns[sortCol] tbl.Columns.Add(col2) self.model.DiscourageImplicitMeasures = True self.model.Tables.Add(tbl) - def add_expression(self, name: str, expression: str, description: Optional[str] = None): - + def add_expression( + self, name: str, expression: str, description: Optional[str] = None + ): """ Adds an expression to a semantic model. @@ -770,7 +897,7 @@ def add_expression(self, name: str, expression: str, description: Optional[str] expression: str The M expression of the expression. description : str, default=None - A description of the expression. + A description of the expression. Returns ------- @@ -787,7 +914,6 @@ def add_expression(self, name: str, expression: str, description: Optional[str] self.model.Expressions.Add(exp) def add_translation(self, language: str): - """ Adds a translation language (culture) to a semantic model. @@ -810,7 +936,6 @@ def add_translation(self, language: str): pass def add_perspective(self, perspective_name: str): - """ Adds a perspective to a semantic model. @@ -828,8 +953,14 @@ def add_perspective(self, perspective_name: str): persp.Name = perspective_name self.model.Perspectives.Add(persp) - def add_m_partition(self, table_name: str, partition_name: str, expression: str, mode: Optional[str] = None, description: Optional[str] = None): - + def add_m_partition( + self, + table_name: str, + partition_name: str, + expression: str, + mode: Optional[str] = None, + description: Optional[str] = None, + ): """ Adds an M-partition to a table within a semantic model. @@ -846,13 +977,18 @@ def add_m_partition(self, table_name: str, partition_name: str, expression: str, Defaults to None which resolves to 'Import'. description : str, default=None A description for the partition. - + Returns ------- """ - mode = mode.title().replace('query', 'Query').replace(' ','').replace('lake', 'Lake') + mode = ( + mode.title() + .replace("query", "Query") + .replace(" ", "") + .replace("lake", "Lake") + ) mp = TOM.MPartitionSource() mp.Expression = expression @@ -862,13 +998,18 @@ def add_m_partition(self, table_name: str, partition_name: str, expression: str, if description is not None: p.Description = description if mode is None: - mode = 'Default' + mode = "Default" p.Mode = System.Enum.Parse(TOM.ModeType, mode) self.model.Tables[table_name].Partitions.Add(p) - def add_entity_partition(self, table_name: str, entity_name: str, expression: Optional[str] = None, description: Optional[str] = None): - + def add_entity_partition( + self, + table_name: str, + entity_name: str, + expression: Optional[str] = None, + description: Optional[str] = None, + ): """ Adds an entity partition to a table within a semantic model. @@ -883,7 +1024,7 @@ def add_entity_partition(self, table_name: str, entity_name: str, expression: Op Defaults to None which resolves to the 'DatabaseQuery' expression. description : str, default=None A description for the partition. - + Returns ------- @@ -893,7 +1034,7 @@ def add_entity_partition(self, table_name: str, entity_name: str, expression: Op ep.Name = table_name ep.EntityName = entity_name if expression is None: - ep.ExpressionSource = self.model.Expressions['DatabaseQuery'] + ep.ExpressionSource = self.model.Expressions["DatabaseQuery"] else: ep.ExpressionSource = expression p = TOM.Partition() @@ -902,11 +1043,17 @@ def add_entity_partition(self, table_name: str, entity_name: str, expression: Op p.Mode = TOM.ModeType.DirectLake if description is not None: p.Description = description - - self.model.Tables[table_name].Partitions.Add(p) - def set_alternate_of(self, table_name: str, column_name: str, summarization_type: str, base_table: str, base_column: Optional[str] = None): + self.model.Tables[table_name].Partitions.Add(p) + def set_alternate_of( + self, + table_name: str, + column_name: str, + summarization_type: str, + base_table: str, + base_column: Optional[str] = None, + ): """ Sets the 'alternate of' property on a column. @@ -922,24 +1069,34 @@ def set_alternate_of(self, table_name: str, column_name: str, summarization_type Name of the base table for aggregation. base_column : str Name of the base column for aggregation - + Returns ------- """ - - if base_column is not None and base_table is None: - print(f"ERROR: If you specify the base table you must also specify the base column") - summarization_type = summarization_type.replace(' ','').capitalize().replace('Groupby', 'GroupBy') + if base_column is not None and base_table is None: + print( + f"ERROR: If you specify the base table you must also specify the base column" + ) + + summarization_type = ( + summarization_type.replace(" ", "") + .capitalize() + .replace("Groupby", "GroupBy") + ) - summarizationTypes = ['Sum', 'GroupBy', 'Count', 'Min', 'Max'] + summarizationTypes = ["Sum", "GroupBy", "Count", "Min", "Max"] if summarization_type not in summarizationTypes: - print(f"The 'summarization_type' parameter must be one of the following valuse: {summarizationTypes}.") + print( + f"The 'summarization_type' parameter must be one of the following valuse: {summarizationTypes}." + ) return ao = TOM.AlternateOf() - ao.Summarization = System.Enum.Parse(TOM.SummarizationType, summarization_type) + ao.Summarization = System.Enum.Parse( + TOM.SummarizationType, summarization_type + ) if base_column is not None: ao.BaseColumn = self.model.Tables[base_table].Columns[base_column] else: @@ -954,7 +1111,6 @@ def set_alternate_of(self, table_name: str, column_name: str, summarization_type c.IsHidden = True def remove_alternate_of(self, table_name: str, column_name: str): - """ Removes the 'alternate of' property on a column. @@ -964,7 +1120,7 @@ def remove_alternate_of(self, table_name: str, column_name: str): Name of the table. column_name : str Name of the column. - + Returns ------- @@ -972,8 +1128,9 @@ def remove_alternate_of(self, table_name: str, column_name: str): self.model.Tables[table_name].Columns[column_name].AlternateOf = None - def get_annotations(self, object) -> 'Microsoft.AnalysisServices.Tabular.Annotation': - + def get_annotations( + self, object + ) -> "Microsoft.AnalysisServices.Tabular.Annotation": """ Shows all annotations for a given object within a semantic model. @@ -981,22 +1138,21 @@ def get_annotations(self, object) -> 'Microsoft.AnalysisServices.Tabular.Annotat ---------- object : TOM Object An object (i.e. table/column/measure) within a semantic model. - + Returns ------- Microsoft.AnalysisServices.Tabular.Annotation TOM objects of all the annotations on a particular object within the semantic model. """ - #df = pd.DataFrame(columns=['Name', 'Value']) + # df = pd.DataFrame(columns=['Name', 'Value']) for a in object.Annotations: - #new_data = {'Name': a.Name, 'Value': a.Value} + # new_data = {'Name': a.Name, 'Value': a.Value} yield a - #df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - - def set_annotation(self, object, name: str, value: str): + # df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + def set_annotation(self, object, name: str, value: str): """ Sets an annotation on an object within the semantic model. @@ -1008,7 +1164,7 @@ def set_annotation(self, object, name: str, value: str): Name of the annotation. value : str Value of the annotation. - + Returns ------- @@ -1024,7 +1180,6 @@ def set_annotation(self, object, name: str, value: str): object.Annotations.Add(ann) def get_annotation_value(self, object, name: str): - """ Obtains the annotation value for a given annotation on an object within the semantic model. @@ -1034,7 +1189,7 @@ def get_annotation_value(self, object, name: str): An object (i.e. table/column/measure) within a semantic model. name : str Name of the annotation. - + Returns ------- str @@ -1044,7 +1199,6 @@ def get_annotation_value(self, object, name: str): return object.Annotations[name].Value def remove_annotation(self, object, name: str): - """ Removes an annotation on an object within the semantic model. @@ -1054,7 +1208,7 @@ def remove_annotation(self, object, name: str): An object (i.e. table/column/measure) within a semantic model. name : str Name of the annotation. - + Returns ------- @@ -1063,7 +1217,6 @@ def remove_annotation(self, object, name: str): object.Annotations.Remove(name) def clear_annotations(self, object): - """ Removes all annotations on an object within the semantic model. @@ -1071,7 +1224,7 @@ def clear_annotations(self, object): ---------- object : TOM Object An object (i.e. table/column/measure) within a semantic model. - + Returns ------- @@ -1079,8 +1232,9 @@ def clear_annotations(self, object): object.Annotations.Clear() - def get_extended_properties(self, object) -> 'Microsoft.AnalysisServices.Tabular.ExtendedProperty': - + def get_extended_properties( + self, object + ) -> "Microsoft.AnalysisServices.Tabular.ExtendedProperty": """ Retrieves all extended properties on an object within the semantic model. @@ -1088,24 +1242,25 @@ def get_extended_properties(self, object) -> 'Microsoft.AnalysisServices.Tabular ---------- object : TOM Object An object (i.e. table/column/measure) within a semantic model. - + Returns ------- Microsoft.AnalysisServices.Tabular.ExtendedPropertiesCollection TOM Objects of all the extended properties. """ - #df = pd.DataFrame(columns=['Name', 'Value', 'Type']) + # df = pd.DataFrame(columns=['Name', 'Value', 'Type']) for a in object.ExtendedProperties: yield a - #new_data = {'Name': a.Name, 'Value': a.Value, 'Type': a.Type} - #df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + # new_data = {'Name': a.Name, 'Value': a.Value, 'Type': a.Type} + # df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - #return df - - def set_extended_property(self, object, extended_property_type: str, name: str, value: str): + # return df + def set_extended_property( + self, object, extended_property_type: str, name: str, value: str + ): """ Sets an extended property on an object within the semantic model. @@ -1119,7 +1274,7 @@ def set_extended_property(self, object, extended_property_type: str, name: str, Name of the extended property. value : str Value of the extended property. - + Returns ------- @@ -1127,7 +1282,7 @@ def set_extended_property(self, object, extended_property_type: str, name: str, extended_property_type = extended_property_type.title() - if extended_property_type == 'Json': + if extended_property_type == "Json": ep = TOM.JsonExtendedProperty() else: ep = TOM.StringExtendedProperty() @@ -1141,7 +1296,6 @@ def set_extended_property(self, object, extended_property_type: str, name: str, object.ExtendedProperties.Add(ep) def get_extended_property_value(self, object, name: str): - """ Retrieves the value of an extended property for an object within the semantic model. @@ -1151,7 +1305,7 @@ def get_extended_property_value(self, object, name: str): An object (i.e. table/column/measure) within a semantic model. name : str Name of the annotation. - + Returns ------- str @@ -1161,7 +1315,6 @@ def get_extended_property_value(self, object, name: str): return object.ExtendedProperties[name].Value def remove_extended_property(self, object, name: str): - """ Removes an extended property on an object within the semantic model. @@ -1171,7 +1324,7 @@ def remove_extended_property(self, object, name: str): An object (i.e. table/column/measure) within a semantic model. name : str Name of the annotation. - + Returns ------- @@ -1180,7 +1333,6 @@ def remove_extended_property(self, object, name: str): object.ExtendedProperties.Remove(name) def clear_extended_properties(self, object): - """ Removes all extended properties on an object within the semantic model. @@ -1188,16 +1340,19 @@ def clear_extended_properties(self, object): ---------- object : TOM Object An object (i.e. table/column/measure) within a semantic model. - + Returns ------- """ object.ExtendedProperties.Clear() - - def in_perspective(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measure', 'TOM.Hierarchy'], perspective_name: str): - + + def in_perspective( + self, + object: Union["TOM.Table", "TOM.Column", "TOM.Measure", "TOM.Hierarchy"], + perspective_name: str, + ): """ Indicates whether an object is contained within a given perspective. @@ -1207,37 +1362,55 @@ def in_perspective(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measure', An object (i.e. table/column/measure) within a semantic model. perspecitve_name : str Name of the perspective. - + Returns ------- bool An indication as to whether the object is contained within the given perspective. """ - validObjects = [TOM.ObjectType.Table, TOM.ObjectType.Column, TOM.ObjectType.Measure, TOM.ObjectType.Hierarchy] + validObjects = [ + TOM.ObjectType.Table, + TOM.ObjectType.Column, + TOM.ObjectType.Measure, + TOM.ObjectType.Hierarchy, + ] objectType = object.ObjectType if objectType not in validObjects: - print(f"Only the following object types are valid for perspectives: {validObjects}.") + print( + f"Only the following object types are valid for perspectives: {validObjects}." + ) return - + object.Model.Perspectives[perspective_name] - try: + try: if objectType == TOM.ObjectType.Table: - object.Model.Perspectives[perspective_name].PerspectiveTables[object.Name] + object.Model.Perspectives[perspective_name].PerspectiveTables[ + object.Name + ] elif objectType == TOM.ObjectType.Column: - object.Model.Perspectives[perspective_name].PerspectiveTables[object.Parent.Name].PerspectiveColumns[object.Name] + object.Model.Perspectives[perspective_name].PerspectiveTables[ + object.Parent.Name + ].PerspectiveColumns[object.Name] elif objectType == TOM.ObjectType.Measure: - object.Model.Perspectives[perspective_name].PerspectiveTables[object.Parent.Name].PerspectiveMeasures[object.Name] + object.Model.Perspectives[perspective_name].PerspectiveTables[ + object.Parent.Name + ].PerspectiveMeasures[object.Name] elif objectType == TOM.ObjectType.Hierarchy: - object.Model.Perspectives[perspective_name].PerspectiveTables[object.Parent.Name].PerspectiveHierarchies[object.Name] + object.Model.Perspectives[perspective_name].PerspectiveTables[ + object.Parent.Name + ].PerspectiveHierarchies[object.Name] return True except: return False - def add_to_perspective(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measure', 'TOM.Hierarchy'], perspective_name: str): - + def add_to_perspective( + self, + object: Union["TOM.Table", "TOM.Column", "TOM.Measure", "TOM.Hierarchy"], + perspective_name: str, + ): """ Adds an object to a perspective. @@ -1247,17 +1420,24 @@ def add_to_perspective(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measu An object (i.e. table/column/measure) within a semantic model. perspective_name : str Name of the perspective. - + Returns ------- """ - validObjects = [TOM.ObjectType.Table, TOM.ObjectType.Column, TOM.ObjectType.Measure, TOM.ObjectType.Hierarchy] + validObjects = [ + TOM.ObjectType.Table, + TOM.ObjectType.Column, + TOM.ObjectType.Measure, + TOM.ObjectType.Hierarchy, + ] objectType = object.ObjectType if objectType not in validObjects: - print(f"Only the following object types are valid for perspectives: {validObjects}.") + print( + f"Only the following object types are valid for perspectives: {validObjects}." + ) return try: object.Model.Perspectives[perspective_name] @@ -1265,7 +1445,7 @@ def add_to_perspective(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measu print(f"The '{perspective_name}' perspective does not exist.") return - #try: + # try: if objectType == TOM.ObjectType.Table: pt = TOM.PerspectiveTable() pt.Table = object @@ -1273,20 +1453,29 @@ def add_to_perspective(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measu elif objectType == TOM.ObjectType.Column: pc = TOM.PerspectiveColumn() pc.Column = object - object.Model.Perspectives[perspective_name].PerspectiveTables[object.Parent.Name].PerspectiveColumns.Add(pc) + object.Model.Perspectives[perspective_name].PerspectiveTables[ + object.Parent.Name + ].PerspectiveColumns.Add(pc) elif objectType == TOM.ObjectType.Measure: pm = TOM.PerspectiveMeasure() pm.Measure = object - object.Model.Perspectives[perspective_name].PerspectiveTables[object.Parent.Name].PerspectiveMeasures.Add(pm) + object.Model.Perspectives[perspective_name].PerspectiveTables[ + object.Parent.Name + ].PerspectiveMeasures.Add(pm) elif objectType == TOM.ObjectType.Hierarchy: ph = TOM.PerspectiveHierarchy() ph.Hierarchy = object - object.Model.Perspectives[perspective_name].PerspectiveTables[object.Parent.Name].PerspectiveHierarchies.Add(ph) - #except: + object.Model.Perspectives[perspective_name].PerspectiveTables[ + object.Parent.Name + ].PerspectiveHierarchies.Add(ph) + # except: # pass - def remove_from_perspective(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measure', 'TOM.Hierarchy'], perspective_name: str): - + def remove_from_perspective( + self, + object: Union["TOM.Table", "TOM.Column", "TOM.Measure", "TOM.Hierarchy"], + perspective_name: str, + ): """ Removes an object from a perspective. @@ -1296,17 +1485,24 @@ def remove_from_perspective(self, object: Union['TOM.Table', 'TOM.Column', 'TOM. An object (i.e. table/column/measure) within a semantic model. perspective_name : str Name of the perspective. - + Returns ------- """ - validObjects = [TOM.ObjectType.Table, TOM.ObjectType.Column, TOM.ObjectType.Measure, TOM.ObjectType.Hierarchy] + validObjects = [ + TOM.ObjectType.Table, + TOM.ObjectType.Column, + TOM.ObjectType.Measure, + TOM.ObjectType.Hierarchy, + ] objectType = object.ObjectType if objectType not in validObjects: - print(f"Only the following object types are valid for perspectives: {validObjects}.") + print( + f"Only the following object types are valid for perspectives: {validObjects}." + ) return try: object.Model.Perspectives[perspective_name] @@ -1314,24 +1510,49 @@ def remove_from_perspective(self, object: Union['TOM.Table', 'TOM.Column', 'TOM. print(f"The '{perspective_name}' perspective does not exist.") return - #try: + # try: if objectType == TOM.ObjectType.Table: - pt = object.Model.Perspectives[perspective_name].PerspectiveTables[object.Name] + pt = object.Model.Perspectives[perspective_name].PerspectiveTables[ + object.Name + ] object.Model.Perspectives[perspective_name].PerspectiveTables.Remove(pt) elif objectType == TOM.ObjectType.Column: - pc = object.Model.Perspectives[perspective_name].PerspectiveTables[object.Parent.Name].PerspectiveColumns[object.Name] - object.Model.Perspectives[perspective_name].PerspectiveTables[object.Parent.Name].PerspectiveColumns.Remove(pc) + pc = ( + object.Model.Perspectives[perspective_name] + .PerspectiveTables[object.Parent.Name] + .PerspectiveColumns[object.Name] + ) + object.Model.Perspectives[perspective_name].PerspectiveTables[ + object.Parent.Name + ].PerspectiveColumns.Remove(pc) elif objectType == TOM.ObjectType.Measure: - pm = object.Model.Perspectives[perspective_name].PerspectiveTables[object.Parent.Name].PerspectiveMeasures[object.Name] - object.Model.Perspectives[perspective_name].PerspectiveTables[object.Parent.Name].PerspectiveMeasures.Remove(pm) + pm = ( + object.Model.Perspectives[perspective_name] + .PerspectiveTables[object.Parent.Name] + .PerspectiveMeasures[object.Name] + ) + object.Model.Perspectives[perspective_name].PerspectiveTables[ + object.Parent.Name + ].PerspectiveMeasures.Remove(pm) elif objectType == TOM.ObjectType.Hierarchy: - ph = object.Model.Perspectives[perspective_name].PerspectiveTables[object.Parent.Name].PerspectiveHierarchies[object.Name] - object.Model.Perspectives[perspective_name].PerspectiveTables[object.Parent.Name].PerspectiveHierarchies.Remove(ph) - #except: + ph = ( + object.Model.Perspectives[perspective_name] + .PerspectiveTables[object.Parent.Name] + .PerspectiveHierarchies[object.Name] + ) + object.Model.Perspectives[perspective_name].PerspectiveTables[ + object.Parent.Name + ].PerspectiveHierarchies.Remove(ph) + # except: # pass - def set_translation(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measure', 'TOM.Hierarchy'], language: str, property: str, value: str): - + def set_translation( + self, + object: Union["TOM.Table", "TOM.Column", "TOM.Measure", "TOM.Hierarchy"], + language: str, + property: str, + value: str, + ): """ Sets a translation value for an object's property. @@ -1345,26 +1566,31 @@ def set_translation(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measure' The property to set. Options: 'Name', 'Description', 'Display Folder'. value : str The transation value. - + Returns ------- """ - self.add_translation(language = language) + self.add_translation(language=language) property = property.title() - validObjects = [TOM.ObjectType.Table, TOM.ObjectType.Column, TOM.ObjectType.Measure, TOM.ObjectType.Hierarchy] #, 'Level' + validObjects = [ + TOM.ObjectType.Table, + TOM.ObjectType.Column, + TOM.ObjectType.Measure, + TOM.ObjectType.Hierarchy, + ] # , 'Level' if object.ObjectType not in validObjects: print(f"Translations can only be set to {validObjects}.") return mapping = { - 'Name': TOM.TranslatedProperty.Caption, - 'Description': TOM.TranslatedProperty.Description, - 'Display Folder': TOM.TranslatedProperty.DisplayFolder + "Name": TOM.TranslatedProperty.Caption, + "Description": TOM.TranslatedProperty.Description, + "Display Folder": TOM.TranslatedProperty.DisplayFolder, } prop = mapping.get(property) @@ -1372,14 +1598,20 @@ def set_translation(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measure' try: object.Model.Cultures[language] except: - print(f"The '{language}' translation language does not exist in the semantic model.") + print( + f"The '{language}' translation language does not exist in the semantic model." + ) return - object.Model.Cultures[language].ObjectTranslations.SetTranslation(object, prop, value) - - - def remove_translation(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measure', 'TOM.Hierarchy'], language: str): + object.Model.Cultures[language].ObjectTranslations.SetTranslation( + object, prop, value + ) + def remove_translation( + self, + object: Union["TOM.Table", "TOM.Column", "TOM.Measure", "TOM.Hierarchy"], + language: str, + ): """ Removes an object's translation value. @@ -1389,17 +1621,18 @@ def remove_translation(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measu An object (i.e. table/column/measure) within a semantic model. language : str The language code. - + Returns ------- """ - o = object.Model.Cultures[language].ObjectTranslations[object, TOM.TranslatedProperty.Caption] + o = object.Model.Cultures[language].ObjectTranslations[ + object, TOM.TranslatedProperty.Caption + ] object.Model.Cultures[language].ObjectTranslations.Remove(o) def remove_object(self, object): - """ Removes an object from a semantic model. @@ -1407,7 +1640,7 @@ def remove_object(self, object): ---------- object : TOM Object An object (i.e. table/column/measure) within a semantic model. - + Returns ------- @@ -1416,16 +1649,18 @@ def remove_object(self, object): objType = object.ObjectType # Have to remove translations and perspectives on the object before removing it. - if objType in ['Table', 'Column', 'Measure', 'Hierarchy', 'Level']: + if objType in ["Table", "Column", "Measure", "Hierarchy", "Level"]: for lang in object.Model.Cultures: try: - self.remove_translation(object = object, language = lang.Name) + self.remove_translation(object=object, language=lang.Name) except: pass - if objType in ['Table', 'Column', 'Measure', 'Hierarchy']: + if objType in ["Table", "Column", "Measure", "Hierarchy"]: for persp in object.Model.Perspectives: try: - self.remove_from_perspective(object = object, perspective_name = persp.Name) + self.remove_from_perspective( + object=object, perspective_name=persp.Name + ) except: pass @@ -1456,8 +1691,7 @@ def remove_object(self, object): elif objType == TOM.ObjectType.TablePermission: object.Parent.TablePermissions.Remove(object.Name) - def used_in_relationships(self, object: Union['TOM.Table', 'TOM.Column']): - + def used_in_relationships(self, object: Union["TOM.Table", "TOM.Column"]): """ Shows all relationships in which a table/column is used. @@ -1465,7 +1699,7 @@ def used_in_relationships(self, object: Union['TOM.Table', 'TOM.Column']): ---------- object : TOM Object An object (i.e. table/column) within a semantic model. - + Returns ------- Microsoft.AnalysisServices.Tabular.RelationshipCollection @@ -1477,15 +1711,19 @@ def used_in_relationships(self, object: Union['TOM.Table', 'TOM.Column']): if objType == TOM.ObjectType.Table: for r in self.model.Relationships: if r.FromTable.Name == object.Name or r.ToTable.Name == object.Name: - yield r#, 'Table' + yield r # , 'Table' elif objType == TOM.ObjectType.Column: for r in self.model.Relationships: - if (r.FromTable.Name == object.Parent.Name and r.FromColumn.Name == object.Name) or \ - (r.ToTable.Name == object.Parent.Name and r.ToColumn.Name == object.Name): - yield r#, 'Column' - - def used_in_levels(self, column: 'TOM.Column'): + if ( + r.FromTable.Name == object.Parent.Name + and r.FromColumn.Name == object.Name + ) or ( + r.ToTable.Name == object.Parent.Name + and r.ToColumn.Name == object.Name + ): + yield r # , 'Column' + def used_in_levels(self, column: "TOM.Column"): """ Shows all levels in which a column is used. @@ -1493,7 +1731,7 @@ def used_in_levels(self, column: 'TOM.Column'): ---------- object : TOM Object An column object within a semantic model. - + Returns ------- Microsoft.AnalysisServices.Tabular.LevelCollection @@ -1504,11 +1742,13 @@ def used_in_levels(self, column: 'TOM.Column'): if objType == TOM.ObjectType.Column: for l in self.all_levels(): - if l.Parent.Table.Name == column.Parent.Name and l.Column.Name == column.Name: + if ( + l.Parent.Table.Name == column.Parent.Name + and l.Column.Name == column.Name + ): yield l - - def used_in_hierarchies(self, column: 'TOM.Column'): + def used_in_hierarchies(self, column: "TOM.Column"): """ Shows all hierarchies in which a column is used. @@ -1516,7 +1756,7 @@ def used_in_hierarchies(self, column: 'TOM.Column'): ---------- object : TOM Object An column object within a semantic model. - + Returns ------- Microsoft.AnalysisServices.Tabular.HierarchyCollection @@ -1527,11 +1767,13 @@ def used_in_hierarchies(self, column: 'TOM.Column'): if objType == TOM.ObjectType.Column: for l in self.all_levels(): - if l.Parent.Table.Name == column.Parent.Name and l.Column.Name == column.Name: + if ( + l.Parent.Table.Name == column.Parent.Name + and l.Column.Name == column.Name + ): yield l.Parent - def used_in_sort_by(self, column: 'TOM.Column'): - + def used_in_sort_by(self, column: "TOM.Column"): """ Shows all columns in which a column is used for sorting. @@ -1539,7 +1781,7 @@ def used_in_sort_by(self, column: 'TOM.Column'): ---------- object : TOM Object An column object within a semantic model. - + Returns ------- Microsoft.AnalysisServices.Tabular.ColumnCollection @@ -1553,8 +1795,11 @@ def used_in_sort_by(self, column: 'TOM.Column'): if c.SortByColumn == column: yield c - def used_in_rls(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measure'], dependencies: pd.DataFrame): - + def used_in_rls( + self, + object: Union["TOM.Table", "TOM.Column", "TOM.Measure"], + dependencies: pd.DataFrame, + ): """ Identifies the filter expressions which reference a given object. @@ -1564,38 +1809,52 @@ def used_in_rls(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measure'], d An object (i.e. table/column) within a semantic model. dependencies : pandas.DataFrame A pandas dataframe with the output of the 'get_model_calc_dependencies' function. - + Returns ------- Microsoft.AnalysisServices.Tabular.TableCollection, Microsoft.AnalysisServices.Tabular.ColumnCollection, Microsoft.AnalysisServices.Tabular.MeasureCollection - + """ objType = object.ObjectType - - df_filt = dependencies[dependencies['Object Type'] == 'Rows Allowed'] + + df_filt = dependencies[dependencies["Object Type"] == "Rows Allowed"] if objType == TOM.ObjectType.Table: - fil = df_filt[(df_filt['Referenced Object Type'] == 'Table') & (df_filt['Referenced Table'] == object.Name)] - tbls = fil['Table Name'].unique().tolist() + fil = df_filt[ + (df_filt["Referenced Object Type"] == "Table") + & (df_filt["Referenced Table"] == object.Name) + ] + tbls = fil["Table Name"].unique().tolist() for t in self.model.Tables: if t.Name in tbls: yield t elif objType == TOM.ObjectType.Column: - fil = df_filt[(df_filt['Referenced Object Type'] == 'Column') & (df_filt['Referenced Table'] == object.Parent.Name) & (df_filt['Referenced Object'] == object.Name)] - cols = fil['Full Object Name'].unique().tolist() + fil = df_filt[ + (df_filt["Referenced Object Type"] == "Column") + & (df_filt["Referenced Table"] == object.Parent.Name) + & (df_filt["Referenced Object"] == object.Name) + ] + cols = fil["Full Object Name"].unique().tolist() for c in self.all_columns(): if format_dax_object_name(c.Parent.Name, c.Name) in cols: yield c elif objType == TOM.ObjectType.Measure: - fil = df_filt[(df_filt['Referenced Object Type'] == 'Measure') & (df_filt['Referenced Table'] == object.Parent.Name) & (df_filt['Referenced Object'] == object.Name)] - meas = fil['Object Name'].unique().tolist() + fil = df_filt[ + (df_filt["Referenced Object Type"] == "Measure") + & (df_filt["Referenced Table"] == object.Parent.Name) + & (df_filt["Referenced Object"] == object.Name) + ] + meas = fil["Object Name"].unique().tolist() for m in self.all_measures(): if m.Name in meas: yield m - def used_in_data_coverage_definition(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measure'], dependencies: pd.DataFrame): - + def used_in_data_coverage_definition( + self, + object: Union["TOM.Table", "TOM.Column", "TOM.Measure"], + dependencies: pd.DataFrame, + ): """ Identifies the ... which reference a given object. @@ -1605,38 +1864,54 @@ def used_in_data_coverage_definition(self, object: Union['TOM.Table', 'TOM.Colum An object (i.e. table/column) within a semantic model. dependencies : pandas.DataFrame A pandas dataframe with the output of the 'get_model_calc_dependencies' function. - + Returns ------- Microsoft.AnalysisServices.Tabular.TableCollection, Microsoft.AnalysisServices.Tabular.ColumnCollection, Microsoft.AnalysisServices.Tabular.MeasureCollection - + """ objType = object.ObjectType - - df_filt = dependencies[dependencies['Object Type'] == 'Data Coverage Definition'] + + df_filt = dependencies[ + dependencies["Object Type"] == "Data Coverage Definition" + ] if objType == TOM.ObjectType.Table: - fil = df_filt[(df_filt['Referenced Object Type'] == 'Table') & (df_filt['Referenced Table'] == object.Name)] - tbls = fil['Table Name'].unique().tolist() + fil = df_filt[ + (df_filt["Referenced Object Type"] == "Table") + & (df_filt["Referenced Table"] == object.Name) + ] + tbls = fil["Table Name"].unique().tolist() for t in self.model.Tables: if t.Name in tbls: yield t elif objType == TOM.ObjectType.Column: - fil = df_filt[(df_filt['Referenced Object Type'] == 'Column') & (df_filt['Referenced Table'] == object.Parent.Name) & (df_filt['Referenced Object'] == object.Name)] - cols = fil['Full Object Name'].unique().tolist() + fil = df_filt[ + (df_filt["Referenced Object Type"] == "Column") + & (df_filt["Referenced Table"] == object.Parent.Name) + & (df_filt["Referenced Object"] == object.Name) + ] + cols = fil["Full Object Name"].unique().tolist() for c in self.all_columns(): if format_dax_object_name(c.Parent.Name, c.Name) in cols: yield c elif objType == TOM.ObjectType.Measure: - fil = df_filt[(df_filt['Referenced Object Type'] == 'Measure') & (df_filt['Referenced Table'] == object.Parent.Name) & (df_filt['Referenced Object'] == object.Name)] - meas = fil['Object Name'].unique().tolist() + fil = df_filt[ + (df_filt["Referenced Object Type"] == "Measure") + & (df_filt["Referenced Table"] == object.Parent.Name) + & (df_filt["Referenced Object"] == object.Name) + ] + meas = fil["Object Name"].unique().tolist() for m in self.all_measures(): if m.Name in meas: yield m - - def used_in_calc_item(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measure'], dependencies: pd.DataFrame): + def used_in_calc_item( + self, + object: Union["TOM.Table", "TOM.Column", "TOM.Measure"], + dependencies: pd.DataFrame, + ): """ Identifies the ... which reference a given object. @@ -1646,44 +1921,54 @@ def used_in_calc_item(self, object: Union['TOM.Table', 'TOM.Column', 'TOM.Measur An object (i.e. table/column) within a semantic model. dependencies : pandas.DataFrame A pandas dataframe with the output of the 'get_model_calc_dependencies' function. - + Returns ------- Microsoft.AnalysisServices.Tabular.TableCollection, Microsoft.AnalysisServices.Tabular.ColumnCollection, Microsoft.AnalysisServices.Tabular.MeasureCollection - + """ objType = object.ObjectType - - df_filt = dependencies[dependencies['Object Type'] == 'Calculation Item'] + + df_filt = dependencies[dependencies["Object Type"] == "Calculation Item"] if objType == TOM.ObjectType.Table: - fil = df_filt[(df_filt['Referenced Object Type'] == 'Table') & (df_filt['Referenced Table'] == object.Name)] - tbls = fil['Table Name'].unique().tolist() + fil = df_filt[ + (df_filt["Referenced Object Type"] == "Table") + & (df_filt["Referenced Table"] == object.Name) + ] + tbls = fil["Table Name"].unique().tolist() for t in self.model.Tables: if t.Name in tbls: yield t elif objType == TOM.ObjectType.Column: - fil = df_filt[(df_filt['Referenced Object Type'] == 'Column') & (df_filt['Referenced Table'] == object.Parent.Name) & (df_filt['Referenced Object'] == object.Name)] - cols = fil['Full Object Name'].unique().tolist() + fil = df_filt[ + (df_filt["Referenced Object Type"] == "Column") + & (df_filt["Referenced Table"] == object.Parent.Name) + & (df_filt["Referenced Object"] == object.Name) + ] + cols = fil["Full Object Name"].unique().tolist() for c in self.all_columns(): if format_dax_object_name(c.Parent.Name, c.Name) in cols: yield c elif objType == TOM.ObjectType.Measure: - fil = df_filt[(df_filt['Referenced Object Type'] == 'Measure') & (df_filt['Referenced Table'] == object.Parent.Name) & (df_filt['Referenced Object'] == object.Name)] - meas = fil['Object Name'].unique().tolist() + fil = df_filt[ + (df_filt["Referenced Object Type"] == "Measure") + & (df_filt["Referenced Table"] == object.Parent.Name) + & (df_filt["Referenced Object"] == object.Name) + ] + meas = fil["Object Name"].unique().tolist() for m in self.all_measures(): if m.Name in meas: yield m def hybrid_tables(self): - """ Outputs the hybrid tables within a semantic model. Parameters ---------- - + Returns ------- Microsoft.AnalysisServices.Tabular.TableCollection @@ -1696,13 +1981,12 @@ def hybrid_tables(self): yield t def date_tables(self): - """ Outputs the tables which are marked as date tables within a semantic model. Parameters ---------- - + Returns ------- Microsoft.AnalysisServices.Tabular.TableCollection @@ -1710,12 +1994,14 @@ def date_tables(self): """ for t in self.model.Tables: - if t.DataCategory == 'Time': - if any(c.IsKey and c.DataType == TOM.DataType.DateTime for c in t.Columns): + if t.DataCategory == "Time": + if any( + c.IsKey and c.DataType == TOM.DataType.DateTime + for c in t.Columns + ): yield t def is_hybrid_table(self, table_name: str): - """ Identifies if a table is a hybrid table. @@ -1723,7 +2009,7 @@ def is_hybrid_table(self, table_name: str): ---------- table_name : str Name of the table. - + Returns ------- bool @@ -1732,14 +2018,19 @@ def is_hybrid_table(self, table_name: str): isHybridTable = False - if any(p.Mode == TOM.ModeType.Import for p in self.model.Tables[table_name].Partitions): - if any(p.Mode == TOM.ModeType.DirectQuery for p in self.model.Tables[table_name].Partitions): + if any( + p.Mode == TOM.ModeType.Import + for p in self.model.Tables[table_name].Partitions + ): + if any( + p.Mode == TOM.ModeType.DirectQuery + for p in self.model.Tables[table_name].Partitions + ): isHybridTable = True return isHybridTable def is_date_table(self, table_name: str): - """ Identifies if a table is marked as a date table. @@ -1747,7 +2038,7 @@ def is_date_table(self, table_name: str): ---------- table_name : str Name of the table. - + Returns ------- bool @@ -1757,14 +2048,15 @@ def is_date_table(self, table_name: str): isDateTable = False t = self.model.Tables[table_name] - if t.DataCategory == 'Time': - if any(c.IsKey and c.DataType == TOM.DataType.DateTime for c in t.Columns): + if t.DataCategory == "Time": + if any( + c.IsKey and c.DataType == TOM.DataType.DateTime for c in t.Columns + ): isDateTable = True return isDateTable - - def mark_as_date_table(self, table_name: str, column_name: str): + def mark_as_date_table(self, table_name: str, column_name: str): """ Marks a table as a date table. @@ -1774,7 +2066,7 @@ def mark_as_date_table(self, table_name: str, column_name: str): Name of the table. column_name : str Name of the date column in the table. - + Returns ------- @@ -1783,9 +2075,11 @@ def mark_as_date_table(self, table_name: str, column_name: str): t = self.model.Tables[table_name] c = t.Columns[column_name] if c.DataType != TOM.DataType.DateTime: - print(f"{red_dot} The column specified in the 'column_name' parameter in this function must be of DateTime data type.") + print( + f"{icons.red_dot} The column specified in the 'column_name' parameter in this function must be of DateTime data type." + ) return - + daxQuery = f""" define measure '{table_name}'[test] = var mn = MIN('{table_name}'[{column_name}]) @@ -1799,25 +2093,30 @@ def mark_as_date_table(self, table_name: str, column_name: str): "1",[test] ) """ - df = fabric.evaluate_dax(dataset=dataset, workspace=workspace, dax_string = daxQuery) - value = df['1'].iloc[0] - if value != '1': - print(f"{red_dot} The '{column_name}' within the '{table_name}' table does not contain contiguous date values.") + df = fabric.evaluate_dax( + dataset=dataset, workspace=workspace, dax_string=daxQuery + ) + value = df["1"].iloc[0] + if value != "1": + print( + f"{icons.red_dot} The '{column_name}' within the '{table_name}' table does not contain contiguous date values." + ) return - + # Mark as a date table - t.DataCategory = 'Time' - c.Columns[column_name].IsKey = True - print(f"{green_dot} The '{table_name}' table has been marked as a date table using the '{column_name}' column as its primary date key.") - - def has_aggs(self): + t.DataCategory = "Time" + c.Columns[column_name].IsKey = True + print( + f"{icons.green_dot} The '{table_name}' table has been marked as a date table using the '{column_name}' column as its primary date key." + ) + def has_aggs(self): """ Identifies if a semantic model has any aggregations. Parameters ---------- - + Returns ------- bool @@ -1831,9 +2130,8 @@ def has_aggs(self): hasAggs = True return hasAggs - - def is_agg_table(self, table_name: str): + def is_agg_table(self, table_name: str): """ Identifies if a table has aggregations. @@ -1841,7 +2139,7 @@ def is_agg_table(self, table_name: str): ---------- table_name : str Name of the table. - + Returns ------- bool @@ -1853,13 +2151,12 @@ def is_agg_table(self, table_name: str): return any(c.AlternateOf is not None for c in t.Columns) def has_hybrid_table(self): - """ Identifies if a semantic model has a hybrid table. Parameters ---------- - + Returns ------- bool @@ -1869,19 +2166,18 @@ def has_hybrid_table(self): hasHybridTable = False for t in self.model.Tables: - if self.is_hybrid_table(table_name = t.Name): + if self.is_hybrid_table(table_name=t.Name): hasHybridTable = True return hasHybridTable def has_date_table(self): - """ Identifies if a semantic model has a table marked as a date table. Parameters ---------- - + Returns ------- bool @@ -1891,29 +2187,31 @@ def has_date_table(self): hasDateTable = False for t in self.model.Tables: - if self.is_date_table(table_name = t.Name): + if self.is_date_table(table_name=t.Name): hasDateTable = True return hasDateTable def is_direct_lake(self): - """ Identifies if a semantic model is in Direct Lake mode. Parameters ---------- - + Returns ------- bool Indicates if the semantic model is in Direct Lake mode. """ - return any(p.Mode == TOM.ModeType.DirectLake for t in self.model.Tables for p in t.Partitions) + return any( + p.Mode == TOM.ModeType.DirectLake + for t in self.model.Tables + for p in t.Partitions + ) def is_field_parameter(self, table_name: str): - """ Identifies if a table is a field parameter. @@ -1921,7 +2219,7 @@ def is_field_parameter(self, table_name: str): ---------- table_name : str Name of the table. - + Returns ------- bool @@ -1930,10 +2228,21 @@ def is_field_parameter(self, table_name: str): t = self.model.Tables[table_name] - return any(p.SourceType == TOM.PartitionSourceType.Calculated and 'NAMEOF(' in p.Source.Expression for p in t.Partitions) and all('[Value' in c.SourceColumn for c in t.Columns if c.Type != TOM.ColumnType.RowNumber) and t.Columns.Count == 4 - - def is_auto_date_table(self, table_name: str): + return ( + any( + p.SourceType == TOM.PartitionSourceType.Calculated + and "NAMEOF(" in p.Source.Expression + for p in t.Partitions + ) + and all( + "[Value" in c.SourceColumn + for c in t.Columns + if c.Type != TOM.ColumnType.RowNumber + ) + and t.Columns.Count == 4 + ) + def is_auto_date_table(self, table_name: str): """ Identifies if a table is an auto-date table. @@ -1941,7 +2250,7 @@ def is_auto_date_table(self, table_name: str): ---------- table_name : str Name of the table. - + Returns ------- bool @@ -1952,14 +2261,28 @@ def is_auto_date_table(self, table_name: str): t = self.model.Tables[table_name] - if t.Name.startswith('LocalDateTable_') or t.Name.startswith('DateTableTemplate_'): - if any(p.SourceType == TOM.PartitionSourceType.Calculated for p in t.Partitions): + if t.Name.startswith("LocalDateTable_") or t.Name.startswith( + "DateTableTemplate_" + ): + if any( + p.SourceType == TOM.PartitionSourceType.Calculated + for p in t.Partitions + ): isAutoDate = True return isAutoDate - def set_kpi(self, measure_name: str, target: Union[int,float,str], lower_bound: float, upper_bound: float, lower_mid_bound: Optional[float] = None, upper_mid_bound: Optional[float] = None, status_type: Optional[str] = None, status_graphic: Optional[str] = None): - + def set_kpi( + self, + measure_name: str, + target: Union[int, float, str], + lower_bound: float, + upper_bound: float, + lower_mid_bound: Optional[float] = None, + upper_mid_bound: Optional[float] = None, + status_type: Optional[str] = None, + status_graphic: Optional[str] = None, + ): """ Sets the properties to add/update a KPI for a measure. @@ -1983,60 +2306,94 @@ def set_kpi(self, measure_name: str, target: Union[int,float,str], lower_bound: status_graphic : str, default=None The status graphic for the KPI. Defaults to 'Three Circles Colored'. - + Returns ------- """ - #https://github.com/m-kovalsky/Tabular/blob/master/KPI%20Graphics.md + # https://github.com/m-kovalsky/Tabular/blob/master/KPI%20Graphics.md if measure_name == target: - print(f"The 'target' parameter cannot be the same measure as the 'measure_name' parameter.") + print( + f"The 'target' parameter cannot be the same measure as the 'measure_name' parameter." + ) return if status_graphic is None: - status_graphic = 'Three Circles Colored' + status_graphic = "Three Circles Colored" - statusType = ['Linear', 'LinearReversed', 'Centered', 'CenteredReversed'] - status_type = status_type.title().replace(' ','') + statusType = ["Linear", "LinearReversed", "Centered", "CenteredReversed"] + status_type = status_type.title().replace(" ", "") if status_type is None: - status_type = 'Linear' + status_type = "Linear" if status_type not in statusType: - print(f"'{status_type}' is an invalid status_type. Please choose from these options: {statusType}.") + print( + f"'{status_type}' is an invalid status_type. Please choose from these options: {statusType}." + ) return - if status_type in ['Linear', 'LinearReversed']: + if status_type in ["Linear", "LinearReversed"]: if upper_bound is not None or lower_mid_bound is not None: - print(f"The 'upper_mid_bound' and 'lower_mid_bound' parameters are not used in the 'Linear' and 'LinearReversed' status types. Make sure these parameters are set to None.") + print( + f"The 'upper_mid_bound' and 'lower_mid_bound' parameters are not used in the 'Linear' and 'LinearReversed' status types. Make sure these parameters are set to None." + ) return elif upper_bound <= lower_bound: print(f"The upper_bound must be greater than the lower_bound.") return - - if status_type in ['Centered', 'CenteredReversed']: + + if status_type in ["Centered", "CenteredReversed"]: if upper_mid_bound is None or lower_mid_bound is None: - print(f"The 'upper_mid_bound' and 'lower_mid_bound' parameters are necessary in the 'Centered' and 'CenteredReversed' status types.") + print( + f"The 'upper_mid_bound' and 'lower_mid_bound' parameters are necessary in the 'Centered' and 'CenteredReversed' status types." + ) return elif upper_bound <= upper_mid_bound: print(f"The upper_bound must be greater than the upper_mid_bound.") elif upper_mid_bound <= lower_mid_bound: - print(f"The upper_mid_bound must be greater than the lower_mid_bound.") + print( + f"The upper_mid_bound must be greater than the lower_mid_bound." + ) elif lower_mid_bound <= lower_bound: print(f"The lower_mid_bound must be greater than the lower_bound.") try: - table_name = next(m.Parent.Name for m in self.all_measures() if m.Name == measure_name) + table_name = next( + m.Parent.Name for m in self.all_measures() if m.Name == measure_name + ) except: - print(f"The '{measure_name}' measure does not exist in the '{dataset}' semantic model within the '{workspace}'.") + print( + f"The '{measure_name}' measure does not exist in the '{dataset}' semantic model within the '{workspace}'." + ) return - - graphics = ['Cylinder', 'Five Bars Colored', 'Five Boxes Colored', 'Gauge - Ascending', 'Gauge - Descending', 'Road Signs', 'Shapes', 'Standard Arrow', 'Three Circles Colored', 'Three Flags Colored', 'Three Stars Colored', 'Three Symbols Uncircled Colored', 'Traffic Light', 'Traffic Light - Single', 'Variance Arrow', 'Status Arrow - Ascending', 'Status Arrow - Descending'] + + graphics = [ + "Cylinder", + "Five Bars Colored", + "Five Boxes Colored", + "Gauge - Ascending", + "Gauge - Descending", + "Road Signs", + "Shapes", + "Standard Arrow", + "Three Circles Colored", + "Three Flags Colored", + "Three Stars Colored", + "Three Symbols Uncircled Colored", + "Traffic Light", + "Traffic Light - Single", + "Variance Arrow", + "Status Arrow - Ascending", + "Status Arrow - Descending", + ] if status_graphic not in graphics: - print(f"The '{status_graphic}' status graphic is not valid. Please choose from these options: {graphics}.") + print( + f"The '{status_graphic}' status graphic is not valid. Please choose from these options: {graphics}." + ) return measure_target = True @@ -2047,22 +2404,28 @@ def set_kpi(self, measure_name: str, target: Union[int,float,str], lower_bound: measure_target = False except: try: - tgt = next(format_dax_object_name(m.Parent.Name, m.Name) for m in self.all_measures() if m.Name == target) + tgt = next( + format_dax_object_name(m.Parent.Name, m.Name) + for m in self.all_measures() + if m.Name == target + ) except: - print(f"The '{target}' measure does not exist in the '{dataset}' semantic model within the '{workspace}'.") + print( + f"The '{target}' measure does not exist in the '{dataset}' semantic model within the '{workspace}'." + ) if measure_target: expr = f"var x = [{measure_name}]/[{target}]\nreturn" else: expr = f"var x = [{measure_name}\nreturn" - if status_type == 'Linear': + if status_type == "Linear": expr = f"{expr}\nif(isblank(x),blank(),\n\tif(x<{lower_bound},-1,\n\t\tif(x<{upper_bound},0,1)))" - elif status_type == 'LinearReversed': + elif status_type == "LinearReversed": expr = f"{expr}\nif(isblank(x),blank(),\nif(x<{lower_bound},1,\n\t\tif(x<{upper_bound},0,-1)))" - elif status_type == 'Centered': + elif status_type == "Centered": expr = f"{expr}\nif(isblank(x),blank(),\n\tif(x<{lower_mid_bound},\n\t\tif(x<{lower_bound},-1,0),\n\t\t\tif(x<{upper_mid_bound},1,\n\t\t\t\tif(x<{upper_bound}0,-1))))" - elif status_type == 'CenteredReversed': + elif status_type == "CenteredReversed": expr = f"{expr}\nif(isblank(x),blank(),\n\tif(x<{lower_mid_bound},\n\t\tif(x<{lower_bound},1,0),\n\t\t\tif(x<{upper_mid_bound},-1,\n\t\t\t\tif(x<{upper_bound}0,1))))" kpi = TOM.KPI() @@ -2079,7 +2442,6 @@ def set_kpi(self, measure_name: str, target: Union[int,float,str], lower_bound: ms.KPI = kpi def set_aggregations(self, table_name: str, agg_table_name: str): - """ Sets the aggregations (alternate of) for all the columns in an aggregation table based on a base table. @@ -2089,7 +2451,7 @@ def set_aggregations(self, table_name: str, agg_table_name: str): Name of the base table. agg_table_name : str Name of the aggregation table. - + Returns ------- @@ -2099,15 +2461,26 @@ def set_aggregations(self, table_name: str, agg_table_name: str): dataType = c.DataType - if dataType in [TOM.DataType.String, TOM.DataType.Boolean, TOM.DataType.DateTime]: - sumType = 'GroupBy' + if dataType in [ + TOM.DataType.String, + TOM.DataType.Boolean, + TOM.DataType.DateTime, + ]: + sumType = "GroupBy" else: - sumType = 'Sum' + sumType = "Sum" - self.set_alternate_of(table_name = agg_table_name, column_name = c.Name, base_table = table_name, base_column = c.Name, summarization_type = sumType) - - def set_is_available_in_mdx(self, table_name: str, column_name: str, value: Optional[bool] = False): + self.set_alternate_of( + table_name=agg_table_name, + column_name=c.Name, + base_table=table_name, + base_column=c.Name, + summarization_type=sumType, + ) + def set_is_available_in_mdx( + self, table_name: str, column_name: str, value: Optional[bool] = False + ): """ Sets the IsAvailableInMdx property on a column. @@ -2119,7 +2492,7 @@ def set_is_available_in_mdx(self, table_name: str, column_name: str, value: Opti Name of the column. value : bool, default=False The IsAvailableInMdx property value. - + Returns ------- @@ -2127,8 +2500,9 @@ def set_is_available_in_mdx(self, table_name: str, column_name: str, value: Opti self.model.Tables[table_name].Columns[column_name].IsAvailableInMdx = value - def set_summarize_by(self, table_name: str, column_name: str, value: Optional[str] = None): - + def set_summarize_by( + self, table_name: str, column_name: str, value: Optional[str] = None + ): """ Sets the SummarizeBy property on a column. @@ -2141,27 +2515,43 @@ def set_summarize_by(self, table_name: str, column_name: str, value: Optional[st value : bool, default=None The SummarizeBy property value. Defaults to none which resolves to 'Default'. - + Returns ------- """ - values = ['Default', 'None', 'Sum', 'Min', 'Max', 'Count', 'Average', 'DistinctCount'] - #https://learn.microsoft.com/en-us/dotnet/api/microsoft.analysisservices.tabular.column.summarizeby?view=analysisservices-dotnet#microsoft-analysisservices-tabular-column-summarizeby + values = [ + "Default", + "None", + "Sum", + "Min", + "Max", + "Count", + "Average", + "DistinctCount", + ] + # https://learn.microsoft.com/en-us/dotnet/api/microsoft.analysisservices.tabular.column.summarizeby?view=analysisservices-dotnet#microsoft-analysisservices-tabular-column-summarizeby if value is None: - value = 'Default' - value = value.capitalize().replace('Distinctcount', 'DistinctCount').replace('Avg', 'Average') + value = "Default" + value = ( + value.capitalize() + .replace("Distinctcount", "DistinctCount") + .replace("Avg", "Average") + ) if value not in values: - print(f"'{value}' is not a valid value for the SummarizeBy property. These are the valid values: {values}.") + print( + f"'{value}' is not a valid value for the SummarizeBy property. These are the valid values: {values}." + ) return - self.model.Tables[table_name].Columns[column_name].SummarizeBy = System.Enum.Parse(TOM.AggregateFunction, value) + self.model.Tables[table_name].Columns[column_name].SummarizeBy = ( + System.Enum.Parse(TOM.AggregateFunction, value) + ) def set_direct_lake_behavior(self, direct_lake_behavior: str): - """ Sets the Direct Lake Behavior property for a semantic model. @@ -2169,30 +2559,45 @@ def set_direct_lake_behavior(self, direct_lake_behavior: str): ---------- direct_lake_behavior : str The DirectLakeBehavior property value. - + Returns ------- """ direct_lake_behavior = direct_lake_behavior.capitalize() - if direct_lake_behavior.startswith('Auto'): - direct_lake_behavior = 'Automatic' - elif direct_lake_behavior.startswith('Directl') or direct_lake_behavior == 'Dl': - direct_lake_behavior = 'DirectLakeOnly' - elif direct_lake_behavior.startswith('Directq') or direct_lake_behavior == 'Dq': - direct_lake_behavior = 'DirectQueryOnly' - - dlValues = ['Automatic', 'DirectLakeOnly', 'DirectQueryOnly'] + if direct_lake_behavior.startswith("Auto"): + direct_lake_behavior = "Automatic" + elif ( + direct_lake_behavior.startswith("Directl") + or direct_lake_behavior == "Dl" + ): + direct_lake_behavior = "DirectLakeOnly" + elif ( + direct_lake_behavior.startswith("Directq") + or direct_lake_behavior == "Dq" + ): + direct_lake_behavior = "DirectQueryOnly" + + dlValues = ["Automatic", "DirectLakeOnly", "DirectQueryOnly"] if direct_lake_behavior not in dlValues: - print(f"The 'direct_lake_behavior' parameter must be one of these values: {dlValues}.") + print( + f"The 'direct_lake_behavior' parameter must be one of these values: {dlValues}." + ) return - self.model.DirectLakeBehavior = System.Enum.Parse(TOM.DirectLakeBehavior, direct_lake_behavior) - - def add_table(self, name: str, description: Optional[str] = None, data_category: Optional[str] = None, hidden: Optional[bool] = False): + self.model.DirectLakeBehavior = System.Enum.Parse( + TOM.DirectLakeBehavior, direct_lake_behavior + ) + def add_table( + self, + name: str, + description: Optional[str] = None, + data_category: Optional[str] = None, + hidden: Optional[bool] = False, + ): """ Adds a table to the semantic model. @@ -2206,7 +2611,7 @@ def add_table(self, name: str, description: Optional[str] = None, data_category: The data category for the table. hidden : bool, default=False Whether the table is hidden or visible. - + Returns ------- @@ -2221,8 +2626,14 @@ def add_table(self, name: str, description: Optional[str] = None, data_category: t.Hidden = hidden self.model.Tables.Add(t) - def add_calculated_table(self, name: str, expression: str, description: Optional[str] = None, data_category: Optional[str] = None, hidden: Optional[bool] = False): - + def add_calculated_table( + self, + name: str, + expression: str, + description: Optional[str] = None, + data_category: Optional[str] = None, + hidden: Optional[bool] = False, + ): """ Adds a calculated table to the semantic model. @@ -2238,7 +2649,7 @@ def add_calculated_table(self, name: str, expression: str, description: Optional The data category for the table. hidden : bool, default=False Whether the table is hidden or visible. - + Returns ------- @@ -2262,7 +2673,6 @@ def add_calculated_table(self, name: str, expression: str, description: Optional self.model.Tables.Add(t) def add_field_parameter(self, table_name: str, objects: List[str]): - """ Adds a table to the semantic model. @@ -2271,10 +2681,10 @@ def add_field_parameter(self, table_name: str, objects: List[str]): table_name : str Name of the table. objects : List[str] - The columns/measures to be included in the field parameter. + The columns/measures to be included in the field parameter. Columns must be specified as such : 'Table Name'[Column Name]. Measures may be formatted as '[Measure Name]' or 'Measure Name'. - + Returns ------- @@ -2284,44 +2694,88 @@ def add_field_parameter(self, table_name: str, objects: List[str]): print(f"The 'objects' parameter must be a list of columns/measures.") return if len(objects) == 1: - print(f"There must be more than one object (column/measure) within the objects parameter.") + print( + f"There must be more than one object (column/measure) within the objects parameter." + ) return - - expr = '' - i=0 + + expr = "" + i = 0 for obj in objects: success = False for m in self.all_measures(): - if obj == '[' + m.Name + ']' or obj == m.Name: - expr = expr + '\n\t' + '("' + m.Name + '", NAMEOF([' + m.Name + ']), ' + str(i) + '),' + if obj == "[" + m.Name + "]" or obj == m.Name: + expr = ( + expr + + "\n\t" + + '("' + + m.Name + + '", NAMEOF([' + + m.Name + + "]), " + + str(i) + + ")," + ) success = True for c in self.all_columns(): fullObjName = format_dax_object_name(c.Parent.Name, c.Name) - if obj == fullObjName or obj == c.Parent.Name + '[' + c.Name + ']': - expr = expr + '\n\t' + '("' + c.Name + '", NAMEOF(' + fullObjName + '), ' + str(i) + '),' + if obj == fullObjName or obj == c.Parent.Name + "[" + c.Name + "]": + expr = ( + expr + + "\n\t" + + '("' + + c.Name + + '", NAMEOF(' + + fullObjName + + "), " + + str(i) + + ")," + ) success = True if not success: - print(f"The '{obj}' object was not found in the '{dataset}' semantic model.") + print( + f"The '{obj}' object was not found in the '{dataset}' semantic model." + ) return else: - i+=1 + i += 1 - expr = '{' + expr.rstrip(',') + '\n}' + expr = "{" + expr.rstrip(",") + "\n}" - self.add_calculated_table(name = table_name, expression = expr) + self.add_calculated_table(name=table_name, expression=expr) - col2 = table_name + ' Fields' - col3 = table_name + ' Order' + col2 = table_name + " Fields" + col3 = table_name + " Order" - self.add_calculated_table_column(table_name = table_name, column_name = table_name, source_column = '[Value1]', data_type = 'String', hidden = False ) - self.add_calculated_table_column(table_name = table_name, column_name = col2, source_column = '[Value2]', data_type = 'String', hidden = True ) - self.add_calculated_table_column(table_name = table_name, column_name = col3, source_column = '[Value3]', data_type = 'Int64', hidden = True ) + self.add_calculated_table_column( + table_name=table_name, + column_name=table_name, + source_column="[Value1]", + data_type="String", + hidden=False, + ) + self.add_calculated_table_column( + table_name=table_name, + column_name=col2, + source_column="[Value2]", + data_type="String", + hidden=True, + ) + self.add_calculated_table_column( + table_name=table_name, + column_name=col3, + source_column="[Value3]", + data_type="Int64", + hidden=True, + ) - self.set_extended_property(self = self, - object = self.model.Tables[table_name].Columns[col2], - extended_property_type = 'Json', - name = 'ParameterMetadata', - value = '{"version":3,"kind":2}') + self.set_extended_property( + self=self, + object=self.model.Tables[table_name].Columns[col2], + extended_property_type="Json", + name="ParameterMetadata", + value='{"version":3,"kind":2}', + ) rcd = TOM.RelatedColumnDetails() gpc = TOM.GroupByColumn() @@ -2329,19 +2783,20 @@ def add_field_parameter(self, table_name: str, objects: List[str]): rcd.GroupByColumns.Add(gpc) # Update column properties - self.model.Tables[table_name].Columns[col2].SortByColumn = self.model.Tables[table_name].Columns[col3] + self.model.Tables[table_name].Columns[col2].SortByColumn = ( + self.model.Tables[table_name].Columns[col3] + ) self.model.Tables[table_name].Columns[table_name].RelatedColumnDetails = rcd fpAdded.append(table_name) def remove_vertipaq_annotations(self): - """ Removes the annotations set using the [set_vertipaq_annotations] function. Parameters ---------- - + Returns ------- @@ -2349,92 +2804,138 @@ def remove_vertipaq_annotations(self): for t in self.model.Tables: for a in t.Annotations: - if a.Name.startswith('Vertipaq_'): - self.remove_annotation(object = t, name = a.Name) + if a.Name.startswith("Vertipaq_"): + self.remove_annotation(object=t, name=a.Name) for c in t.Columns: for a in c.Annotations: - if a.Name.startswith('Vertipaq_'): - self.remove_annotation(object = c, name = a.Name) + if a.Name.startswith("Vertipaq_"): + self.remove_annotation(object=c, name=a.Name) for h in t.Hierarchies: for a in h.Annotations: - if a.Name.startswith('Vertipaq_'): - self.remove_annotation(object = h, name = a.Name) + if a.Name.startswith("Vertipaq_"): + self.remove_annotation(object=h, name=a.Name) for p in t.Partitions: for a in p.Annotations: - if a.Name.startswith('Vertipaq_'): - self.remove_annotation(object = p, name = a.Name) + if a.Name.startswith("Vertipaq_"): + self.remove_annotation(object=p, name=a.Name) for r in self.model.Relationships: for a in r.Annotations: - if a.Name.startswith('Veripaq_'): - self.remove_annotation(object = r, name = a.Name) + if a.Name.startswith("Veripaq_"): + self.remove_annotation(object=r, name=a.Name) def set_vertipaq_annotations(self): - """ Saves Vertipaq Analyzer statistics as annotations on objects in the semantic model. Parameters ---------- - + Returns ------- """ - dfT = fabric.list_tables(dataset = dataset, workspace = workspace, extended=True) - dfC = fabric.list_columns(dataset = dataset, workspace = workspace, extended=True) - #intList = ['Total Size']#, 'Data Size', 'Dictionary Size', 'Hierarchy Size'] - dfCSum = dfC.groupby(['Table Name'])['Total Size'].sum().reset_index() - dfTable = pd.merge(dfT[['Name', 'Type', 'Row Count']], dfCSum[['Table Name', 'Total Size']], left_on = 'Name', right_on = 'Table Name', how = 'inner') - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace, extended=True) - dfP['Records per Segment'] = round(dfP['Record Count'] / dfP['Segment Count'],2) - dfH = fabric.list_hierarchies(dataset = dataset, workspace = workspace, extended=True) - dfR = list_relationships(dataset = dataset, workspace = workspace, extended=True) + dfT = fabric.list_tables( + dataset=dataset, workspace=workspace, extended=True + ) + dfC = fabric.list_columns( + dataset=dataset, workspace=workspace, extended=True + ) + # intList = ['Total Size']#, 'Data Size', 'Dictionary Size', 'Hierarchy Size'] + dfCSum = dfC.groupby(["Table Name"])["Total Size"].sum().reset_index() + dfTable = pd.merge( + dfT[["Name", "Type", "Row Count"]], + dfCSum[["Table Name", "Total Size"]], + left_on="Name", + right_on="Table Name", + how="inner", + ) + dfP = fabric.list_partitions( + dataset=dataset, workspace=workspace, extended=True + ) + dfP["Records per Segment"] = round( + dfP["Record Count"] / dfP["Segment Count"], 2 + ) + dfH = fabric.list_hierarchies( + dataset=dataset, workspace=workspace, extended=True + ) + dfR = list_relationships( + dataset=dataset, workspace=workspace, extended=True + ) for t in self.model.Tables: - dfT_filt = dfTable[dfTable['Name'] == t.Name] - rowCount = str(dfT_filt['Row Count'].iloc[0]) - totalSize = str(dfT_filt['Total Size'].iloc[0]) - self.set_annotation(object = t, name = 'Vertipaq_RowCount', value = rowCount) - self.set_annotation(object = t, name = 'Vertipaq_TableSize', value = totalSize) + dfT_filt = dfTable[dfTable["Name"] == t.Name] + rowCount = str(dfT_filt["Row Count"].iloc[0]) + totalSize = str(dfT_filt["Total Size"].iloc[0]) + self.set_annotation(object=t, name="Vertipaq_RowCount", value=rowCount) + self.set_annotation( + object=t, name="Vertipaq_TableSize", value=totalSize + ) for c in t.Columns: - dfC_filt = dfC[(dfC['Table Name'] == t.Name) & (dfC['Column Name'] == c.Name)] - totalSize = str(dfC_filt['Total Size'].iloc[0]) - dataSize = str(dfC_filt['Data Size'].iloc[0]) - dictSize = str(dfC_filt['Dictionary Size'].iloc[0]) - hierSize = str(dfC_filt['Hierarchy Size'].iloc[0]) - card = str(dfC_filt['Column Cardinality'].iloc[0]) - self.set_annotation(object = c, name = 'Vertipaq_TotalSize', value = totalSize) - self.set_annotation(object = c, name = 'Vertipaq_DataSize', value = dataSize) - self.set_annotation(object = c, name = 'Vertipaq_DictionarySize', value = dictSize) - self.set_annotation(object = c, name = 'Vertipaq_HierarchySize', value = hierSize) - self.set_annotation(object = c, name = 'Vertipaq_Cardinality', value = card) + dfC_filt = dfC[ + (dfC["Table Name"] == t.Name) & (dfC["Column Name"] == c.Name) + ] + totalSize = str(dfC_filt["Total Size"].iloc[0]) + dataSize = str(dfC_filt["Data Size"].iloc[0]) + dictSize = str(dfC_filt["Dictionary Size"].iloc[0]) + hierSize = str(dfC_filt["Hierarchy Size"].iloc[0]) + card = str(dfC_filt["Column Cardinality"].iloc[0]) + self.set_annotation( + object=c, name="Vertipaq_TotalSize", value=totalSize + ) + self.set_annotation( + object=c, name="Vertipaq_DataSize", value=dataSize + ) + self.set_annotation( + object=c, name="Vertipaq_DictionarySize", value=dictSize + ) + self.set_annotation( + object=c, name="Vertipaq_HierarchySize", value=hierSize + ) + self.set_annotation( + object=c, name="Vertipaq_Cardinality", value=card + ) for p in t.Partitions: - dfP_filt = dfP[(dfP['Table Name'] == t.Name) & (dfP['Partition Name'] == p.Name)] - recordCount = str(dfP_filt['Record Count'].iloc[0]) - segmentCount = str(dfP_filt['Segment Count'].iloc[0]) - rpS = str(dfP_filt['Records per Segment'].iloc[0]) - self.set_annotation(object = p, name = 'Vertipaq_RecordCount', value = recordCount) - self.set_annotation(object = p, name = 'Vertipaq_SegmentCount', value = segmentCount) - self.set_annotation(object = p, name = 'Vertipaq_RecordsPerSegment', value = rpS) + dfP_filt = dfP[ + (dfP["Table Name"] == t.Name) + & (dfP["Partition Name"] == p.Name) + ] + recordCount = str(dfP_filt["Record Count"].iloc[0]) + segmentCount = str(dfP_filt["Segment Count"].iloc[0]) + rpS = str(dfP_filt["Records per Segment"].iloc[0]) + self.set_annotation( + object=p, name="Vertipaq_RecordCount", value=recordCount + ) + self.set_annotation( + object=p, name="Vertipaq_SegmentCount", value=segmentCount + ) + self.set_annotation( + object=p, name="Vertipaq_RecordsPerSegment", value=rpS + ) for h in t.Hierarchies: - dfH_filt = dfH[(dfH['Table Name'] == t.Name) & (dfH['Hierarchy Name'] == h.Name)] - usedSize = str(dfH_filt['Used Size'].iloc[0]) - self.set_annotation(object = h, name = 'Vertipaq_UsedSize', value = usedSize) + dfH_filt = dfH[ + (dfH["Table Name"] == t.Name) + & (dfH["Hierarchy Name"] == h.Name) + ] + usedSize = str(dfH_filt["Used Size"].iloc[0]) + self.set_annotation( + object=h, name="Vertipaq_UsedSize", value=usedSize + ) for r in self.model.Relationships: - dfR_filt = dfR[dfR['Relationship Name'] == r.Name] - relSize = str(dfR_filt['Used Size'].iloc[0]) - self.set_annotation(object = r, name = 'Vertipaq_UsedSize', value = relSize) + dfR_filt = dfR[dfR["Relationship Name"] == r.Name] + relSize = str(dfR_filt["Used Size"].iloc[0]) + self.set_annotation(object=r, name="Vertipaq_UsedSize", value=relSize) try: - runId = self.get_annotation_value(object = self.model, name = 'Vertipaq_Run') + runId = self.get_annotation_value( + object=self.model, name="Vertipaq_Run" + ) runId = str(int(runId) + 1) except: - runId = '1' - self.set_annotation(object = self.model, name = 'Vertipaq_Run', value = runId) - - def row_count(self, object: Union['TOM.Partition', 'TOM.Table']): + runId = "1" + self.set_annotation(object=self.model, name="Vertipaq_Run", value=runId) + def row_count(self, object: Union["TOM.Partition", "TOM.Table"]): """ Obtains the row count of a table or partition within a semantic model. @@ -2442,24 +2943,27 @@ def row_count(self, object: Union['TOM.Partition', 'TOM.Table']): ---------- object : TOM Object The table/partition object within the semantic model. - + Returns ------- int Number of rows within the TOM object. """ - + objType = object.ObjectType - + if objType == TOM.ObjectType.Table: - result = self.get_annotation_value(object = object, name = 'Vertipaq_RowCount') + result = self.get_annotation_value( + object=object, name="Vertipaq_RowCount" + ) elif objType == TOM.ObjectType.Partition: - result = self.get_annotation_value(object = object, name = 'Vertipaq_RecordCount') + result = self.get_annotation_value( + object=object, name="Vertipaq_RecordCount" + ) return int(result) - - def records_per_segment(self, object: 'TOM.Partition'): + def records_per_segment(self, object: "TOM.Partition"): """ Obtains the records per segment of a partition within a semantic model. @@ -2467,22 +2971,23 @@ def records_per_segment(self, object: 'TOM.Partition'): ---------- object : TOM Object The partition object within the semantic model. - + Returns ------- float Number of records per segment within the partition. """ - + objType = object.ObjectType - + if objType == TOM.ObjectType.Partition: - result = self.get_annotation_value(object = object, name = 'Vertipaq_RecordsPerSegment') + result = self.get_annotation_value( + object=object, name="Vertipaq_RecordsPerSegment" + ) return float(result) - - def used_size(self, object: Union['TOM.Hierarchy', 'TOM.Relationship']): + def used_size(self, object: Union["TOM.Hierarchy", "TOM.Relationship"]): """ Obtains the used size of a hierarchy or relationship within a semantic model. @@ -2490,24 +2995,27 @@ def used_size(self, object: Union['TOM.Hierarchy', 'TOM.Relationship']): ---------- object : TOM Object The hierarhcy/relationship object within the semantic model. - + Returns ------- int Used size of the TOM object. """ - + objType = object.ObjectType - + if objType == TOM.ObjectType.Hierarchy: - result = self.get_annotation_value(object = object, name = 'Vertipaq_UsedSize') + result = self.get_annotation_value( + object=object, name="Vertipaq_UsedSize" + ) elif objType == TOM.ObjectType.Relationship: - result = self.get_annotation_value(object = object, name = 'Vertipaq_UsedSize') + result = self.get_annotation_value( + object=object, name="Vertipaq_UsedSize" + ) return int(result) - def data_size(self, column: 'TOM.Column'): - + def data_size(self, column: "TOM.Column"): """ Obtains the data size of a column within a semantic model. @@ -2515,22 +3023,23 @@ def data_size(self, column: 'TOM.Column'): ---------- column : TOM Object The column object within the semantic model. - + Returns ------- int Data size of the TOM column. """ - + objType = column.ObjectType - + if objType == TOM.ObjectType.Column: - result = self.get_annotation_value(object = column, name = 'Vertipaq_DataSize') + result = self.get_annotation_value( + object=column, name="Vertipaq_DataSize" + ) return int(result) - def dictionary_size(self, column: 'TOM.Column'): - + def dictionary_size(self, column: "TOM.Column"): """ Obtains the dictionary size of a column within a semantic model. @@ -2538,7 +3047,7 @@ def dictionary_size(self, column: 'TOM.Column'): ---------- column : TOM Object The column object within the semantic model. - + Returns ------- int @@ -2548,12 +3057,13 @@ def dictionary_size(self, column: 'TOM.Column'): objType = column.ObjectType if objType == TOM.ObjectType.Column: - result = self.get_annotation_value(object = column, name = 'Vertipaq_DictionarySize') + result = self.get_annotation_value( + object=column, name="Vertipaq_DictionarySize" + ) return int(result) - - def total_size(self, object: Union['TOM.Table', 'TOM.Column']): + def total_size(self, object: Union["TOM.Table", "TOM.Column"]): """ Obtains the data size of a table/column within a semantic model. @@ -2561,7 +3071,7 @@ def total_size(self, object: Union['TOM.Table', 'TOM.Column']): ---------- object : TOM Object The table/column object within the semantic model. - + Returns ------- int @@ -2569,16 +3079,19 @@ def total_size(self, object: Union['TOM.Table', 'TOM.Column']): """ objType = object.ObjectType - + if objType == TOM.ObjectType.Column: - result = self.get_annotation_value(object = object, name = 'Vertipaq_TotalSize') + result = self.get_annotation_value( + object=object, name="Vertipaq_TotalSize" + ) elif objType == TOM.ObjectType.Table: - result = self.get_annotation_value(object = object, name = 'Vertipaq_TotalSize') + result = self.get_annotation_value( + object=object, name="Vertipaq_TotalSize" + ) return int(result) - def cardinality(self, column: 'TOM.Column'): - + def cardinality(self, column: "TOM.Column"): """ Obtains the cardinality of a column within a semantic model. @@ -2586,22 +3099,23 @@ def cardinality(self, column: 'TOM.Column'): ---------- column : TOM Object The column object within the semantic model. - + Returns ------- int Cardinality of the TOM column. """ - + objType = column.ObjectType - + if objType == TOM.ObjectType.Column: - result = self.get_annotation_value(object = column, name = 'Vertipaq_Cardinality') + result = self.get_annotation_value( + object=column, name="Vertipaq_Cardinality" + ) - return int(result) - - def depends_on(self, object, dependencies: pd.DataFrame): + return int(result) + def depends_on(self, object, dependencies: pd.DataFrame): """ Obtains the objects on which the specified object depends. @@ -2611,7 +3125,7 @@ def depends_on(self, object, dependencies: pd.DataFrame): The TOM object within the semantic model. dependencies : pandas.DataFrame A pandas dataframe with the output of the 'get_model_calc_dependencies' function. - + Returns ------- Microsoft.AnalysisServices.Tabular.TableCollection, Microsoft.AnalysisServices.Tabular.ColumnCollection, Microsoft.AnalysisServices.Tabular.MeasureCollection @@ -2625,10 +3139,28 @@ def depends_on(self, object, dependencies: pd.DataFrame): if objType == TOM.ObjectType.Table: objParentName = objName - fil = dependencies[(dependencies['Object Type'] == objType) & (dependencies['Table Name'] == objParentName) & (dependencies['Object Name'] == objName)] - meas = fil[fil['Referenced Object Type'] == 'Measure']['Referenced Object'].unique().tolist() - cols = fil[fil['Referenced Object Type'] == 'Column']['Referenced Full Object Name'].unique().tolist() - tbls = fil[fil['Referenced Object Type'] == 'Table']['Referenced Table'].unique().tolist() + fil = dependencies[ + (dependencies["Object Type"] == objType) + & (dependencies["Table Name"] == objParentName) + & (dependencies["Object Name"] == objName) + ] + meas = ( + fil[fil["Referenced Object Type"] == "Measure"]["Referenced Object"] + .unique() + .tolist() + ) + cols = ( + fil[fil["Referenced Object Type"] == "Column"][ + "Referenced Full Object Name" + ] + .unique() + .tolist() + ) + tbls = ( + fil[fil["Referenced Object Type"] == "Table"]["Referenced Table"] + .unique() + .tolist() + ) for m in self.all_measures(): if m.Name in meas: yield m @@ -2640,7 +3172,6 @@ def depends_on(self, object, dependencies: pd.DataFrame): yield t def referenced_by(self, object, dependencies: pd.DataFrame): - """ Obtains the objects which reference the specified object. @@ -2650,7 +3181,7 @@ def referenced_by(self, object, dependencies: pd.DataFrame): The TOM object within the semantic model. dependencies : pandas.DataFrame A pandas dataframe with the output of the 'get_model_calc_dependencies' function. - + Returns ------- Microsoft.AnalysisServices.Tabular.TableCollection, Microsoft.AnalysisServices.Tabular.ColumnCollection, Microsoft.AnalysisServices.Tabular.MeasureCollection @@ -2664,10 +3195,24 @@ def referenced_by(self, object, dependencies: pd.DataFrame): if objType == TOM.ObjectType.Table: objParentName = objName - fil = dependencies[(dependencies['Referenced Object Type'] == objType) & (dependencies['Referenced Table'] == objParentName) & (dependencies['Referenced Object'] == objName)] - meas = fil[fil['Object Type'] == 'Measure']['Object Name'].unique().tolist() - cols = fil[fil['Object Type'].isin(['Column', 'Calc Column'])]['Full Object Name'].unique().tolist() - tbls = fil[fil['Object Type'].isin(['Table', 'Calc Table'])]['Table Name'].unique().tolist() + fil = dependencies[ + (dependencies["Referenced Object Type"] == objType) + & (dependencies["Referenced Table"] == objParentName) + & (dependencies["Referenced Object"] == objName) + ] + meas = fil[fil["Object Type"] == "Measure"]["Object Name"].unique().tolist() + cols = ( + fil[fil["Object Type"].isin(["Column", "Calc Column"])][ + "Full Object Name" + ] + .unique() + .tolist() + ) + tbls = ( + fil[fil["Object Type"].isin(["Table", "Calc Table"])]["Table Name"] + .unique() + .tolist() + ) for m in self.all_measures(): if m.Name in meas: yield m @@ -2678,8 +3223,9 @@ def referenced_by(self, object, dependencies: pd.DataFrame): if t.Name in tbls: yield t - def fully_qualified_measures(self, object: 'TOM.Measure', dependencies: pd.DataFrame): - + def fully_qualified_measures( + self, object: "TOM.Measure", dependencies: pd.DataFrame + ): """ Obtains all fully qualified measure references for a given object. @@ -2689,20 +3235,22 @@ def fully_qualified_measures(self, object: 'TOM.Measure', dependencies: pd.DataF The TOM object within the semantic model. dependencies : pandas.DataFrame A pandas dataframe with the output of the 'get_model_calc_dependencies' function. - + Returns ------- Microsoft.AnalysisServices.Tabular.MeasureCollection All fully qualified measure references for a given object. """ - - for obj in self.depends_on(object = object, dependencies=dependencies): + + for obj in self.depends_on(object=object, dependencies=dependencies): if obj.ObjectType == TOM.ObjectType.Measure: - if (obj.Parent.Name + obj.Name in object.Expression) or (format_dax_object_name(obj.Parent.Name, obj.Name) in object.Expression): + if (obj.Parent.Name + obj.Name in object.Expression) or ( + format_dax_object_name(obj.Parent.Name, obj.Name) + in object.Expression + ): yield obj - def unqualified_columns(self, object: 'TOM.Column', dependencies: pd.DataFrame): - + def unqualified_columns(self, object: "TOM.Column", dependencies: pd.DataFrame): """ Obtains all unqualified column references for a given object. @@ -2712,29 +3260,33 @@ def unqualified_columns(self, object: 'TOM.Column', dependencies: pd.DataFrame): The TOM object within the semantic model. dependencies : pandas.DataFrame A pandas dataframe with the output of the 'get_model_calc_dependencies' function. - + Returns ------- Microsoft.AnalysisServices.Tabular.ColumnCollection All unqualified column references for a given object. """ - + def create_pattern(a, b): - return r'(? 0: usingView = True - + return usingView - - def has_incremental_refresh_policy(self, table_name: str): + def has_incremental_refresh_policy(self, table_name: str): """ Identifies whether a table has an incremental refresh policy. @@ -2761,7 +3312,7 @@ def has_incremental_refresh_policy(self, table_name: str): ---------- table_name : str Name of the table. - + Returns ------- bool @@ -2775,9 +3326,8 @@ def has_incremental_refresh_policy(self, table_name: str): hasRP = True return hasRP - - def show_incremental_refresh_policy(self, table_name: str): + def show_incremental_refresh_policy(self, table_name: str): """ Prints the incremental refresh policy for a table. @@ -2785,7 +3335,7 @@ def show_incremental_refresh_policy(self, table_name: str): ---------- table_name : str Name of the table. - + Returns ------- @@ -2794,40 +3344,64 @@ def show_incremental_refresh_policy(self, table_name: str): rp = self.model.Tables[table_name].RefreshPolicy if rp is None: - print(f"The '{table_name}' table in the '{dataset}' semantic model within the '{workspace}' workspace does not have an incremental refresh policy.") - else: + print( + f"The '{table_name}' table in the '{dataset}' semantic model within the '{workspace}' workspace does not have an incremental refresh policy." + ) + else: print(f"Table Name: {table_name}") rwGran = str(rp.RollingWindowGranularity).lower() icGran = str(rp.IncrementalGranularity).lower() - if rp.RollingWindowPeriods > 1: - print(f"Archive data starting {start_bold}{rp.RollingWindowPeriods} {rwGran}s{end_bold} before refresh date.") + if rp.RollingWindowPeriods > 1: + print( + f"Archive data starting {start_bold}{rp.RollingWindowPeriods} {rwGran}s{end_bold} before refresh date." + ) else: - print(f"Archive data starting {start_bold}{rp.RollingWindowPeriods} {rwGran}{end_bold} before refresh date.") - if rp.IncrementalPeriods > 1: - print(f"Incrementally refresh data {start_bold}{rp.IncrementalPeriods} {icGran}s{end_bold} before refresh date.") + print( + f"Archive data starting {start_bold}{rp.RollingWindowPeriods} {rwGran}{end_bold} before refresh date." + ) + if rp.IncrementalPeriods > 1: + print( + f"Incrementally refresh data {start_bold}{rp.IncrementalPeriods} {icGran}s{end_bold} before refresh date." + ) else: - print(f"Incrementally refresh data {start_bold}{rp.IncrementalPeriods} {icGran}{end_bold} before refresh date.") + print( + f"Incrementally refresh data {start_bold}{rp.IncrementalPeriods} {icGran}{end_bold} before refresh date." + ) if rp.Mode == TOM.RefreshPolicyMode.Hybrid: - print(f"{checked} Get the latest data in real time with DirectQuery (Premium only)") + print( + f"{checked} Get the latest data in real time with DirectQuery (Premium only)" + ) else: - print(f"{unchecked} Get the latest data in real time with DirectQuery (Premium only)") + print( + f"{unchecked} Get the latest data in real time with DirectQuery (Premium only)" + ) if rp.IncrementalPeriodsOffset == -1: print(f"{checked} Only refresh complete days") else: print(f"{unchecked} Only refresh complete days") if len(rp.PollingExpression) > 0: - pattern = r'\[([^\]]+)\]' + pattern = r"\[([^\]]+)\]" match = re.search(pattern, rp.PollingExpression) if match: col = match[0][1:-1] fullCol = format_dax_object_name(table_name, col) - print(f"{checked} Detect data changes: {start_bold}{fullCol}{end_bold}") + print( + f"{checked} Detect data changes: {start_bold}{fullCol}{end_bold}" + ) else: print(f"{unchecked} Detect data changes") - def update_incremental_refresh_policy(self, table_name: str, incremental_granularity: str, incremental_periods: int, rolling_window_granularity: str, rolling_window_periods: int, only_refresh_complete_days: Optional[bool] = False, detect_data_changes_column: Optional[str] = None): - + def update_incremental_refresh_policy( + self, + table_name: str, + incremental_granularity: str, + incremental_periods: int, + rolling_window_granularity: str, + rolling_window_periods: int, + only_refresh_complete_days: Optional[bool] = False, + detect_data_changes_column: Optional[str] = None, + ): """ Updates the incremental refresh policy for a table within a semantic model. @@ -2854,27 +3428,37 @@ def update_incremental_refresh_policy(self, table_name: str, incremental_granula """ - if not self.has_incremental_refresh_policy(table_name = table_name): - print(f"The '{table_name}' table does not have an incremental refresh policy.") + if not self.has_incremental_refresh_policy(table_name=table_name): + print( + f"The '{table_name}' table does not have an incremental refresh policy." + ) return - - incGran = ['Day', 'Month', 'Quarter', 'Year'] + + incGran = ["Day", "Month", "Quarter", "Year"] incremental_granularity = incremental_granularity.capitalize() rolling_window_granularity = rolling_window_granularity.capitalize() if incremental_granularity not in incGran: - print(f"{red_dot} Invalid 'incremental_granularity' value. Please choose from the following options: {incGran}.") + print( + f"{icons.red_dot} Invalid 'incremental_granularity' value. Please choose from the following options: {incGran}." + ) return if rolling_window_granularity not in incGran: - print(f"{red_dot} Invalid 'rolling_window_granularity' value. Please choose from the following options: {incGran}.") + print( + f"{icons.red_dot} Invalid 'rolling_window_granularity' value. Please choose from the following options: {incGran}." + ) return - + if rolling_window_periods < 1: - print(f"{red_dot} Invalid 'rolling_window_periods' value. Must be a value greater than 0.") + print( + f"{icons.red_dot} Invalid 'rolling_window_periods' value. Must be a value greater than 0." + ) return if incremental_periods < 1: - print(f"{red_dot} Invalid 'incremental_periods' value. Must be a value greater than 0.") + print( + f"{icons.red_dot} Invalid 'incremental_periods' value. Must be a value greater than 0." + ) return t = self.model.Tables[table_name] @@ -2883,14 +3467,20 @@ def update_incremental_refresh_policy(self, table_name: str, incremental_granula dc = t.Columns[detect_data_changes_column] if dc.DataType != TOM.DataType.DateTime: - print(f"{red_dot} Invalid 'detect_data_changes_column' parameter. This column must be of DateTime data type.") + print( + f"{icons.red_dot} Invalid 'detect_data_changes_column' parameter. This column must be of DateTime data type." + ) return rp = TOM.BasicRefreshPolicy() rp.IncrementalPeriods = incremental_periods - rp.IncrementalGranularity = System.Enum.Parse(TOM.RefreshGranularityType, incremental_granularity) + rp.IncrementalGranularity = System.Enum.Parse( + TOM.RefreshGranularityType, incremental_granularity + ) rp.RollingWindowPeriods = rolling_window_periods - rp.RollingWindowGranularity = System.Enum.Parse(TOM.RefreshGranularityType, rolling_window_granularity) + rp.RollingWindowGranularity = System.Enum.Parse( + TOM.RefreshGranularityType, rolling_window_granularity + ) rp.SourceExpression = t.RefreshPolicy.SourceExpression if only_refresh_complete_days: @@ -2909,8 +3499,19 @@ def update_incremental_refresh_policy(self, table_name: str, incremental_granula self.show_incremental_refresh_policy(table_name=table_name) - def add_incremental_refresh_policy(self, table_name: str, column_name: str, start_date: str, end_date: str, incremental_granularity: str, incremental_periods: int, rolling_window_granularity: str, rolling_window_periods: int, only_refresh_complete_days: Optional[bool] = False, detect_data_changes_column: Optional[str] = None): - + def add_incremental_refresh_policy( + self, + table_name: str, + column_name: str, + start_date: str, + end_date: str, + incremental_granularity: str, + incremental_periods: int, + rolling_window_granularity: str, + rolling_window_periods: int, + only_refresh_complete_days: Optional[bool] = False, + detect_data_changes_column: Optional[str] = None, + ): """ Adds anincremental refresh policy for a table within a semantic model. @@ -2943,28 +3544,36 @@ def add_incremental_refresh_policy(self, table_name: str, column_name: str, star """ - #https://learn.microsoft.com/en-us/power-bi/connect-data/incremental-refresh-configure + # https://learn.microsoft.com/en-us/power-bi/connect-data/incremental-refresh-configure - incGran = ['Day', 'Month', 'Quarter', 'Year'] + incGran = ["Day", "Month", "Quarter", "Year"] incremental_granularity = incremental_granularity.capitalize() rolling_window_granularity = rolling_window_granularity.capitalize() if incremental_granularity not in incGran: - print(f"{red_dot} Invalid 'incremental_granularity' value. Please choose from the following options: {incGran}.") + print( + f"{icons.red_dot} Invalid 'incremental_granularity' value. Please choose from the following options: {incGran}." + ) return if rolling_window_granularity not in incGran: - print(f"{red_dot} Invalid 'rolling_window_granularity' value. Please choose from the following options: {incGran}.") + print( + f"{icons.red_dot} Invalid 'rolling_window_granularity' value. Please choose from the following options: {incGran}." + ) return - + if rolling_window_periods < 1: - print(f"{red_dot} Invalid 'rolling_window_periods' value. Must be a value greater than 0.") + print( + f"{icons.red_dot} Invalid 'rolling_window_periods' value. Must be a value greater than 0." + ) return if incremental_periods < 1: - print(f"{red_dot} Invalid 'incremental_periods' value. Must be a value greater than 0.") + print( + f"{icons.red_dot} Invalid 'incremental_periods' value. Must be a value greater than 0." + ) return - - date_format = '%m/%d/%Y' + + date_format = "%m/%d/%Y" date_obj_start = datetime.strptime(start_date, date_format) start_year = date_obj_start.year @@ -2977,7 +3586,9 @@ def add_incremental_refresh_policy(self, table_name: str, column_name: str, star end_day = date_obj_end.day if date_obj_end <= date_obj_start: - print(f"{red_dot} Invalid 'start_date' or 'end_date'. The 'end_date' must be after the 'start_date'.") + print( + f"{icons.red_dot} Invalid 'start_date' or 'end_date'. The 'end_date' must be after the 'start_date'." + ) return t = self.model.Tables[table_name] @@ -2987,59 +3598,75 @@ def add_incremental_refresh_policy(self, table_name: str, column_name: str, star dType = c.DataType if dType != TOM.DataType.DateTime: - print(f"{red_dot} The {fcName} column is of '{dType}' data type. The column chosen must be of DateTime data type.") + print( + f"{icons.red_dot} The {fcName} column is of '{dType}' data type. The column chosen must be of DateTime data type." + ) return - + if detect_data_changes_column is not None: dc = t.Columns[detect_data_changes_column] dcType = dc.DataType if dcType != TOM.DataType.DateTime: - print(f"{red_dot} Invalid 'detect_data_changes_column' parameter. This column must be of DateTime data type.") + print( + f"{icons.red_dot} Invalid 'detect_data_changes_column' parameter. This column must be of DateTime data type." + ) return # Start changes: # Update partition expression - i=0 + i = 0 for p in t.Partitions: if p.SourceType != TOM.PartitionSourceType.M: - print(f"{red_dot} Invalid partition source type. Incremental refresh can only be set up if the table's partition is an M-partition.") + print( + f"{icons.red_dot} Invalid partition source type. Incremental refresh can only be set up if the table's partition is an M-partition." + ) return - elif i==0: + elif i == 0: text = p.Expression text = text.rstrip() - ind = text.rfind(' ') + 1 + ind = text.rfind(" ") + 1 obj = text[ind:] pattern = r"in\s*[^ ]*" matches = list(re.finditer(pattern, text)) if matches: last_match = matches[-1] - text_before_last_match = text[:last_match.start()] + text_before_last_match = text[: last_match.start()] print(text_before_last_match) else: - print(f"{red_dot} Invalid M-partition expression.") + print(f"{icons.red_dot} Invalid M-partition expression.") return - + endExpr = f'#"Filtered Rows IR" = Table.SelectRows({obj}, each [{column_name}] >= RangeStart and [{column_name}] <= RangeEnd)\n#"Filtered Rows IR"' finalExpr = text_before_last_match + endExpr p.Expression = finalExpr - i+=1 + i += 1 # Add expressions - self.add_expression(name = 'RangeStart', expression = f'datetime({start_year}, {start_month}, {start_day}, 0, 0, 0) meta [IsParameterQuery=true, Type="DateTime", IsParameterQueryRequired=true]') - self.add_expression(name = 'RangeEnd', expression = f'datetime({end_year}, {end_month}, {end_day}, 0, 0, 0) meta [IsParameterQuery=true, Type="DateTime", IsParameterQueryRequired=true]') + self.add_expression( + name="RangeStart", + expression=f'datetime({start_year}, {start_month}, {start_day}, 0, 0, 0) meta [IsParameterQuery=true, Type="DateTime", IsParameterQueryRequired=true]', + ) + self.add_expression( + name="RangeEnd", + expression=f'datetime({end_year}, {end_month}, {end_day}, 0, 0, 0) meta [IsParameterQuery=true, Type="DateTime", IsParameterQueryRequired=true]', + ) # Update properties rp = TOM.BasicRefreshPolicy() rp.IncrementalPeriods = incremental_periods - rp.IncrementalGranularity = System.Enum.Parse(TOM.RefreshGranularityType, incremental_granularity) + rp.IncrementalGranularity = System.Enum.Parse( + TOM.RefreshGranularityType, incremental_granularity + ) rp.RollingWindowPeriods = rolling_window_periods - rp.RollingWindowGranularity = System.Enum.Parse(TOM.RefreshGranularityType, rolling_window_granularity) + rp.RollingWindowGranularity = System.Enum.Parse( + TOM.RefreshGranularityType, rolling_window_granularity + ) if only_refresh_complete_days: rp.IncrementalPeriodsOffset = -1 @@ -3055,8 +3682,13 @@ def add_incremental_refresh_policy(self, table_name: str, column_name: str, star self.show_incremental_refresh_policy(table_name=table_name) - def apply_refresh_policy(self, table_name: str, effective_date: Optional[datetime] = None, refresh: Optional[bool] = True, max_parallelism: Optional[int] = 0): - + def apply_refresh_policy( + self, + table_name: str, + effective_date: Optional[datetime] = None, + refresh: Optional[bool] = True, + max_parallelism: Optional[int] = 0, + ): """ Applies the incremental refresh policy for a table within a semantic model. @@ -3076,10 +3708,15 @@ def apply_refresh_policy(self, table_name: str, effective_date: Optional[datetim """ - self.model.Tables[table_name].ApplyRefreshPolicy(effectiveDate = effective_date, refresh = refresh, maxParallelism = max_parallelism) - - def set_data_coverage_definition(self, table_name: str, partition_name: str, expression: str): + self.model.Tables[table_name].ApplyRefreshPolicy( + effectiveDate=effective_date, + refresh=refresh, + maxParallelism=max_parallelism, + ) + def set_data_coverage_definition( + self, table_name: str, partition_name: str, expression: str + ): """ Sets the data coverage definition for a partition. @@ -3097,18 +3734,22 @@ def set_data_coverage_definition(self, table_name: str, partition_name: str, exp """ - doc = 'https://learn.microsoft.com/analysis-services/tom/table-partitions?view=asallproducts-allversions' + doc = "https://learn.microsoft.com/analysis-services/tom/table-partitions?view=asallproducts-allversions" t = self.model.Tables[table_name] p = t.Partitions[partition_name] - ht = self.is_hybrid_table(table_name = table_name) + ht = self.is_hybrid_table(table_name=table_name) if not ht: - print(f"The data coverage definition property is only applicable to hybrid tables. See the documentation: {doc}.") + print( + f"The data coverage definition property is only applicable to hybrid tables. See the documentation: {doc}." + ) return if p.Mode != TOM.ModeType.DirectQuery: - print(f"The data coverage definition property is only applicable to the DirectQuery partition of a hybrid table. See the documentation: {doc}.") + print( + f"The data coverage definition property is only applicable to the DirectQuery partition of a hybrid table. See the documentation: {doc}." + ) return dcd = TOM.DataCoverageDefinition() @@ -3116,7 +3757,6 @@ def set_data_coverage_definition(self, table_name: str, partition_name: str, exp p.DataCoverageDefinition = dcd def set_encoding_hint(self, table_name: str, column_name: str, value: str): - """ Sets the encoding hint for a column. @@ -3134,17 +3774,20 @@ def set_encoding_hint(self, table_name: str, column_name: str, value: str): """ - values = ['Default', 'Hash', 'Value'] + values = ["Default", "Hash", "Value"] value = value.capitalize() if value not in values: - print(f"{red_dot} Invalid encoding hint value. Please choose from these options: {values}.") + print( + f"{icons.red_dot} Invalid encoding hint value. Please choose from these options: {values}." + ) return - self.model.Tables[table_name].Columns[column_name].EncodingHint = System.Enum.Parse(TOM.EncodingHintType, value) + self.model.Tables[table_name].Columns[column_name].EncodingHint = ( + System.Enum.Parse(TOM.EncodingHintType, value) + ) def set_data_type(self, table_name: str, column_name: str, value: str): - """ Sets the data type for a column. @@ -3162,26 +3805,39 @@ def set_data_type(self, table_name: str, column_name: str, value: str): """ - values = ['Binary', 'Boolean', 'DateTime', 'Decimal', 'Double', 'Int64', 'String'] + values = [ + "Binary", + "Boolean", + "DateTime", + "Decimal", + "Double", + "Int64", + "String", + ] + + value = value.replace(" ", "").capitalize() + if value == "Datetime": + value = "DateTime" + elif value.startswith("Int"): + value = "Int64" + elif value.startswith("Bool"): + value = "Boolean" - value = value.replace(' ','').capitalize() - if value == 'Datetime': - value = 'DateTime' - elif value.startswith('Int'): - value = 'Int64' - elif value.startswith('Bool'): - value = 'Boolean' - if value not in values: - print(f"{red_dot} Invalid data type. Please choose from these options: {values}.") + print( + f"{icons.red_dot} Invalid data type. Please choose from these options: {values}." + ) return - - self.model.Tables[table_name].Columns[column_name].DataType = System.Enum.Parse(TOM.DataType, value) - def add_time_intelligence(self, measure_name: str, date_table: str, time_intel: Union[str, List[str]]): + self.model.Tables[table_name].Columns[column_name].DataType = ( + System.Enum.Parse(TOM.DataType, value) + ) + def add_time_intelligence( + self, measure_name: str, date_table: str, time_intel: Union[str, List[str]] + ): """ - Adds time intelligence measures + Adds time intelligence measures Parameters ---------- @@ -3198,16 +3854,18 @@ def add_time_intelligence(self, measure_name: str, date_table: str, time_intel: """ table_name = None - time_intel_options = ['MTD', 'QTD', 'YTD'] + time_intel_options = ["MTD", "QTD", "YTD"] if isinstance(time_intel, str): time_intel = [time_intel] - + # Validate time intelligence variations for t in time_intel: t = t.capitalize() if t not in [time_intel_options]: - print(f"The '{t}' time intelligence variation is not supported. Valid options: {time_intel_options}.") + print( + f"The '{t}' time intelligence variation is not supported. Valid options: {time_intel_options}." + ) return # Validate measure and extract table name @@ -3216,14 +3874,18 @@ def add_time_intelligence(self, measure_name: str, date_table: str, time_intel: table_name = m.Parent.Name if table_name is None: - print(f"The '{measure_name}' is not a valid measure in the '{dataset}' semantic model within the '{workspace}' workspace.") + print( + f"The '{measure_name}' is not a valid measure in the '{dataset}' semantic model within the '{workspace}' workspace." + ) return - + # Validate date table if not self.is_date_table(date_table): - print(f"{red_dot} The '{date_table}' table is not a valid date table in the '{dataset}' wemantic model within the '{workspace}' workspace.") + print( + f"{icons.red_dot} The '{date_table}' table is not a valid date table in the '{dataset}' wemantic model within the '{workspace}' workspace." + ) return - + # Extract date key from date table for c in self.all_columns(): if c.Parent.Name == date_table and c.IsKey: @@ -3231,21 +3893,27 @@ def add_time_intelligence(self, measure_name: str, date_table: str, time_intel: # Create the new time intelligence measures for t in time_intel: - if t == 'MTD': + if t == "MTD": expr = f"CALCULATE([{measure_name}],DATES{time_intel}('{date_table}'[{date_key}]))" new_meas_name = f"{measure_name} {t}" - self.add_measure(table_name = table_name, measure_name = new_meas_name, expression = expr) - + self.add_measure( + table_name=table_name, + measure_name=new_meas_name, + expression=expr, + ) + def close(self): if not readonly and self.model is not None: self.model.SaveChanges() if len(fpAdded) > 0: - refresh_semantic_model(dataset = dataset, tables = fpAdded, workspace = workspace) + refresh_semantic_model( + dataset=dataset, tables=fpAdded, workspace=workspace + ) self.model = None - tw = TOMWrapper(dataset = dataset, workspace = workspace, readonly = readonly) - try: - yield tw + tw = TOMWrapper(dataset=dataset, workspace=workspace, readonly=readonly) + try: + yield tw finally: tw.close() diff --git a/sempy_labs/Translations.py b/sempy_labs/Translations.py index 0f389ce3..9dc4ca3b 100644 --- a/sempy_labs/Translations.py +++ b/sempy_labs/Translations.py @@ -1,14 +1,10 @@ import pandas as pd from typing import List, Optional, Union from sempy._utils._log import log +import sempy_labs._icons as icons -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' def language_validate(language: str): - """ Validateds that the language specified exists within the supported langauges. @@ -22,29 +18,36 @@ def language_validate(language: str): bool A True/False indication as to whether the language code is supported. """ - - url = 'https://learn.microsoft.com/azure/ai-services/translator/language-support' + + url = "https://learn.microsoft.com/azure/ai-services/translator/language-support" tables = pd.read_html(url) df = tables[0] - df_filt = df[df['Language code'] == language] + df_filt = df[df["Language code"] == language] - df_filt2 = df[df['Language'] == language.capitalize()] + df_filt2 = df[df["Language"] == language.capitalize()] if len(df_filt) == 1: - lang = df_filt['Language'].iloc[0] + lang = df_filt["Language"].iloc[0] elif len(df_filt2) == 1: - lang = df_filt2['Language'].iloc[0] + lang = df_filt2["Language"].iloc[0] else: - print(f"The '{language}' language is not a valid language code. Please refer to this link for a list of valid language codes: {url}.") + print( + f"The '{language}' language is not a valid language code. Please refer to this link for a list of valid language codes: {url}." + ) return return lang -@log -def translate_semantic_model(dataset: str, languages: Union[str, List[str]], exclude_characters: Optional[str] = None, workspace: Optional[str] = None): +@log +def translate_semantic_model( + dataset: str, + languages: Union[str, List[str]], + exclude_characters: Optional[str] = None, + workspace: Optional[str] = None, +): """ Translates names, descriptions, display folders for all objects in a semantic model. @@ -63,7 +66,7 @@ def translate_semantic_model(dataset: str, languages: Union[str, List[str]], exc Returns ------- - + """ from synapse.ml.services import Translate @@ -74,67 +77,151 @@ def translate_semantic_model(dataset: str, languages: Union[str, List[str]], exc if isinstance(languages, str): languages = [languages] - dfPrep = pd.DataFrame(columns=['Object Type', 'Name', 'Description', 'Display Folder']) + dfPrep = pd.DataFrame( + columns=["Object Type", "Name", "Description", "Display Folder"] + ) - with connect_semantic_model(dataset=dataset, readonly=False, workspace=workspace) as tom: + with connect_semantic_model( + dataset=dataset, readonly=False, workspace=workspace + ) as tom: if exclude_characters is None: for o in tom.model.Tables: - new_data = {'Object Type': 'Table', 'Name': o.Name, 'TName': o.Name, 'Description': o.Description, 'TDescription': o.Description, 'Display Folder': None, 'TDisplay Folder': None} - dfPrep = pd.concat([dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True) + new_data = { + "Object Type": "Table", + "Name": o.Name, + "TName": o.Name, + "Description": o.Description, + "TDescription": o.Description, + "Display Folder": None, + "TDisplay Folder": None, + } + dfPrep = pd.concat( + [dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) for o in tom.all_columns(): - new_data = {'Object Type': 'Column', 'Name': o.Name, 'TName': o.Name, 'Description': o.Description, 'TDescription': o.Description, 'Display Folder': o.DisplayFolder, 'TDisplay Folder': o.DisplayFolder} - dfPrep = pd.concat([dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True) + new_data = { + "Object Type": "Column", + "Name": o.Name, + "TName": o.Name, + "Description": o.Description, + "TDescription": o.Description, + "Display Folder": o.DisplayFolder, + "TDisplay Folder": o.DisplayFolder, + } + dfPrep = pd.concat( + [dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) for o in tom.all_measures(): - new_data = {'Object Type': 'Measure', 'Name': o.Name, 'TName': o.Name, 'Description': o.Description, 'TDescription': o.Description, 'Display Folder': o.DisplayFolder, 'TDisplay Folder': o.DisplayFolder} - dfPrep = pd.concat([dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True) + new_data = { + "Object Type": "Measure", + "Name": o.Name, + "TName": o.Name, + "Description": o.Description, + "TDescription": o.Description, + "Display Folder": o.DisplayFolder, + "TDisplay Folder": o.DisplayFolder, + } + dfPrep = pd.concat( + [dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) for o in tom.all_hierarchies(): - new_data = {'Object Type': 'Hierarchy', 'Name': o.Name, 'TName': o.Name, 'Description': o.Description, 'TDescription': o.Description, 'Display Folder': o.DisplayFolder, 'TDisplay Folder': o.DisplayFolder} - dfPrep = pd.concat([dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True) + new_data = { + "Object Type": "Hierarchy", + "Name": o.Name, + "TName": o.Name, + "Description": o.Description, + "TDescription": o.Description, + "Display Folder": o.DisplayFolder, + "TDisplay Folder": o.DisplayFolder, + } + dfPrep = pd.concat( + [dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) else: for o in tom.model.Tables: oName = o.Name oDescription = o.Description for s in exclude_characters: - oName = oName.replace(s, ' ') - oDescription = oDescription.replace(s, ' ') - new_data = {'Object Type': 'Table', 'Name': o.Name, 'TName': oName, 'Description': o.Description, 'TDescription': oDescription, 'Display Folder': None, 'TDisplay Folder': None} - dfPrep = pd.concat([dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True) + oName = oName.replace(s, " ") + oDescription = oDescription.replace(s, " ") + new_data = { + "Object Type": "Table", + "Name": o.Name, + "TName": oName, + "Description": o.Description, + "TDescription": oDescription, + "Display Folder": None, + "TDisplay Folder": None, + } + dfPrep = pd.concat( + [dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) for o in tom.all_columns(): oName = o.Name oDescription = o.Description oDisplayFolder = o.DisplayFolder for s in exclude_characters: - oName = oName.replace(s, ' ') - oDescription = oDescription.replace(s, ' ') - oDisplayFolder = oDisplayFolder.replace(s, ' ') - new_data = {'Object Type': 'Column', 'Name': o.Name, 'TName': oName, 'Description': o.Description, 'TDescription': oDescription, 'Display Folder': o.DisplayFolder, 'TDisplay Folder': oDisplayFolder} - dfPrep = pd.concat([dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True) + oName = oName.replace(s, " ") + oDescription = oDescription.replace(s, " ") + oDisplayFolder = oDisplayFolder.replace(s, " ") + new_data = { + "Object Type": "Column", + "Name": o.Name, + "TName": oName, + "Description": o.Description, + "TDescription": oDescription, + "Display Folder": o.DisplayFolder, + "TDisplay Folder": oDisplayFolder, + } + dfPrep = pd.concat( + [dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) for o in tom.all_measures(): oName = o.Name oDescription = o.Description oDisplayFolder = o.DisplayFolder for s in exclude_characters: - oName = oName.replace(s, ' ') - oDescription = oDescription.replace(s, ' ') - oDisplayFolder = oDisplayFolder.replace(s, ' ') - new_data = {'Object Type': 'Measure', 'Name': o.Name, 'TName': oName, 'Description': o.Description, 'TDescription': oDescription, 'Display Folder': o.DisplayFolder, 'TDisplay Folder': oDisplayFolder} - dfPrep = pd.concat([dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True) + oName = oName.replace(s, " ") + oDescription = oDescription.replace(s, " ") + oDisplayFolder = oDisplayFolder.replace(s, " ") + new_data = { + "Object Type": "Measure", + "Name": o.Name, + "TName": oName, + "Description": o.Description, + "TDescription": oDescription, + "Display Folder": o.DisplayFolder, + "TDisplay Folder": oDisplayFolder, + } + dfPrep = pd.concat( + [dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) for o in tom.all_hierarchies(): oName = o.Name oDescription = o.Description oDisplayFolder = o.DisplayFolder for s in exclude_characters: - oName = oName.replace(s, ' ') - oDescription = oDescription.replace(s, ' ') - oDisplayFolder = oDisplayFolder.replace(s, ' ') - new_data = {'Object Type': 'Hierarchy', 'Name': o.Name, 'TName': oName, 'Description': o.Description, 'TDescription': oDescription, 'Display Folder': o.DisplayFolder, 'TDisplay Folder': oDisplayFolder} - dfPrep = pd.concat([dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True) + oName = oName.replace(s, " ") + oDescription = oDescription.replace(s, " ") + oDisplayFolder = oDisplayFolder.replace(s, " ") + new_data = { + "Object Type": "Hierarchy", + "Name": o.Name, + "TName": oName, + "Description": o.Description, + "TDescription": oDescription, + "Display Folder": o.DisplayFolder, + "TDisplay Folder": oDisplayFolder, + } + dfPrep = pd.concat( + [dfPrep, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) spark = SparkSession.builder.getOrCreate() df = spark.createDataFrame(dfPrep) - columns = ['Name', 'Description', 'Display Folder'] + columns = ["Name", "Description", "Display Folder"] for clm in columns: columnToTranslate = f"T{clm}" @@ -146,83 +233,183 @@ def translate_semantic_model(dataset: str, languages: Union[str, List[str]], exc .setConcurrency(5) ) - transDF = (translate - .transform(df) + transDF = ( + translate.transform(df) .withColumn("translation", flatten(col("translation.translations"))) .withColumn("translation", col("translation.text")) - .select('Object Type', clm, columnToTranslate, 'translation')) + .select("Object Type", clm, columnToTranslate, "translation") + ) df_panda = transDF.toPandas() - print(f"{in_progress} Translating {clm}s...") + print(f"{icons.in_progress} Translating {clm}s...") for lang in languages: i = languages.index(lang) - tom.add_translation(language = lang) - print(f"{in_progress} Translating into the '{lang}' language...") + tom.add_translation(language=lang) + print(f"{icons.in_progress} Translating into the '{lang}' language...") for t in tom.model.Tables: if t.IsHidden == False: - if clm == 'Name': - df_filt = df_panda[(df_panda['Object Type'] == 'Table') & (df_panda['Name'] == t.Name)] + if clm == "Name": + df_filt = df_panda[ + (df_panda["Object Type"] == "Table") + & (df_panda["Name"] == t.Name) + ] if len(df_filt) == 1: - tr = df_filt['translation'].str[i].iloc[0] - tom.set_translation(object = t, language = lang, property = 'Name', value = tr) - print(f"{green_dot} Translation '{tr}' set for the '{lang}' language on the '{t.Name}' table.") - elif clm == 'Description' and t.Description is not None: - df_filt = df_panda[(df_panda['Object Type'] == 'Table') & (df_panda['Description'] == t.Description)] + tr = df_filt["translation"].str[i].iloc[0] + tom.set_translation( + object=t, language=lang, property="Name", value=tr + ) + print( + f"{icons.green_dot} Translation '{tr}' set for the '{lang}' language on the '{t.Name}' table." + ) + elif clm == "Description" and t.Description is not None: + df_filt = df_panda[ + (df_panda["Object Type"] == "Table") + & (df_panda["Description"] == t.Description) + ] if len(df_filt) == 1: - tr = df_filt['translation'].str[i].iloc[0] - tom.set_translation(object = t, language = lang, property = 'Description', value = tr) + tr = df_filt["translation"].str[i].iloc[0] + tom.set_translation( + object=t, + language=lang, + property="Description", + value=tr, + ) for c in t.Columns: if c.IsHidden == False: - if clm == 'Name': - df_filt = df_panda[(df_panda['Object Type'] == 'Column') & (df_panda['Name'] == c.Name)] + if clm == "Name": + df_filt = df_panda[ + (df_panda["Object Type"] == "Column") + & (df_panda["Name"] == c.Name) + ] if len(df_filt) == 1: - tr = df_filt['translation'].str[i].iloc[0] - tom.set_translation(object = c, language = lang, property = 'Name', value = tr) - print(f"{green_dot} Translation '{tr}' set on the '{c.Name}' column within the {t.Name}' table.") - elif clm == 'Description' and c.Description is not None: - df_filt = df_panda[(df_panda['Object Type'] == 'Column') & (df_panda['Description'] == c.Description)] + tr = df_filt["translation"].str[i].iloc[0] + tom.set_translation( + object=c, + language=lang, + property="Name", + value=tr, + ) + print( + f"{icons.green_dot} Translation '{tr}' set on the '{c.Name}' column within the {t.Name}' table." + ) + elif clm == "Description" and c.Description is not None: + df_filt = df_panda[ + (df_panda["Object Type"] == "Column") + & (df_panda["Description"] == c.Description) + ] if len(df_filt) == 1: - tr = df_filt['translation'].str[i].iloc[0] - tom.set_translation(object = c, language = lang, property = 'Description', value = tr) - elif clm == 'Display Folder' and c.DisplayFolder is not None: - df_filt = df_panda[(df_panda['Object Type'] == 'Column') & (df_panda['Display Folder'] == c.Description)] + tr = df_filt["translation"].str[i].iloc[0] + tom.set_translation( + object=c, + language=lang, + property="Description", + value=tr, + ) + elif ( + clm == "Display Folder" + and c.DisplayFolder is not None + ): + df_filt = df_panda[ + (df_panda["Object Type"] == "Column") + & (df_panda["Display Folder"] == c.Description) + ] if len(df_filt) == 1: - tr = df_filt['translation'].str[i].iloc[0] - tom.set_translation(object = c, language = lang, property = 'Display Folder', value = tr) + tr = df_filt["translation"].str[i].iloc[0] + tom.set_translation( + object=c, + language=lang, + property="Display Folder", + value=tr, + ) for h in t.Hierarchies: if h.IsHidden == False: - if clm == 'Name': - df_filt = df_panda[(df_panda['Object Type'] == 'Hierarchy') & (df_panda['Name'] == h.Name)] + if clm == "Name": + df_filt = df_panda[ + (df_panda["Object Type"] == "Hierarchy") + & (df_panda["Name"] == h.Name) + ] if len(df_filt) == 1: - tr = df_filt['translation'].str[i].iloc[0] - tom.set_translation(object = h, language = lang, property = 'Name', value = tr) - elif clm == 'Description' and h.Description is not None: - df_filt = df_panda[(df_panda['Object Type'] == 'Hierarchy') & (df_panda['Description'] == h.Description)] + tr = df_filt["translation"].str[i].iloc[0] + tom.set_translation( + object=h, + language=lang, + property="Name", + value=tr, + ) + elif clm == "Description" and h.Description is not None: + df_filt = df_panda[ + (df_panda["Object Type"] == "Hierarchy") + & (df_panda["Description"] == h.Description) + ] if len(df_filt) == 1: - tr = df_filt['translation'].str[i].iloc[0] - tom.set_translation(object = h, language = lang, property = 'Description', value = tr) - elif clm == 'Display Folder' and h.DisplayFolder is not None: - df_filt = df_panda[(df_panda['Object Type'] == 'Hierarchy') & (df_panda['Display Folder'] == h.Description)] + tr = df_filt["translation"].str[i].iloc[0] + tom.set_translation( + object=h, + language=lang, + property="Description", + value=tr, + ) + elif ( + clm == "Display Folder" + and h.DisplayFolder is not None + ): + df_filt = df_panda[ + (df_panda["Object Type"] == "Hierarchy") + & (df_panda["Display Folder"] == h.Description) + ] if len(df_filt) == 1: - tr = df_filt['translation'].str[i].iloc[0] - tom.set_translation(object = h, language = lang, property = 'Display Folder', value = tr) + tr = df_filt["translation"].str[i].iloc[0] + tom.set_translation( + object=h, + language=lang, + property="Display Folder", + value=tr, + ) for ms in t.Measures: if ms.IsHidden == False: - if clm == 'Name': - df_filt = df_panda[(df_panda['Object Type'] == 'Measure') & (df_panda['Name'] == ms.Name)] + if clm == "Name": + df_filt = df_panda[ + (df_panda["Object Type"] == "Measure") + & (df_panda["Name"] == ms.Name) + ] if len(df_filt) == 1: - tr = df_filt['translation'].str[i].iloc[0] - tom.set_translation(object = ms, language = lang, property = 'Name', value = tr) - print(f"{green_dot} Translation '{tr}' set on the '{ms.Name}' column within the {t.Name}' table.") - elif clm == 'Description' and ms.Description is not None: - df_filt = df_panda[(df_panda['Object Type'] == 'Measure') & (df_panda['Description'] == ms.Description)] - if len(df_filt) == 1: - tr = df_filt['translation'].str[i].iloc[0] - tom.set_translation(object = ms, language = lang, property = 'Description', value = tr) - elif clm == 'Display Folder' and ms.DisplayFolder is not None: - df_filt = df_panda[(df_panda['Object Type'] == 'Measure') & (df_panda['Display Folder'] == ms.Description)] + tr = df_filt["translation"].str[i].iloc[0] + tom.set_translation( + object=ms, + language=lang, + property="Name", + value=tr, + ) + print( + f"{icons.green_dot} Translation '{tr}' set on the '{ms.Name}' column within the {t.Name}' table." + ) + elif clm == "Description" and ms.Description is not None: + df_filt = df_panda[ + (df_panda["Object Type"] == "Measure") + & (df_panda["Description"] == ms.Description) + ] + if len(df_filt) == 1: + tr = df_filt["translation"].str[i].iloc[0] + tom.set_translation( + object=ms, + language=lang, + property="Description", + value=tr, + ) + elif ( + clm == "Display Folder" and ms.DisplayFolder is not None + ): + df_filt = df_panda[ + (df_panda["Object Type"] == "Measure") + & (df_panda["Display Folder"] == ms.Description) + ] if len(df_filt) == 1: - tr = df_filt['translation'].str[i].iloc[0] - tom.set_translation(object = ms, language = lang, property = 'Display Folder', value = tr) + tr = df_filt["translation"].str[i].iloc[0] + tom.set_translation( + object=ms, + language=lang, + property="Display Folder", + value=tr, + ) diff --git a/sempy_labs/Vertipaq.py b/sempy_labs/Vertipaq.py index ca50a740..f2a132ff 100644 --- a/sempy_labs/Vertipaq.py +++ b/sempy_labs/Vertipaq.py @@ -4,16 +4,26 @@ from IPython.display import display, HTML import zipfile, os, shutil, datetime, warnings from pyspark.sql import SparkSession -from .HelperFunctions import format_dax_object_name, get_direct_lake_sql_endpoint, resolve_lakehouse_name -from .ListFunctions import list_relationships -from .GetLakehouseTables import get_lakehouse_tables -from .Lakehouse import lakehouse_attached +from ._helper_functions import ( + format_dax_object_name, + get_direct_lake_sql_endpoint, + resolve_lakehouse_name, +) +from ._list_functions import list_relationships +from .lakehouse.GetLakehouseTables import get_lakehouse_tables +from .lakehouse.Lakehouse import lakehouse_attached from typing import List, Optional, Union from sempy._utils._log import log + @log -def vertipaq_analyzer(dataset: str, workspace: Optional[str] = None, export: Optional[str] = None, lakehouse_workspace: Optional[str] = None, read_stats_from_data: Optional[bool] = False): - +def vertipaq_analyzer( + dataset: str, + workspace: Optional[str] = None, + export: Optional[str] = None, + lakehouse_workspace: Optional[str] = None, + read_stats_from_data: Optional[bool] = False, +): """ Displays an HTML visualization of the Vertipaq Analyzer statistics from a semantic model. @@ -26,8 +36,8 @@ def vertipaq_analyzer(dataset: str, workspace: Optional[str] = None, export: Opt Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. export : str, default=None - Specifying 'zip' will export the results to a zip file in your lakehouse (which can be imported using the import_vertipaq_analyzer function. - Specifying 'table' will export the results to delta tables (appended) in your lakehouse. + Specifying 'zip' will export the results to a zip file in your lakehouse (which can be imported using the import_vertipaq_analyzer function. + Specifying 'table' will export the results to delta tables (appended) in your lakehouse. Default value: None. lakehouse_workspace : str, default=None The Fabric workspace used by the lakehouse (for Direct Lake semantic models). @@ -42,72 +52,93 @@ def vertipaq_analyzer(dataset: str, workspace: Optional[str] = None, export: Opt """ pd.options.mode.copy_on_write = True - warnings.filterwarnings("ignore", message="createDataFrame attempted Arrow optimization*") + warnings.filterwarnings( + "ignore", message="createDataFrame attempted Arrow optimization*" + ) if workspace == None: workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) + workspace = fabric.resolve_workspace_name(workspace_id) if lakehouse_workspace == None: lakehouse_workspace = workspace - dfT = fabric.list_tables(dataset = dataset, extended=True, workspace = workspace) - dfT.rename(columns={'Name': 'Table Name'}, inplace=True) - dfC = fabric.list_columns(dataset = dataset, extended=True, workspace = workspace) - dfC['Column Object'] = format_dax_object_name(dfC['Table Name'], dfC['Column Name']) - dfC.rename(columns={'Column Cardinality': 'Cardinality'}, inplace=True) - dfH = fabric.list_hierarchies(dataset = dataset, extended=True, workspace = workspace) - dfR = list_relationships(dataset = dataset, extended=True, workspace = workspace) - dfR['From Object'] = format_dax_object_name(dfR['From Table'], dfR['From Column']) - dfR['To Object'] = format_dax_object_name(dfR['To Table'], dfR['To Column']) - dfP = fabric.list_partitions(dataset = dataset, extended=True, workspace = workspace) - dfD = fabric.list_datasets(workspace = workspace, additional_xmla_properties=['CompatibilityLevel','Model.DefaultMode']) - dfD = dfD[dfD['Dataset Name'] == dataset] - dfD['Compatibility Level'] = dfD['Compatibility Level'].astype(int) - isDirectLake = any(r['Mode'] == 'DirectLake' for i, r in dfP.iterrows()) - dfR['Missing Rows'] = None + dfT = fabric.list_tables(dataset=dataset, extended=True, workspace=workspace) + dfT.rename(columns={"Name": "Table Name"}, inplace=True) + dfC = fabric.list_columns(dataset=dataset, extended=True, workspace=workspace) + dfC["Column Object"] = format_dax_object_name(dfC["Table Name"], dfC["Column Name"]) + dfC.rename(columns={"Column Cardinality": "Cardinality"}, inplace=True) + dfH = fabric.list_hierarchies(dataset=dataset, extended=True, workspace=workspace) + dfR = list_relationships(dataset=dataset, extended=True, workspace=workspace) + dfR["From Object"] = format_dax_object_name(dfR["From Table"], dfR["From Column"]) + dfR["To Object"] = format_dax_object_name(dfR["To Table"], dfR["To Column"]) + dfP = fabric.list_partitions(dataset=dataset, extended=True, workspace=workspace) + dfD = fabric.list_datasets( + workspace=workspace, + additional_xmla_properties=["CompatibilityLevel", "Model.DefaultMode"], + ) + dfD = dfD[dfD["Dataset Name"] == dataset] + dfD["Compatibility Level"] = dfD["Compatibility Level"].astype(int) + isDirectLake = any(r["Mode"] == "DirectLake" for i, r in dfP.iterrows()) + dfR["Missing Rows"] = None # Direct Lake if read_stats_from_data: if isDirectLake: - dfC = pd.merge(dfC, dfP[['Table Name', 'Query', 'Source Type']], on='Table Name', how='left') - dfC_flt = dfC[(dfC['Source Type'] == 'Entity') & (~dfC['Column Name'].str.startswith('RowNumber-'))] + dfC = pd.merge( + dfC, + dfP[["Table Name", "Query", "Source Type"]], + on="Table Name", + how="left", + ) + dfC_flt = dfC[ + (dfC["Source Type"] == "Entity") + & (~dfC["Column Name"].str.startswith("RowNumber-")) + ] sqlEndpointId = get_direct_lake_sql_endpoint(dataset, workspace) # Get lakehouse name from SQL Endpoint ID - dfI = fabric.list_items(workspace = lakehouse_workspace, type = 'SQLEndpoint') - dfI_filt = dfI[(dfI['Id'] == sqlEndpointId)] + dfI = fabric.list_items(workspace=lakehouse_workspace, type="SQLEndpoint") + dfI_filt = dfI[(dfI["Id"] == sqlEndpointId)] if len(dfI_filt) == 0: - print(f"The lakehouse (SQL Endpoint) used by the '{dataset}' semantic model does not reside in the '{lakehouse_workspace}' workspace. Please update the lakehouse_workspace parameter.") + print( + f"The lakehouse (SQL Endpoint) used by the '{dataset}' semantic model does not reside in the '{lakehouse_workspace}' workspace. Please update the lakehouse_workspace parameter." + ) else: - lakehouseName = dfI_filt['Display Name'].iloc[0] + lakehouseName = dfI_filt["Display Name"].iloc[0] current_workspace_id = fabric.get_workspace_id() current_workspace = fabric.resolve_workspace_name(current_workspace_id) if current_workspace != lakehouse_workspace: - lakeTables = get_lakehouse_tables(lakehouse = lakehouseName, workspace = lakehouse_workspace) + lakeTables = get_lakehouse_tables( + lakehouse=lakehouseName, workspace=lakehouse_workspace + ) sql_statements = [] spark = SparkSession.builder.getOrCreate() # Loop through tables - for lakeTName in dfC_flt['Query'].unique(): - query = 'SELECT ' - columns_in_table = dfC_flt.loc[dfC_flt['Query'] == lakeTName, 'Source'].unique() - + for lakeTName in dfC_flt["Query"].unique(): + query = "SELECT " + columns_in_table = dfC_flt.loc[ + dfC_flt["Query"] == lakeTName, "Source" + ].unique() + # Loop through columns within those tables for scName in columns_in_table: query = query + f"COUNT(DISTINCT({scName})) AS {scName}, " - + query = query[:-2] if lakehouse_workspace == current_workspace: query = query + f" FROM {lakehouseName}.{lakeTName}" else: - lakeTables_filt = lakeTables[lakeTables['Table Name'] == lakeTName] - tPath = lakeTables_filt['Location'].iloc[0] + lakeTables_filt = lakeTables[ + lakeTables["Table Name"] == lakeTName + ] + tPath = lakeTables_filt["Location"].iloc[0] df = spark.read.format("delta").load(tPath) - tempTableName = 'delta_table_' + lakeTName + tempTableName = "delta_table_" + lakeTName df.createOrReplaceTempView(tempTableName) query = query + f" FROM {tempTableName}" sql_statements.append((lakeTName, query)) @@ -117,364 +148,650 @@ def vertipaq_analyzer(dataset: str, workspace: Optional[str] = None, export: Opt query = o[1] df = spark.sql(query) - + for column in df.columns: x = df.collect()[0][column] for i, r in dfC.iterrows(): - if r['Query'] == tName and r['Source'] == column: - dfC.at[i, 'Cardinality'] = x + if r["Query"] == tName and r["Source"] == column: + dfC.at[i, "Cardinality"] = x # Remove column added temporarily - dfC.drop(columns=['Query', 'Source Type'], inplace=True) + dfC.drop(columns=["Query", "Source Type"], inplace=True) # Direct Lake missing rows - dfR = pd.merge(dfR, dfP[['Table Name', 'Query']], left_on = 'From Table', right_on = 'Table Name', how = 'left') - dfR.rename(columns={'Query': 'From Lake Table'}, inplace=True) - dfR.drop(columns=['Table Name'], inplace=True) - dfR = pd.merge(dfR, dfP[['Table Name', 'Query']], left_on = 'To Table', right_on = 'Table Name', how = 'left') - dfR.rename(columns={'Query': 'To Lake Table'}, inplace=True) - dfR.drop(columns=['Table Name'], inplace=True) - dfR = pd.merge(dfR, dfC[['Column Object', 'Source']], left_on = 'From Object', right_on = 'Column Object', how = 'left') - dfR.rename(columns={'Source': 'From Lake Column'}, inplace=True) - dfR.drop(columns=['Column Object'], inplace=True) - dfR = pd.merge(dfR, dfC[['Column Object', 'Source']], left_on = 'To Object', right_on = 'Column Object', how = 'left') - dfR.rename(columns={'Source': 'To Lake Column'}, inplace=True) - dfR.drop(columns=['Column Object'], inplace=True) + dfR = pd.merge( + dfR, + dfP[["Table Name", "Query"]], + left_on="From Table", + right_on="Table Name", + how="left", + ) + dfR.rename(columns={"Query": "From Lake Table"}, inplace=True) + dfR.drop(columns=["Table Name"], inplace=True) + dfR = pd.merge( + dfR, + dfP[["Table Name", "Query"]], + left_on="To Table", + right_on="Table Name", + how="left", + ) + dfR.rename(columns={"Query": "To Lake Table"}, inplace=True) + dfR.drop(columns=["Table Name"], inplace=True) + dfR = pd.merge( + dfR, + dfC[["Column Object", "Source"]], + left_on="From Object", + right_on="Column Object", + how="left", + ) + dfR.rename(columns={"Source": "From Lake Column"}, inplace=True) + dfR.drop(columns=["Column Object"], inplace=True) + dfR = pd.merge( + dfR, + dfC[["Column Object", "Source"]], + left_on="To Object", + right_on="Column Object", + how="left", + ) + dfR.rename(columns={"Source": "To Lake Column"}, inplace=True) + dfR.drop(columns=["Column Object"], inplace=True) spark = SparkSession.builder.getOrCreate() for i, r in dfR.iterrows(): - fromTable = r['From Lake Table'] - fromColumn = r['From Lake Column'] - toTable= r['To Lake Table'] - toColumn = r['To Lake Column'] + fromTable = r["From Lake Table"] + fromColumn = r["From Lake Column"] + toTable = r["To Lake Table"] + toColumn = r["To Lake Column"] if lakehouse_workspace == current_workspace: query = f"select count(f.{fromColumn}) as {fromColumn}\nfrom {fromTable} as f\nleft join {toTable} as c on f.{fromColumn} = c.{toColumn}\nwhere c.{toColumn} is null" else: - tempTableFrom = 'delta_table_' + fromTable - tempTableTo = 'delta_table_' + toTable + tempTableFrom = "delta_table_" + fromTable + tempTableTo = "delta_table_" + toTable query = f"select count(f.{fromColumn}) as {fromColumn}\nfrom {tempTableFrom} as f\nleft join {tempTableTo} as c on f.{fromColumn} = c.{toColumn}\nwhere c.{toColumn} is null" - - #query = f"select count(f.{fromColumn}) as {fromColumn}\nfrom {fromTable} as f\nleft join {toTable} as c on f.{fromColumn} = c.{toColumn}\nwhere c.{toColumn} is null" + + # query = f"select count(f.{fromColumn}) as {fromColumn}\nfrom {fromTable} as f\nleft join {toTable} as c on f.{fromColumn} = c.{toColumn}\nwhere c.{toColumn} is null" df = spark.sql(query) missingRows = df.collect()[0][0] - dfR.at[i, 'Missing Rows'] = missingRows - - dfR['Missing Rows'] = dfR['Missing Rows'].astype(int) + dfR.at[i, "Missing Rows"] = missingRows + + dfR["Missing Rows"] = dfR["Missing Rows"].astype(int) else: # Calculate missing rows using DAX for non-direct lake for i, r in dfR.iterrows(): - fromTable = r['From Table'] - fromColumn = r['From Column'] - toTable= r['To Table'] - toColumn = r['To Column'] - isActive = bool(r['Active']) + fromTable = r["From Table"] + fromColumn = r["From Column"] + toTable = r["To Table"] + toColumn = r["To Column"] + isActive = bool(r["Active"]) fromObject = format_dax_object_name(fromTable, fromColumn) - toObject= format_dax_object_name(toTable, toColumn) + toObject = format_dax_object_name(toTable, toColumn) missingRows = 0 query = f"evaluate\nsummarizecolumns(\n\"1\",calculate(countrows('{fromTable}'),isblank({toObject}))\n)" - if isActive == False: # add userelationship + if isActive == False: # add userelationship query = f"evaluate\nsummarizecolumns(\n\"1\",calculate(countrows('{fromTable}'),userelationship({fromObject},{toObject}),isblank({toObject}))\n)" - - result = fabric.evaluate_dax(dataset = dataset, dax_string = query, workspace = workspace) - try: - missingRows = result.iloc[0,0] + result = fabric.evaluate_dax( + dataset=dataset, dax_string=query, workspace=workspace + ) + + try: + missingRows = result.iloc[0, 0] except: pass - - dfR.at[i, 'Missing Rows'] = missingRows - dfR['Missing Rows'] = dfR['Missing Rows'].astype(int) - dfTP = dfP.groupby('Table Name')['Partition Name'].count().reset_index() - dfTP.rename(columns={'Partition Name': 'Partitions'}, inplace=True) - dfTC = dfC.groupby('Table Name')['Column Name'].count().reset_index() - dfTC.rename(columns={'Column Name': 'Columns'}, inplace=True) + dfR.at[i, "Missing Rows"] = missingRows + dfR["Missing Rows"] = dfR["Missing Rows"].astype(int) + + dfTP = dfP.groupby("Table Name")["Partition Name"].count().reset_index() + dfTP.rename(columns={"Partition Name": "Partitions"}, inplace=True) + dfTC = dfC.groupby("Table Name")["Column Name"].count().reset_index() + dfTC.rename(columns={"Column Name": "Columns"}, inplace=True) - total_size = dfC['Total Size'].sum() - table_sizes = dfC.groupby('Table Name')['Total Size'].sum().reset_index() - table_sizes.rename(columns={'Total Size': 'Table Size'}, inplace=True) + total_size = dfC["Total Size"].sum() + table_sizes = dfC.groupby("Table Name")["Total Size"].sum().reset_index() + table_sizes.rename(columns={"Total Size": "Table Size"}, inplace=True) # Columns - dfC_filt = dfC[~dfC['Column Name'].str.startswith('RowNumber-')] - dfC_filt['% DB'] = round((dfC_filt['Total Size'] / total_size) * 100,2) - dfC_filt = pd.merge(dfC_filt, table_sizes, on = 'Table Name', how = 'left') - dfC_filt['% Table'] = round((dfC_filt['Total Size'] / dfC_filt['Table Size']) * 100,2) - columnList = ['Table Name', 'Column Name', 'Type', 'Cardinality', 'Total Size', 'Data Size', 'Dictionary Size', 'Hierarchy Size','% Table', '% DB', 'Data Type', 'Encoding', 'Is Resident', 'Temperature', 'Last Accessed'] - - colSize = dfC_filt[columnList].sort_values(by='Total Size', ascending=False) - temp = dfC_filt[columnList].sort_values(by='Temperature', ascending=False) + dfC_filt = dfC[~dfC["Column Name"].str.startswith("RowNumber-")] + dfC_filt["% DB"] = round((dfC_filt["Total Size"] / total_size) * 100, 2) + dfC_filt = pd.merge(dfC_filt, table_sizes, on="Table Name", how="left") + dfC_filt["% Table"] = round( + (dfC_filt["Total Size"] / dfC_filt["Table Size"]) * 100, 2 + ) + columnList = [ + "Table Name", + "Column Name", + "Type", + "Cardinality", + "Total Size", + "Data Size", + "Dictionary Size", + "Hierarchy Size", + "% Table", + "% DB", + "Data Type", + "Encoding", + "Is Resident", + "Temperature", + "Last Accessed", + ] + + colSize = dfC_filt[columnList].sort_values(by="Total Size", ascending=False) + temp = dfC_filt[columnList].sort_values(by="Temperature", ascending=False) colSize.reset_index(drop=True, inplace=True) temp.reset_index(drop=True, inplace=True) export_Col = colSize.copy() - intList = ['Cardinality', 'Total Size', 'Data Size', 'Dictionary Size', 'Hierarchy Size'] - pctList = ['% Table', '% DB'] - colSize[intList] = colSize[intList].applymap('{:,}'.format) - temp[intList] = temp[intList].applymap('{:,}'.format) - colSize[pctList] = colSize[pctList].applymap('{:.2f}%'.format) - temp[pctList] = temp[pctList].applymap('{:.2f}%'.format) + intList = [ + "Cardinality", + "Total Size", + "Data Size", + "Dictionary Size", + "Hierarchy Size", + ] + pctList = ["% Table", "% DB"] + colSize[intList] = colSize[intList].applymap("{:,}".format) + temp[intList] = temp[intList].applymap("{:,}".format) + colSize[pctList] = colSize[pctList].applymap("{:.2f}%".format) + temp[pctList] = temp[pctList].applymap("{:.2f}%".format) # Tables - intList = ['Total Size', 'Data Size', 'Dictionary Size', 'Hierarchy Size'] - dfCSum = dfC.groupby(['Table Name'])[intList].sum().reset_index() - dfCSum['% DB'] = round((dfCSum['Total Size'] / total_size) * 100,2) - - dfTable = pd.merge(dfT[['Table Name', 'Type', 'Row Count']], dfCSum, on = 'Table Name', how = 'inner') - dfTable = pd.merge(dfTable,dfTP, on = 'Table Name', how = 'left') - dfTable = pd.merge(dfTable,dfTC, on = 'Table Name', how = 'left') - dfTable = dfTable.drop_duplicates() #Drop duplicates (temporary) - dfTable = dfTable.sort_values(by='Total Size', ascending=False) + intList = ["Total Size", "Data Size", "Dictionary Size", "Hierarchy Size"] + dfCSum = dfC.groupby(["Table Name"])[intList].sum().reset_index() + dfCSum["% DB"] = round((dfCSum["Total Size"] / total_size) * 100, 2) + + dfTable = pd.merge( + dfT[["Table Name", "Type", "Row Count"]], dfCSum, on="Table Name", how="inner" + ) + dfTable = pd.merge(dfTable, dfTP, on="Table Name", how="left") + dfTable = pd.merge(dfTable, dfTC, on="Table Name", how="left") + dfTable = dfTable.drop_duplicates() # Drop duplicates (temporary) + dfTable = dfTable.sort_values(by="Total Size", ascending=False) dfTable.reset_index(drop=True, inplace=True) export_Table = dfTable.copy() - intList.extend(['Row Count', 'Partitions', 'Columns']) - dfTable[intList] = dfTable[intList].applymap('{:,}'.format) - pctList = ['% DB'] - dfTable[pctList] = dfTable[pctList].applymap('{:.2f}%'.format) - - ## Relationships - #dfR.drop(columns=['Max From Cardinality', 'Max To Cardinality'], inplace=True) - dfR = pd.merge(dfR, dfC[['Column Object', 'Cardinality']], left_on = 'From Object', right_on = 'Column Object', how = 'left') - dfR.rename(columns={'Cardinality': 'Max From Cardinality'}, inplace=True) - dfR = pd.merge(dfR, dfC[['Column Object', 'Cardinality']], left_on = 'To Object', right_on = 'Column Object', how='left') - dfR.rename(columns={'Cardinality': 'Max To Cardinality'}, inplace=True) - dfR = dfR[['From Object', 'To Object', 'Multiplicity', 'Used Size', 'Max From Cardinality', 'Max To Cardinality', 'Missing Rows']].sort_values(by='Used Size', ascending=False) + intList.extend(["Row Count", "Partitions", "Columns"]) + dfTable[intList] = dfTable[intList].applymap("{:,}".format) + pctList = ["% DB"] + dfTable[pctList] = dfTable[pctList].applymap("{:.2f}%".format) + + ## Relationships + # dfR.drop(columns=['Max From Cardinality', 'Max To Cardinality'], inplace=True) + dfR = pd.merge( + dfR, + dfC[["Column Object", "Cardinality"]], + left_on="From Object", + right_on="Column Object", + how="left", + ) + dfR.rename(columns={"Cardinality": "Max From Cardinality"}, inplace=True) + dfR = pd.merge( + dfR, + dfC[["Column Object", "Cardinality"]], + left_on="To Object", + right_on="Column Object", + how="left", + ) + dfR.rename(columns={"Cardinality": "Max To Cardinality"}, inplace=True) + dfR = dfR[ + [ + "From Object", + "To Object", + "Multiplicity", + "Used Size", + "Max From Cardinality", + "Max To Cardinality", + "Missing Rows", + ] + ].sort_values(by="Used Size", ascending=False) dfR.reset_index(drop=True, inplace=True) export_Rel = dfR.copy() - intList = ['Used Size', 'Max From Cardinality', 'Max To Cardinality', 'Missing Rows'] + intList = [ + "Used Size", + "Max From Cardinality", + "Max To Cardinality", + "Missing Rows", + ] if read_stats_from_data == False: - intList.remove('Missing Rows') - dfR[intList] = dfR[intList].applymap('{:,}'.format) + intList.remove("Missing Rows") + dfR[intList] = dfR[intList].applymap("{:,}".format) ## Partitions - dfP = dfP[['Table Name', 'Partition Name', 'Mode', 'Record Count', 'Segment Count']].sort_values(by='Record Count', ascending=False) #, 'Records per Segment' - dfP['Records per Segment'] = round(dfP['Record Count'] / dfP['Segment Count'],2) # Remove after records per segment is fixed + dfP = dfP[ + ["Table Name", "Partition Name", "Mode", "Record Count", "Segment Count"] + ].sort_values( + by="Record Count", ascending=False + ) # , 'Records per Segment' + dfP["Records per Segment"] = round( + dfP["Record Count"] / dfP["Segment Count"], 2 + ) # Remove after records per segment is fixed dfP.reset_index(drop=True, inplace=True) export_Part = dfP.copy() - intList = ['Record Count', 'Segment Count', 'Records per Segment'] - dfP[intList] = dfP[intList].applymap('{:,}'.format) + intList = ["Record Count", "Segment Count", "Records per Segment"] + dfP[intList] = dfP[intList].applymap("{:,}".format) ## Hierarchies - dfH_filt = dfH[dfH['Level Ordinal'] == 0] - dfH_filt = dfH_filt[['Table Name', 'Hierarchy Name', 'Used Size']].sort_values(by='Used Size', ascending=False) + dfH_filt = dfH[dfH["Level Ordinal"] == 0] + dfH_filt = dfH_filt[["Table Name", "Hierarchy Name", "Used Size"]].sort_values( + by="Used Size", ascending=False + ) dfH_filt.reset_index(drop=True, inplace=True) export_Hier = dfH_filt.copy() - intList = ['Used Size'] - dfH_filt[intList] = dfH_filt[intList].applymap('{:,}'.format) + intList = ["Used Size"] + dfH_filt[intList] = dfH_filt[intList].applymap("{:,}".format) ## Model if total_size >= 1000000000: - y = total_size / (1024 ** 3) * 1000000000 + y = total_size / (1024**3) * 1000000000 elif total_size >= 1000000: - y = total_size / (1024 ** 2) * 1000000 + y = total_size / (1024**2) * 1000000 elif total_size >= 1000: y = total_size / (1024) * 1000 y = round(y) tblCount = len(dfT) colCount = len(dfC_filt) - compatLevel = dfD['Compatibility Level'].iloc[0] - defMode = dfD['Model Default Mode'].iloc[0] - - dfModel = pd.DataFrame({'Dataset Name': dataset, 'Total Size': y, 'Table Count': tblCount, 'Column Count': colCount, 'Compatibility Level': compatLevel, 'Default Mode': defMode}, index=[0]) + compatLevel = dfD["Compatibility Level"].iloc[0] + defMode = dfD["Model Default Mode"].iloc[0] + + dfModel = pd.DataFrame( + { + "Dataset Name": dataset, + "Total Size": y, + "Table Count": tblCount, + "Column Count": colCount, + "Compatibility Level": compatLevel, + "Default Mode": defMode, + }, + index=[0], + ) dfModel.reset_index(drop=True, inplace=True) export_Model = dfModel.copy() - intList = ['Total Size', 'Table Count', 'Column Count'] - dfModel[intList] = dfModel[intList].applymap('{:,}'.format) + intList = ["Total Size", "Table Count", "Column Count"] + dfModel[intList] = dfModel[intList].applymap("{:,}".format) dataFrames = { - 'dfModel': dfModel, - 'dfTable': dfTable, - 'dfP': dfP, - 'colSize': colSize, - 'temp': temp, - 'dfR': dfR, - 'dfH_filt': dfH_filt + "dfModel": dfModel, + "dfTable": dfTable, + "dfP": dfP, + "colSize": colSize, + "temp": temp, + "dfR": dfR, + "dfH_filt": dfH_filt, } dfs = {} for fileName, df in dataFrames.items(): dfs[fileName] = df - + visualize_vertipaq(dfs) ### Export vertipaq to delta tables in lakehouse - if export in ['table','zip']: - lakeAttach = lakehouse_attached() - if lakeAttach == False: - print(f"In order to save the Vertipaq Analyzer results, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook.") + if export in ["table", "zip"]: + lakeAttach = lakehouse_attached() + if lakeAttach == False: + print( + f"In order to save the Vertipaq Analyzer results, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook." + ) return - - if export == 'table': - spark = SparkSession.builder.getOrCreate() - - lakehouse_id = fabric.get_lakehouse_id() - lakehouse = resolve_lakehouse_name(lakehouse_id = lakehouse_id, workspace = workspace) - lakeTName = 'vertipaq_analyzer_model' - - lakeT = get_lakehouse_tables(lakehouse = lakehouse, workspace = workspace) - lakeT_filt = lakeT[lakeT['Table Name'] == lakeTName] - - query = f"SELECT MAX(RunId) FROM {lakehouse}.{lakeTName}" - - if len(lakeT_filt) == 0: - runId = 1 - else: - dfSpark = spark.sql(query) - maxRunId = dfSpark.collect()[0][0] - runId = maxRunId + 1 - - dfMap = { - 'export_Col': ['Columns', export_Col], - 'export_Table': ['Tables', export_Table], - 'export_Part': ['Partitions', export_Part], - 'export_Rel': ['Relationships', export_Rel], - 'export_Hier': ['Hierarchies', export_Hier], - 'export_Model': ['Model', export_Model] + + if export == "table": + spark = SparkSession.builder.getOrCreate() + + lakehouse_id = fabric.get_lakehouse_id() + lakehouse = resolve_lakehouse_name( + lakehouse_id=lakehouse_id, workspace=workspace + ) + lakeTName = "vertipaq_analyzer_model" + + lakeT = get_lakehouse_tables(lakehouse=lakehouse, workspace=workspace) + lakeT_filt = lakeT[lakeT["Table Name"] == lakeTName] + + query = f"SELECT MAX(RunId) FROM {lakehouse}.{lakeTName}" + + if len(lakeT_filt) == 0: + runId = 1 + else: + dfSpark = spark.sql(query) + maxRunId = dfSpark.collect()[0][0] + runId = maxRunId + 1 + + dfMap = { + "export_Col": ["Columns", export_Col], + "export_Table": ["Tables", export_Table], + "export_Part": ["Partitions", export_Part], + "export_Rel": ["Relationships", export_Rel], + "export_Hier": ["Hierarchies", export_Hier], + "export_Model": ["Model", export_Model], } - - print(f"Saving Vertipaq Analyzer to delta tables in the lakehouse...\n") - now = datetime.datetime.now() - for key, (obj, df) in dfMap.items(): - df['Timestamp'] = now - df['Workspace Name'] = workspace - df['Dataset Name'] = dataset - df['RunId'] = runId - - colName = 'Workspace Name' - df.insert(0, colName, df.pop(colName)) - colName = 'Dataset Name' - df.insert(1, colName, df.pop(colName)) - - df.columns = df.columns.str.replace(' ', '_') - - delta_table_name = f"VertipaqAnalyzer_{obj}".lower() - spark_df = spark.createDataFrame(df) - spark_df.write.mode('append').format('delta').saveAsTable(delta_table_name) - print(f"\u2022 Vertipaq Analyzer results for '{obj}' have been appended to the '{delta_table_name}' delta table.") + + print(f"Saving Vertipaq Analyzer to delta tables in the lakehouse...\n") + now = datetime.datetime.now() + for key, (obj, df) in dfMap.items(): + df["Timestamp"] = now + df["Workspace Name"] = workspace + df["Dataset Name"] = dataset + df["RunId"] = runId + + colName = "Workspace Name" + df.insert(0, colName, df.pop(colName)) + colName = "Dataset Name" + df.insert(1, colName, df.pop(colName)) + + df.columns = df.columns.str.replace(" ", "_") + + delta_table_name = f"VertipaqAnalyzer_{obj}".lower() + spark_df = spark.createDataFrame(df) + spark_df.write.mode("append").format("delta").saveAsTable(delta_table_name) + print( + f"\u2022 Vertipaq Analyzer results for '{obj}' have been appended to the '{delta_table_name}' delta table." + ) ### Export vertipaq to zip file within the lakehouse - if export == 'zip': - dataFrames = { - 'dfModel': dfModel, - 'dfTable': dfTable, - 'dfP': dfP, - 'colSize': colSize, - 'temp': temp, - 'dfR': dfR, - 'dfH_filt': dfH_filt - } - - zipFileName = f"{workspace}.{dataset}.zip" - - folderPath = '/lakehouse/default/Files' - subFolderPath = os.path.join(folderPath, 'VertipaqAnalyzer') - ext = '.csv' - if not os.path.exists(subFolderPath): - os.makedirs(subFolderPath, exist_ok=True) - zipFilePath = os.path.join(subFolderPath, zipFileName) - - # Create CSV files based on dataframes - for fileName, df in dataFrames.items(): - filePath = os.path.join(subFolderPath, fileName + ext) - df.to_csv(filePath, index=False) - - # Create a zip file and add CSV files to it - with zipfile.ZipFile(zipFilePath, 'w') as zipf: - for fileName in dataFrames: - filePath = os.path.join(subFolderPath, fileName + ext) - zipf.write(filePath, os.path.basename(filePath)) - - # Clean up: remove the individual CSV files - for fileName, df in dataFrames.items(): - filePath = os.path.join(subFolderPath, fileName) + ext - if os.path.exists(filePath): - os.remove(filePath) - print(f"The Vertipaq Analyzer info for the '{dataset}' semantic model in the '{workspace}' workspace has been saved to the 'Vertipaq Analyzer/{zipFileName}' in the default lakehouse attached to this notebook.") + if export == "zip": + dataFrames = { + "dfModel": dfModel, + "dfTable": dfTable, + "dfP": dfP, + "colSize": colSize, + "temp": temp, + "dfR": dfR, + "dfH_filt": dfH_filt, + } + + zipFileName = f"{workspace}.{dataset}.zip" + + folderPath = "/lakehouse/default/Files" + subFolderPath = os.path.join(folderPath, "VertipaqAnalyzer") + ext = ".csv" + if not os.path.exists(subFolderPath): + os.makedirs(subFolderPath, exist_ok=True) + zipFilePath = os.path.join(subFolderPath, zipFileName) + + # Create CSV files based on dataframes + for fileName, df in dataFrames.items(): + filePath = os.path.join(subFolderPath, fileName + ext) + df.to_csv(filePath, index=False) + + # Create a zip file and add CSV files to it + with zipfile.ZipFile(zipFilePath, "w") as zipf: + for fileName in dataFrames: + filePath = os.path.join(subFolderPath, fileName + ext) + zipf.write(filePath, os.path.basename(filePath)) + + # Clean up: remove the individual CSV files + for fileName, df in dataFrames.items(): + filePath = os.path.join(subFolderPath, fileName) + ext + if os.path.exists(filePath): + os.remove(filePath) + print( + f"The Vertipaq Analyzer info for the '{dataset}' semantic model in the '{workspace}' workspace has been saved to the 'Vertipaq Analyzer/{zipFileName}' in the default lakehouse attached to this notebook." + ) + def visualize_vertipaq(dataframes): - + # Tooltips for columns within the visual data = [ - {'ViewName': 'Model', 'ColumnName': 'Dataset Name', 'Tooltip': 'The name of the semantic model'}, - {'ViewName': 'Model', 'ColumnName': 'Total Size', 'Tooltip': 'The size of the model (in bytes)'}, - {'ViewName': 'Model', 'ColumnName': 'Table Count', 'Tooltip': 'The number of tables in the semantic model'}, - {'ViewName': 'Model', 'ColumnName': 'Column Count', 'Tooltip': 'The number of columns in the semantic model'}, - {'ViewName': 'Model', 'ColumnName': 'Compatibility Level', 'Tooltip': 'The compatibility level of the semantic model'}, - {'ViewName': 'Model', 'ColumnName': 'Default Mode', 'Tooltip': 'The default query mode of the semantic model'}, - {'ViewName': 'Table', 'ColumnName': 'Table Name', 'Tooltip': 'The name of the table'}, - {'ViewName': 'Table', 'ColumnName': 'Type', 'Tooltip': 'The type of table'}, - {'ViewName': 'Table', 'ColumnName': 'Row Count', 'Tooltip': 'The number of rows in the table'}, - {'ViewName': 'Table', 'ColumnName': 'Total Size', 'Tooltip': 'Data Size + Dictionary Size + Hierarchy Size (in bytes)'}, - {'ViewName': 'Table', 'ColumnName': 'Data Size', 'Tooltip': 'The size of the data for all the columns in this table (in bytes)'}, - {'ViewName': 'Table', 'ColumnName': 'Dictionary Size', 'Tooltip': "The size of the column's dictionary for all columns in this table (in bytes)"}, - {'ViewName': 'Table', 'ColumnName': 'Hierarchy Size', 'Tooltip': 'The size of hierarchy structures for all columns in this table (in bytes)'}, - {'ViewName': 'Table', 'ColumnName': '% DB', 'Tooltip': 'The size of the table relative to the size of the semantic model'}, - {'ViewName': 'Table', 'ColumnName': 'Partitions', 'Tooltip': 'The number of partitions in the table'}, - {'ViewName': 'Table', 'ColumnName': 'Columns', 'Tooltip': 'The number of columns in the table'}, - {'ViewName': 'Partition', 'ColumnName': 'Table Name', 'Tooltip': 'The name of the table'}, - {'ViewName': 'Partition', 'ColumnName': 'Partition Name', 'Tooltip': 'The name of the partition within the table'}, - {'ViewName': 'Partition', 'ColumnName': 'Mode', 'Tooltip': 'The query mode of the partition'}, - {'ViewName': 'Partition', 'ColumnName': 'Record Count', 'Tooltip': 'The number of rows in the partition'}, - {'ViewName': 'Partition', 'ColumnName': 'Segment Count', 'Tooltip': 'The number of segments within the partition'}, - {'ViewName': 'Partition', 'ColumnName': 'Records per Segment', 'Tooltip': 'The number of rows per segment'}, - {'ViewName': 'Column', 'ColumnName': 'Table Name', 'Tooltip': 'The name of the table'}, - {'ViewName': 'Column', 'ColumnName': 'Column Name', 'Tooltip': 'The name of the column'}, - {'ViewName': 'Column', 'ColumnName': 'Type', 'Tooltip': 'The type of column'}, - {'ViewName': 'Column', 'ColumnName': 'Cardinality', 'Tooltip': 'The number of unique rows in the column'}, - {'ViewName': 'Column', 'ColumnName': 'Total Size', 'Tooltip': 'Data Size + Dictionary Size + Hierarchy Size (in bytes)'}, - {'ViewName': 'Column', 'ColumnName': 'Data Size', 'Tooltip': 'The size of the data for the column (in bytes)'}, - {'ViewName': 'Column', 'ColumnName': 'Dictionary Size', 'Tooltip': "The size of the column's dictionary (in bytes)"}, - {'ViewName': 'Column', 'ColumnName': 'Hierarchy Size', 'Tooltip': 'The size of hierarchy structures (in bytes)'}, - {'ViewName': 'Column', 'ColumnName': '% Table', 'Tooltip': 'The size of the column relative to the size of the table'}, - {'ViewName': 'Column', 'ColumnName': '% DB', 'Tooltip': 'The size of the column relative to the size of the semantic model'}, - {'ViewName': 'Column', 'ColumnName': 'Data Type', 'Tooltip': 'The data type of the column'}, - {'ViewName': 'Column', 'ColumnName': 'Encoding', 'Tooltip': 'The encoding type for the column'}, - {'ViewName': 'Column', 'ColumnName': 'Is Resident', 'Tooltip': 'Indicates whether the column is in memory or not'}, - {'ViewName': 'Column', 'ColumnName': 'Temperature', 'Tooltip': 'A decimal indicating the frequency and recency of queries against the column'}, - {'ViewName': 'Column', 'ColumnName': 'Last Accessed', 'Tooltip': 'The time the column was last queried'}, - {'ViewName': 'Hierarchy', 'ColumnName': 'Table Name', 'Tooltip': 'The name of the table'}, - {'ViewName': 'Hierarchy', 'ColumnName': 'Hierarchy Name', 'Tooltip': 'The name of the hierarchy'}, - {'ViewName': 'Hierarchy', 'ColumnName': 'Used Size', 'Tooltip': 'The size of user hierarchy structures (in bytes)'}, - {'ViewName': 'Relationship', 'ColumnName': 'From Object', 'Tooltip': 'The from table/column in the relationship'}, - {'ViewName': 'Relationship', 'ColumnName': 'To Object', 'Tooltip': 'The to table/column in the relationship'}, - {'ViewName': 'Relationship', 'ColumnName': 'Multiplicity', 'Tooltip': 'The cardinality on each side of the relationship'}, - {'ViewName': 'Relationship', 'ColumnName': 'Used Size', 'Tooltip': 'The size of the relationship (in bytes)'}, - {'ViewName': 'Relationship', 'ColumnName': 'Max From Cardinality', 'Tooltip': 'The number of unique values in the column used in the from side of the relationship'}, - {'ViewName': 'Relationship', 'ColumnName': 'Max To Cardinality', 'Tooltip': 'The number of unique values in the column used in the to side of the relationship'}, - {'ViewName': 'Relationship', 'ColumnName': 'Missing Rows', 'Tooltip': "The number of rows in the 'from' table which do not map to the key column in the 'to' table"} + { + "ViewName": "Model", + "ColumnName": "Dataset Name", + "Tooltip": "The name of the semantic model", + }, + { + "ViewName": "Model", + "ColumnName": "Total Size", + "Tooltip": "The size of the model (in bytes)", + }, + { + "ViewName": "Model", + "ColumnName": "Table Count", + "Tooltip": "The number of tables in the semantic model", + }, + { + "ViewName": "Model", + "ColumnName": "Column Count", + "Tooltip": "The number of columns in the semantic model", + }, + { + "ViewName": "Model", + "ColumnName": "Compatibility Level", + "Tooltip": "The compatibility level of the semantic model", + }, + { + "ViewName": "Model", + "ColumnName": "Default Mode", + "Tooltip": "The default query mode of the semantic model", + }, + { + "ViewName": "Table", + "ColumnName": "Table Name", + "Tooltip": "The name of the table", + }, + {"ViewName": "Table", "ColumnName": "Type", "Tooltip": "The type of table"}, + { + "ViewName": "Table", + "ColumnName": "Row Count", + "Tooltip": "The number of rows in the table", + }, + { + "ViewName": "Table", + "ColumnName": "Total Size", + "Tooltip": "Data Size + Dictionary Size + Hierarchy Size (in bytes)", + }, + { + "ViewName": "Table", + "ColumnName": "Data Size", + "Tooltip": "The size of the data for all the columns in this table (in bytes)", + }, + { + "ViewName": "Table", + "ColumnName": "Dictionary Size", + "Tooltip": "The size of the column's dictionary for all columns in this table (in bytes)", + }, + { + "ViewName": "Table", + "ColumnName": "Hierarchy Size", + "Tooltip": "The size of hierarchy structures for all columns in this table (in bytes)", + }, + { + "ViewName": "Table", + "ColumnName": "% DB", + "Tooltip": "The size of the table relative to the size of the semantic model", + }, + { + "ViewName": "Table", + "ColumnName": "Partitions", + "Tooltip": "The number of partitions in the table", + }, + { + "ViewName": "Table", + "ColumnName": "Columns", + "Tooltip": "The number of columns in the table", + }, + { + "ViewName": "Partition", + "ColumnName": "Table Name", + "Tooltip": "The name of the table", + }, + { + "ViewName": "Partition", + "ColumnName": "Partition Name", + "Tooltip": "The name of the partition within the table", + }, + { + "ViewName": "Partition", + "ColumnName": "Mode", + "Tooltip": "The query mode of the partition", + }, + { + "ViewName": "Partition", + "ColumnName": "Record Count", + "Tooltip": "The number of rows in the partition", + }, + { + "ViewName": "Partition", + "ColumnName": "Segment Count", + "Tooltip": "The number of segments within the partition", + }, + { + "ViewName": "Partition", + "ColumnName": "Records per Segment", + "Tooltip": "The number of rows per segment", + }, + { + "ViewName": "Column", + "ColumnName": "Table Name", + "Tooltip": "The name of the table", + }, + { + "ViewName": "Column", + "ColumnName": "Column Name", + "Tooltip": "The name of the column", + }, + {"ViewName": "Column", "ColumnName": "Type", "Tooltip": "The type of column"}, + { + "ViewName": "Column", + "ColumnName": "Cardinality", + "Tooltip": "The number of unique rows in the column", + }, + { + "ViewName": "Column", + "ColumnName": "Total Size", + "Tooltip": "Data Size + Dictionary Size + Hierarchy Size (in bytes)", + }, + { + "ViewName": "Column", + "ColumnName": "Data Size", + "Tooltip": "The size of the data for the column (in bytes)", + }, + { + "ViewName": "Column", + "ColumnName": "Dictionary Size", + "Tooltip": "The size of the column's dictionary (in bytes)", + }, + { + "ViewName": "Column", + "ColumnName": "Hierarchy Size", + "Tooltip": "The size of hierarchy structures (in bytes)", + }, + { + "ViewName": "Column", + "ColumnName": "% Table", + "Tooltip": "The size of the column relative to the size of the table", + }, + { + "ViewName": "Column", + "ColumnName": "% DB", + "Tooltip": "The size of the column relative to the size of the semantic model", + }, + { + "ViewName": "Column", + "ColumnName": "Data Type", + "Tooltip": "The data type of the column", + }, + { + "ViewName": "Column", + "ColumnName": "Encoding", + "Tooltip": "The encoding type for the column", + }, + { + "ViewName": "Column", + "ColumnName": "Is Resident", + "Tooltip": "Indicates whether the column is in memory or not", + }, + { + "ViewName": "Column", + "ColumnName": "Temperature", + "Tooltip": "A decimal indicating the frequency and recency of queries against the column", + }, + { + "ViewName": "Column", + "ColumnName": "Last Accessed", + "Tooltip": "The time the column was last queried", + }, + { + "ViewName": "Hierarchy", + "ColumnName": "Table Name", + "Tooltip": "The name of the table", + }, + { + "ViewName": "Hierarchy", + "ColumnName": "Hierarchy Name", + "Tooltip": "The name of the hierarchy", + }, + { + "ViewName": "Hierarchy", + "ColumnName": "Used Size", + "Tooltip": "The size of user hierarchy structures (in bytes)", + }, + { + "ViewName": "Relationship", + "ColumnName": "From Object", + "Tooltip": "The from table/column in the relationship", + }, + { + "ViewName": "Relationship", + "ColumnName": "To Object", + "Tooltip": "The to table/column in the relationship", + }, + { + "ViewName": "Relationship", + "ColumnName": "Multiplicity", + "Tooltip": "The cardinality on each side of the relationship", + }, + { + "ViewName": "Relationship", + "ColumnName": "Used Size", + "Tooltip": "The size of the relationship (in bytes)", + }, + { + "ViewName": "Relationship", + "ColumnName": "Max From Cardinality", + "Tooltip": "The number of unique values in the column used in the from side of the relationship", + }, + { + "ViewName": "Relationship", + "ColumnName": "Max To Cardinality", + "Tooltip": "The number of unique values in the column used in the to side of the relationship", + }, + { + "ViewName": "Relationship", + "ColumnName": "Missing Rows", + "Tooltip": "The number of rows in the 'from' table which do not map to the key column in the 'to' table", + }, ] # Create DataFrame tooltipDF = pd.DataFrame(data) - #define the dictionary with {"Tab name":df} + # define the dictionary with {"Tab name":df} df_dict = { - "Model Summary":dataframes['dfModel'], - "Tables":dataframes['dfTable'], - "Partitions": dataframes['dfP'], - "Columns (Total Size)": dataframes['colSize'], - "Columns (Temperature)": dataframes['temp'], - "Relationships": dataframes['dfR'], - "Hierarchies": dataframes['dfH_filt'] - } + "Model Summary": dataframes["dfModel"], + "Tables": dataframes["dfTable"], + "Partitions": dataframes["dfP"], + "Columns (Total Size)": dataframes["colSize"], + "Columns (Temperature)": dataframes["temp"], + "Relationships": dataframes["dfR"], + "Hierarchies": dataframes["dfH_filt"], + } mapping = { - 'Model Summary': 'Model', - 'Tables': 'Table', - 'Partitions': 'Partition', - 'Columns (Total Size)': 'Column', - 'Columns (Temperature)': 'Column', - 'Relationships': 'Relationship', - 'Hierarchies': 'Hierarchy' -} + "Model Summary": "Model", + "Tables": "Table", + "Partitions": "Partition", + "Columns (Total Size)": "Column", + "Columns (Temperature)": "Column", + "Relationships": "Relationship", + "Hierarchies": "Hierarchy", + } # Basic styles for the tabs and tab content styles = """ @@ -505,10 +822,9 @@ def visualize_vertipaq(dataframes): """ - # HTML for tabs tab_html = '
' - content_html = '' + content_html = "" for i, (title, df) in enumerate(df_dict.items()): tab_id = f"tab{i}" tab_html += f'' @@ -519,23 +835,29 @@ def visualize_vertipaq(dataframes): for col in df.columns: tt = None try: - tooltipDF_filt = tooltipDF[(tooltipDF['ViewName'] == vw) & (tooltipDF['ColumnName'] == col)] - tt = tooltipDF_filt['Tooltip'].iloc[0] + tooltipDF_filt = tooltipDF[ + (tooltipDF["ViewName"] == vw) & (tooltipDF["ColumnName"] == col) + ] + tt = tooltipDF_filt["Tooltip"].iloc[0] except: pass - df_html = df_html.replace(f'{col}', f'{col}') - content_html += f'

{title}

{df_html}
' - tab_html += '
' + df_html = df_html.replace(f"{col}", f'{col}') + content_html += ( + f'

{title}

{df_html}
' + ) + tab_html += "" # Display the tabs, tab contents, and run the script display(HTML(styles + tab_html + content_html + script)) # Default to open the first tab - display(HTML("")) + display( + HTML("") + ) + @log def import_vertipaq_analyzer(folder_path: str, file_name: str): - - """ + """ Imports and visualizes the vertipaq analyzer info from a saved .zip file in your lakehouse. Parameters @@ -550,22 +872,22 @@ def import_vertipaq_analyzer(folder_path: str, file_name: str): str A visualization of the Vertipaq Analyzer statistics. """ - - pd.options.mode.copy_on_write = True - zipFilePath = os.path.join(folder_path, file_name) - extracted_dir = os.path.join(folder_path, 'extracted_dataframes') + pd.options.mode.copy_on_write = True - with zipfile.ZipFile(zipFilePath, 'r') as zip_ref: - zip_ref.extractall(extracted_dir) + zipFilePath = os.path.join(folder_path, file_name) + extracted_dir = os.path.join(folder_path, "extracted_dataframes") - # Read all CSV files into a dictionary of DataFrames - dfs = {} - for file_name in zip_ref.namelist(): - df = pd.read_csv(extracted_dir + '/' + file_name) - dfs[file_name] = df + with zipfile.ZipFile(zipFilePath, "r") as zip_ref: + zip_ref.extractall(extracted_dir) - visualize_vertipaq(dfs) + # Read all CSV files into a dictionary of DataFrames + dfs = {} + for file_name in zip_ref.namelist(): + df = pd.read_csv(extracted_dir + "/" + file_name) + dfs[file_name] = df + + visualize_vertipaq(dfs) - # Clean up: remove the extracted directory - shutil.rmtree(extracted_dir) \ No newline at end of file + # Clean up: remove the extracted directory + shutil.rmtree(extracted_dir) diff --git a/sempy_labs/WarmCache.py b/sempy_labs/WarmCache.py index b4d340d0..eae67b1b 100644 --- a/sempy_labs/WarmCache.py +++ b/sempy_labs/WarmCache.py @@ -4,20 +4,21 @@ from tqdm.auto import tqdm import numpy as np import time -from .HelperFunctions import format_dax_object_name +from ._helper_functions import format_dax_object_name from .RefreshSemanticModel import refresh_semantic_model from .GetMeasureDependencies import get_measure_dependencies from typing import List, Optional, Union from sempy._utils._log import log +import sempy_labs._icons as icons -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' @log -def warm_direct_lake_cache_perspective(dataset: str, perspective: str, add_dependencies: Optional[bool] = False, workspace: Optional[str] = None): - +def warm_direct_lake_cache_perspective( + dataset: str, + perspective: str, + add_dependencies: Optional[bool] = False, + workspace: Optional[str] = None, +): """ Warms the cache of a Direct Lake semantic model by running a simple DAX query against the columns in a perspective. @@ -33,10 +34,10 @@ def warm_direct_lake_cache_perspective(dataset: str, perspective: str, add_depen The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- - + """ if workspace == None: @@ -45,79 +46,109 @@ def warm_direct_lake_cache_perspective(dataset: str, perspective: str, add_depen else: workspace_id = fabric.resolve_workspace_id(workspace) - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) - if not any(r['Mode'] == 'DirectLake' for i, r in dfP.iterrows()): - print(f"{red_dot} The '{dataset}' semantic model in the '{workspace}' workspace is not in Direct Lake mode. This function is specifically for semantic models in Direct Lake mode.") + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + if not any(r["Mode"] == "DirectLake" for i, r in dfP.iterrows()): + print( + f"{icons.red_dot} The '{dataset}' semantic model in the '{workspace}' workspace is not in Direct Lake mode. This function is specifically for semantic models in Direct Lake mode." + ) return - - dfPersp = fabric.list_perspectives(dataset = dataset, workspace = workspace) - dfPersp['DAX Object Name'] = format_dax_object_name(dfPersp['Table Name'], dfPersp['Object Name']) - dfPersp_filt = dfPersp[dfPersp['Perspective Name'] == perspective] + + dfPersp = fabric.list_perspectives(dataset=dataset, workspace=workspace) + dfPersp["DAX Object Name"] = format_dax_object_name( + dfPersp["Table Name"], dfPersp["Object Name"] + ) + dfPersp_filt = dfPersp[dfPersp["Perspective Name"] == perspective] if len(dfPersp_filt) == 0: - print(f"{red_dot} The '{perspective} perspective does not exist or contains no objects within the '{dataset}' semantic model in the '{workspace}' workspace.") + print( + f"{icons.red_dot} The '{perspective} perspective does not exist or contains no objects within the '{dataset}' semantic model in the '{workspace}' workspace." + ) return - dfPersp_c = dfPersp_filt[dfPersp_filt['Object Type'] == 'Column'] + dfPersp_c = dfPersp_filt[dfPersp_filt["Object Type"] == "Column"] - column_values = dfPersp_c['DAX Object Name'].tolist() + column_values = dfPersp_c["DAX Object Name"].tolist() if add_dependencies: # Measure dependencies md = get_measure_dependencies(dataset, workspace) - md['Referenced Full Object'] = format_dax_object_name(md['Referenced Table'], md['Referenced Object']) - dfPersp_m = dfPersp_filt[(dfPersp_filt['Object Type'] == 'Measure')] - md_filt = md[(md['Object Name'].isin(dfPersp_m['Object Name'].values)) & (md['Referenced Object Type'] == 'Column')] - measureDep = md_filt['Referenced Full Object'].unique() + md["Referenced Full Object"] = format_dax_object_name( + md["Referenced Table"], md["Referenced Object"] + ) + dfPersp_m = dfPersp_filt[(dfPersp_filt["Object Type"] == "Measure")] + md_filt = md[ + (md["Object Name"].isin(dfPersp_m["Object Name"].values)) + & (md["Referenced Object Type"] == "Column") + ] + measureDep = md_filt["Referenced Full Object"].unique() # Hierarchy dependencies - dfPersp_h = dfPersp_filt[(dfPersp_filt['Object Type'] == 'Hierarchy')] - dfH = fabric.list_hierarchies(dataset = dataset, workspace = workspace) - dfH['Hierarchy Object'] = format_dax_object_name(dfH['Table Name'], dfH['Hierarchy Name']) - dfH['Column Object'] = format_dax_object_name(dfH['Table Name'], dfH['Column Name']) - dfH_filt = dfH[dfH['Hierarchy Object'].isin(dfPersp_h['DAX Object Name'].values)] - hierarchyDep = dfH_filt['Column Object'].unique() + dfPersp_h = dfPersp_filt[(dfPersp_filt["Object Type"] == "Hierarchy")] + dfH = fabric.list_hierarchies(dataset=dataset, workspace=workspace) + dfH["Hierarchy Object"] = format_dax_object_name( + dfH["Table Name"], dfH["Hierarchy Name"] + ) + dfH["Column Object"] = format_dax_object_name( + dfH["Table Name"], dfH["Column Name"] + ) + dfH_filt = dfH[ + dfH["Hierarchy Object"].isin(dfPersp_h["DAX Object Name"].values) + ] + hierarchyDep = dfH_filt["Column Object"].unique() # Relationship dependencies - unique_table_names = dfPersp_filt['Table Name'].unique() - dfR = fabric.list_relationships(dataset = dataset, workspace = workspace) - dfR['From Object'] = format_dax_object_name(dfR['From Table'], dfR['From Column']) - dfR['To Object'] = format_dax_object_name(dfR['To Table'], dfR['To Column']) - filtered_dfR = dfR[dfR['From Table'].isin(unique_table_names) & dfR['To Table'].isin(unique_table_names)] - - fromObjects = filtered_dfR['From Object'].unique() - toObjects = filtered_dfR['To Object'].unique() - - merged_list = np.concatenate([column_values, measureDep, hierarchyDep, fromObjects, toObjects]) + unique_table_names = dfPersp_filt["Table Name"].unique() + dfR = fabric.list_relationships(dataset=dataset, workspace=workspace) + dfR["From Object"] = format_dax_object_name( + dfR["From Table"], dfR["From Column"] + ) + dfR["To Object"] = format_dax_object_name(dfR["To Table"], dfR["To Column"]) + filtered_dfR = dfR[ + dfR["From Table"].isin(unique_table_names) + & dfR["To Table"].isin(unique_table_names) + ] + + fromObjects = filtered_dfR["From Object"].unique() + toObjects = filtered_dfR["To Object"].unique() + + merged_list = np.concatenate( + [column_values, measureDep, hierarchyDep, fromObjects, toObjects] + ) merged_list_unique = list(set(merged_list)) else: merged_list_unique = column_values - df = pd.DataFrame(merged_list_unique, columns=['DAX Object Name']) - df[['Table Name', 'Column Name']] = df['DAX Object Name'].str.split('[', expand=True) - df['Table Name'] = df['Table Name'].str[1:-1] - df['Column Name'] = df['Column Name'].str[0:-1] + df = pd.DataFrame(merged_list_unique, columns=["DAX Object Name"]) + df[["Table Name", "Column Name"]] = df["DAX Object Name"].str.split( + "[", expand=True + ) + df["Table Name"] = df["Table Name"].str[1:-1] + df["Column Name"] = df["Column Name"].str[0:-1] - tbls = list(set(value.split('[')[0] for value in merged_list_unique)) + tbls = list(set(value.split("[")[0] for value in merged_list_unique)) for tableName in (bar := tqdm(tbls)): - filtered_list = [value for value in merged_list_unique if value.startswith(f"{tableName}[")] + filtered_list = [ + value for value in merged_list_unique if value.startswith(f"{tableName}[") + ] bar.set_description(f"Warming the '{tableName}' table...") - css = ','.join(map(str, filtered_list)) - dax = """EVALUATE TOPN(1,SUMMARIZECOLUMNS(""" + css + "))""" - x = fabric.evaluate_dax(dataset = dataset, dax_string = dax, workspace = workspace) - - print(f"{green_dot} The following columns have been put into memory:") + css = ",".join(map(str, filtered_list)) + dax = """EVALUATE TOPN(1,SUMMARIZECOLUMNS(""" + css + "))" "" + x = fabric.evaluate_dax(dataset=dataset, dax_string=dax, workspace=workspace) + + print(f"{icons.green_dot} The following columns have been put into memory:") - new_column_order = ['Table Name', 'Column Name', 'DAX Object Name'] + new_column_order = ["Table Name", "Column Name", "DAX Object Name"] df = df.reindex(columns=new_column_order) - df = df[['Table Name', 'Column Name']].sort_values(by=['Table Name', 'Column Name'], ascending=True) - + df = df[["Table Name", "Column Name"]].sort_values( + by=["Table Name", "Column Name"], ascending=True + ) + return df + @log def warm_direct_lake_cache_isresident(dataset: str, workspace: Optional[str] = None): - """ Performs a refresh on the semantic model and puts the columns which were in memory prior to the refresh back into memory. @@ -129,11 +160,11 @@ def warm_direct_lake_cache_isresident(dataset: str, workspace: Optional[str] = N The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- - - """ + + """ if workspace == None: workspace_id = fabric.get_workspace_id() @@ -141,35 +172,45 @@ def warm_direct_lake_cache_isresident(dataset: str, workspace: Optional[str] = N else: workspace_id = fabric.resolve_workspace_id(workspace) - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) - if not any(r['Mode'] == 'DirectLake' for i, r in dfP.iterrows()): - print(f"The '{dataset}' semantic model in the '{workspace}' workspace is not in Direct Lake mode. This function is specifically for semantic models in Direct Lake mode.") + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + if not any(r["Mode"] == "DirectLake" for i, r in dfP.iterrows()): + print( + f"The '{dataset}' semantic model in the '{workspace}' workspace is not in Direct Lake mode. This function is specifically for semantic models in Direct Lake mode." + ) return - + # Identify columns which are currently in memory (Is Resident = True) - dfC = fabric.list_columns(dataset = dataset, workspace = workspace, extended = True) - dfC['DAX Object Name'] = format_dax_object_name(dfC['Table Name'], dfC['Column Name']) - dfC_filtered = dfC[dfC['Is Resident']] + dfC = fabric.list_columns(dataset=dataset, workspace=workspace, extended=True) + dfC["DAX Object Name"] = format_dax_object_name( + dfC["Table Name"], dfC["Column Name"] + ) + dfC_filtered = dfC[dfC["Is Resident"]] if len(dfC_filtered) == 0: - print(f"{yellow_dot} At present, no columns are in memory in the '{dataset}' semantic model in the '{workspace}' workspace.") + print( + f"{icons.yellow_dot} At present, no columns are in memory in the '{dataset}' semantic model in the '{workspace}' workspace." + ) return # Refresh/frame dataset - refresh_semantic_model(dataset = dataset, refresh_type = 'full', workspace = workspace) + refresh_semantic_model(dataset=dataset, refresh_type="full", workspace=workspace) time.sleep(2) - tbls = dfC_filtered['Table Name'].unique() - column_values = dfC_filtered['DAX Object Name'].tolist() + tbls = dfC_filtered["Table Name"].unique() + column_values = dfC_filtered["DAX Object Name"].tolist() # Run basic query to get columns into memory; completed one table at a time (so as not to overload the capacity) for tableName in (bar := tqdm(tbls)): bar.set_description(f"Warming the '{tableName}' table...") - css = ','.join(map(str, column_values)) - dax = """EVALUATE TOPN(1,SUMMARIZECOLUMNS(""" + css + "))""" - x = fabric.evaluate_dax(dataset = dataset, dax_string = dax, workspace = workspace) + css = ",".join(map(str, column_values)) + dax = """EVALUATE TOPN(1,SUMMARIZECOLUMNS(""" + css + "))" "" + x = fabric.evaluate_dax(dataset=dataset, dax_string=dax, workspace=workspace) - print(f"{green_dot} The following columns have been put into memory. Temperature indicates the column temperature prior to the semantic model refresh.") + print( + f"{icons.green_dot} The following columns have been put into memory. Temperature indicates the column temperature prior to the semantic model refresh." + ) - return dfC_filtered[['Table Name', 'Column Name', 'Is Resident', 'Temperature']].sort_values(by=['Table Name', 'Column Name'], ascending=True) + return dfC_filtered[ + ["Table Name", "Column Name", "Is Resident", "Temperature"] + ].sort_values(by=["Table Name", "Column Name"], ascending=True) diff --git a/sempy_labs/__init__.py b/sempy_labs/__init__.py index bc1d8850..29d98378 100644 --- a/sempy_labs/__init__.py +++ b/sempy_labs/__init__.py @@ -1 +1,27 @@ -from sempy_labs._clear_cache import clear_cache as clear_cache \ No newline at end of file +from sempy_labs._clear_cache import clear_cache as clear_cache +from sempy_labs._create_blank_semantic_model import ( + create_blank_semantic_model as create_blank_semantic_model, +) +from sempy_labs._create_pqt_file import create_pqt_file as create_pqt_file +from sempy_labs._fallback import check_fallback_reason as check_fallback_reason +from sempy_labs._generate_semantic_model import ( + create_semantic_model_from_bim as create_semantic_model_from_bim, + deploy_semantic_model as deploy_semantic_model, +) +from sempy_labs._list_functions import ( + get_object_level_security as get_object_level_security, +) +from sempy_labs._helper_functions import ( + resolve_lakehouse_name as resolve_lakehouse_name, + save_as_delta_table as save_as_delta_table, + generate_embedded_filter as generate_embedded_filter, + get_direct_lake_sql_endpoint as get_direct_lake_sql_endpoint, + resolve_lakehouse_id as resolve_lakehouse_id, + resolve_dataset_name as resolve_dataset_name, + resolve_dataset_id as resolve_dataset_id, + resolve_report_name as resolve_report_name, + resolve_report_id as resolve_report_id, + create_relationship_name as create_relationship_name, + format_dax_object_name as format_dax_object_name, + create_abfss_path as create_abfss_path, +) diff --git a/sempy_labs/_clear_cache.py b/sempy_labs/_clear_cache.py index 1b009444..426f339b 100644 --- a/sempy_labs/_clear_cache.py +++ b/sempy_labs/_clear_cache.py @@ -1,15 +1,11 @@ import sempy import sempy.fabric as fabric -from .HelperFunctions import resolve_dataset_id +from sempy_labs._helper_functions import resolve_dataset_id from typing import List, Optional, Union +import sempy_labs._icons as icons -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' def clear_cache(dataset: str, workspace: Optional[str] = None): - """ Clears the cache of a semantic model. @@ -21,17 +17,13 @@ def clear_cache(dataset: str, workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - """ if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - datasetID = resolve_dataset_id(dataset = dataset, workspace = workspace) + datasetID = resolve_dataset_id(dataset=dataset, workspace=workspace) xmla = f""" @@ -40,8 +32,8 @@ def clear_cache(dataset: str, workspace: Optional[str] = None): """ - fabric.execute_xmla(dataset = dataset,xmla_command=xmla, workspace = workspace) + fabric.execute_xmla(dataset=dataset, xmla_command=xmla, workspace=workspace) + + outputtext = f"{icons.green_dot} Cache cleared for the '{dataset}' semantic model within the '{workspace}' workspace." - outputtext = f"{green_dot} Cache cleared for the '{dataset}' semantic model within the '{workspace}' workspace." - - return outputtext \ No newline at end of file + return outputtext diff --git a/sempy_labs/CreateBlankSemanticModel.py b/sempy_labs/_create_blank_semantic_model.py similarity index 57% rename from sempy_labs/CreateBlankSemanticModel.py rename to sempy_labs/_create_blank_semantic_model.py index 80ada03f..af2b0bb1 100644 --- a/sempy_labs/CreateBlankSemanticModel.py +++ b/sempy_labs/_create_blank_semantic_model.py @@ -1,15 +1,15 @@ import sempy import sempy.fabric as fabric from typing import List, Optional, Union +import sempy_labs._icons as icons -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' -def create_blank_semantic_model(dataset: str, compatibility_level: Optional[int] = 1605, workspace: Optional[str] = None): - - """ +def create_blank_semantic_model( + dataset: str, + compatibility_level: Optional[int] = 1605, + workspace: Optional[str] = None, +): + """ Creates a new blank semantic model (no tables/columns etc.). Parameters @@ -23,21 +23,17 @@ def create_blank_semantic_model(dataset: str, compatibility_level: Optional[int] The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - """ - if workspace == None: - workspace_id = fabric.get_workspace_id() - workspace = fabric.resolve_workspace_name(workspace_id) + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) - if compatibility_level < 1500: - print(f"{red_dot} Compatiblity level must be at least 1500.") - return + if compatibility_level < 1500: + print(f"{icons.red_dot} Compatiblity level must be at least 1500.") + return - tmsl = f''' + tmsl = f""" {{ "createOrReplace": {{ "object": {{ @@ -53,8 +49,10 @@ def create_blank_semantic_model(dataset: str, compatibility_level: Optional[int] }} }} }} - ''' + """ - fabric.execute_tmsl(script = tmsl, workspace = workspace) + fabric.execute_tmsl(script=tmsl, workspace=workspace) - return print(f"{green_dot} The '{dataset}' semantic model was created within the '{workspace}' workspace.") \ No newline at end of file + return print( + f"{icons.green_dot} The '{dataset}' semantic model was created within the '{workspace}' workspace." + ) diff --git a/sempy_labs/_create_pqt_file.py b/sempy_labs/_create_pqt_file.py new file mode 100644 index 00000000..63034882 --- /dev/null +++ b/sempy_labs/_create_pqt_file.py @@ -0,0 +1,238 @@ +import sempy.fabric as fabric +import json, os, shutil +import xml.etree.ElementTree as ET +from ._list_functions import list_tables +from sempy_labs.lakehouse import lakehouse_attached +from sempy._utils._log import log +from typing import Optional +import sempy_labs._icons as icons + + +@log +def create_pqt_file( + dataset: str, workspace: Optional[str] = None, file_name: Optional[str] = None +): + """ + Dynamically generates a [Power Query Template](https://learn.microsoft.com/power-query/power-query-template) file based on the semantic model. The .pqt file is saved within the Files section of your lakehouse. + + Parameters + ---------- + dataset : str + Name of the semantic model. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + file_name : str, default=None + The name of the Power Query Template file to be generated. + Defaults to None which resolves to 'PowerQueryTemplate'. + """ + + if file_name is None: + file_name = "PowerQueryTemplate" + + lakeAttach = lakehouse_attached() + + if lakeAttach == False: + print( + f"{icons.red_dot} In order to run the 'create_pqt_file' function, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook." + ) + return + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + + folderPath = "/lakehouse/default/Files" + subFolderPath = os.path.join(folderPath, "pqtnewfolder") + os.makedirs(subFolderPath, exist_ok=True) + + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + dfT = list_tables(dataset, workspace) + dfE = fabric.list_expressions(dataset=dataset, workspace=workspace) + + # Check if M-partitions are used + if any(dfP["Source Type"] == "M"): + + class QueryMetadata: + def __init__( + self, + QueryName, + QueryGroupId=None, + LastKnownIsParameter=None, + LastKnownResultTypeName=None, + LoadEnabled=True, + IsHidden=False, + ): + self.QueryName = QueryName + self.QueryGroupId = QueryGroupId + self.LastKnownIsParameter = LastKnownIsParameter + self.LastKnownResultTypeName = LastKnownResultTypeName + self.LoadEnabled = LoadEnabled + self.IsHidden = IsHidden + + class RootObject: + def __init__( + self, DocumentLocale, EngineVersion, QueriesMetadata, QueryGroups=None + ): + if QueryGroups is None: + QueryGroups = [] + self.DocumentLocale = DocumentLocale + self.EngineVersion = EngineVersion + self.QueriesMetadata = QueriesMetadata + self.QueryGroups = QueryGroups + + # STEP 1: Create MashupDocument.pq + mdfileName = "MashupDocument.pq" + mdFilePath = os.path.join(subFolderPath, mdfileName) + sb = "section Section1;" + for table_name in dfP["Table Name"].unique(): + tName = '#"' + table_name + '"' + sourceExpression = dfT.loc[ + (dfT["Name"] == table_name), "Source Expression" + ].iloc[0] + refreshPolicy = dfT.loc[(dfT["Name"] == table_name), "Refresh Policy"].iloc[ + 0 + ] + sourceType = dfP.loc[(dfP["Table Name"] == table_name), "Source Type"].iloc[ + 0 + ] + + if sourceType == "M" or refreshPolicy: + sb = sb + "\n" + "shared " + tName + " = " + + partitions_in_table = dfP.loc[ + dfP["Table Name"] == table_name, "Partition Name" + ].unique() + + i = 1 + for partition_name in partitions_in_table: + pSourceType = dfP.loc[ + (dfP["Table Name"] == table_name) + & (dfP["Partition Name"] == partition_name), + "Source Type", + ].iloc[0] + pQuery = dfP.loc[ + (dfP["Table Name"] == table_name) + & (dfP["Partition Name"] == partition_name), + "Query", + ].iloc[0] + + if pQuery is not None: + pQueryNoSpaces = ( + pQuery.replace(" ", "") + .replace("\n", "") + .replace("\t", "") + .replace("\r", "") + ) + if pQueryNoSpaces.startswith('letSource=""'): + pQuery = 'let\n\tSource = ""\nin\n\tSource' + + if pSourceType == "M" and i == 1: + sb = sb + pQuery + ";" + elif refreshPolicy and i == 1: + sb = sb + sourceExpression + ";" + i += 1 + + for index, row in dfE.iterrows(): + expr = row["Expression"] + eName = row["Name"] + eName = '#"' + eName + '"' + sb = sb + "\n" + "shared " + eName + " = " + expr + ";" + + with open(mdFilePath, "w") as file: + file.write(sb) + + # STEP 2: Create the MashupMetadata.json file + mmfileName = "MashupMetadata.json" + mmFilePath = os.path.join(subFolderPath, mmfileName) + queryMetadata = [] + + for tName in dfP["Table Name"].unique(): + sourceType = dfP.loc[(dfP["Table Name"] == tName), "Source Type"].iloc[0] + refreshPolicy = dfT.loc[(dfT["Name"] == tName), "Refresh Policy"].iloc[0] + if sourceType == "M" or refreshPolicy: + queryMetadata.append( + QueryMetadata(tName, None, None, None, True, False) + ) + + for i, r in dfE.iterrows(): + eName = r["Name"] + eKind = r["Kind"] + if eKind == "M": + queryMetadata.append( + QueryMetadata(eName, None, None, None, True, False) + ) + else: + queryMetadata.append( + QueryMetadata(eName, None, None, None, False, False) + ) + + rootObject = RootObject("en-US", "2.126.453.0", queryMetadata) + + def obj_to_dict(obj): + if isinstance(obj, list): + return [obj_to_dict(e) for e in obj] + elif hasattr(obj, "__dict__"): + return {k: obj_to_dict(v) for k, v in obj.__dict__.items()} + else: + return obj + + jsonContent = json.dumps(obj_to_dict(rootObject), indent=4) + + with open(mmFilePath, "w") as json_file: + json_file.write(jsonContent) + + # STEP 3: Create Metadata.json file + mFileName = "Metadata.json" + mFilePath = os.path.join(subFolderPath, mFileName) + metaData = {"Name": "fileName", "Description": "", "Version": "1.0.0.0"} + jsonContent = json.dumps(metaData, indent=4) + + with open(mFilePath, "w") as json_file: + json_file.write(jsonContent) + + # STEP 4: Create [Content_Types].xml file: + ns = "http://schemas.openxmlformats.org/package/2006/content-types" + ET.register_namespace("", ns) + types = ET.Element("{%s}Types" % ns) + default1 = ET.SubElement( + types, + "{%s}Default" % ns, + {"Extension": "json", "ContentType": "application/json"}, + ) + default2 = ET.SubElement( + types, + "{%s}Default" % ns, + {"Extension": "pq", "ContentType": "application/x-ms-m"}, + ) + xmlDocument = ET.ElementTree(types) + xmlFileName = "[Content_Types].xml" + xmlFilePath = os.path.join(subFolderPath, xmlFileName) + xmlDocument.write( + xmlFilePath, xml_declaration=True, encoding="utf-8", method="xml" + ) + + # STEP 5: Zip up the 4 files + zipFileName = file_name + ".zip" + zipFilePath = os.path.join(folderPath, zipFileName) + shutil.make_archive(zipFilePath[:-4], "zip", subFolderPath) + + # STEP 6: Convert the zip file back into a .pqt file + newExt = ".pqt" + directory = os.path.dirname(zipFilePath) + fileNameWithoutExtension = os.path.splitext(os.path.basename(zipFilePath))[0] + newFilePath = os.path.join(directory, fileNameWithoutExtension + newExt) + shutil.move(zipFilePath, newFilePath) + + # STEP 7: Delete subFolder directory which is no longer needed + shutil.rmtree(subFolderPath, ignore_errors=True) + + print( + f"{icons.green_dot} '{file_name}.pqt' has been created based on the '{dataset}' semantic model in the '{workspace}' workspace within the Files section of your lakehouse." + ) + + else: + print( + f"{icons.yellow_dot} The '{dataset}' semantic model in the '{workspace}' workspace does not use Power Query so a Power Query Template file cannot be generated." + ) diff --git a/sempy_labs/Fallback.py b/sempy_labs/_fallback.py similarity index 55% rename from sempy_labs/Fallback.py rename to sempy_labs/_fallback.py index cad5ee80..38886b6a 100644 --- a/sempy_labs/Fallback.py +++ b/sempy_labs/_fallback.py @@ -3,8 +3,8 @@ import numpy as np from typing import List, Optional, Union -def check_fallback_reason(dataset: str, workspace: Optional[str] = None): +def check_fallback_reason(dataset: str, workspace: Optional[str] = None): """ Shows the reason a table in a Direct Lake semantic model would fallback to DirectQuery. @@ -27,31 +27,36 @@ def check_fallback_reason(dataset: str, workspace: Optional[str] = None): workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) - dfP_filt = dfP[dfP['Mode'] == 'DirectLake'] - + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + dfP_filt = dfP[dfP["Mode"] == "DirectLake"] + if len(dfP_filt) == 0: - print(f"The '{dataset}' semantic model is not in Direct Lake. This function is only applicable to Direct Lake semantic models.") + print( + f"The '{dataset}' semantic model is not in Direct Lake. This function is only applicable to Direct Lake semantic models." + ) else: - df = fabric.evaluate_dax(dataset = dataset,workspace = workspace, - dax_string = - """ + df = fabric.evaluate_dax( + dataset=dataset, + workspace=workspace, + dax_string=""" SELECT [TableName] AS [Table Name],[FallbackReason] AS [FallbackReasonID] FROM $SYSTEM.TMSCHEMA_DELTA_TABLE_METADATA_STORAGES - """ - ) + """, + ) value_mapping = { - 0: 'No reason for fallback', - 1: 'This table is not framed', - 2: 'This object is a view in the lakehouse', - 3: 'The table does not exist in the lakehouse', - 4: 'Transient error', - 5: 'Using OLS will result in fallback to DQ', - 6: 'Using RLS will result in fallback to DQ' + 0: "No reason for fallback", + 1: "This table is not framed", + 2: "This object is a view in the lakehouse", + 3: "The table does not exist in the lakehouse", + 4: "Transient error", + 5: "Using OLS will result in fallback to DQ", + 6: "Using RLS will result in fallback to DQ", } # Create a new column based on the mapping - df['Fallback Reason Detail'] = np.vectorize(value_mapping.get)(df['FallbackReasonID']) - - return df \ No newline at end of file + df["Fallback Reason Detail"] = np.vectorize(value_mapping.get)( + df["FallbackReasonID"] + ) + + return df diff --git a/sempy_labs/GenerateSemanticModel.py b/sempy_labs/_generate_semantic_model.py similarity index 56% rename from sempy_labs/GenerateSemanticModel.py rename to sempy_labs/_generate_semantic_model.py index 7ed53dae..fd11a822 100644 --- a/sempy_labs/GenerateSemanticModel.py +++ b/sempy_labs/_generate_semantic_model.py @@ -2,10 +2,12 @@ import sempy.fabric as fabric import json, base64, time from .GetSemanticModelBim import get_semantic_model_bim -from typing import List, Optional, Union +from typing import Optional -def create_semantic_model_from_bim(dataset: str, bim_file: str, workspace: Optional[str] = None): +def create_semantic_model_from_bim( + dataset: str, bim_file: str, workspace: Optional[str] = None +): """ Creates a new semantic model based on a Model.bim file. @@ -19,10 +21,6 @@ def create_semantic_model_from_bim(dataset: str, bim_file: str, workspace: Optio The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - """ if workspace == None: @@ -31,70 +29,77 @@ def create_semantic_model_from_bim(dataset: str, bim_file: str, workspace: Optio else: workspace_id = fabric.resolve_workspace_id(workspace) - objectType = 'SemanticModel' + objectType = "SemanticModel" - dfI = fabric.list_items(workspace = workspace, type = objectType) - dfI_filt = dfI[(dfI['Display Name'] == dataset)] + dfI = fabric.list_items(workspace=workspace, type=objectType) + dfI_filt = dfI[(dfI["Display Name"] == dataset)] if len(dfI_filt) > 0: - print(f"WARNING: '{dataset}' already exists as a semantic model in the '{workspace}' workspace.") + print( + f"WARNING: '{dataset}' already exists as a semantic model in the '{workspace}' workspace." + ) return client = fabric.FabricRestClient() - defPBIDataset = { - "version": "1.0", - "settings": {} - } + defPBIDataset = {"version": "1.0", "settings": {}} def conv_b64(file): - + loadJson = json.dumps(file) - f = base64.b64encode(loadJson.encode('utf-8')).decode('utf-8') - + f = base64.b64encode(loadJson.encode("utf-8")).decode("utf-8") + return f payloadPBIDefinition = conv_b64(defPBIDataset) payloadBim = conv_b64(bim_file) request_body = { - 'displayName': dataset, - 'type': objectType, - 'definition': { - "parts": [ - { - "path": "model.bim", - "payload": payloadBim, - "payloadType": "InlineBase64" - }, - { - "path": "definition.pbidataset", - "payload": payloadPBIDefinition, - "payloadType": "InlineBase64" - } - ] - - } - } - - response = client.post(f"/v1/workspaces/{workspace_id}/items",json=request_body) + "displayName": dataset, + "type": objectType, + "definition": { + "parts": [ + { + "path": "model.bim", + "payload": payloadBim, + "payloadType": "InlineBase64", + }, + { + "path": "definition.pbidataset", + "payload": payloadPBIDefinition, + "payloadType": "InlineBase64", + }, + ] + }, + } + + response = client.post(f"/v1/workspaces/{workspace_id}/items", json=request_body) if response.status_code == 201: - print(f"The '{dataset}' semantic model has been created within the '{workspace}' workspace.") + print( + f"The '{dataset}' semantic model has been created within the '{workspace}' workspace." + ) print(response.json()) elif response.status_code == 202: - operationId = response.headers['x-ms-operation-id'] + operationId = response.headers["x-ms-operation-id"] response = client.get(f"/v1/operations/{operationId}") - response_body = json.loads(response.content) - while response_body['status'] != 'Succeeded': + response_body = json.loads(response.content) + while response_body["status"] != "Succeeded": time.sleep(3) response = client.get(f"/v1/operations/{operationId}") response_body = json.loads(response.content) response = client.get(f"/v1/operations/{operationId}/result") - print(f"The '{dataset}' semantic model has been created within the '{workspace}' workspace.") + print( + f"The '{dataset}' semantic model has been created within the '{workspace}' workspace." + ) print(response.json()) -def deploy_semantic_model(dataset: str, new_dataset: Optional[str] = None, workspace: Optional[str] = None, new_dataset_workspace: Optional[str] = None): +def deploy_semantic_model( + dataset: str, + new_dataset: Optional[str] = None, + workspace: Optional[str] = None, + new_dataset_workspace: Optional[str] = None, +): """ Deploys a semantic model based on an existing semantic model. @@ -115,8 +120,8 @@ def deploy_semantic_model(dataset: str, new_dataset: Optional[str] = None, works Returns ------- - - """ + + """ if workspace == None: workspace_id = fabric.get_workspace_id() @@ -129,9 +134,13 @@ def deploy_semantic_model(dataset: str, new_dataset: Optional[str] = None, works new_dataset = dataset if new_dataset == dataset and new_dataset_workspace == workspace: - print(f"The 'dataset' and 'new_dataset' parameters have the same value. And, the 'workspace' and 'new_dataset_workspace' parameters have the same value. At least one of these must be different. Please update the parameters.") + print( + f"The 'dataset' and 'new_dataset' parameters have the same value. And, the 'workspace' and 'new_dataset_workspace' parameters have the same value. At least one of these must be different. Please update the parameters." + ) return - bim = get_semantic_model_bim(dataset = dataset, workspace = workspace) + bim = get_semantic_model_bim(dataset=dataset, workspace=workspace) - create_semantic_model_from_bim(dataset = new_dataset, bim_file = bim, workspace = new_dataset_workspace) \ No newline at end of file + create_semantic_model_from_bim( + dataset=new_dataset, bim_file=bim, workspace=new_dataset_workspace + ) diff --git a/sempy_labs/HelperFunctions.py b/sempy_labs/_helper_functions.py similarity index 64% rename from sempy_labs/HelperFunctions.py rename to sempy_labs/_helper_functions.py index d6a8ebf1..46a36f18 100644 --- a/sempy_labs/HelperFunctions.py +++ b/sempy_labs/_helper_functions.py @@ -1,17 +1,14 @@ -import sempy import sempy.fabric as fabric import re from pyspark.sql import SparkSession -from typing import List, Optional, Union +from typing import Optional from uuid import UUID +import sempy_labs._icons as icons -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' - -def create_abfss_path(lakehouse_id: UUID, lakehouse_workspace_id: UUID, delta_table_name: str): +def create_abfss_path( + lakehouse_id: UUID, lakehouse_workspace_id: UUID, delta_table_name: str +): """ Creates an abfss path for a delta table in a Fabric lakehouse. @@ -32,8 +29,8 @@ def create_abfss_path(lakehouse_id: UUID, lakehouse_workspace_id: UUID, delta_ta return f"abfss://{lakehouse_workspace_id}@onelake.dfs.fabric.microsoft.com/{lakehouse_id}/Tables/{delta_table_name}" -def format_dax_object_name(a: str,b: str): +def format_dax_object_name(a: str, b: str): """ Formats a table/column combination to the 'Table Name'[Column Name] format. @@ -49,11 +46,13 @@ def format_dax_object_name(a: str,b: str): str The fully qualified object name. """ - + return "'" + a + "'[" + b + "]" -def create_relationship_name(from_table: str, from_column: str, to_table: str, to_column: str): +def create_relationship_name( + from_table: str, from_column: str, to_table: str, to_column: str +): """ Formats a relationship's table/columns into a fully qualified name. @@ -71,13 +70,17 @@ def create_relationship_name(from_table: str, from_column: str, to_table: str, t Returns ------- str - The fully qualified relationship name. + The fully qualified relationship name. """ - return format_dax_object_name(from_table, from_column) + ' -> ' + format_dax_object_name(to_table, to_column) + return ( + format_dax_object_name(from_table, from_column) + + " -> " + + format_dax_object_name(to_table, to_column) + ) -def resolve_report_id(report: str, workspace: Optional[str] = None): +def resolve_report_id(report: str, workspace: Optional[str] = None): """ Obtains the ID of the Power BI report. @@ -95,22 +98,22 @@ def resolve_report_id(report: str, workspace: Optional[str] = None): UUID The ID of the Power BI report. """ - + if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - obj = fabric.resolve_item_id(item_name = report, type = 'Report', workspace = workspace) + obj = fabric.resolve_item_id(item_name=report, type="Report", workspace=workspace) - #objectType = 'Report' - #dfI = fabric.list_items(workspace = workspace, type = objectType) - #dfI_filt = dfI[(dfI['Display Name'] == report)] - #obj = dfI_filt['Id'].iloc[0] + # objectType = 'Report' + # dfI = fabric.list_items(workspace = workspace, type = objectType) + # dfI_filt = dfI[(dfI['Display Name'] == report)] + # obj = dfI_filt['Id'].iloc[0] return obj -def resolve_report_name(report_id: UUID, workspace: Optional[str] = None): +def resolve_report_name(report_id: UUID, workspace: Optional[str] = None): """ Obtains the name of the Power BI report. @@ -128,23 +131,24 @@ def resolve_report_name(report_id: UUID, workspace: Optional[str] = None): str The name of the Power BI report. """ - - + if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - obj = fabric.resolve_item_name(item_id = report_id, type = 'Report', workspace = workspace) + obj = fabric.resolve_item_name( + item_id=report_id, type="Report", workspace=workspace + ) - #objectType = 'Report' - #dfI = fabric.list_items(workspace = workspace, type = objectType) - #dfI_filt = dfI[(dfI['Id'] == report_id)] - #obj = dfI_filt['Display Name'].iloc[0] + # objectType = 'Report' + # dfI = fabric.list_items(workspace = workspace, type = objectType) + # dfI_filt = dfI[(dfI['Id'] == report_id)] + # obj = dfI_filt['Display Name'].iloc[0] return obj -def resolve_dataset_id(dataset: str, workspace: Optional[str] = None): +def resolve_dataset_id(dataset: str, workspace: Optional[str] = None): """ Obtains the ID of the semantic model. @@ -162,22 +166,24 @@ def resolve_dataset_id(dataset: str, workspace: Optional[str] = None): UUID The ID of the semantic model. """ - + if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - obj = fabric.resolve_item_id(item_name = dataset, type = 'SemanticModel', workspace = workspace) + obj = fabric.resolve_item_id( + item_name=dataset, type="SemanticModel", workspace=workspace + ) - #objectType = 'SemanticModel' - #dfI = fabric.list_items(workspace = workspace, type = objectType) - #dfI_filt = dfI[(dfI['Display Name'] == dataset)] - #obj = dfI_filt['Id'].iloc[0] + # objectType = 'SemanticModel' + # dfI = fabric.list_items(workspace = workspace, type = objectType) + # dfI_filt = dfI[(dfI['Display Name'] == dataset)] + # obj = dfI_filt['Id'].iloc[0] return obj -def resolve_dataset_name(dataset_id: UUID, workspace: Optional[str] = None): +def resolve_dataset_name(dataset_id: UUID, workspace: Optional[str] = None): """ Obtains the name of the semantic model. @@ -195,22 +201,24 @@ def resolve_dataset_name(dataset_id: UUID, workspace: Optional[str] = None): str The name of the semantic model. """ - + if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - obj = fabric.resolve_item_name(item_id = dataset_id, type = 'SemanticModel', workspace = workspace) + obj = fabric.resolve_item_name( + item_id=dataset_id, type="SemanticModel", workspace=workspace + ) - #objectType = 'SemanticModel' - #dfI = fabric.list_items(workspace = workspace, type = objectType) - #dfI_filt = dfI[(dfI['Id'] == dataset_id)] - #obj = dfI_filt['Display Name'].iloc[0] + # objectType = 'SemanticModel' + # dfI = fabric.list_items(workspace = workspace, type = objectType) + # dfI_filt = dfI[(dfI['Id'] == dataset_id)] + # obj = dfI_filt['Display Name'].iloc[0] return obj -def resolve_lakehouse_name(lakehouse_id: UUID, workspace: Optional[str] = None): +def resolve_lakehouse_name(lakehouse_id: UUID, workspace: Optional[str] = None): """ Obtains the name of the Fabric lakehouse. @@ -228,27 +236,29 @@ def resolve_lakehouse_name(lakehouse_id: UUID, workspace: Optional[str] = None): str The name of the Fabric lakehouse. """ - + if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - obj = fabric.resolve_item_name(item_id = lakehouse_id, type = 'Lakehouse', workspace = workspace) + obj = fabric.resolve_item_name( + item_id=lakehouse_id, type="Lakehouse", workspace=workspace + ) - #objectType = 'Lakehouse' - #dfI = fabric.list_items(workspace = workspace, type = objectType) - #dfI_filt = dfI[(dfI['Id'] == lakehouse_id)] + # objectType = 'Lakehouse' + # dfI = fabric.list_items(workspace = workspace, type = objectType) + # dfI_filt = dfI[(dfI['Id'] == lakehouse_id)] - #if len(dfI_filt) == 0: + # if len(dfI_filt) == 0: # print(f"The '{lakehouse_id}' Lakehouse Id does not exist within the '{workspace}' workspace.") # return - - #obj = dfI_filt['Display Name'].iloc[0] + + # obj = dfI_filt['Display Name'].iloc[0] return obj -def resolve_lakehouse_id(lakehouse: str, workspace: Optional[str] = None): +def resolve_lakehouse_id(lakehouse: str, workspace: Optional[str] = None): """ Obtains the ID of the Fabric lakehouse. @@ -266,27 +276,29 @@ def resolve_lakehouse_id(lakehouse: str, workspace: Optional[str] = None): UUID The ID of the Fabric lakehouse. """ - + if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - obj = fabric.resolve_item_id(item_name = lakehouse, type = 'Lakehouse', workspace = workspace) + obj = fabric.resolve_item_id( + item_name=lakehouse, type="Lakehouse", workspace=workspace + ) - #objectType = 'Lakehouse' - #dfI = fabric.list_items(workspace = workspace, type = objectType) - #dfI_filt = dfI[(dfI['Display Name'] == lakehouse)] + # objectType = 'Lakehouse' + # dfI = fabric.list_items(workspace = workspace, type = objectType) + # dfI_filt = dfI[(dfI['Display Name'] == lakehouse)] - #if len(dfI_filt) == 0: + # if len(dfI_filt) == 0: # print(f"The '{lakehouse}' lakehouse does not exist within the '{workspace}' workspace.") # return - - #obj = dfI_filt['Id'].iloc[0] + + # obj = dfI_filt['Id'].iloc[0] return obj -def get_direct_lake_sql_endpoint(dataset: str, workspace: Optional[str] = None): +def get_direct_lake_sql_endpoint(dataset: str, workspace: Optional[str] = None): """ Obtains the SQL Endpoint ID of the semantic model. @@ -309,24 +321,26 @@ def get_direct_lake_sql_endpoint(dataset: str, workspace: Optional[str] = None): workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) - dfP_filt = dfP[dfP['Mode'] == 'DirectLake'] + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + dfP_filt = dfP[dfP["Mode"] == "DirectLake"] if len(dfP_filt) == 0: - print(f"The '{dataset}' semantic model in the '{workspace}' workspace is not in Direct Lake mode.") + print( + f"The '{dataset}' semantic model in the '{workspace}' workspace is not in Direct Lake mode." + ) return - - dfE = fabric.list_expressions(dataset = dataset, workspace = workspace) - dfE_filt = dfE[dfE['Name']== 'DatabaseQuery'] - expr = dfE_filt['Expression'].iloc[0] + + dfE = fabric.list_expressions(dataset=dataset, workspace=workspace) + dfE_filt = dfE[dfE["Name"] == "DatabaseQuery"] + expr = dfE_filt["Expression"].iloc[0] matches = re.findall(r'"([^"]*)"', expr) sqlEndpointId = matches[1] - + return sqlEndpointId -def generate_embedded_filter(filter: str): +def generate_embedded_filter(filter: str): """ Converts the filter expression to a filter expression which can be used by a Power BI embedded URL. @@ -344,27 +358,60 @@ def generate_embedded_filter(filter: str): pattern = r"'[^']+'\[[^\[]+\]" matches = re.findall(pattern, filter) for match in matches: - matchReplace = match.replace("'",'').replace('[','/').replace(']','')\ - .replace(' ','_x0020_').replace('@','_00x40_').replace('+','_0x2B_').replace('{','_007B_').replace('}','_007D_') + matchReplace = ( + match.replace("'", "") + .replace("[", "/") + .replace("]", "") + .replace(" ", "_x0020_") + .replace("@", "_00x40_") + .replace("+", "_0x2B_") + .replace("{", "_007B_") + .replace("}", "_007D_") + ) filter = filter.replace(match, matchReplace) - + pattern = r"\[[^\[]+\]" matches = re.findall(pattern, filter) for match in matches: - matchReplace = match.replace("'",'').replace('[','/').replace(']','')\ - .replace(' ','_x0020_').replace('@','_00x40_').replace('+','_0x2B_').replace('{','_007B_').replace('}','_007D_') + matchReplace = ( + match.replace("'", "") + .replace("[", "/") + .replace("]", "") + .replace(" ", "_x0020_") + .replace("@", "_00x40_") + .replace("+", "_0x2B_") + .replace("{", "_007B_") + .replace("}", "_007D_") + ) filter = filter.replace(match, matchReplace) - revised_filter = filter.replace('<=','le').replace('>=','ge').replace('<>','ne').replace('!=','ne')\ - .replace('==','eq').replace('=','eq').replace('<','lt').replace('>','gt')\ - .replace(' && ',' and ').replace(' & ',' and ')\ - .replace(' || ',' or ').replace(' | ',' or ')\ - .replace('{','(').replace('}',')') - + revised_filter = ( + filter.replace("<=", "le") + .replace(">=", "ge") + .replace("<>", "ne") + .replace("!=", "ne") + .replace("==", "eq") + .replace("=", "eq") + .replace("<", "lt") + .replace(">", "gt") + .replace(" && ", " and ") + .replace(" & ", " and ") + .replace(" || ", " or ") + .replace(" | ", " or ") + .replace("{", "(") + .replace("}", ")") + ) + return revised_filter -def save_as_delta_table(dataframe, delta_table_name: str, write_mode: str, lakehouse: Optional[str] = None, workspace: Optional[str] = None): +def save_as_delta_table( + dataframe, + delta_table_name: str, + write_mode: str, + lakehouse: Optional[str] = None, + workspace: Optional[str] = None, +): """ Saves a pandas dataframe as a delta table in a Fabric lakehouse. @@ -398,26 +445,38 @@ def save_as_delta_table(dataframe, delta_table_name: str, write_mode: str, lakeh if lakehouse is None: lakehouse_id = fabric.get_lakehouse_id() - lakehouse = resolve_lakehouse_name(lakehouse_id=lakehouse_id, workspace=workspace) + lakehouse = resolve_lakehouse_name( + lakehouse_id=lakehouse_id, workspace=workspace + ) else: lakehouse_id = resolve_lakehouse_id(lakehouse, workspace) - writeModes = ['append', 'overwrite'] + writeModes = ["append", "overwrite"] write_mode = write_mode.lower() if write_mode not in writeModes: - print(f"{red_dot} Invalid 'write_type' parameter. Choose from one of the following values: {writeModes}.") + print( + f"{icons.red_dot} Invalid 'write_type' parameter. Choose from one of the following values: {writeModes}." + ) return - if ' ' in delta_table_name: - print(f"{red_dot} Invalid 'delta_table_name'. Delta tables in the lakehouse cannot have spaces in their names.") + if " " in delta_table_name: + print( + f"{icons.red_dot} Invalid 'delta_table_name'. Delta tables in the lakehouse cannot have spaces in their names." + ) return - - dataframe.columns = dataframe.columns.str.replace(' ', '_') + + dataframe.columns = dataframe.columns.str.replace(" ", "_") spark = SparkSession.builder.getOrCreate() spark_df = spark.createDataFrame(dataframe) - filePath = create_abfss_path(lakehouse_id = lakehouse_id, lakehouse_workspace_id = workspace_id, delta_table_name = delta_table_name) - spark_df.write.mode(write_mode).format('delta').save(filePath) - print(f"{green_dot} The dataframe has been saved as the '{delta_table_name}' table in the '{lakehouse}' lakehouse within the '{workspace}' workspace.") \ No newline at end of file + filePath = create_abfss_path( + lakehouse_id=lakehouse_id, + lakehouse_workspace_id=workspace_id, + delta_table_name=delta_table_name, + ) + spark_df.write.mode(write_mode).format("delta").save(filePath) + print( + f"{icons.green_dot} The dataframe has been saved as the '{delta_table_name}' table in the '{lakehouse}' lakehouse within the '{workspace}' workspace." + ) diff --git a/sempy_labs/_icons.py b/sempy_labs/_icons.py new file mode 100644 index 00000000..2547eb5f --- /dev/null +++ b/sempy_labs/_icons.py @@ -0,0 +1,4 @@ +green_dot = "\U0001F7E2" +yellow_dot = "\U0001F7E1" +red_dot = "\U0001F534" +in_progress = "⌛" diff --git a/sempy_labs/ListFunctions.py b/sempy_labs/_list_functions.py similarity index 55% rename from sempy_labs/ListFunctions.py rename to sempy_labs/_list_functions.py index 6e63a69a..f2fe7c2f 100644 --- a/sempy_labs/ListFunctions.py +++ b/sempy_labs/_list_functions.py @@ -1,13 +1,11 @@ -import sempy import sempy.fabric as fabric import pandas as pd import json, time from pyspark.sql import SparkSession -from .GetDirectLakeLakehouse import get_direct_lake_lakehouse from typing import List, Optional, Union -def get_object_level_security(dataset: str, workspace: Optional[str] = None): +def get_object_level_security(dataset: str, workspace: Optional[str] = None): """ Shows the object level security for the semantic model. @@ -19,7 +17,7 @@ def get_object_level_security(dataset: str, workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame @@ -33,7 +31,7 @@ def get_object_level_security(dataset: str, workspace: Optional[str] = None): tom_server = fabric.create_tom_server(readonly=True, workspace=workspace) m = tom_server.Databases.GetByName(dataset).Model - df = pd.DataFrame(columns=['Role Name', 'Object Type', 'Table Name', 'Object Name']) + df = pd.DataFrame(columns=["Role Name", "Object Type", "Table Name", "Object Name"]) for r in m.Roles: for tp in r.TablePermissions: @@ -41,18 +39,32 @@ def get_object_level_security(dataset: str, workspace: Optional[str] = None): columnCount = len(tp.ColumnPermissions) objectType = "Table" if columnCount == 0: - new_data = {'Role Name': r.Name, 'Object Type': objectType, 'Table Name': tp.Name, 'Object Name': tp.Name} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + new_data = { + "Role Name": r.Name, + "Object Type": objectType, + "Table Name": tp.Name, + "Object Name": tp.Name, + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) else: objectType = "Column" for cp in tp.ColumnPermissions: - new_data = {'Role Name': r.Name, 'Object Type': objectType, 'Table Name': tp.Name, 'Object Name': cp.Name} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + new_data = { + "Role Name": r.Name, + "Object Type": objectType, + "Table Name": tp.Name, + "Object Name": cp.Name, + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) return df -def list_tables(dataset: str, workspace: Optional[str] = None): +def list_tables(dataset: str, workspace: Optional[str] = None): """ Shows a semantic model's tables and their properties. @@ -64,7 +76,7 @@ def list_tables(dataset: str, workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame @@ -78,7 +90,17 @@ def list_tables(dataset: str, workspace: Optional[str] = None): tom_server = fabric.create_tom_server(readonly=True, workspace=workspace) m = tom_server.Databases.GetByName(dataset).Model - df = pd.DataFrame(columns=['Name', 'Type', 'Hidden', 'Data Category', 'Description', 'Refresh Policy', 'Source Expression']) + df = pd.DataFrame( + columns=[ + "Name", + "Type", + "Hidden", + "Data Category", + "Description", + "Refresh Policy", + "Source Expression", + ] + ) for t in m.Tables: tableType = "Table" @@ -94,13 +116,21 @@ def list_tables(dataset: str, workspace: Optional[str] = None): if rPolicy: sourceExpression = t.RefreshPolicy.SourceExpression - new_data = {'Name': t.Name, 'Type': tableType, 'Hidden': t.IsHidden, 'Data Category': t.DataCategory, 'Description': t.Description, 'Refresh Policy': rPolicy, 'Source Expression': sourceExpression} + new_data = { + "Name": t.Name, + "Type": tableType, + "Hidden": t.IsHidden, + "Data Category": t.DataCategory, + "Description": t.Description, + "Refresh Policy": rPolicy, + "Source Expression": sourceExpression, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) return df -def list_annotations(dataset: str, workspace: Optional[str] = None): +def list_annotations(dataset: str, workspace: Optional[str] = None): """ Shows a semantic model's annotations and their properties. @@ -112,7 +142,7 @@ def list_annotations(dataset: str, workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame @@ -126,108 +156,201 @@ def list_annotations(dataset: str, workspace: Optional[str] = None): tom_server = fabric.create_tom_server(readonly=True, workspace=workspace) m = tom_server.Databases.GetByName(dataset).Model - df = pd.DataFrame(columns=['Object Name', 'Parent Object Name', 'Object Type', 'Annotation Name', 'Annotation Value']) + df = pd.DataFrame( + columns=[ + "Object Name", + "Parent Object Name", + "Object Type", + "Annotation Name", + "Annotation Value", + ] + ) mName = m.Name - for a in m.Annotations: - objectType = 'Model' + for a in m.Annotations: + objectType = "Model" aName = a.Name aValue = a.Value - new_data = {'Object Name': mName, 'Parent Object Name': "N/A", 'Object Type': objectType,'Annotation Name': aName, 'Annotation Value': aValue} + new_data = { + "Object Name": mName, + "Parent Object Name": "N/A", + "Object Type": objectType, + "Annotation Name": aName, + "Annotation Value": aValue, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) for t in m.Tables: - objectType = 'Table' + objectType = "Table" tName = t.Name for ta in t.Annotations: taName = ta.Name taValue = ta.Value - new_data = {'Object Name': tName, 'Parent Object Name': mName, 'Object Type': objectType,'Annotation Name': taName, 'Annotation Value': taValue} + new_data = { + "Object Name": tName, + "Parent Object Name": mName, + "Object Type": objectType, + "Annotation Name": taName, + "Annotation Value": taValue, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) for p in t.Partitions: pName = p.Name - objectType = 'Partition' + objectType = "Partition" for pa in p.Annotations: paName = paName paValue = paValue - new_data = {'Object Name': pName, 'Parent Object Name': tName, 'Object Type': objectType,'Annotation Name': paName, 'Annotation Value': paValue} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + new_data = { + "Object Name": pName, + "Parent Object Name": tName, + "Object Type": objectType, + "Annotation Name": paName, + "Annotation Value": paValue, + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) for c in t.Columns: - objectType = 'Column' - cName = c.Name + objectType = "Column" + cName = c.Name for ca in c.Annotations: caName = ca.Name caValue = ca.Value - new_data = {'Object Name': cName, 'Parent Object Name': tName, 'Object Type': objectType,'Annotation Name': caName, 'Annotation Value': caValue} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + new_data = { + "Object Name": cName, + "Parent Object Name": tName, + "Object Type": objectType, + "Annotation Name": caName, + "Annotation Value": caValue, + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) for ms in t.Measures: - objectType = 'Measure' + objectType = "Measure" measName = ms.Name for ma in ms.Annotations: maName = ma.Name maValue = ma.Value - new_data = {'Object Name': measName, 'Parent Object Name': tName, 'Object Type': objectType,'Annotation Name': maName, 'Annotation Value': maValue} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + new_data = { + "Object Name": measName, + "Parent Object Name": tName, + "Object Type": objectType, + "Annotation Name": maName, + "Annotation Value": maValue, + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) for h in t.Hierarchies: - objectType = 'Hierarchy' + objectType = "Hierarchy" hName = h.Name for ha in h.Annotations: haName = ha.Name haValue = ha.Value - new_data = {'Object Name': hName, 'Parent Object Name': tName, 'Object Type': objectType,'Annotation Name': haName, 'Annotation Value': haValue} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + new_data = { + "Object Name": hName, + "Parent Object Name": tName, + "Object Type": objectType, + "Annotation Name": haName, + "Annotation Value": haValue, + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) for d in m.DataSources: dName = d.Name - objectType = 'Data Source' + objectType = "Data Source" for da in d.Annotations: daName = da.Name daValue = da.Value - new_data = {'Object Name': dName, 'Parent Object Name': mName, 'Object Type': objectType,'Annotation Name': daName, 'Annotation Value': daValue} + new_data = { + "Object Name": dName, + "Parent Object Name": mName, + "Object Type": objectType, + "Annotation Name": daName, + "Annotation Value": daValue, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) for r in m.Relationships: rName = r.Name - objectType = 'Relationship' + objectType = "Relationship" for ra in r.Annotations: raName = ra.Name raValue = ra.Value - new_data = {'Object Name': rName, 'Parent Object Name': mName, 'Object Type': objectType,'Annotation Name': raName, 'Annotation Value': raValue} + new_data = { + "Object Name": rName, + "Parent Object Name": mName, + "Object Type": objectType, + "Annotation Name": raName, + "Annotation Value": raValue, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) for cul in m.Cultures: culName = cul.Name - objectType = 'Translation' + objectType = "Translation" for cula in cul.Annotations: culaName = cula.Name culaValue = cula.Value - new_data = {'Object Name': culName, 'Parent Object Name': mName, 'Object Type': objectType,'Annotation Name': culaName, 'Annotation Value': culaValue} + new_data = { + "Object Name": culName, + "Parent Object Name": mName, + "Object Type": objectType, + "Annotation Name": culaName, + "Annotation Value": culaValue, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) for e in m.Expressions: eName = e.Name - objectType = 'Expression' + objectType = "Expression" for ea in e.Annotations: eaName = ea.Name eaValue = ea.Value - new_data = {'Object Name': eName, 'Parent Object Name': mName, 'Object Type': objectType,'Annotation Name': eaName, 'Annotation Value': eaValue} + new_data = { + "Object Name": eName, + "Parent Object Name": mName, + "Object Type": objectType, + "Annotation Name": eaName, + "Annotation Value": eaValue, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) for per in m.Perspectives: perName = per.Name - objectType = 'Perspective' + objectType = "Perspective" for pera in per.Annotations: peraName = pera.Name peraValue = pera.Value - new_data = {'Object Name': perName, 'Parent Object Name': mName, 'Object Type': objectType,'Annotation Name': peraName, 'Annotation Value': peraValue} + new_data = { + "Object Name": perName, + "Parent Object Name": mName, + "Object Type": objectType, + "Annotation Name": peraName, + "Annotation Value": peraValue, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) for rol in m.Roles: rolName = rol.Name - objectType = 'Role' + objectType = "Role" for rola in rol.Annotations: rolaName = rola.Name rolaValue = rola.Value - new_data = {'Object Name': rolName, 'Parent Object Name': mName, 'Object Type': objectType,'Annotation Name': rolaName, 'Annotation Value': rolaValue} + new_data = { + "Object Name": rolName, + "Parent Object Name": mName, + "Object Type": objectType, + "Annotation Name": rolaName, + "Annotation Value": rolaValue, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) return df -def list_columns(dataset: str, workspace: Optional[str] = None, lakehouse: Optional[str] = None, lakehouse_workspace: Optional[str] = None): +def list_columns( + dataset: str, + workspace: Optional[str] = None, + lakehouse: Optional[str] = None, + lakehouse_workspace: Optional[str] = None, +): """ Shows a semantic model's columns and their properties. @@ -246,42 +369,60 @@ def list_columns(dataset: str, workspace: Optional[str] = None, lakehouse: Optio The Fabric workspace used by the lakehouse. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame A pandas dataframe showing the semantic model's columns and their properties. """ + from sempy_labs.directlake._get_directlake_lakehouse import ( + get_direct_lake_lakehouse, + ) if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) - isDirectLake = any(r['Mode'] == 'DirectLake' for i, r in dfP.iterrows()) + isDirectLake = any(r["Mode"] == "DirectLake" for i, r in dfP.iterrows()) - dfC = fabric.list_columns(dataset = dataset, workspace = workspace) + dfC = fabric.list_columns(dataset=dataset, workspace=workspace) if isDirectLake: - dfC['Column Cardinality'] = None + dfC["Column Cardinality"] = None sql_statements = [] - lakeID, lakeName = get_direct_lake_lakehouse(dataset = dataset, workspace = workspace, lakehouse = lakehouse, lakehouse_workspace = lakehouse_workspace) - - for table_name in dfC['Table Name'].unique(): + lakeID, lakeName = get_direct_lake_lakehouse( + dataset=dataset, + workspace=workspace, + lakehouse=lakehouse, + lakehouse_workspace=lakehouse_workspace, + ) + + for table_name in dfC["Table Name"].unique(): print(f"Gathering stats for table: '{table_name}'...") - query = 'SELECT ' - - columns_in_table = dfC.loc[dfC['Table Name'] == table_name, 'Column Name'].unique() - + query = "SELECT " + + columns_in_table = dfC.loc[ + dfC["Table Name"] == table_name, "Column Name" + ].unique() + # Loop through columns within those tables for column_name in columns_in_table: - scName = dfC.loc[(dfC['Table Name'] == table_name) & (dfC['Column Name'] == column_name), 'Source'].iloc[0] - lakeTName = dfC.loc[(dfC['Table Name'] == table_name) & (dfC['Column Name'] == column_name), 'Query'].iloc[0] + scName = dfC.loc[ + (dfC["Table Name"] == table_name) + & (dfC["Column Name"] == column_name), + "Source", + ].iloc[0] + lakeTName = dfC.loc[ + (dfC["Table Name"] == table_name) + & (dfC["Column Name"] == column_name), + "Query", + ].iloc[0] # Build the query to be executed dynamically query = query + f"COUNT(DISTINCT({scName})) AS {scName}, " - + query = query[:-2] query = query + f" FROM {lakehouse}.{lakeTName}" sql_statements.append((table_name, query)) @@ -294,20 +435,20 @@ def list_columns(dataset: str, workspace: Optional[str] = None, lakehouse: Optio # Run the query df = spark.sql(query) - + for column in df.columns: x = df.collect()[0][column] for i, r in dfC.iterrows(): - if r['Table Name'] == tName and r['Source'] == column: - dfC.at[i, 'Column Cardinality'] = x + if r["Table Name"] == tName and r["Source"] == column: + dfC.at[i, "Column Cardinality"] = x # Remove column added temporarily - dfC.drop(columns=['Query'], inplace=True) + dfC.drop(columns=["Query"], inplace=True) return dfC -def list_dashboards(workspace: Optional[str] = None): +def list_dashboards(workspace: Optional[str] = None): """ Shows a list of the dashboards within a workspace. @@ -317,16 +458,27 @@ def list_dashboards(workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame A pandas dataframe showing the dashboards within a workspace. """ - df = pd.DataFrame(columns=['Dashboard ID', 'Dashboard Name', 'Read Only', 'Web URL', 'Embed URL', 'Data Classification', 'Users', 'Subscriptions']) - - if workspace == 'None': + df = pd.DataFrame( + columns=[ + "Dashboard ID", + "Dashboard Name", + "Read Only", + "Web URL", + "Embed URL", + "Data Classification", + "Users", + "Subscriptions", + ] + ) + + if workspace == "None": workspace_id = fabric.get_workspace_id() workspace = fabric.resovle_workspace_name(workspace_id) else: @@ -335,25 +487,34 @@ def list_dashboards(workspace: Optional[str] = None): client = fabric.PowerBIRestClient() response = client.get(f"/v1.0/myorg/groups/{workspace_id}/dashboards") - for v in response.json()['value']: - dashboardID = v['id'] - displayName = v['displayName'] - isReadOnly = v['isReadOnly'] - webURL = v['webUrl'] - embedURL = v['embedUrl'] - dataClass = v['dataClassification'] - users = v['users'] - subs = v['subscriptions'] - - new_data = {'Dashboard ID': dashboardID, 'Dashboard Name': displayName, 'Read Only': isReadOnly, 'Web URL': webURL, 'Embed URL': embedURL, 'Data Classification': dataClass, 'Users': [users], 'Subscriptions': [subs]} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + for v in response.json()["value"]: + dashboardID = v["id"] + displayName = v["displayName"] + isReadOnly = v["isReadOnly"] + webURL = v["webUrl"] + embedURL = v["embedUrl"] + dataClass = v["dataClassification"] + users = v["users"] + subs = v["subscriptions"] + + new_data = { + "Dashboard ID": dashboardID, + "Dashboard Name": displayName, + "Read Only": isReadOnly, + "Web URL": webURL, + "Embed URL": embedURL, + "Data Classification": dataClass, + "Users": [users], + "Subscriptions": [subs], + } + df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - df['Read Only'] = df['Read Only'].astype(bool) + df["Read Only"] = df["Read Only"].astype(bool) return df -def list_lakehouses(workspace: Optional[str] = None): +def list_lakehouses(workspace: Optional[str] = None): """ Shows the lakehouses within a workspace. @@ -363,14 +524,25 @@ def list_lakehouses(workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame A pandas dataframe showing the lakehouses within a workspace. """ - df = pd.DataFrame(columns=['Lakehouse Name', 'Lakehouse ID', 'Description', 'OneLake Tables Path', 'OneLake Files Path', 'SQL Endpoint Connection String', 'SQL Endpoint ID', 'SQL Endpoint Provisioning Status']) + df = pd.DataFrame( + columns=[ + "Lakehouse Name", + "Lakehouse ID", + "Description", + "OneLake Tables Path", + "OneLake Files Path", + "SQL Endpoint Connection String", + "SQL Endpoint ID", + "SQL Endpoint Provisioning Status", + ] + ) if workspace == None: workspace_id = fabric.get_workspace_id() @@ -380,26 +552,35 @@ def list_lakehouses(workspace: Optional[str] = None): client = fabric.FabricRestClient() response = client.get(f"/v1/workspaces/{workspace_id}/lakehouses/") - - for v in response.json()['value']: - lakehouseId = v['id'] - lakehouseName = v['displayName'] - lakehouseDesc = v['description'] - prop = v['properties'] - oneLakeTP = prop['oneLakeTablesPath'] - oneLakeFP = prop['oneLakeFilesPath'] - sqlEPProp = prop['sqlEndpointProperties'] - sqlEPCS = sqlEPProp['connectionString'] - sqlepid = sqlEPProp['id'] - sqlepstatus = sqlEPProp['provisioningStatus'] - - new_data = {'Lakehouse Name': lakehouseName, 'Lakehouse ID': lakehouseId, 'Description': lakehouseDesc, 'OneLake Tables Path': oneLakeTP, 'OneLake Files Path': oneLakeFP, 'SQL Endpoint Connection String': sqlEPCS, 'SQL Endpoint ID': sqlepid, 'SQL Endpoint Provisioning Status': sqlepstatus} + + for v in response.json()["value"]: + lakehouseId = v["id"] + lakehouseName = v["displayName"] + lakehouseDesc = v["description"] + prop = v["properties"] + oneLakeTP = prop["oneLakeTablesPath"] + oneLakeFP = prop["oneLakeFilesPath"] + sqlEPProp = prop["sqlEndpointProperties"] + sqlEPCS = sqlEPProp["connectionString"] + sqlepid = sqlEPProp["id"] + sqlepstatus = sqlEPProp["provisioningStatus"] + + new_data = { + "Lakehouse Name": lakehouseName, + "Lakehouse ID": lakehouseId, + "Description": lakehouseDesc, + "OneLake Tables Path": oneLakeTP, + "OneLake Files Path": oneLakeFP, + "SQL Endpoint Connection String": sqlEPCS, + "SQL Endpoint ID": sqlepid, + "SQL Endpoint Provisioning Status": sqlepstatus, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) return df -def list_warehouses(workspace: Optional[str] = None): +def list_warehouses(workspace: Optional[str] = None): """ Shows the warehouses within a workspace. @@ -409,14 +590,23 @@ def list_warehouses(workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame A pandas dataframe showing the warehouses within a workspace. """ - df = pd.DataFrame(columns=['Warehouse Name', 'Warehouse ID', 'Description', 'Connection Info', 'Created Date', 'Last Updated Time']) + df = pd.DataFrame( + columns=[ + "Warehouse Name", + "Warehouse ID", + "Description", + "Connection Info", + "Created Date", + "Last Updated Time", + ] + ) if workspace == None: workspace_id = fabric.get_workspace_id() @@ -426,23 +616,30 @@ def list_warehouses(workspace: Optional[str] = None): client = fabric.FabricRestClient() response = client.get(f"/v1/workspaces/{workspace_id}/warehouses/") - - for v in response.json()['value']: - warehouse_id = v['id'] - warehouse_name = v['displayName'] - desc = v['description'] - prop = v['properties'] - connInfo = prop['connectionInfo'] - createdDate = prop['createdDate'] - lastUpdate = prop['lastUpdatedTime'] - - new_data = {'Warehouse Name': warehouse_name, 'Warehouse ID': warehouse_id, 'Description': desc, 'Connection Info': connInfo, 'Created Date': createdDate, 'Last Updated Time': lastUpdate} + + for v in response.json()["value"]: + warehouse_id = v["id"] + warehouse_name = v["displayName"] + desc = v["description"] + prop = v["properties"] + connInfo = prop["connectionInfo"] + createdDate = prop["createdDate"] + lastUpdate = prop["lastUpdatedTime"] + + new_data = { + "Warehouse Name": warehouse_name, + "Warehouse ID": warehouse_id, + "Description": desc, + "Connection Info": connInfo, + "Created Date": createdDate, + "Last Updated Time": lastUpdate, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) return df -def list_sqlendpoints(workspace: Optional[str] = None): +def list_sqlendpoints(workspace: Optional[str] = None): """ Shows the SQL Endpoints within a workspace. @@ -452,14 +649,14 @@ def list_sqlendpoints(workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame A pandas dataframe showing the SQL Endpoints within a workspace. """ - df = pd.DataFrame(columns=['SQL Endpoint ID', 'SQL Endpoint Name', 'Description']) + df = pd.DataFrame(columns=["SQL Endpoint ID", "SQL Endpoint Name", "Description"]) if workspace == None: workspace_id = fabric.get_workspace_id() @@ -469,19 +666,23 @@ def list_sqlendpoints(workspace: Optional[str] = None): client = fabric.FabricRestClient() response = client.get(f"/v1/workspaces/{workspace_id}/sqlEndpoints/") - - for v in response.json()['value']: - sql_id = v['id'] - lake_name = v['displayName'] - desc = v['description'] - new_data = {'SQL Endpoint ID': sql_id, 'SQL Endpoint Name': lake_name, 'Description': desc} + for v in response.json()["value"]: + sql_id = v["id"] + lake_name = v["displayName"] + desc = v["description"] + + new_data = { + "SQL Endpoint ID": sql_id, + "SQL Endpoint Name": lake_name, + "Description": desc, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) return df -def list_mirroredwarehouses(workspace: Optional[str] = None): +def list_mirroredwarehouses(workspace: Optional[str] = None): """ Shows the mirrored warehouses within a workspace. @@ -491,14 +692,16 @@ def list_mirroredwarehouses(workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame A pandas dataframe showing the mirrored warehouses within a workspace. """ - df = pd.DataFrame(columns=['Mirrored Warehouse', 'Mirrored Warehouse ID', 'Description']) + df = pd.DataFrame( + columns=["Mirrored Warehouse", "Mirrored Warehouse ID", "Description"] + ) if workspace == None: workspace_id = fabric.get_workspace_id() @@ -508,19 +711,23 @@ def list_mirroredwarehouses(workspace: Optional[str] = None): client = fabric.FabricRestClient() response = client.get(f"/v1/workspaces/{workspace_id}/mirroredWarehouses/") - - for v in response.json()['value']: - mirr_id = v['id'] - dbname = v['displayName'] - desc = v['description'] - new_data = {'Mirrored Warehouse': dbname, 'Mirrored Warehouse ID': mirr_id, 'Description': desc} + for v in response.json()["value"]: + mirr_id = v["id"] + dbname = v["displayName"] + desc = v["description"] + + new_data = { + "Mirrored Warehouse": dbname, + "Mirrored Warehouse ID": mirr_id, + "Description": desc, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) return df -def list_kqldatabases(workspace: Optional[str] = None): +def list_kqldatabases(workspace: Optional[str] = None): """ Shows the KQL databases within a workspace. @@ -530,14 +737,24 @@ def list_kqldatabases(workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame A pandas dataframe showing the KQL Databases within a workspace. """ - df = pd.DataFrame(columns=['KQL Database Name', 'KQL Database ID', 'Description', 'Parent Eventhouse Item ID', 'Query Service URI', 'Ingestion Service URI', 'Kusto Database Type']) + df = pd.DataFrame( + columns=[ + "KQL Database Name", + "KQL Database ID", + "Description", + "Parent Eventhouse Item ID", + "Query Service URI", + "Ingestion Service URI", + "Kusto Database Type", + ] + ) if workspace == None: workspace_id = fabric.get_workspace_id() @@ -547,24 +764,32 @@ def list_kqldatabases(workspace: Optional[str] = None): client = fabric.FabricRestClient() response = client.get(f"/v1/workspaces/{workspace_id}/kqlDatabases/") - - for v in response.json()['value']: - kql_id = v['id'] - kql_name = v['displayName'] - desc = v['description'] - prop = v['properties'] - eventId = prop['parentEventhouseItemId'] - qsURI = prop['queryServiceUri'] - isURI = prop['ingestionServiceUri'] - dbType = prop['kustoDatabaseType'] - - new_data = {'KQL Database Name': kql_name, 'KQL Database ID': kql_id, 'Description': desc, 'Parent Eventhouse Item ID': eventId, 'Query Service URI': qsURI, 'Ingestion Service URI': isURI, 'Kusto Database Type': dbType} + + for v in response.json()["value"]: + kql_id = v["id"] + kql_name = v["displayName"] + desc = v["description"] + prop = v["properties"] + eventId = prop["parentEventhouseItemId"] + qsURI = prop["queryServiceUri"] + isURI = prop["ingestionServiceUri"] + dbType = prop["kustoDatabaseType"] + + new_data = { + "KQL Database Name": kql_name, + "KQL Database ID": kql_id, + "Description": desc, + "Parent Eventhouse Item ID": eventId, + "Query Service URI": qsURI, + "Ingestion Service URI": isURI, + "Kusto Database Type": dbType, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) return df -def list_kqlquerysets(workspace: Optional[str] = None): +def list_kqlquerysets(workspace: Optional[str] = None): """ Shows the KQL Querysets within a workspace. @@ -574,14 +799,14 @@ def list_kqlquerysets(workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame A pandas dataframe showing the KQL Querysets within a workspace. """ - df = pd.DataFrame(columns=['KQL Queryset Name', 'KQL Queryset ID', 'Description']) + df = pd.DataFrame(columns=["KQL Queryset Name", "KQL Queryset ID", "Description"]) if workspace == None: workspace_id = fabric.get_workspace_id() @@ -591,19 +816,23 @@ def list_kqlquerysets(workspace: Optional[str] = None): client = fabric.FabricRestClient() response = client.get(f"/v1/workspaces/{workspace_id}/kqlQuerysets/") - - for v in response.json()['value']: - kql_id = v['id'] - kql_name = v['displayName'] - desc = v['description'] - new_data = {'KQL Queryset Name': kql_name, 'KQL Queryset ID': kql_id, 'Description': desc} + for v in response.json()["value"]: + kql_id = v["id"] + kql_name = v["displayName"] + desc = v["description"] + + new_data = { + "KQL Queryset Name": kql_name, + "KQL Queryset ID": kql_id, + "Description": desc, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) return df -def list_mlmodels(workspace: Optional[str] = None): +def list_mlmodels(workspace: Optional[str] = None): """ Shows the ML models within a workspace. @@ -613,14 +842,14 @@ def list_mlmodels(workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame A pandas dataframe showing the ML models within a workspace. """ - df = pd.DataFrame(columns=['ML Model Name', 'ML Model ID', 'Description']) + df = pd.DataFrame(columns=["ML Model Name", "ML Model ID", "Description"]) if workspace == None: workspace_id = fabric.get_workspace_id() @@ -630,19 +859,23 @@ def list_mlmodels(workspace: Optional[str] = None): client = fabric.FabricRestClient() response = client.get(f"/v1/workspaces/{workspace_id}/mlModels/") - - for v in response.json()['value']: - model_id = v['id'] - modelName = v['displayName'] - desc = v['description'] - new_data = {'ML Model Name': modelName, 'ML Model ID': model_id, 'Description': desc} + for v in response.json()["value"]: + model_id = v["id"] + modelName = v["displayName"] + desc = v["description"] + + new_data = { + "ML Model Name": modelName, + "ML Model ID": model_id, + "Description": desc, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) return df -def list_eventstreams(workspace: Optional[str] = None): +def list_eventstreams(workspace: Optional[str] = None): """ Shows the eventstreams within a workspace. @@ -652,14 +885,14 @@ def list_eventstreams(workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame A pandas dataframe showing the eventstreams within a workspace. """ - df = pd.DataFrame(columns=['Eventstream Name', 'Eventstream ID', 'Description']) + df = pd.DataFrame(columns=["Eventstream Name", "Eventstream ID", "Description"]) if workspace == None: workspace_id = fabric.get_workspace_id() @@ -669,19 +902,23 @@ def list_eventstreams(workspace: Optional[str] = None): client = fabric.FabricRestClient() response = client.get(f"/v1/workspaces/{workspace_id}/eventstreams/") - - for v in response.json()['value']: - model_id = v['id'] - modelName = v['displayName'] - desc = v['description'] - new_data = {'Eventstream Name': modelName, 'Eventstream ID': model_id, 'Description': desc} + for v in response.json()["value"]: + model_id = v["id"] + modelName = v["displayName"] + desc = v["description"] + + new_data = { + "Eventstream Name": modelName, + "Eventstream ID": model_id, + "Description": desc, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) return df -def list_datapipelines(workspace: Optional[str] = None): +def list_datapipelines(workspace: Optional[str] = None): """ Shows the data pipelines within a workspace. @@ -691,14 +928,14 @@ def list_datapipelines(workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame A pandas dataframe showing the data pipelines within a workspace. """ - df = pd.DataFrame(columns=['Data Pipeline Name', 'Data Pipeline ID', 'Description']) + df = pd.DataFrame(columns=["Data Pipeline Name", "Data Pipeline ID", "Description"]) if workspace == None: workspace_id = fabric.get_workspace_id() @@ -708,19 +945,23 @@ def list_datapipelines(workspace: Optional[str] = None): client = fabric.FabricRestClient() response = client.get(f"/v1/workspaces/{workspace_id}/dataPipelines/") - - for v in response.json()['value']: - model_id = v['id'] - modelName = v['displayName'] - desc = v['description'] - new_data = {'Data Pipeline Name': modelName, 'Data Pipeline ID': model_id, 'Description': desc} + for v in response.json()["value"]: + model_id = v["id"] + modelName = v["displayName"] + desc = v["description"] + + new_data = { + "Data Pipeline Name": modelName, + "Data Pipeline ID": model_id, + "Description": desc, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) return df -def list_mlexperiments(workspace: Optional[str] = None): +def list_mlexperiments(workspace: Optional[str] = None): """ Shows the ML experiments within a workspace. @@ -730,14 +971,14 @@ def list_mlexperiments(workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame A pandas dataframe showing the ML experiments within a workspace. """ - df = pd.DataFrame(columns=['ML Experiment Name', 'ML Experiment ID', 'Description']) + df = pd.DataFrame(columns=["ML Experiment Name", "ML Experiment ID", "Description"]) if workspace == None: workspace_id = fabric.get_workspace_id() @@ -747,19 +988,23 @@ def list_mlexperiments(workspace: Optional[str] = None): client = fabric.FabricRestClient() response = client.get(f"/v1/workspaces/{workspace_id}/mlExperiments/") - - for v in response.json()['value']: - model_id = v['id'] - modelName = v['displayName'] - desc = v['description'] - new_data = {'ML Experiment Name': modelName, 'ML Experiment ID': model_id, 'Description': desc} + for v in response.json()["value"]: + model_id = v["id"] + modelName = v["displayName"] + desc = v["description"] + + new_data = { + "ML Experiment Name": modelName, + "ML Experiment ID": model_id, + "Description": desc, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) return df -def list_datamarts(workspace: Optional[str] = None): +def list_datamarts(workspace: Optional[str] = None): """ Shows the datamarts within a workspace. @@ -769,14 +1014,14 @@ def list_datamarts(workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame A pandas dataframe showing the datamarts within a workspace. """ - df = pd.DataFrame(columns=['Datamart Name', 'Datamart ID', 'Description']) + df = pd.DataFrame(columns=["Datamart Name", "Datamart ID", "Description"]) if workspace == None: workspace_id = fabric.get_workspace_id() @@ -786,19 +1031,25 @@ def list_datamarts(workspace: Optional[str] = None): client = fabric.FabricRestClient() response = client.get(f"/v1/workspaces/{workspace_id}/datamarts/") - - for v in response.json()['value']: - model_id = v['id'] - modelName = v['displayName'] - desc = v['description'] - new_data = {'Datamart Name': modelName, 'Datamart ID': model_id, 'Description': desc} + for v in response.json()["value"]: + model_id = v["id"] + modelName = v["displayName"] + desc = v["description"] + + new_data = { + "Datamart Name": modelName, + "Datamart ID": model_id, + "Description": desc, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) return df -def create_warehouse(warehouse: str, description: Optional[str] = None, workspace: Optional[str] = None): +def create_warehouse( + warehouse: str, description: Optional[str] = None, workspace: Optional[str] = None +): """ Creates a Fabric warehouse. @@ -812,48 +1063,57 @@ def create_warehouse(warehouse: str, description: Optional[str] = None, workspac The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- - + """ if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) else: - workspace_id = fabric.resolve_workspace_id(workspace) + workspace_id = fabric.resolve_workspace_id(workspace) if description == None: - request_body = { - "displayName": warehouse - } + request_body = {"displayName": warehouse} else: - request_body = { - "displayName": warehouse, - "description": description - } + request_body = {"displayName": warehouse, "description": description} client = fabric.FabricRestClient() - response = client.post(f"/v1/workspaces/{workspace_id}/warehouses/", json=request_body) + response = client.post( + f"/v1/workspaces/{workspace_id}/warehouses/", json=request_body + ) if response.status_code == 201: - print(f"The '{warehouse}' warehouse has been created within the '{workspace}' workspace.") + print( + f"The '{warehouse}' warehouse has been created within the '{workspace}' workspace." + ) elif response.status_code == 202: - operationId = response.headers['x-ms-operation-id'] + operationId = response.headers["x-ms-operation-id"] response = client.get(f"/v1/operations/{operationId}") - response_body = json.loads(response.content) - while response_body['status'] != 'Succeeded': + response_body = json.loads(response.content) + while response_body["status"] != "Succeeded": time.sleep(3) response = client.get(f"/v1/operations/{operationId}") response_body = json.loads(response.content) response = client.get(f"/v1/operations/{operationId}/result") - print(f"The '{warehouse}' warehouse has been created within the '{workspace}' workspace.") + print( + f"The '{warehouse}' warehouse has been created within the '{workspace}' workspace." + ) else: - print(f"ERROR: Failed to create the '{warehouse}' warehouse within the '{workspace}' workspace.") - -def update_item(item_type: str, current_name: str, new_name: str, description: Optional[str] = None, workspace:Optional[str] = None): - + print( + f"ERROR: Failed to create the '{warehouse}' warehouse within the '{workspace}' workspace." + ) + + +def update_item( + item_type: str, + current_name: str, + new_name: str, + description: Optional[str] = None, + workspace: Optional[str] = None, +): """ Updates the name/description of a Fabric item. @@ -871,10 +1131,10 @@ def update_item(item_type: str, current_name: str, new_name: str, description: O The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- - + """ if workspace == None: @@ -884,57 +1144,64 @@ def update_item(item_type: str, current_name: str, new_name: str, description: O workspace_id = fabric.resolve_workspace_id(workspace) itemTypes = { - 'DataPipeline': 'dataPipelines', - 'Eventstream': 'eventstreams', - 'KQLDatabase': 'kqlDatabases', - 'KQLQueryset': 'kqlQuerysets', - 'Lakehouse': 'lakehouses', - 'MLExperiment': 'mlExperiments', - 'MLModel': 'mlModels', - 'Notebook': 'notebooks', - 'Warehouse': 'warehouses', + "DataPipeline": "dataPipelines", + "Eventstream": "eventstreams", + "KQLDatabase": "kqlDatabases", + "KQLQueryset": "kqlQuerysets", + "Lakehouse": "lakehouses", + "MLExperiment": "mlExperiments", + "MLModel": "mlModels", + "Notebook": "notebooks", + "Warehouse": "warehouses", } - item_type = item_type.replace(' ','').capitalize() + item_type = item_type.replace(" ", "").capitalize() if item_type not in itemTypes.keys(): print(f"The '{item_type}' is not a valid item type. ") return - + itemType = itemTypes[item_type] - dfI = fabric.list_items(workspace = workspace, type = item_type) - dfI_filt = dfI[(dfI['Display Name'] == current_name)] + dfI = fabric.list_items(workspace=workspace, type=item_type) + dfI_filt = dfI[(dfI["Display Name"] == current_name)] if len(dfI_filt) == 0: - print(f"The '{current_name}' {item_type} does not exist within the '{workspace}' workspace.") + print( + f"The '{current_name}' {item_type} does not exist within the '{workspace}' workspace." + ) return - - itemId = dfI_filt['Id'].iloc[0] + + itemId = dfI_filt["Id"].iloc[0] if description == None: - request_body = { - "displayName": new_name - } + request_body = {"displayName": new_name} else: - request_body = { - "displayName": new_name, - "description": description - } + request_body = {"displayName": new_name, "description": description} client = fabric.FabricRestClient() - response = client.patch(f"/v1/workspaces/{workspace_id}/{itemType}/{itemId}", json=request_body) + response = client.patch( + f"/v1/workspaces/{workspace_id}/{itemType}/{itemId}", json=request_body + ) if response.status_code == 200: if description == None: - print(f"The '{current_name}' {item_type} within the '{workspace}' workspace has been updated to be named '{new_name}'") + print( + f"The '{current_name}' {item_type} within the '{workspace}' workspace has been updated to be named '{new_name}'" + ) else: - print(f"The '{current_name}' {item_type} within the '{workspace}' workspace has been updated to be named '{new_name}' and have a description of '{description}'") + print( + f"The '{current_name}' {item_type} within the '{workspace}' workspace has been updated to be named '{new_name}' and have a description of '{description}'" + ) else: - print(f"ERROR: The '{current_name}' {item_type} within the '{workspace}' workspace was not updateds.") + print( + f"ERROR: The '{current_name}' {item_type} within the '{workspace}' workspace was not updateds." + ) -def list_relationships(dataset: str, workspace: Optional[str] = None, extended: Optional[bool] = False): +def list_relationships( + dataset: str, workspace: Optional[str] = None, extended: Optional[bool] = False +): """ Shows a semantic model's relationships and their properties. @@ -948,7 +1215,7 @@ def list_relationships(dataset: str, workspace: Optional[str] = None, extended: or if no lakehouse attached, resolves to the workspace of the notebook. extended : bool, default=False Fetches extended column information. - + Returns ------- pandas.DataFrame @@ -958,83 +1225,104 @@ def list_relationships(dataset: str, workspace: Optional[str] = None, extended: if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - - dfR = fabric.list_relationships(dataset = dataset, workspace = workspace) + + dfR = fabric.list_relationships(dataset=dataset, workspace=workspace) if extended: # Used to map the Relationship IDs rel = fabric.evaluate_dax( - dataset = dataset, workspace = workspace, dax_string = - """ + dataset=dataset, + workspace=workspace, + dax_string=""" SELECT [ID] AS [RelationshipID] ,[Name] FROM $SYSTEM.TMSCHEMA_RELATIONSHIPS - """) + """, + ) # USED_SIZE shows the Relationship Size where TABLE_ID starts with R$ cs = fabric.evaluate_dax( - dataset = dataset, workspace = workspace, dax_string = - """ + dataset=dataset, + workspace=workspace, + dax_string=""" SELECT [TABLE_ID] ,[USED_SIZE] FROM $SYSTEM.DISCOVER_STORAGE_TABLE_COLUMN_SEGMENTS - """) + """, + ) def parse_value(text): - ind = text.rfind('(') + 1 + ind = text.rfind("(") + 1 output = text[ind:] output = output[:-1] return output - cs['RelationshipID'] = cs['TABLE_ID'].apply(parse_value).astype('uint64') - relcs = pd.merge(cs[['RelationshipID', 'TABLE_ID', 'USED_SIZE']], rel, on='RelationshipID', how='left') + cs["RelationshipID"] = cs["TABLE_ID"].apply(parse_value).astype("uint64") + relcs = pd.merge( + cs[["RelationshipID", "TABLE_ID", "USED_SIZE"]], + rel, + on="RelationshipID", + how="left", + ) - dfR['Used Size'] = None + dfR["Used Size"] = None for i, r in dfR.iterrows(): - relName = r['Relationship Name'] + relName = r["Relationship Name"] + + filtered_cs = relcs[ + (relcs["Name"] == relName) & (relcs["TABLE_ID"].str.startswith("R$")) + ] + sumval = filtered_cs["USED_SIZE"].sum() + dfR.at[i, "Used Size"] = sumval - filtered_cs = relcs[(relcs['Name'] == relName) & (relcs['TABLE_ID'].str.startswith("R$"))] - sumval = filtered_cs['USED_SIZE'].sum() - dfR.at[i, 'Used Size'] = sumval + dfR["Used Size"] = dfR["Used Size"].astype("int") - dfR['Used Size'] = dfR['Used Size'].astype('int') - return dfR -def list_dataflow_storage_accounts(): +def list_dataflow_storage_accounts(): """ Shows the accessible dataflow storage accounts. Parameters ---------- - + Returns ------- pandas.DataFrame A pandas dataframe showing the accessible dataflow storage accounts. """ - df = pd.DataFrame(columns=['Dataflow Storage Account ID', 'Dataflow Storage Account Name', 'Enabled']) + df = pd.DataFrame( + columns=[ + "Dataflow Storage Account ID", + "Dataflow Storage Account Name", + "Enabled", + ] + ) client = fabric.PowerBIRestClient() response = client.get(f"/v1.0/myorg/dataflowStorageAccounts") - - for v in response.json()['value']: - dfsaId = v['id'] - dfsaName = v['name'] - isEnabled = v['isEnabled'] - - new_data = {'Dataflow Storage Account ID': dfsaId, 'Dataflow Storage Account Name': dfsaName, 'Enabled': isEnabled} + + for v in response.json()["value"]: + dfsaId = v["id"] + dfsaName = v["name"] + isEnabled = v["isEnabled"] + + new_data = { + "Dataflow Storage Account ID": dfsaId, + "Dataflow Storage Account Name": dfsaName, + "Enabled": isEnabled, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - df['Enabled'] = df['Enabled'].astype(bool) + df["Enabled"] = df["Enabled"].astype(bool) return df -def list_kpis(dataset: str, workspace: Optional[str] = None): +def list_kpis(dataset: str, workspace: Optional[str] = None): """ Shows a semantic model's KPIs and their properties. @@ -1055,20 +1343,50 @@ def list_kpis(dataset: str, workspace: Optional[str] = None): from .TOM import connect_semantic_model - with connect_semantic_model(dataset = dataset, workspace = workspace, readonly = True) as tom: - - df = pd.DataFrame(columns=['Table Name', 'Measure Name', 'Target Expression', 'Target Format String', 'Target Description', 'Status Expression', 'Status Graphic', 'Status Description', 'Trend Expression', 'Trend Graphic', 'Trend Description']) + with connect_semantic_model( + dataset=dataset, workspace=workspace, readonly=True + ) as tom: + + df = pd.DataFrame( + columns=[ + "Table Name", + "Measure Name", + "Target Expression", + "Target Format String", + "Target Description", + "Status Expression", + "Status Graphic", + "Status Description", + "Trend Expression", + "Trend Graphic", + "Trend Description", + ] + ) for t in tom.model.Tables: for m in t.Measures: if m.KPI is not None: - new_data = {'Table Name': t.Name, 'Measure Name': m.Name, 'Target Expression': m.KPI.TargetExpression, 'Target Format String': m.KPI.TargetFormatString, 'Target Description': m.KPI.TargetDescription, 'Status Graphic': m.KPI.StatusGraphic, 'Status Expression': m.KPI.StatusExpression, 'Status Description': m.KPI.StatusDescription, 'Trend Expression': m.KPI.TrendExpression, 'Trend Graphic': m.KPI.TrendGraphic, 'Trend Description': m.KPI.TrendDescription} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + new_data = { + "Table Name": t.Name, + "Measure Name": m.Name, + "Target Expression": m.KPI.TargetExpression, + "Target Format String": m.KPI.TargetFormatString, + "Target Description": m.KPI.TargetDescription, + "Status Graphic": m.KPI.StatusGraphic, + "Status Expression": m.KPI.StatusExpression, + "Status Description": m.KPI.StatusDescription, + "Trend Expression": m.KPI.TrendExpression, + "Trend Graphic": m.KPI.TrendGraphic, + "Trend Description": m.KPI.TrendDescription, + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) return df - -def list_workspace_role_assignments(workspace: Optional[str] = None): + +def list_workspace_role_assignments(workspace: Optional[str] = None): """ Shows the members of a given workspace. @@ -1078,7 +1396,7 @@ def list_workspace_role_assignments(workspace: Optional[str] = None): The Fabric workspace name. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- pandas.DataFrame @@ -1091,18 +1409,23 @@ def list_workspace_role_assignments(workspace: Optional[str] = None): else: workspace_id = fabric.resolve_workspace_id(workspace) - df = pd.DataFrame(columns=['User Name', 'User Email', 'Role Name', 'Type']) + df = pd.DataFrame(columns=["User Name", "User Email", "Role Name", "Type"]) client = fabric.FabricRestClient() response = client.get(f"/v1/workspaces/{workspace_id}/roleAssignments") - for i in response.json()['value']: - user_name = i['principal']['displayName'] - role_name = i['role'] - user_email = i['principal']['userDetails']['userPrincipalName'] - user_type = i['principal']['type'] - - new_data = {'User Name': user_name, 'Role Name': role_name, 'Type': user_type, 'User Email': user_email} + for i in response.json()["value"]: + user_name = i["principal"]["displayName"] + role_name = i["role"] + user_email = i["principal"]["userDetails"]["userPrincipalName"] + user_type = i["principal"]["type"] + + new_data = { + "User Name": user_name, + "Role Name": role_name, + "Type": user_type, + "User Email": user_email, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - return df \ No newline at end of file + return df diff --git a/sempy_labs/directlake/__init__.py b/sempy_labs/directlake/__init__.py new file mode 100644 index 00000000..de59562e --- /dev/null +++ b/sempy_labs/directlake/__init__.py @@ -0,0 +1,24 @@ +from sempy_labs.directlake._directlake_schema_compare import ( + direct_lake_schema_compare as direct_lake_schema_compare, +) +from sempy_labs.directlake._directlake_schema_sync import ( + direct_lake_schema_sync as direct_lake_schema_sync, +) +from sempy_labs.directlake._get_directlake_lakehouse import ( + get_directlake_lakehouse as get_directlake_lakehouse, +) +from sempy_labs.directlake._get_directlake_model_calc_tables import ( + list_directlake_model_calc_tables as list_directlake_model_calc_tables, +) +from sempy_labs.directlake._show_unsupported_directlake_objects import ( + show_unsupported_direct_lake_objects as show_unsupported_direct_lake_objects, +) +from sempy_labs.directlake._update_directlake_model_lakehouse_connection import ( + update_direct_lake_model_lakehouse_connection as update_direct_lake_model_lakehouse_connection, +) +from sempy_labs.directlake._update_directlake_partition_entity import ( + update_direct_lake_partition_entity as update_direct_lake_partition_entity, +) +from sempy_labs.directlake._get_shared_expression import ( + get_shared_expression as get_shared_expression, +) diff --git a/sempy_labs/directlake/_directlake_schema_compare.py b/sempy_labs/directlake/_directlake_schema_compare.py new file mode 100644 index 00000000..f5b83aa1 --- /dev/null +++ b/sempy_labs/directlake/_directlake_schema_compare.py @@ -0,0 +1,108 @@ +import sempy.fabric as fabric +import pandas as pd +from sempy_labs._helper_functions import ( + format_dax_object_name, + resolve_lakehouse_name, + get_direct_lake_sql_endpoint, +) +from sempy_labs.lakehouse import get_lakehouse_columns +from sempy_labs._list_functions import list_tables +from typing import Optional + + +def direct_lake_schema_compare( + dataset: str, + workspace: Optional[str] = None, + lakehouse: Optional[str] = None, + lakehouse_workspace: Optional[str] = None, +): + """ + Checks that all the tables in a Direct Lake semantic model map to tables in their corresponding lakehouse and that the columns in each table exist. + + Parameters + ---------- + dataset : str + Name of the semantic model. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + lakehouse : str, default=None + The Fabric lakehouse used by the Direct Lake semantic model. + Defaults to None which resolves to the lakehouse attached to the notebook. + lakehouse_workspace : str, default=None + The Fabric workspace used by the lakehouse. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + """ + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + + if lakehouse_workspace is None: + lakehouse_workspace = workspace + + if lakehouse == None: + lakehouse_id = fabric.get_lakehouse_id() + lakehouse = resolve_lakehouse_name(lakehouse_id, lakehouse_workspace) + + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + sqlEndpointId = get_direct_lake_sql_endpoint(dataset, workspace) + dfI = fabric.list_items(workspace=lakehouse_workspace, type="SQLEndpoint") + dfI_filt = dfI[(dfI["Id"] == sqlEndpointId)] + + if len(dfI_filt) == 0: + print( + f"The SQL Endpoint in the '{dataset}' semantic model in the '{workspace} workspace does not point to the '{lakehouse}' lakehouse in the '{lakehouse_workspace}' workspace as specified." + ) + return + + if not any(r["Mode"] == "DirectLake" for i, r in dfP.iterrows()): + print(f"The '{dataset}' semantic model is not in Direct Lake mode.") + return + + dfT = list_tables(dataset, workspace) + dfC = fabric.list_columns(dataset=dataset, workspace=workspace) + lc = get_lakehouse_columns(lakehouse, lakehouse_workspace) + + dfT.rename(columns={"Type": "Table Type"}, inplace=True) + dfP_filt = dfP[dfP["Mode"] == "DirectLake"] + dfC = pd.merge(dfC, dfP[["Table Name", "Query"]], on="Table Name", how="inner") + dfC = pd.merge( + dfC, + dfT[["Name", "Table Type"]], + left_on="Table Name", + right_on="Name", + how="inner", + ) + dfC["Full Column Name"] = format_dax_object_name(dfC["Query"], dfC["Source"]) + dfC_filt = dfC[dfC["Table Type"] == "Table"] + # Schema compare + missingtbls = dfP_filt[~dfP_filt["Query"].isin(lc["Table Name"])] + missingtbls = missingtbls[["Table Name", "Query"]] + missingtbls.rename(columns={"Query": "Source Table"}, inplace=True) + missingcols = dfC_filt[~dfC_filt["Full Column Name"].isin(lc["Full Column Name"])] + missingcols = missingcols[ + ["Table Name", "Column Name", "Type", "Data Type", "Source"] + ] + missingcols.rename(columns={"Source": "Source Column"}, inplace=True) + + if len(missingtbls) == 0: + print( + f"All tables exist in the '{lakehouse}' lakehouse within the '{lakehouse_workspace}' workspace." + ) + else: + print( + f"The following tables exist in the '{dataset}' semantic model within the '{workspace}' workspace but do not exist in the '{lakehouse}' lakehouse within the '{lakehouse_workspace}' workspace." + ) + display(missingtbls) + if len(missingcols) == 0: + print( + f"All columns exist in the '{lakehouse}' lakehouse within the '{lakehouse_workspace}' workspace." + ) + else: + print( + f"The following columns exist in the '{dataset}' semantic model within the '{workspace}' workspace but do not exist in the '{lakehouse}' lakehouse within the '{lakehouse_workspace}' workspace." + ) + display(missingcols) diff --git a/sempy_labs/DirectLakeSchemaSync.py b/sempy_labs/directlake/_directlake_schema_sync.py similarity index 50% rename from sempy_labs/DirectLakeSchemaSync.py rename to sempy_labs/directlake/_directlake_schema_sync.py index 8ea4017e..70c7a1a5 100644 --- a/sempy_labs/DirectLakeSchemaSync.py +++ b/sempy_labs/directlake/_directlake_schema_sync.py @@ -1,14 +1,24 @@ import sempy import sempy.fabric as fabric import pandas as pd -from .GetLakehouseColumns import get_lakehouse_columns -from .HelperFunctions import format_dax_object_name, resolve_lakehouse_name, get_direct_lake_sql_endpoint -from typing import List, Optional, Union +from .lakehouse.GetLakehouseColumns import get_lakehouse_columns +from sempy_labs._helper_functions import ( + format_dax_object_name, + resolve_lakehouse_name, + get_direct_lake_sql_endpoint, +) +from typing import Optional from sempy._utils._log import log -@log -def direct_lake_schema_sync(dataset: str, workspace: Optional[str] = None, add_to_model: Optional[bool] = False, lakehouse: Optional[str] = None, lakehouse_workspace: Optional[str] = None): +@log +def direct_lake_schema_sync( + dataset: str, + workspace: Optional[str] = None, + add_to_model: Optional[bool] = False, + lakehouse: Optional[str] = None, + lakehouse_workspace: Optional[str] = None, +): """ Shows/adds columns which exist in the lakehouse but do not exist in the semantic model (only for tables in the semantic model). @@ -29,17 +39,12 @@ def direct_lake_schema_sync(dataset: str, workspace: Optional[str] = None, add_t The Fabric workspace used by the lakehouse. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - """ sempy.fabric._client._utils._init_analysis_services() import Microsoft.AnalysisServices.Tabular as TOM import System - if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) @@ -53,46 +58,52 @@ def direct_lake_schema_sync(dataset: str, workspace: Optional[str] = None, add_t sqlEndpointId = get_direct_lake_sql_endpoint(dataset, workspace) - dfI = fabric.list_items(workspace = lakehouse_workspace, type = 'SQLEndpoint') - dfI_filt = dfI[(dfI['Id'] == sqlEndpointId)] + dfI = fabric.list_items(workspace=lakehouse_workspace, type="SQLEndpoint") + dfI_filt = dfI[(dfI["Id"] == sqlEndpointId)] if len(dfI_filt) == 0: - print(f"The SQL Endpoint in the '{dataset}' semantic model in the '{workspace} workspace does not point to the '{lakehouse}' lakehouse in the '{lakehouse_workspace}' workspace as specified.") + print( + f"The SQL Endpoint in the '{dataset}' semantic model in the '{workspace} workspace does not point to the '{lakehouse}' lakehouse in the '{lakehouse_workspace}' workspace as specified." + ) return - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) - dfP_filt = dfP[dfP['Source Type'] == 'Entity'] - dfC = fabric.list_columns(dataset = dataset, workspace = workspace) - dfC_filt = dfC[dfC['Table Name'].isin(dfP_filt['Table Name'].values)] - dfC_filt = pd.merge(dfC_filt, dfP_filt[['Table Name', 'Query']], on = 'Table Name', how = 'left') - dfC_filt['Column Object'] = format_dax_object_name(dfC_filt['Query'], dfC_filt['Source']) + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + dfP_filt = dfP[dfP["Source Type"] == "Entity"] + dfC = fabric.list_columns(dataset=dataset, workspace=workspace) + dfC_filt = dfC[dfC["Table Name"].isin(dfP_filt["Table Name"].values)] + dfC_filt = pd.merge( + dfC_filt, dfP_filt[["Table Name", "Query"]], on="Table Name", how="left" + ) + dfC_filt["Column Object"] = format_dax_object_name( + dfC_filt["Query"], dfC_filt["Source"] + ) lc = get_lakehouse_columns(lakehouse, lakehouse_workspace) - lc_filt = lc[lc['Table Name'].isin(dfP_filt['Query'].values)] + lc_filt = lc[lc["Table Name"].isin(dfP_filt["Query"].values)] mapping = { - 'string': 'String', - 'bigint': 'Int64', - 'int': 'Int64', - 'smallint': 'Int64', - 'boolean': 'Boolean', - 'timestamp': 'DateTime', - 'date': 'DateTime', - 'decimal(38,18)': 'Decimal', - 'double': 'Double' + "string": "String", + "bigint": "Int64", + "int": "Int64", + "smallint": "Int64", + "boolean": "Boolean", + "timestamp": "DateTime", + "date": "DateTime", + "decimal(38,18)": "Decimal", + "double": "Double", } tom_server = fabric.create_tom_server(readonly=False, workspace=workspace) m = tom_server.Databases.GetByName(dataset).Model for i, r in lc_filt.iterrows(): - lakeTName = r['Table Name'] - lakeCName = r['Column Name'] - fullColName = r['Full Column Name'] - dType = r['Data Type'] - - if fullColName not in dfC_filt['Column Object'].values: - dfL = dfP_filt[dfP_filt['Query'] == lakeTName] - tName = dfL['Table Name'].iloc[0] + lakeTName = r["Table Name"] + lakeCName = r["Column Name"] + fullColName = r["Full Column Name"] + dType = r["Data Type"] + + if fullColName not in dfC_filt["Column Object"].values: + dfL = dfP_filt[dfP_filt["Query"] == lakeTName] + tName = dfL["Table Name"].iloc[0] if add_to_model: col = TOM.DataColumn() col.Name = lakeCName @@ -101,11 +112,17 @@ def direct_lake_schema_sync(dataset: str, workspace: Optional[str] = None, add_t try: col.DataType = System.Enum.Parse(TOM.DataType, dt) except: - print(f"ERROR: '{dType}' data type is not mapped properly to the semantic model data types.") + print( + f"ERROR: '{dType}' data type is not mapped properly to the semantic model data types." + ) return m.Tables[tName].Columns.Add(col) - print(f"The '{lakeCName}' column has been added to the '{tName}' table as a '{dt}' data type within the '{dataset}' semantic model within the '{workspace}' workspace.") + print( + f"The '{lakeCName}' column has been added to the '{tName}' table as a '{dt}' data type within the '{dataset}' semantic model within the '{workspace}' workspace." + ) else: - print(f"The {fullColName} column exists in the lakehouse but not in the '{tName}' table in the '{dataset}' semantic model within the '{workspace}' workspace.") + print( + f"The {fullColName} column exists in the lakehouse but not in the '{tName}' table in the '{dataset}' semantic model within the '{workspace}' workspace." + ) m.SaveChanges() diff --git a/sempy_labs/GetDirectLakeLakehouse.py b/sempy_labs/directlake/_get_directlake_lakehouse.py similarity index 69% rename from sempy_labs/GetDirectLakeLakehouse.py rename to sempy_labs/directlake/_get_directlake_lakehouse.py index 8ef8fb46..1fa8f778 100644 --- a/sempy_labs/GetDirectLakeLakehouse.py +++ b/sempy_labs/directlake/_get_directlake_lakehouse.py @@ -1,11 +1,20 @@ import sempy import sempy.fabric as fabric -from .HelperFunctions import resolve_lakehouse_id, resolve_lakehouse_name, get_direct_lake_sql_endpoint +from sempy_labs._helper_functions import ( + resolve_lakehouse_id, + resolve_lakehouse_name, + get_direct_lake_sql_endpoint, +) from typing import List, Optional, Union from uuid import UUID -def get_direct_lake_lakehouse(dataset: str, workspace: Optional[str] = None, lakehouse: Optional[str] = None, lakehouse_workspace: Optional[str] = None): +def get_direct_lake_lakehouse( + dataset: str, + workspace: Optional[str] = None, + lakehouse: Optional[str] = None, + lakehouse_workspace: Optional[str] = None, +): """ Identifies the lakehouse used by a Direct Lake semantic model. @@ -29,7 +38,7 @@ def get_direct_lake_lakehouse(dataset: str, workspace: Optional[str] = None, lak ------- str, UUID The lakehouse name and lakehouse ID. - """ + """ if workspace == None: workspace_id = fabric.get_workspace_id() @@ -39,26 +48,25 @@ def get_direct_lake_lakehouse(dataset: str, workspace: Optional[str] = None, lak if lakehouse_workspace is None: lakehouse_workspace = workspace - + if lakehouse == None: lakehouse_id = fabric.get_lakehouse_id() lakehouse = resolve_lakehouse_name(lakehouse_id, lakehouse_workspace) - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) - dfP_filt = dfP[dfP['Mode'] == 'DirectLake'] + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + dfP_filt = dfP[dfP["Mode"] == "DirectLake"] if len(dfP_filt) == 0: - print(f"ERROR: The '{dataset}' semantic model within the '{workspace}' workspace is not in Direct Lake mode.") + print( + f"ERROR: The '{dataset}' semantic model within the '{workspace}' workspace is not in Direct Lake mode." + ) else: sqlEndpointId = get_direct_lake_sql_endpoint(dataset, workspace) - dfI = fabric.list_items(workspace = lakehouse_workspace, type = 'SQLEndpoint') - dfI_filt = dfI[dfI['Id'] == sqlEndpointId] - lakehouseName = dfI_filt['Display Name'].iloc[0] + dfI = fabric.list_items(workspace=lakehouse_workspace, type="SQLEndpoint") + dfI_filt = dfI[dfI["Id"] == sqlEndpointId] + lakehouseName = dfI_filt["Display Name"].iloc[0] lakehouseId = resolve_lakehouse_id(lakehouseName, lakehouse_workspace) return lakehouseName, lakehouseId - - - diff --git a/sempy_labs/GetSharedExpression.py b/sempy_labs/directlake/_get_shared_expression.py similarity index 54% rename from sempy_labs/GetSharedExpression.py rename to sempy_labs/directlake/_get_shared_expression.py index 032cf288..6b366273 100644 --- a/sempy_labs/GetSharedExpression.py +++ b/sempy_labs/directlake/_get_shared_expression.py @@ -1,11 +1,12 @@ -import sempy import sempy.fabric as fabric -from .HelperFunctions import resolve_lakehouse_name -from .ListFunctions import list_lakehouses -from typing import List, Optional, Union +from sempy_labs._helper_functions import resolve_lakehouse_name +from sempy_labs._list_functions import list_lakehouses +from typing import Optional -def get_shared_expression(lakehouse: Optional[str] = None, workspace: Optional[str] = None): +def get_shared_expression( + lakehouse: Optional[str] = None, workspace: Optional[str] = None +): """ Dynamically generates the M expression used by a Direct Lake model for a given lakehouse. @@ -34,17 +35,25 @@ def get_shared_expression(lakehouse: Optional[str] = None, workspace: Optional[s lakehouse_id = fabric.get_lakehouse_id() lakehouse = resolve_lakehouse_name(lakehouse_id) - dfL = list_lakehouses(workspace = workspace) - lakeDetail = dfL[dfL['Lakehouse Name'] == lakehouse] + dfL = list_lakehouses(workspace=workspace) + lakeDetail = dfL[dfL["Lakehouse Name"] == lakehouse] - sqlEPCS = lakeDetail['SQL Endpoint Connection String'].iloc[0] - sqlepid = lakeDetail['SQL Endpoint ID'].iloc[0] - provStatus = lakeDetail['SQL Endpoint Provisioning Status'].iloc[0] + sqlEPCS = lakeDetail["SQL Endpoint Connection String"].iloc[0] + sqlepid = lakeDetail["SQL Endpoint ID"].iloc[0] + provStatus = lakeDetail["SQL Endpoint Provisioning Status"].iloc[0] - if provStatus == 'InProgress': - print(f"The SQL Endpoint for the '{lakehouse}' lakehouse within the '{workspace}' workspace has not yet been provisioned. Please wait until it has been provisioned.") + if provStatus == "InProgress": + print( + f"The SQL Endpoint for the '{lakehouse}' lakehouse within the '{workspace}' workspace has not yet been provisioned. Please wait until it has been provisioned." + ) return - - sh = 'let\n\tdatabase = Sql.Database("' + sqlEPCS + '", "' + sqlepid + '")\nin\n\tdatabase' - return sh \ No newline at end of file + sh = ( + 'let\n\tdatabase = Sql.Database("' + + sqlEPCS + + '", "' + + sqlepid + + '")\nin\n\tdatabase' + ) + + return sh diff --git a/sempy_labs/ListDirectLakeModelCalcTables.py b/sempy_labs/directlake/_list_directlake_model_calc_tables.py similarity index 68% rename from sempy_labs/ListDirectLakeModelCalcTables.py rename to sempy_labs/directlake/_list_directlake_model_calc_tables.py index 3eac1966..77a0463e 100644 --- a/sempy_labs/ListDirectLakeModelCalcTables.py +++ b/sempy_labs/directlake/_list_directlake_model_calc_tables.py @@ -1,13 +1,13 @@ import sempy import sempy.fabric as fabric import pandas as pd -from .ListFunctions import list_tables, list_annotations -from typing import List, Optional, Union +from sempy_labs._list_functions import list_tables, list_annotations +from typing import Optional from sempy._utils._log import log + @log def list_direct_lake_model_calc_tables(dataset: str, workspace: Optional[str] = None): - """ Shows the calculated tables and their respective DAX expression for a Direct Lake model (which has been migrated from import/DirectQuery. @@ -30,23 +30,25 @@ def list_direct_lake_model_calc_tables(dataset: str, workspace: Optional[str] = workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - df = pd.DataFrame(columns=['Table Name', 'Source Expression']) + df = pd.DataFrame(columns=["Table Name", "Source Expression"]) - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) - dfP_filt = dfP[dfP['Mode'] == 'DirectLake'] + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + dfP_filt = dfP[dfP["Mode"] == "DirectLake"] if len(dfP_filt) == 0: print(f"The '{dataset}' semantic model is not in Direct Lake mode.") else: dfA = list_annotations(dataset, workspace) dfT = list_tables(dataset, workspace) - dfA_filt = dfA[(dfA['Object Type'] == 'Model') & (dfA['Annotation Name'].isin(dfT['Name']))] + dfA_filt = dfA[ + (dfA["Object Type"] == "Model") & (dfA["Annotation Name"].isin(dfT["Name"])) + ] - for i,r in dfA_filt.iterrows(): - tName = r['Annotation Name'] - se = r['Annotation Value'] + for i, r in dfA_filt.iterrows(): + tName = r["Annotation Name"] + se = r["Annotation Value"] - new_data = {'Table Name': tName, 'Source Expression': se} + new_data = {"Table Name": tName, "Source Expression": se} df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - return df \ No newline at end of file + return df diff --git a/sempy_labs/directlake/_show_unsupported_directlake_objects.py b/sempy_labs/directlake/_show_unsupported_directlake_objects.py new file mode 100644 index 00000000..6b335751 --- /dev/null +++ b/sempy_labs/directlake/_show_unsupported_directlake_objects.py @@ -0,0 +1,88 @@ +import sempy.fabric as fabric +import pandas as pd +from sempy_labs._list_functions import list_tables +from sempy_labs._helper_functions import format_dax_object_name +from typing import Optional, Tuple + + +def show_unsupported_direct_lake_objects( + dataset: str, workspace: Optional[str] = None +) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Returns a list of a semantic model's objects which are not supported by Direct Lake based on [official documentation](https://learn.microsoft.com/power-bi/enterprise/directlake-overview#known-issues-and-limitations). + + Parameters + ---------- + dataset : str + Name of the semantic model. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + + Returns + ------- + pandas.DataFrame, pandas.DataFrame, pandas.DataFrame + 3 pandas dataframes showing objects in a semantic model which are not supported by Direct Lake. + """ + + pd.options.mode.chained_assignment = None + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + + dfT = list_tables(dataset, workspace) + dfC = fabric.list_columns(dataset=dataset, workspace=workspace) + dfR = fabric.list_relationships(dataset=dataset, workspace=workspace) + + # Calc tables + dfT_filt = dfT[dfT["Type"] == "Calculated Table"] + dfT_filt.rename(columns={"Name": "Table Name"}, inplace=True) + t = dfT_filt[["Table Name", "Type"]] + + # Calc columns + dfC_filt = dfC[(dfC["Type"] == "Calculated") | (dfC["Data Type"] == "Binary")] + c = dfC_filt[["Table Name", "Column Name", "Type", "Data Type", "Source"]] + + # Relationships + dfC["Column Object"] = format_dax_object_name(dfC["Table Name"], dfC["Column Name"]) + dfR["From Object"] = format_dax_object_name(dfR["From Table"], dfR["From Column"]) + dfR["To Object"] = format_dax_object_name(dfR["To Table"], dfR["To Column"]) + merged_from = pd.merge( + dfR, dfC, left_on="From Object", right_on="Column Object", how="left" + ) + merged_to = pd.merge( + dfR, dfC, left_on="To Object", right_on="Column Object", how="left" + ) + + dfR["From Column Data Type"] = merged_from["Data Type"] + dfR["To Column Data Type"] = merged_to["Data Type"] + + dfR_filt = dfR[ + ( + (dfR["From Column Data Type"] == "DateTime") + | (dfR["To Column Data Type"] == "DateTime") + ) + | (dfR["From Column Data Type"] != dfR["To Column Data Type"]) + ] + r = dfR_filt[ + [ + "From Table", + "From Column", + "To Table", + "To Column", + "From Column Data Type", + "To Column Data Type", + ] + ] + + # print('Calculated Tables are not supported...') + # display(t) + # print("Learn more about Direct Lake limitations here: https://learn.microsoft.com/power-bi/enterprise/directlake-overview#known-issues-and-limitations") + # print('Calculated columns are not supported. Columns of binary data type are not supported.') + # display(c) + # print('Columns used for relationship cannot be of data type datetime and they also must be of the same data type.') + # display(r) + + return t, c, r diff --git a/sempy_labs/UpdateDirectLakeModelLakehouseConnection.py b/sempy_labs/directlake/_update_directlake_model_lakehouse_connection.py similarity index 51% rename from sempy_labs/UpdateDirectLakeModelLakehouseConnection.py rename to sempy_labs/directlake/_update_directlake_model_lakehouse_connection.py index 4f45c309..0ff42fe6 100644 --- a/sempy_labs/UpdateDirectLakeModelLakehouseConnection.py +++ b/sempy_labs/directlake/_update_directlake_model_lakehouse_connection.py @@ -1,12 +1,17 @@ import sempy import sempy.fabric as fabric from .GetSharedExpression import get_shared_expression -from .HelperFunctions import resolve_lakehouse_name -from .TOM import connect_semantic_model +from sempy_labs._helper_functions import resolve_lakehouse_name +from ..TOM import connect_semantic_model from typing import List, Optional, Union -def update_direct_lake_model_lakehouse_connection(dataset: str, workspace: Optional[str] = None, lakehouse: Optional[str] = None, lakehouse_workspace: Optional[str] = None): +def update_direct_lake_model_lakehouse_connection( + dataset: str, + workspace: Optional[str] = None, + lakehouse: Optional[str] = None, + lakehouse_workspace: Optional[str] = None, +): """ Remaps a Direct Lake semantic model's SQL Endpoint connection to a new lakehouse. @@ -29,7 +34,7 @@ def update_direct_lake_model_lakehouse_connection(dataset: str, workspace: Optio Returns ------- - """ + """ if workspace == None: workspace_id = fabric.get_workspace_id() @@ -45,27 +50,33 @@ def update_direct_lake_model_lakehouse_connection(dataset: str, workspace: Optio lakehouse = resolve_lakehouse_name(lakehouse_id, lakehouse_workspace) # Check if lakehouse is valid - dfI = fabric.list_items(workspace = lakehouse_workspace, type = 'Lakehouse') - dfI_filt = dfI[(dfI['Display Name'] == lakehouse)] + dfI = fabric.list_items(workspace=lakehouse_workspace, type="Lakehouse") + dfI_filt = dfI[(dfI["Display Name"] == lakehouse)] if len(dfI_filt) == 0: - print(f"The '{lakehouse}' lakehouse does not exist within the '{lakehouse_workspace}' workspace. Therefore it cannot be used to support the '{dataset}' semantic model within the '{workspace}' workspace.") + print( + f"The '{lakehouse}' lakehouse does not exist within the '{lakehouse_workspace}' workspace. Therefore it cannot be used to support the '{dataset}' semantic model within the '{workspace}' workspace." + ) + + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + dfP_filt = dfP[dfP["Mode"] == "DirectLake"] - dfP = fabric.list_partitions(dataset = dataset, workspace = workspace) - dfP_filt = dfP[dfP['Mode'] == 'DirectLake'] - if len(dfP_filt) == 0: - print(f"The '{dataset}' semantic model is not in Direct Lake. This function is only applicable to Direct Lake semantic models.") + print( + f"The '{dataset}' semantic model is not in Direct Lake. This function is only applicable to Direct Lake semantic models." + ) else: - with connect_semantic_model(dataset=dataset, readonly=False, workspace=workspace) as tom: - - shEx = get_shared_expression(lakehouse,lakehouse_workspace) + with connect_semantic_model( + dataset=dataset, readonly=False, workspace=workspace + ) as tom: + + shEx = get_shared_expression(lakehouse, lakehouse_workspace) try: - tom.model.Expressions['DatabaseQuery'].Expression = shEx - print(f"The expression in the '{dataset}' semantic model has been updated to point to the '{lakehouse}' lakehouse in the '{lakehouse_workspace}' workspace.") + tom.model.Expressions["DatabaseQuery"].Expression = shEx + print( + f"The expression in the '{dataset}' semantic model has been updated to point to the '{lakehouse}' lakehouse in the '{lakehouse_workspace}' workspace." + ) except: - print(f"ERROR: The expression in the '{dataset}' semantic model was not updated.") - - - - + print( + f"ERROR: The expression in the '{dataset}' semantic model was not updated." + ) diff --git a/sempy_labs/UpdateDirectLakePartitionEntity.py b/sempy_labs/directlake/_update_directlake_partition_entity.py similarity index 58% rename from sempy_labs/UpdateDirectLakePartitionEntity.py rename to sempy_labs/directlake/_update_directlake_partition_entity.py index 35561abc..b1484e93 100644 --- a/sempy_labs/UpdateDirectLakePartitionEntity.py +++ b/sempy_labs/directlake/_update_directlake_partition_entity.py @@ -1,10 +1,14 @@ -import sempy import sempy.fabric as fabric -from .TOM import connect_semantic_model +from sempy_labs.TOM import connect_semantic_model from typing import List, Optional, Union -def update_direct_lake_partition_entity(dataset: str, table_name: Union[str, List[str]], entity_name: Union[str, List[str]], workspace: Optional[str] = None): +def update_direct_lake_partition_entity( + dataset: str, + table_name: Union[str, List[str]], + entity_name: Union[str, List[str]], + workspace: Optional[str] = None, +): """ Remaps a table (or tables) in a Direct Lake semantic model to a table in a lakehouse. @@ -20,11 +24,7 @@ def update_direct_lake_partition_entity(dataset: str, table_name: Union[str, Lis The Fabric workspace name in which the semantic model exists. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - - """ + """ if workspace == None: workspace_id = fabric.get_workspace_id() @@ -37,26 +37,32 @@ def update_direct_lake_partition_entity(dataset: str, table_name: Union[str, Lis table_name = [table_name] if isinstance(entity_name, str): entity_name = [entity_name] - + if len(table_name) != len(entity_name): - print(f"ERROR: The 'table_name' and 'entity_name' arrays must be of equal length.") + print( + f"ERROR: The 'table_name' and 'entity_name' arrays must be of equal length." + ) return - - with connect_semantic_model(dataset=dataset, readonly=False, workspace=workspace) as tom: + + with connect_semantic_model( + dataset=dataset, readonly=False, workspace=workspace + ) as tom: if not tom.is_direct_lake(): - print(f"The '{dataset}' semantic model within the '{workspace}' workspace is not in Direct Lake mode.") + print( + f"The '{dataset}' semantic model within the '{workspace}' workspace is not in Direct Lake mode." + ) return for tName in table_name: i = table_name.index(tName) eName = entity_name[i] try: - tom.model.Tables[tName].Partitions[0].EntityName = eName - print(f"The '{tName}' table in the '{dataset}' semantic model has been updated to point to the '{eName}' table in the '{lakehouse}' lakehouse within the '{lakehouse_workspace}' workspace.") + tom.model.Tables[tName].Partitions[0].EntityName = eName + print( + f"The '{tName}' table in the '{dataset}' semantic model has been updated to point to the '{eName}' table in the '{lakehouse}' lakehouse within the '{lakehouse_workspace}' workspace." + ) except: - print(f"ERROR: The '{tName}' table in the '{dataset}' semantic model has not been updated.") - - - - + print( + f"ERROR: The '{tName}' table in the '{dataset}' semantic model has not been updated." + ) diff --git a/sempy_labs/lakehouse/__init__.py b/sempy_labs/lakehouse/__init__.py new file mode 100644 index 00000000..af2664d9 --- /dev/null +++ b/sempy_labs/lakehouse/__init__.py @@ -0,0 +1,10 @@ +from sempy_labs.lakehouse._get_lakehouse_columns import ( + get_lakehouse_columns as get_lakehouse_columns, +) +from sempy_labs.lakehouse._get_lakehouse_tables import ( + get_lakehouse_tables as get_lakehouse_tables, +) +from sempy_labs.lakehouse._lakehouse import ( + lakehouse_attached as lakehouse_attached, + optimize_lakehouse_tables as optimize_lakehouse_tables, +) diff --git a/sempy_labs/GetLakehouseColumns.py b/sempy_labs/lakehouse/_get_lakehouse_columns.py similarity index 58% rename from sempy_labs/GetLakehouseColumns.py rename to sempy_labs/lakehouse/_get_lakehouse_columns.py index 56807281..ebc27b30 100644 --- a/sempy_labs/GetLakehouseColumns.py +++ b/sempy_labs/lakehouse/_get_lakehouse_columns.py @@ -1,14 +1,18 @@ -import sempy import sempy.fabric as fabric import pandas as pd from pyspark.sql import SparkSession -from delta import DeltaTable -from .HelperFunctions import resolve_lakehouse_name, format_dax_object_name, resolve_lakehouse_id -from .GetLakehouseTables import get_lakehouse_tables -from typing import List, Optional, Union +from sempy_labs._helper_functions import ( + resolve_lakehouse_name, + format_dax_object_name, + resolve_lakehouse_id, +) +from typing import Optional +from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables -def get_lakehouse_columns(lakehouse: Optional[str] = None, workspace: Optional[str] = None): +def get_lakehouse_columns( + lakehouse: Optional[str] = None, workspace: Optional[str] = None +): """ Shows the tables and columns of a lakehouse and their respective properties. @@ -28,7 +32,16 @@ def get_lakehouse_columns(lakehouse: Optional[str] = None, workspace: Optional[s Shows the tables/columns within a lakehouse and their properties. """ - df = pd.DataFrame(columns=['Workspace Name', 'Lakehouse Name', 'Table Name', 'Column Name', 'Full Column Name', 'Data Type']) + df = pd.DataFrame( + columns=[ + "Workspace Name", + "Lakehouse Name", + "Table Name", + "Column Name", + "Full Column Name", + "Data Type", + ] + ) if workspace == None: workspace_id = fabric.get_workspace_id() @@ -44,18 +57,27 @@ def get_lakehouse_columns(lakehouse: Optional[str] = None, workspace: Optional[s spark = SparkSession.builder.getOrCreate() - tables = get_lakehouse_tables(lakehouse = lakehouse, workspace = workspace, extended = False, count_rows = False) - tables_filt = tables[tables['Format'] == 'delta'] + tables = get_lakehouse_tables( + lakehouse=lakehouse, workspace=workspace, extended=False, count_rows=False + ) + tables_filt = tables[tables["Format"] == "delta"] for i, r in tables_filt.iterrows(): - tName = r['Table Name'] - tPath = r['Location'] + tName = r["Table Name"] + tPath = r["Location"] delta_table = DeltaTable.forPath(spark, tPath) sparkdf = delta_table.toDF() for cName, data_type in sparkdf.dtypes: tc = format_dax_object_name(tName, cName) - new_data = {'Workspace Name': workspace, 'Lakehouse Name': lakehouse, 'Table Name': tName, 'Column Name': cName, 'Full Column Name': tc, 'Data Type': data_type} + new_data = { + "Workspace Name": workspace, + "Lakehouse Name": lakehouse, + "Table Name": tName, + "Column Name": cName, + "Full Column Name": tc, + "Data Type": data_type, + } df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - return df \ No newline at end of file + return df diff --git a/sempy_labs/lakehouse/_get_lakehouse_tables.py b/sempy_labs/lakehouse/_get_lakehouse_tables.py new file mode 100644 index 00000000..a38c7d6d --- /dev/null +++ b/sempy_labs/lakehouse/_get_lakehouse_tables.py @@ -0,0 +1,248 @@ +import sempy +import sempy.fabric as fabric +import pandas as pd +from pyspark.sql import SparkSession +import pyarrow.parquet as pq +import datetime +from sempy_labs._helper_functions import resolve_lakehouse_id, resolve_lakehouse_name +from ..Guardrails import get_sku_size, get_directlake_guardrails_for_sku +from sempy_labs.lakehouse._lakehouse import lakehouse_attached +from typing import Optional + + +def get_lakehouse_tables( + lakehouse: Optional[str] = None, + workspace: Optional[str] = None, + extended: Optional[bool] = False, + count_rows: Optional[bool] = False, + export: Optional[bool] = False, +): + """ + Shows the tables of a lakehouse and their respective properties. Option to include additional properties relevant to Direct Lake guardrails. + + Parameters + ---------- + lakehouse : str, default=None + The Fabric lakehouse. + Defaults to None which resolves to the lakehouse attached to the notebook. + lakehouse_workspace : str, default=None + The Fabric workspace used by the lakehouse. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + extended : bool, default=False + Obtains additional columns relevant to the size of each table. + count_rows : bool, default=False + Obtains a row count for each lakehouse table. + export : bool, default=False + Exports the resulting dataframe to a delta table in the lakehouse. + + Returns + ------- + pandas.DataFrame + Shows the tables/columns within a lakehouse and their properties. + """ + + df = pd.DataFrame( + columns=[ + "Workspace Name", + "Lakehouse Name", + "Table Name", + "Format", + "Type", + "Location", + ] + ) + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + else: + workspace_id = fabric.resolve_workspace_id(workspace) + + if lakehouse == None: + lakehouse_id = fabric.get_lakehouse_id() + lakehouse = resolve_lakehouse_name(lakehouse_id, workspace) + else: + lakehouse_id = resolve_lakehouse_id(lakehouse, workspace) + + if count_rows: # Setting countrows defaults to extended=True + extended = True + + client = fabric.FabricRestClient() + response = client.get( + f"/v1/workspaces/{workspace_id}/lakehouses/{lakehouse_id}/tables" + ) + + for i in response.json()["data"]: + tName = i["name"] + tType = i["type"] + tFormat = i["format"] + tLocation = i["location"] + if extended == False: + new_data = { + "Workspace Name": workspace, + "Lakehouse Name": lakehouse, + "Table Name": tName, + "Format": tFormat, + "Type": tType, + "Location": tLocation, + } + df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + else: + sku_value = get_sku_size(workspace) + guardrail = get_directlake_guardrails_for_sku(sku_value) + + spark = SparkSession.builder.getOrCreate() + + intColumns = ["Files", "Row Groups", "Table Size"] + if tType == "Managed" and tFormat == "delta": + detail_df = spark.sql(f"DESCRIBE DETAIL `{tName}`").collect()[0] + num_files = detail_df.numFiles + size_in_bytes = detail_df.sizeInBytes + + delta_table_path = f"Tables/{tName}" + latest_files = ( + spark.read.format("delta").load(delta_table_path).inputFiles() + ) + file_paths = [f.split("/")[-1] for f in latest_files] + + # Handle FileNotFoundError + num_rowgroups = 0 + for filename in file_paths: + try: + num_rowgroups += pq.ParquetFile( + f"/lakehouse/default/{delta_table_path}/{filename}" + ).num_row_groups + except FileNotFoundError: + continue + + if count_rows: + num_rows = spark.table(tName).count() + intColumns.append("Row Count") + new_data = { + "Workspace Name": workspace, + "Lakehouse Name": lakehouse, + "Table Name": tName, + "Format": tFormat, + "Type": tType, + "Location": tLocation, + "Files": num_files, + "Row Groups": num_rowgroups, + "Row Count": num_rows, + "Table Size": size_in_bytes, + } + else: + new_data = { + "Workspace Name": workspace, + "Lakehouse Name": lakehouse, + "Table Name": tName, + "Format": tFormat, + "Type": tType, + "Location": tLocation, + "Files": num_files, + "Row Groups": num_rowgroups, + "Table Size": size_in_bytes, + } + + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) + df[intColumns] = df[intColumns].astype(int) + + df["SKU"] = guardrail["Fabric SKUs"].iloc[0] + df["Parquet File Guardrail"] = guardrail["Parquet files per table"].iloc[0] + df["Row Group Guardrail"] = guardrail["Row groups per table"].iloc[0] + df["Row Count Guardrail"] = ( + guardrail["Rows per table (millions)"].iloc[0] * 1000000 + ) + + df["Parquet File Guardrail Hit"] = ( + df["Files"] > df["Parquet File Guardrail"] + ) + df["Row Group Guardrail Hit"] = df["Row Groups"] > df["Row Group Guardrail"] + + if count_rows: + df["Row Count Guardrail Hit"] = ( + df["Row Count"] > df["Row Count Guardrail"] + ) + + if export: + lakeAttach = lakehouse_attached() + if lakeAttach == False: + print( + f"In order to save the report.json file, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook." + ) + return + spark = SparkSession.builder.getOrCreate() + + lakehouse_id = fabric.get_lakehouse_id() + lakehouse = resolve_lakehouse_name( + lakehouse_id=lakehouse_id, workspace=workspace + ) + lakeTName = "lakehouse_table_details" + lakeT_filt = df[df["Table Name"] == lakeTName] + + query = f"SELECT MAX(RunId) FROM {lakehouse}.{lakeTName}" + + if len(lakeT_filt) == 0: + runId = 1 + else: + dfSpark = spark.sql(query) + maxRunId = dfSpark.collect()[0][0] + runId = maxRunId + 1 + + export_df = df.copy() + + cols = [ + "Files", + "Row Groups", + "Row Count", + "Table Size", + "SKU", + "Parquet File Guardrail", + "Row Group Guardrail", + "Row Count Guardrail", + "Parquet File Guardrail Hit", + "Row Group Guardrail Hit", + "Row Count Guardrail Hit", + ] + + for c in cols: + if c not in export_df: + if c in [ + "Files", + "Row Groups", + "Row Count", + "Table Size", + "Parquet File Guardrail", + "Row Group Guardrail", + "Row Count Guardrail", + ]: + export_df[c] = 0 + export_df[c] = export_df[c].astype(int) + elif c in ["SKU"]: + export_df[c] = None + export_df[c] = export_df[c].astype(str) + elif c in [ + "Parquet File Guardrail Hit", + "Row Group Guardrail Hit", + "Row Count Guardrail Hit", + ]: + export_df[c] = False + export_df[c] = export_df[c].astype(bool) + + print( + f"Saving Lakehouse table properties to the '{lakeTName}' table in the lakehouse...\n" + ) + now = datetime.datetime.now() + export_df["Timestamp"] = now + export_df["RunId"] = runId + + export_df.columns = export_df.columns.str.replace(" ", "_") + spark_df = spark.createDataFrame(export_df) + spark_df.write.mode("append").format("delta").saveAsTable(lakeTName) + print( + f"\u2022 Lakehouse table properties have been saved to the '{lakeTName}' delta table." + ) + + return df diff --git a/sempy_labs/Lakehouse.py b/sempy_labs/lakehouse/_lakehouse.py similarity index 67% rename from sempy_labs/Lakehouse.py rename to sempy_labs/lakehouse/_lakehouse.py index eb65e010..eebc5f3d 100644 --- a/sempy_labs/Lakehouse.py +++ b/sempy_labs/lakehouse/_lakehouse.py @@ -1,35 +1,34 @@ -import sempy import sempy.fabric as fabric from tqdm.auto import tqdm from pyspark.sql import SparkSession -from delta import DeltaTable -from .HelperFunctions import resolve_lakehouse_name +from sempy_labs._helper_functions import resolve_lakehouse_name from typing import List, Optional, Union -def lakehouse_attached() -> bool: +def lakehouse_attached() -> bool: """ Identifies if a lakehouse is attached to the notebook. - Parameters - ---------- - Returns ------- bool Returns True if a lakehouse is attached to the notebook. - """ + """ spark = SparkSession.builder.getOrCreate() - lakeId = spark.conf.get('trident.lakehouse.id') - + lakeId = spark.conf.get("trident.lakehouse.id") + if len(lakeId) > 0: return True else: return False -def optimize_lakehouse_tables(tables: Optional[Union[str, List[str]]] = None, lakehouse: Optional[str] = None, workspace: Optional[str] = None): +def optimize_lakehouse_tables( + tables: Optional[Union[str, List[str]]] = None, + lakehouse: Optional[str] = None, + workspace: Optional[str] = None, +): """ Runs the [OPTIMIZE](https://docs.delta.io/latest/optimizations-oss.html) function over the specified lakehouse tables. @@ -44,30 +43,26 @@ def optimize_lakehouse_tables(tables: Optional[Union[str, List[str]]] = None, la The Fabric workspace used by the lakehouse. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - - Returns - ------- - """ - from .GetLakehouseTables import get_lakehouse_tables + from .lakehouse.GetLakehouseTables import get_lakehouse_tables if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) - + if lakehouse == None: lakehouse_id = fabric.get_lakehouse_id() lakehouse = resolve_lakehouse_name(lakehouse_id, workspace) - lakeTables = get_lakehouse_tables(lakehouse = lakehouse, workspace = workspace) - lakeTablesDelta = lakeTables[lakeTables['Format'] == 'delta'] + lakeTables = get_lakehouse_tables(lakehouse=lakehouse, workspace=workspace) + lakeTablesDelta = lakeTables[lakeTables["Format"] == "delta"] if isinstance(tables, str): tables = [tables] if tables is not None: - tables_filt = lakeTablesDelta[lakeTablesDelta['Table Name'].isin(tables)] + tables_filt = lakeTablesDelta[lakeTablesDelta["Table Name"].isin(tables)] else: tables_filt = lakeTablesDelta.copy() @@ -75,12 +70,14 @@ def optimize_lakehouse_tables(tables: Optional[Union[str, List[str]]] = None, la spark = SparkSession.builder.getOrCreate() - i=1 - for index, r in (bar := tqdm(tables_filt.iterrows())): - tableName = r['Table Name'] - tablePath = r['Location'] + i = 1 + for _, r in (bar := tqdm(tables_filt.iterrows())): + tableName = r["Table Name"] + tablePath = r["Location"] bar.set_description(f"Optimizing the '{tableName}' table...") deltaTable = DeltaTable.forPath(spark, tablePath) deltaTable.optimize().executeCompaction() - print(f"The '{tableName}' table has been optimized. ({str(i)}/{str(tableCount)})") - i+=1 + print( + f"The '{tableName}' table has been optimized. ({str(i)}/{str(tableCount)})" + ) + i += 1 diff --git a/sempy_labs/migration/__init__.py b/sempy_labs/migration/__init__.py new file mode 100644 index 00000000..d90bec0b --- /dev/null +++ b/sempy_labs/migration/__init__.py @@ -0,0 +1,16 @@ +from sempy_labs.migration._migrate_calctables_to_lakehouse import ( + migrate_calctables_to_lakehouse as migrate_calctables_to_lakehouse, + migrate_field_parameters as migrate_field_parameters, +) +from sempy_labs.migration._migrate_calctables_to_semantic_model import ( + migrate_calc_tables_to_semantic_model as migrate_calc_tables_to_semantic_model, +) +from sempy_labs.migration._migrate_model_objects_to_semantic_model import ( + migrate_model_objects_to_semantic_model as migrate_model_objects_to_semantic_model, +) +from sempy_labs.migration._migrate_tables_columns_to_semantic_model import ( + migrate_tables_columns_to_semantic_model as migrate_tables_columns_to_semantic_model, +) +from sempy_labs.migration._migration_validation import ( + migration_validation as migration_validation, +) diff --git a/sempy_labs/migration/_migrate_calctables_to_lakehouse.py b/sempy_labs/migration/_migrate_calctables_to_lakehouse.py new file mode 100644 index 00000000..27a0f49f --- /dev/null +++ b/sempy_labs/migration/_migrate_calctables_to_lakehouse.py @@ -0,0 +1,433 @@ +import sempy +import sempy.fabric as fabric +import pandas as pd +import re, datetime, time +from .lakehouse.GetLakehouseTables import get_lakehouse_tables +from .HelperFunctions import ( + resolve_lakehouse_name, + resolve_lakehouse_id, + create_abfss_path, +) +from .TOM import connect_semantic_model +from pyspark.sql import SparkSession +from typing import List, Optional, Union +from sempy._utils._log import log +import sempy_labs._icons as icons + + +@log +def migrate_calc_tables_to_lakehouse( + dataset: str, + new_dataset: str, + workspace: Optional[str] = None, + new_dataset_workspace: Optional[str] = None, + lakehouse: Optional[str] = None, + lakehouse_workspace: Optional[str] = None, +): + """ + Creates delta tables in your lakehouse based on the DAX expression of a calculated table in an import/DirectQuery semantic model. The DAX expression encapsulating the calculated table logic is stored in the new Direct Lake semantic model as model annotations. + + Parameters + ---------- + dataset : str + Name of the import/DirectQuery semantic model. + new_dataset : str + Name of the Direct Lake semantic model. + workspace : str, default=None + The Fabric workspace name in which the import/DirectQuery semantic model exists. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + new_dataset_workspace : str + The Fabric workspace name in which the Direct Lake semantic model will be created. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + lakehouse : str, default=None + The Fabric lakehouse used by the Direct Lake semantic model. + Defaults to None which resolves to the lakehouse attached to the notebook. + lakehouse_workspace : str, default=None + The Fabric workspace used by the lakehouse. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + """ + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + else: + workspace_id = fabric.resolve_workspace_id(workspace) + + if new_dataset_workspace == None: + new_dataset_workspace = workspace + + if lakehouse_workspace == None: + lakehouse_workspace = new_dataset_workspace + lakehouse_workspace_id = fabric.resolve_workspace_id(lakehouse_workspace) + else: + lakehouse_workspace_id = fabric.resolve_workspace_id(lakehouse_workspace) + + if lakehouse == None: + lakehouse_id = fabric.get_lakehouse_id() + lakehouse = resolve_lakehouse_name(lakehouse_id, lakehouse_workspace) + else: + lakehouse_id = resolve_lakehouse_id(lakehouse, lakehouse_workspace) + + dfC = fabric.list_columns(dataset=dataset, workspace=workspace) + # dfC['Column Object'] = "'" + dfC['Table Name'] + "'[" + dfC['Column Name'] + "]" + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + dfP_filt = dfP[(dfP["Source Type"] == "Calculated")] + dfP_filt = dfP_filt[ + ~dfP_filt["Query"].str.contains("NAMEOF") + ] # Remove field parameters + # dfC_CalcColumn = dfC[dfC['Type'] == 'Calculated'] + lakeTables = get_lakehouse_tables(lakehouse, lakehouse_workspace) + + # Do not execute the function if lakehouse tables already exist with the same name + killFunction = False + for i, r in dfP_filt.iterrows(): + tName = r["Table Name"] + dtName = tName.replace(" ", "_") + + if dtName in lakeTables["Table Name"].values: + print( + f"{icons.red_dot} The '{tName}' table already exists as '{dtName}' in the '{lakehouse}' lakehouse in the '{workspace}' workspace." + ) + killFunction = True + + if killFunction: + return + + spark = SparkSession.builder.getOrCreate() + + if len(dfP_filt) == 0: + print( + f"{icons.yellow_dot} The '{dataset}' semantic model in the '{workspace}' workspace has no calculated tables." + ) + return + + start_time = datetime.datetime.now() + timeout = datetime.timedelta(minutes=1) + success = False + + while not success: + try: + with connect_semantic_model( + dataset=dataset, workspace=workspace, readonly=True + ) as tom: + success = True + for t in tom.model.Tables: + if tom.is_auto_date_table(table_name=t.Name): + print( + f"{icons.yellow_dot} The '{t.Name}' table is an auto-datetime table and is not supported in the Direct Lake migration process. Please create a proper Date/Calendar table in your lakehoues and use it in your Direct Lake model." + ) + else: + for p in t.Partitions: + if str(p.SourceType) == "Calculated": + query = p.Source.Expression + if "NAMEOF" not in query: # exclude field parameters + daxQuery = "" + if query.lower().startswith("calendar") and any( + str(c.Type) == "Calculated" for c in t.Columns + ): + daxQuery = f"ADDCOLUMNS(\n{query}," + for c in t.Columns: + if str(c.Type) == "Calculated": + expr = c.Expression + expr = expr.replace( + f"'{t.Name}'", "" + ).replace(f"{t.Name}[Date]", "[Date]") + expr = expr.replace( + "[MonthNo]", "MONTH([Date])" + ).replace( + "[QuarterNo]", + "INT((MONTH([Date]) + 2) / 3)", + ) + daxQuery = ( + f'{daxQuery}\n"{c.Name}",{expr},' + ) + daxQuery = ( + "EVALUATE\n" + daxQuery.rstrip(",") + "\n)" + ) + else: + daxQuery = f"EVALUATE\n{query}" + daxQueryTopN = ( + daxQuery.replace( + "EVALUATE\n", "EVALUATE\nTOPN(1," + ) + + ")" + ) + + try: + df = fabric.evaluate_dax( + dataset=dataset, + dax_string=daxQueryTopN, + workspace=workspace, + ) + + for col in df.columns: + pattern = r"\[([^\]]+)\]" + + matches = re.findall(pattern, col) + new_column_name = matches[0].replace( + " ", "" + ) + + df.rename( + columns={col: new_column_name}, + inplace=True, + ) + + try: + dataType = next( + str(c.DataType) + for c in tom.model.Tables[ + t.Name + ].Columns + if str(c.Type) + == "CalculatedTableColumn" + and c.SourceColumn == col + ) + except: + dataType = next( + str(c.DataType) + for c in tom.model.Tables[ + t.Name + ].Columns + if str(c.Type) == "Calculated" + and c.Name == new_column_name + ) + + if dataType == "Int64": + df[new_column_name] = df[ + new_column_name + ].astype(int) + elif dataType in ["Decimal", "Double"]: + df[new_column_name] = df[ + new_column_name + ].astype(float) + elif dataType == "Boolean": + df[new_column_name] = df[ + new_column_name + ].astype(bool) + elif dataType == "DateTime": + df[new_column_name] = pd.to_datetime( + df[new_column_name] + ) + + delta_table_name = t.Name.replace( + " ", "_" + ).lower() + + spark_df = spark.createDataFrame(df) + filePath = create_abfss_path( + lakehouse_id=lakehouse_id, + lakehouse_workspace_id=lakehouse_workspace_id, + delta_table_name=delta_table_name, + ) + spark_df.write.mode("overwrite").format( + "delta" + ).save(filePath) + + start_time2 = datetime.datetime.now() + timeout2 = datetime.timedelta(minutes=1) + success2 = False + + while not success2: + try: + with connect_semantic_model( + dataset=new_dataset, + readonly=False, + workspace=new_dataset_workspace, + ) as tom2: + success2 = True + tom2.set_annotation( + object=tom2.model, + name=t.Name, + value=daxQuery, + ) + except Exception as e: + if ( + datetime.datetime.now() + - start_time2 + > timeout2 + ): + break + time.sleep(1) + + print( + f"{icons.green_dot} Calculated table '{t.Name}' has been created as delta table '{delta_table_name.lower()}' in the '{lakehouse}' lakehouse within the '{lakehouse_workspace}' workspace." + ) + except: + print( + f"{icons.red_dot} Failed to create calculated table '{t.Name}' as a delta table in the lakehouse." + ) + except Exception as e: + if datetime.datetime.now() - start_time > timeout: + break + time.sleep(1) + + +@log +def migrate_field_parameters( + dataset: str, + new_dataset: str, + workspace: Optional[str] = None, + new_dataset_workspace: Optional[str] = None, +): + """ + Migrates field parameters from one semantic model to another. + + Parameters + ---------- + dataset : str + Name of the import/DirectQuery semantic model. + new_dataset : str + Name of the Direct Lake semantic model. + workspace : str, default=None + The Fabric workspace name in which the import/DirectQuery semantic model exists. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + new_dataset_workspace : str + The Fabric workspace name in which the Direct Lake semantic model will be created. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + """ + + from .HelperFunctions import format_dax_object_name + + sempy.fabric._client._utils._init_analysis_services() + import Microsoft.AnalysisServices.Tabular as TOM + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + + if new_dataset_workspace == None: + new_dataset_workspace = workspace + + dfC = fabric.list_columns(dataset=dataset, workspace=workspace) + dfC["Column Object"] = format_dax_object_name(dfC["Table Name"], dfC["Column Name"]) + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + dfP_filt = dfP[(dfP["Source Type"] == "Calculated")] + dfP_filt = dfP_filt[ + dfP_filt["Query"].str.contains("NAMEOF") + ] # Only field parameters + dfC_CalcColumn = dfC[dfC["Type"] == "Calculated"] + + if len(dfP_filt) == 0: + print( + f"{icons.green_dot} The '{dataset}' semantic model in the '{workspace}' workspace has no field parameters." + ) + return + + start_time = datetime.datetime.now() + timeout = datetime.timedelta(minutes=1) + success = False + + while not success: + try: + with connect_semantic_model( + dataset=new_dataset, workspace=new_dataset_workspace, readonly=False + ) as tom: + success = True + + for i, r in dfP_filt.iterrows(): + tName = r["Table Name"] + query = r["Query"] + + # For field parameters, remove calc columns from the query + rows = query.strip().split("\n") + filtered_rows = [ + row + for row in rows + if not any( + value in row + for value in dfC_CalcColumn["Column Object"].values + ) + ] + updated_query_string = "\n".join(filtered_rows) + + # Remove extra comma + lines = updated_query_string.strip().split("\n") + lines[-2] = lines[-2].rstrip(",") + expr = "\n".join(lines) + + try: + par = TOM.Partition() + par.Name = tName + + parSource = TOM.CalculatedPartitionSource() + par.Source = parSource + parSource.Expression = expr + + tbl = TOM.Table() + tbl.Name = tName + tbl.Partitions.Add(par) + + columns = ["Value1", "Value2", "Value3"] + + for colName in columns: + col = TOM.CalculatedTableColumn() + col.Name = colName + col.SourceColumn = "[" + colName + "]" + col.DataType = TOM.DataType.String + + tbl.Columns.Add(col) + + tom.model.Tables.Add(tbl) + + ep = TOM.JsonExtendedProperty() + ep.Name = "ParameterMetadata" + ep.Value = '{"version":3,"kind":2}' + + rcd = TOM.RelatedColumnDetails() + gpc = TOM.GroupByColumn() + gpc.GroupingColumn = tom.model.Tables[tName].Columns["Value2"] + rcd.GroupByColumns.Add(gpc) + + # Update column properties + tom.model.Tables[tName].Columns["Value2"].IsHidden = True + tom.model.Tables[tName].Columns["Value3"].IsHidden = True + tom.model.Tables[tName].Columns[ + "Value3" + ].DataType = TOM.DataType.Int64 + tom.model.Tables[tName].Columns["Value1"].SortByColumn = ( + tom.model.Tables[tName].Columns["Value3"] + ) + tom.model.Tables[tName].Columns["Value2"].SortByColumn = ( + tom.model.Tables[tName].Columns["Value3"] + ) + tom.model.Tables[tName].Columns[ + "Value2" + ].ExtendedProperties.Add(ep) + tom.model.Tables[tName].Columns[ + "Value1" + ].RelatedColumnDetails = rcd + + dfC_filt1 = dfC[ + (dfC["Table Name"] == tName) & (dfC["Source"] == "[Value1]") + ] + col1 = dfC_filt1["Column Name"].iloc[0] + dfC_filt2 = dfC[ + (dfC["Table Name"] == tName) & (dfC["Source"] == "[Value2]") + ] + col2 = dfC_filt2["Column Name"].iloc[0] + dfC_filt3 = dfC[ + (dfC["Table Name"] == tName) & (dfC["Source"] == "[Value3]") + ] + col3 = dfC_filt3["Column Name"].iloc[0] + + tom.model.Tables[tName].Columns["Value1"].Name = col1 + tom.model.Tables[tName].Columns["Value2"].Name = col2 + tom.model.Tables[tName].Columns["Value3"].Name = col3 + + print( + f"{icons.green_dot} The '{tName}' table has been added as a field parameter to the '{new_dataset}' semantic model in the '{new_dataset_workspace}' workspace." + ) + except: + print( + f"{icons.red_dot} The '{tName}' table has not been added as a field parameter." + ) + except Exception as e: + if datetime.datetime.now() - start_time > timeout: + break + time.sleep(1) diff --git a/sempy_labs/migration/_migrate_calctables_to_semantic_model.py b/sempy_labs/migration/_migrate_calctables_to_semantic_model.py new file mode 100644 index 00000000..470c3942 --- /dev/null +++ b/sempy_labs/migration/_migrate_calctables_to_semantic_model.py @@ -0,0 +1,153 @@ +import sempy.fabric as fabric +import re, datetime, time +from .lakehouse.GetLakehouseTables import get_lakehouse_tables +from .HelperFunctions import resolve_lakehouse_name +from .TOM import connect_semantic_model +from typing import Optional +from sempy._utils._log import log +import sempy_labs._icons as icons + + +@log +def migrate_calc_tables_to_semantic_model( + dataset: str, + new_dataset: str, + workspace: Optional[str] = None, + new_dataset_workspace: Optional[str] = None, + lakehouse: Optional[str] = None, + lakehouse_workspace: Optional[str] = None, +): + """ + Creates new tables in the Direct Lake semantic model based on the lakehouse tables created using the 'migrate_calc_tables_to_lakehouse' function. + + Parameters + ---------- + dataset : str + Name of the import/DirectQuery semantic model. + new_dataset : str + Name of the Direct Lake semantic model. + workspace : str, default=None + The Fabric workspace name in which the import/DirectQuery semantic model exists. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + new_dataset_workspace : str + The Fabric workspace name in which the Direct Lake semantic model will be created. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + lakehouse : str, default=None + The Fabric lakehouse used by the Direct Lake semantic model. + Defaults to None which resolves to the lakehouse attached to the notebook. + lakehouse_workspace : str, default=None + The Fabric workspace used by the lakehouse. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + """ + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + else: + workspace_id = fabric.resolve_workspace_id(workspace) + + if new_dataset_workspace == None: + new_dataset_workspace = workspace + + if lakehouse_workspace == None: + lakehouse_workspace = new_dataset_workspace + if lakehouse == None: + lakehouse_id = fabric.get_lakehouse_id() + lakehouse = resolve_lakehouse_name(lakehouse_id, lakehouse_workspace) + + # Get calc tables but not field parameters + dfP = fabric.list_partitions(dataset=dataset, workspace=workspace) + dfP_filt = dfP[(dfP["Source Type"] == "Calculated")] + dfP_filt = dfP_filt[~dfP_filt["Query"].str.contains("NAMEOF")] + + dfC = fabric.list_columns(dataset=dataset, workspace=workspace) + lc = get_lakehouse_tables(lakehouse=lakehouse, workspace=lakehouse_workspace) + # Get all calc table columns of calc tables not including field parameters + dfC_filt = dfC[ + (dfC["Table Name"].isin(dfP_filt["Table Name"])) + ] # & (dfC['Type'] == 'CalculatedTableColumn')] + # dfA = list_annotations(new_dataset, new_dataset_workspace) + # dfA_filt = dfA[(dfA['Object Type'] == 'Model') & ~ (dfA['Annotation Value'].str.contains('NAMEOF'))] + + if len(dfP_filt) == 0: + print( + f"{icons.green_dot} The '{dataset}' semantic model has no calculated tables." + ) + return + + start_time = datetime.datetime.now() + timeout = datetime.timedelta(minutes=1) + success = False + + while not success: + try: + with connect_semantic_model( + dataset=new_dataset, readonly=False, workspace=new_dataset_workspace + ) as tom: + success = True + for tName in dfC_filt["Table Name"].unique(): + if tName.lower() in lc["Table Name"].values: + + try: + tom.model.Tables[tName] + except: + tom.add_table(name=tName) + tom.add_entity_partition( + table_name=tName, + entity_name=tName.replace(" ", "_").lower(), + ) + + columns_in_table = dfC_filt.loc[ + dfC_filt["Table Name"] == tName, "Column Name" + ].unique() + + for cName in columns_in_table: + scName = dfC.loc[ + (dfC["Table Name"] == tName) + & (dfC["Column Name"] == cName), + "Source", + ].iloc[0] + cDataType = dfC.loc[ + (dfC["Table Name"] == tName) + & (dfC["Column Name"] == cName), + "Data Type", + ].iloc[0] + cType = dfC.loc[ + (dfC["Table Name"] == tName) + & (dfC["Column Name"] == cName), + "Type", + ].iloc[0] + + # av = tom.get_annotation_value(object = tom.model, name = tName) + + # if cType == 'CalculatedTableColumn': + # lakeColumn = scName.replace(' ','_') + # elif cType == 'Calculated': + pattern = r"\[([^]]+)\]" + + matches = re.findall(pattern, scName) + lakeColumn = matches[0].replace(" ", "") + try: + tom.model.Tables[tName].Columns[cName] + except: + tom.add_data_column( + table_name=tName, + column_name=cName, + source_column=lakeColumn, + data_type=cDataType, + ) + print( + f"{icons.green_dot} The '{tName}'[{cName}] column has been added." + ) + + print( + f"\n{icons.green_dot} All viable calculated tables have been added to the model." + ) + + except Exception as e: + if datetime.datetime.now() - start_time > timeout: + break + time.sleep(1) diff --git a/sempy_labs/migration/_migrate_model_objects_to_semantic_model.py b/sempy_labs/migration/_migrate_model_objects_to_semantic_model.py new file mode 100644 index 00000000..72381854 --- /dev/null +++ b/sempy_labs/migration/_migrate_model_objects_to_semantic_model.py @@ -0,0 +1,524 @@ +import sempy +import sempy.fabric as fabric +import re, datetime, time +from ._list_functions import list_tables +from .HelperFunctions import create_relationship_name +from .TOM import connect_semantic_model +from typing import Optional +from sempy._utils._log import log +import sempy_labs._icons as icons + + +@log +def migrate_model_objects_to_semantic_model( + dataset: str, + new_dataset: str, + workspace: Optional[str] = None, + new_dataset_workspace: Optional[str] = None, +): + """ + Adds the rest of the model objects (besides tables/columns) and their properties to a Direct Lake semantic model based on an import/DirectQuery semantic model. + + Parameters + ---------- + dataset : str + Name of the import/DirectQuery semantic model. + new_dataset : str + Name of the Direct Lake semantic model. + workspace : str, default=None + The Fabric workspace name in which the import/DirectQuery semantic model exists. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + new_dataset_workspace : str + The Fabric workspace name in which the Direct Lake semantic model will be created. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + """ + + sempy.fabric._client._utils._init_analysis_services() + import Microsoft.AnalysisServices.Tabular as TOM + import System + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + else: + workspaceId = fabric.resolve_workspace_id(workspace) + + if new_dataset_workspace == None: + new_dataset_workspace = workspace + + dfT = list_tables(dataset, workspace) + dfC = fabric.list_columns(dataset=dataset, workspace=workspace) + dfM = fabric.list_measures(dataset=dataset, workspace=workspace) + dfR = fabric.list_relationships(dataset=dataset, workspace=workspace) + dfRole = fabric.get_roles(dataset=dataset, workspace=workspace) + dfRLS = fabric.get_row_level_security_permissions( + dataset=dataset, workspace=workspace + ) + dfCI = fabric.list_calculation_items(dataset=dataset, workspace=workspace) + dfP = fabric.list_perspectives(dataset=dataset, workspace=workspace) + dfTranslation = fabric.list_translations(dataset=dataset, workspace=workspace) + dfH = fabric.list_hierarchies(dataset=dataset, workspace=workspace) + dfPar = fabric.list_partitions(dataset=dataset, workspace=workspace) + + dfP_cc = dfPar[(dfPar["Source Type"] == "Calculated")] + dfP_fp = dfP_cc[dfP_cc["Query"].str.contains("NAMEOF")] + dfC_fp = dfC[dfC["Table Name"].isin(dfP_fp["Table Name"].values)] + + print(f"{icons.in_progress} Updating '{new_dataset}' based on '{dataset}'...") + start_time = datetime.datetime.now() + timeout = datetime.timedelta(minutes=1) + success = False + + while not success: + try: + with connect_semantic_model( + dataset=new_dataset, readonly=False, workspace=new_dataset_workspace + ) as tom: + success = True + + isDirectLake = any( + str(p.Mode) == "DirectLake" + for t in tom.model.Tables + for p in t.Partitions + ) + + print(f"\n{icons.in_progress} Updating table properties...") + for t in tom.model.Tables: + t.IsHidden = bool(dfT.loc[dfT["Name"] == t.Name, "Hidden"].iloc[0]) + t.Description = dfT.loc[dfT["Name"] == t.Name, "Description"].iloc[ + 0 + ] + t.DataCategory = dfT.loc[ + dfT["Name"] == t.Name, "Data Category" + ].iloc[0] + + print( + f"{icons.green_dot} The '{t.Name}' table's properties have been updated." + ) + + print(f"\n{icons.in_progress} Updating column properties...") + for t in tom.model.Tables: + if ( + t.Name not in dfP_fp["Table Name"].values + ): # do not include field parameters + dfT_filtered = dfT[dfT["Name"] == t.Name] + tType = dfT_filtered["Type"].iloc[0] + for c in t.Columns: + if not c.Name.startswith("RowNumber-"): + dfC_filt = dfC[ + (dfC["Table Name"] == t.Name) + & (dfC["Column Name"] == c.Name) + ] + cName = dfC_filt["Column Name"].iloc[0] + c.Name = cName + if tType == "Table": + c.SourceColumn = cName.replace(" ", "_") + c.IsHidden = bool(dfC_filt["Hidden"].iloc[0]) + c.DataType = System.Enum.Parse( + TOM.DataType, dfC_filt["Data Type"].iloc[0] + ) + c.DisplayFolder = dfC_filt["Display Folder"].iloc[0] + c.FormatString = dfC_filt["Format String"].iloc[0] + c.SummarizeBy = System.Enum.Parse( + TOM.AggregateFunction, + dfC_filt["Summarize By"].iloc[0], + ) + c.DataCategory = dfC_filt["Data Category"].iloc[0] + c.IsKey = bool(dfC_filt["Key"].iloc[0]) + sbc = dfC_filt["Sort By Column"].iloc[0] + + if sbc != None: + try: + c.SortByColumn = tom.model.Tables[ + t.Name + ].Columns[sbc] + except: + print( + f"{icons.red_dot} Failed to create '{sbc}' as a Sort By Column for the '{c.Name}' in the '{t.Name}' table." + ) + print( + f"{icons.green_dot} The '{t.Name}'[{c.Name}] column's properties have been updated." + ) + + print(f"\n{icons.in_progress} Creating hierarchies...") + dfH_grouped = ( + dfH.groupby( + [ + "Table Name", + "Hierarchy Name", + "Hierarchy Hidden", + "Hierarchy Description", + ] + ) + .agg({"Level Name": list, "Column Name": list}) + .reset_index() + ) + + for i, r in dfH_grouped.iterrows(): + tName = r["Table Name"] + hName = r["Hierarchy Name"] + hDesc = r["Hierarchy Description"] + hHid = bool(r["Hierarchy Hidden"]) + cols = r["Column Name"] + lvls = r["Level Name"] + + try: + tom.model.Tables[tName].Hierarchies[hName] + except: + tom.add_hierarchy( + table_name=tName, + hierarchy_name=hName, + hierarchy_description=hDesc, + hierarchy_hidden=hHid, + columns=cols, + levels=lvls, + ) + print( + f"{icons.green_dot} The '{hName}' hierarchy has been added." + ) + + print(f"\n{icons.in_progress} Creating measures...") + for i, r in dfM.iterrows(): + tName = r["Table Name"] + mName = r["Measure Name"] + mExpr = r["Measure Expression"] + mHidden = bool(r["Measure Hidden"]) + mDF = r["Measure Display Folder"] + mDesc = r["Measure Description"] + mFS = r["Format String"] + + try: + tom.model.Tables[tName].Measures[mName] + except: + tom.add_measure( + table_name=tName, + measure_name=mName, + expression=mExpr, + hidden=mHidden, + display_folder=mDF, + description=mDesc, + format_string=mFS, + ) + print( + f"{icons.green_dot} The '{mName}' measure has been added." + ) + + for cgName in dfCI["Calculation Group Name"].unique(): + + isHidden = bool( + dfCI.loc[ + (dfCI["Calculation Group Name"] == cgName), "Hidden" + ].iloc[0] + ) + prec = int( + dfCI.loc[ + (dfCI["Calculation Group Name"] == cgName), "Precedence" + ].iloc[0] + ) + desc = dfCI.loc[ + (dfCI["Calculation Group Name"] == cgName), "Description" + ].iloc[0] + + try: + tom.model.Tables[cgName] + except: + tom.add_calculation_group( + name=cgName, + description=desc, + precedence=prec, + hidden=isHidden, + ) + print( + f"{icons.green_dot} The '{cgName}' calculation group has been added." + ) + tom.model.DiscourageImplicitMeasures = True + + print( + f"\n{icons.in_progress} Updating calculation group column name..." + ) + dfC_filt = dfC[ + (dfC["Table Name"] == cgName) & (dfC["Hidden"] == False) + ] + colName = dfC_filt["Column Name"].iloc[0] + tom.model.Tables[cgName].Columns["Name"].Name = colName + + calcItems = dfCI.loc[ + dfCI["Calculation Group Name"] == cgName, + "Calculation Item Name", + ].unique() + + print(f"\n{icons.in_progress} Creating calculation items...") + for calcItem in calcItems: + ordinal = int( + dfCI.loc[ + (dfCI["Calculation Group Name"] == cgName) + & (dfCI["Calculation Item Name"] == calcItem), + "Ordinal", + ].iloc[0] + ) + expr = dfCI.loc[ + (dfCI["Calculation Group Name"] == cgName) + & (dfCI["Calculation Item Name"] == calcItem), + "Expression", + ].iloc[0] + fse = dfCI.loc[ + (dfCI["Calculation Group Name"] == cgName) + & (dfCI["Calculation Item Name"] == calcItem), + "Format String Expression", + ].iloc[0] + try: + tom.model.Tables[cgName].CalculationGroup.CalculationItems[ + calcItem + ] + except: + tom.add_calculation_item( + table_name=cgName, + calculation_item_name=calcItem, + expression=expr, + format_string_expression=fse, + ordinal=ordinal, + ) + print( + f"{icons.green_dot} The '{calcItem}' has been added to the '{cgName}' calculation group." + ) + + print(f"\n{icons.in_progress} Creating relationships...") + for index, row in dfR.iterrows(): + fromTable = row["From Table"] + fromColumn = row["From Column"] + toTable = row["To Table"] + toColumn = row["To Column"] + isActive = row["Active"] + cfb = row["Cross Filtering Behavior"] + sfb = row["Security Filtering Behavior"] + rori = row["Rely On Referential Integrity"] + mult = row["Multiplicity"] + + card_mapping = {"m": "Many", "1": "One", "0": "None"} + + fromCard = card_mapping.get(mult[0]) + toCard = card_mapping.get(mult[-1]) + + relName = create_relationship_name( + fromTable, fromColumn, toTable, toColumn + ) + + if any( + r.FromTable.Name == fromTable + and r.FromColumn.Name == fromColumn + and r.ToTable.Name == toTable + and r.ToColumn.Name == toColumn + for r in tom.model.Relationships + ): + print( + f"{icons.yellow_dot} {relName} already exists as a relationship in the semantic model." + ) + elif isDirectLake and any( + r.FromTable.Name == fromTable + and r.FromColumn.Name == fromColumn + and r.ToTable.Name == toTable + and r.ToColumn.Name == toColumn + and ( + r.FromColumn.DataType == "DateTime" + or r.ToColumn.DataType == "DateTime" + ) + for r in tom.model.Relationships + ): + print( + f"{icons.yellow_dot} {relName} was not created since relationships based on DateTime columns are not supported." + ) + elif isDirectLake and any( + r.FromTable.Name == fromTable + and r.FromColumn.Name == fromColumn + and r.ToTable.Name == toTable + and r.ToColumn.Name == toColumn + and (r.FromColumn.DataType != r.ToColumn.DataType) + for r in tom.model.Relationships + ): + print( + f"{icons.yellow_dot} {relName} was not created since columns used in a relationship must have the same data type." + ) + else: + try: + tom.add_relationship( + from_table=fromTable, + from_column=fromColumn, + to_table=toTable, + to_column=toColumn, + from_cardinality=fromCard, + to_cardinality=toCard, + cross_filtering_behavior=cfb, + security_filtering_behavior=sfb, + rely_on_referential_integrity=rori, + is_active=isActive, + ) + + print( + f"{icons.green_dot} The {relName} relationship has been added." + ) + except: + print( + f"{icons.red_dot} The {relName} relationship was not added." + ) + + print(f"\n{icons.in_progress} Creating roles...") + for index, row in dfRole.iterrows(): + roleName = row["Role"] + roleDesc = row["Description"] + modPerm = row["Model Permission"] + + try: + tom.model.Roles[roleName] + except: + tom.add_role( + role_name=roleName, + model_permission=modPerm, + description=roleDesc, + ) + print( + f"{icons.green_dot} The '{roleName}' role has been added." + ) + + print(f"\n{icons.in_progress} Creating row level security...") + for index, row in dfRLS.iterrows(): + roleName = row["Role"] + tName = row["Table"] + expr = row["Filter Expression"] + + try: + tom.set_rls( + role_name=roleName, table_name=tName, filter_expression=expr + ) + print( + f"{icons.green_dot} Row level security for the '{tName}' table within the '{roleName}' role has been set." + ) + except: + print( + f"{icons.red_dot} Row level security for the '{tName}' table within the '{roleName}' role was not set." + ) + + print(f"\n{icons.in_progress} Creating perspectives...") + for pName in dfP["Perspective Name"].unique(): + + try: + tom.model.Perspectives[pName] + except: + tom.add_perspective(perspective_name=pName) + print( + f"{icons.green_dot} The '{pName}' perspective has been added." + ) + + print(f"\n{icons.in_progress} Adding objects to perspectives...") + for index, row in dfP.iterrows(): + pName = row["Perspective Name"] + tName = row["Table Name"] + oName = row["Object Name"] + oType = row["Object Type"] + tType = dfT.loc[(dfT["Name"] == tName), "Type"].iloc[0] + + try: + if oType == "Table": + tom.add_to_perspective( + object=tom.model.Tables[tName], perspective_name=pName + ) + elif oType == "Column": + tom.add_to_perspective( + object=tom.model.Tables[tName].Columns[oName], + perspective_name=pName, + ) + elif oType == "Measure": + tom.add_to_perspective( + object=tom.model.Tables[tName].Measures[oName], + perspective_name=pName, + ) + elif oType == "Hierarchy": + tom.add_to_perspective( + object=tom.model.Tables[tName].Hierarchies[oName], + perspective_name=pName, + ) + except: + pass + + print(f"\n{icons.in_progress} Creating translation languages...") + for trName in dfTranslation["Culture Name"].unique(): + try: + tom.model.Cultures[trName] + except: + tom.add_translation(trName) + print( + f"{icons.green_dot} The '{trName}' translation language has been added." + ) + + print(f"\n{icons.in_progress} Creating translation values...") + for index, row in dfTranslation.iterrows(): + trName = row["Culture Name"] + tName = row["Table Name"] + oName = row["Object Name"] + oType = row["Object Type"] + translation = row["Translation"] + prop = row["Property"] + + if prop == "Caption": + prop = "Name" + elif prop == "DisplayFolder": + prop = "Display Folder" + + try: + if oType == "Table": + tom.set_translation( + object=tom.model.Tables[tName], + language=trName, + property=prop, + value=translation, + ) + elif oType == "Column": + tom.set_translation( + object=tom.model.Tables[tName].Columns[oName], + language=trName, + property=prop, + value=translation, + ) + elif oType == "Measure": + tom.set_translation( + object=tom.model.Tables[tName].Measures[oName], + language=trName, + property=prop, + value=translation, + ) + elif oType == "Hierarchy": + tom.set_translation( + object=tom.model.Tables[tName].Hierarchies[oName], + language=trName, + property=prop, + value=translation, + ) + elif oType == "Level": + + pattern = r"\[([^]]+)\]" + matches = re.findall(pattern, oName) + lName = matches[0] + + pattern = r"'([^']+)'" + matches = re.findall(pattern, oName) + hName = matches[0] + tom.set_translation( + object=tom.model.Tables[tName] + .Hierarchies[hName] + .Levels[lName], + language=trName, + property=prop, + value=translation, + ) + except: + pass + + print( + f"\n{icons.green_dot} Migration of objects from '{dataset}' -> '{new_dataset}' is complete." + ) + + except Exception as e: + if datetime.datetime.now() - start_time > timeout: + break + time.sleep(1) diff --git a/sempy_labs/migration/_migrate_tables_columns_to_semantic_model.py b/sempy_labs/migration/_migrate_tables_columns_to_semantic_model.py new file mode 100644 index 00000000..6461f107 --- /dev/null +++ b/sempy_labs/migration/_migrate_tables_columns_to_semantic_model.py @@ -0,0 +1,169 @@ +import sempy +import sempy.fabric as fabric +import pandas as pd +import datetime, time +from ._list_functions import list_tables +from .GetSharedExpression import get_shared_expression +from .HelperFunctions import resolve_lakehouse_name +from .lakehouse.Lakehouse import lakehouse_attached +from .TOM import connect_semantic_model +from typing import List, Optional, Union +from sempy._utils._log import log +import sempy_labs._icons as icons + + +@log +def migrate_tables_columns_to_semantic_model( + dataset: str, + new_dataset: str, + workspace: Optional[str] = None, + new_dataset_workspace: Optional[str] = None, + lakehouse: Optional[str] = None, + lakehouse_workspace: Optional[str] = None, +): + """ + Adds tables/columns to the new Direct Lake semantic model based on an import/DirectQuery semantic model. + + Parameters + ---------- + dataset : str + Name of the import/DirectQuery semantic model. + new_dataset : str + Name of the Direct Lake semantic model. + workspace : str, default=None + The Fabric workspace name in which the import/DirectQuery semantic model exists. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + new_dataset_workspace : str + The Fabric workspace name in which the Direct Lake semantic model will be created. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + lakehouse : str, default=None + The Fabric lakehouse used by the Direct Lake semantic model. + Defaults to None which resolves to the lakehouse attached to the notebook. + lakehouse_workspace : str, default=None + The Fabric workspace used by the lakehouse. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + + """ + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + else: + workspace_id = fabric.resolve_workspace_id(workspace) + + if new_dataset_workspace == None: + new_dataset_workspace = workspace + + if lakehouse_workspace == None: + lakehouse_workspace = new_dataset_workspace + + if lakehouse == None: + lakehouse_id = fabric.get_lakehouse_id() + lakehouse = resolve_lakehouse_name(lakehouse_id, lakehouse_workspace) + + # Check that lakehouse is attached to the notebook + lakeAttach = lakehouse_attached() + + # Run if lakehouse is attached to the notebook or a lakehouse & lakehouse workspace are specified + if lakeAttach or (lakehouse is not None and lakehouse_workspace is not None): + shEx = get_shared_expression(lakehouse, lakehouse_workspace) + + dfC = fabric.list_columns(dataset=dataset, workspace=workspace) + dfT = list_tables(dataset, workspace) + dfT.rename(columns={"Type": "Table Type"}, inplace=True) + dfC = pd.merge( + dfC, + dfT[["Name", "Table Type"]], + left_on="Table Name", + right_on="Name", + how="left", + ) + dfT_filt = dfT[dfT["Table Type"] == "Table"] + dfC_filt = dfC[ + (dfC["Table Type"] == "Table") + & ~(dfC["Column Name"].str.startswith("RowNumber-")) + & (dfC["Type"] != "Calculated") + ] + + print(f"{icons.in_progress} Updating '{new_dataset}' based on '{dataset}'...") + start_time = datetime.datetime.now() + timeout = datetime.timedelta(minutes=1) + success = False + + while not success: + try: + with connect_semantic_model( + dataset=new_dataset, readonly=False, workspace=new_dataset_workspace + ) as tom: + success = True + try: + tom.model.Expressions["DatabaseQuery"] + except: + tom.add_expression("DatabaseQuery", expression=shEx) + print( + f"{icons.green_dot} The 'DatabaseQuery' expression has been added." + ) + + for i, r in dfT_filt.iterrows(): + tName = r["Name"] + tDC = r["Data Category"] + tHid = bool(r["Hidden"]) + tDesc = r["Description"] + + try: + tom.model.Tables[tName] + except: + tom.add_table( + name=tName, + description=tDesc, + data_category=tDC, + hidden=tHid, + ) + tom.add_entity_partition( + table_name=tName, entity_name=tName.replace(" ", "_") + ) + print( + f"{icons.green_dot} The '{tName}' table has been added." + ) + + for i, r in dfC_filt.iterrows(): + tName = r["Table Name"] + cName = r["Column Name"] + scName = r["Source"].replace(" ", "_") + cHid = bool(r["Hidden"]) + cDataType = r["Data Type"] + + try: + tom.model.Tables[tName].Columns[cName] + except: + tom.add_data_column( + table_name=tName, + column_name=cName, + source_column=scName, + hidden=cHid, + data_type=cDataType, + ) + print( + f"{icons.green_dot} The '{tName}'[{cName}] column has been added." + ) + + print( + f"\n{icons.green_dot} All regular tables and columns have been added to the '{new_dataset}' semantic model." + ) + except Exception as e: + if datetime.datetime.now() - start_time > timeout: + break + time.sleep(1) + else: + print( + f"{icons.red_dot} Lakehouse not attached to notebook and lakehouse/lakehouse_workspace are not specified. Please add your lakehouse to this notebook or specify the lakehouse/lakehouse_workspace parameters." + ) + print( + f"To attach a lakehouse to a notebook, go to the the 'Explorer' window to the left, click 'Lakehouses' to add your lakehouse to this notebook" + ) + print( + f"\nLearn more here: https://learn.microsoft.com/fabric/data-engineering/lakehouse-notebook-explore#add-or-remove-a-lakehouse" + ) diff --git a/sempy_labs/migration/_migration_validation.py b/sempy_labs/migration/_migration_validation.py new file mode 100644 index 00000000..4e0c9c16 --- /dev/null +++ b/sempy_labs/migration/_migration_validation.py @@ -0,0 +1,230 @@ +import sempy +import sempy.fabric as fabric +import pandas as pd +from .HelperFunctions import create_relationship_name +from .TOM import connect_semantic_model +from typing import List, Optional, Union +from sempy._utils._log import log + + +def list_semantic_model_objects(dataset: str, workspace: Optional[str] = None): + """ + Shows a list of semantic model objects. + + Parameters + ---------- + dataset : str + Name of the semantic model. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + + + Returns + ------- + pandas.DataFrame + A pandas dataframe showing a list of objects in the semantic model + """ + + if workspace is None: + workspace = fabric.resolve_workspace_name() + + df = pd.DataFrame(columns=["Parent Name", "Object Name", "Object Type"]) + with connect_semantic_model( + dataset=dataset, workspace=workspace, readonly=True + ) as tom: + for t in tom.model.Tables: + if t.CalculationGroup is not None: + new_data = { + "Parent Name": t.Parent.Name, + "Object Name": t.Name, + "Object Type": "Calculation Group", + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) + for ci in t.CalculationGroup.CalculationItems: + new_data = { + "Parent Name": t.Name, + "Object Name": ci.Name, + "Object Type": str(ci.ObjectType), + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) + elif any(str(p.SourceType) == "Calculated" for p in t.Partitions): + new_data = { + "Parent Name": t.Parent.Name, + "Object Name": t.Name, + "Object Type": "Calculated Table", + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) + else: + new_data = { + "Parent Name": t.Parent.Name, + "Object Name": t.Name, + "Object Type": str(t.ObjectType), + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) + for c in t.Columns: + if str(c.Type) != "RowNumber": + if str(c.Type) == "Calculated": + new_data = { + "Parent Name": c.Parent.Name, + "Object Name": c.Name, + "Object Type": "Calculated Column", + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) + else: + new_data = { + "Parent Name": c.Parent.Name, + "Object Name": c.Name, + "Object Type": str(c.ObjectType), + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) + for m in t.Measures: + new_data = { + "Parent Name": m.Parent.Name, + "Object Name": m.Name, + "Object Type": str(m.ObjectType), + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) + for h in t.Hierarchies: + new_data = { + "Parent Name": h.Parent.Name, + "Object Name": h.Name, + "Object Type": str(h.ObjectType), + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) + for l in h.Levels: + new_data = { + "Parent Name": l.Parent.Name, + "Object Name": l.Name, + "Object Type": str(l.ObjectType), + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) + for p in t.Partitions: + new_data = { + "Parent Name": p.Parent.Name, + "Object Name": p.Name, + "Object Type": str(p.ObjectType), + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) + for r in tom.model.Relationships: + rName = create_relationship_name( + r.FromTable.Name, r.FromColumn.Name, r.ToTable.Name, r.ToColumn.Name + ) + new_data = { + "Parent Name": r.Parent.Name, + "Object Name": rName, + "Object Type": str(r.ObjectType), + } + df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + for role in tom.model.Roles: + new_data = { + "Parent Name": role.Parent.Name, + "Object Name": role.Name, + "Object Type": str(role.ObjectType), + } + df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + for rls in role.TablePermissions: + new_data = { + "Parent Name": role.Name, + "Object Name": rls.Name, + "Object Type": str(rls.ObjectType), + } + df = pd.concat( + [df, pd.DataFrame(new_data, index=[0])], ignore_index=True + ) + for tr in tom.model.Cultures: + new_data = { + "Parent Name": tr.Parent.Name, + "Object Name": tr.Name, + "Object Type": str(tr.ObjectType), + } + df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + for per in tom.model.Perspectives: + new_data = { + "Parent Name": per.Parent.Name, + "Object Name": per.Name, + "Object Type": str(per.ObjectType), + } + df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + + return df + + +@log +def migration_validation( + dataset: str, + new_dataset: str, + workspace: Optional[str] = None, + new_dataset_workspace: Optional[str] = None, +): + """ + Shows the objects in the original semantic model and whether then were migrated successfully or not. + + Parameters + ---------- + dataset : str + Name of the import/DirectQuery semantic model. + new_dataset : str + Name of the Direct Lake semantic model. + workspace : str, default=None + The Fabric workspace name in which the import/DirectQuery semantic model exists. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + new_dataset_workspace : str + The Fabric workspace name in which the Direct Lake semantic model will be created. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + + Returns + ------- + pandas.DataFrame + A pandas dataframe showing a list of objects and whether they were successfully migrated. Also shows the % of objects which were migrated successfully. + """ + + dfA = list_semantic_model_objects(dataset=dataset, workspace=workspace) + dfB = list_semantic_model_objects( + dataset=new_dataset, workspace=new_dataset_workspace + ) + + def is_migrated(row): + if row["Object Type"] == "Calculated Table": + return ( + (dfB["Parent Name"] == row["Parent Name"]) + & (dfB["Object Name"] == row["Object Name"]) + & (dfB["Object Type"].isin(["Calculated Table", "Table"])) + ).any() + else: + return ( + (dfB["Parent Name"] == row["Parent Name"]) + & (dfB["Object Name"] == row["Object Name"]) + & (dfB["Object Type"] == row["Object Type"]) + ).any() + + dfA["Migrated"] = dfA.apply(is_migrated, axis=1) + + denom = len(dfA) + num = len(dfA[dfA["Migrated"]]) + print(f"{100 * round(num / denom,2)}% migrated") + + return dfA diff --git a/sempy_labs/report/__init__.py b/sempy_labs/report/__init__.py new file mode 100644 index 00000000..f908ea90 --- /dev/null +++ b/sempy_labs/report/__init__.py @@ -0,0 +1,15 @@ +from sempy_labs.report._generate_report import ( + create_report_from_reportjson as create_report_from_reportjson, + update_report_from_reportjson as update_report_from_reportjson, +) +from sempy_labs.report._report_functions import ( + get_report_json as get_report_json, + report_dependency_tree as report_dependency_tree, + export_report as export_report, + clone_report as clone_report, + launch_report as launch_report, + list_report_pages as list_report_pages, + list_report_visuals as list_report_visuals, + list_report_bookmarks as list_report_bookmarks, + translate_report_titles as translate_report_titles, +) diff --git a/sempy_labs/report/_generate_report.py b/sempy_labs/report/_generate_report.py new file mode 100644 index 00000000..94139157 --- /dev/null +++ b/sempy_labs/report/_generate_report.py @@ -0,0 +1,260 @@ +import sempy +import sempy.fabric as fabric +import pandas as pd +import json, base64, time +from typing import List, Optional, Union + + +def create_report_from_reportjson( + report: str, + dataset: str, + report_json: str, + theme_json: Optional[str] = None, + workspace: Optional[str] = None, +): + """ + Creates a report based on a report.json file (and an optional themes.json file). + + Parameters + ---------- + report : str + Name of the report. + dataset : str + Name of the semantic model to connect to the report. + report_json : str + The report.json file to be used to create the report. + theme_json : str, default=None + The theme.json file to be used for the theme of the report. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + """ + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + else: + workspace_id = fabric.resolve_workspace_id(workspace) + + objectType = "Report" + + dfI_m = fabric.list_items(workspace=workspace, type="SemanticModel") + dfI_model = dfI_m[(dfI_m["Display Name"] == dataset)] + + if len(dfI_model) == 0: + print( + f"ERROR: The '{dataset}' semantic model does not exist in the '{workspace}' workspace." + ) + return + + datasetId = dfI_model["Id"].iloc[0] + + dfI_r = fabric.list_items(workspace=workspace, type="Report") + dfI_rpt = dfI_r[(dfI_r["Display Name"] == report)] + + if len(dfI_rpt) > 0: + print( + f"WARNING: '{report}' already exists as a report in the '{workspace}' workspace." + ) + return + + client = fabric.FabricRestClient() + defPBIR = { + "version": "1.0", + "datasetReference": { + "byPath": None, + "byConnection": { + "connectionString": None, + "pbiServiceModelId": None, + "pbiModelVirtualServerName": "sobe_wowvirtualserver", + "pbiModelDatabaseName": datasetId, + "name": "EntityDataSource", + "connectionType": "pbiServiceXmlaStyleLive", + }, + }, + } + + def conv_b64(file): + + loadJson = json.dumps(file) + f = base64.b64encode(loadJson.encode("utf-8")).decode("utf-8") + + return f + + definitionPBIR = conv_b64(defPBIR) + payloadReportJson = conv_b64(report_json) + + if theme_json == None: + request_body = { + "displayName": report, + "type": objectType, + "definition": { + "parts": [ + { + "path": "report.json", + "payload": payloadReportJson, + "payloadType": "InlineBase64", + }, + { + "path": "definition.pbir", + "payload": definitionPBIR, + "payloadType": "InlineBase64", + }, + ] + }, + } + else: + payloadThemeJson = conv_b64(theme_json) + themeID = theme_json["payload"]["blob"]["displayName"] + themePath = "StaticResources/SharedResources/BaseThemes/" + themeID + ".json" + request_body = { + "displayName": report, + "type": objectType, + "definition": { + "parts": [ + { + "path": "report.json", + "payload": payloadReportJson, + "payloadType": "InlineBase64", + }, + { + "path": themePath, + "payload": payloadThemeJson, + "payloadType": "InlineBase64", + }, + { + "path": "definition.pbir", + "payload": definitionPBIR, + "payloadType": "InlineBase64", + }, + ] + }, + } + + response = client.post(f"/v1/workspaces/{workspace_id}/items", json=request_body) + + if response.status_code == 201: + print("Report creation succeeded") + print(response.json()) + elif response.status_code == 202: + operationId = response.headers["x-ms-operation-id"] + response = client.get(f"/v1/operations/{operationId}") + response_body = json.loads(response.content) + while response_body["status"] != "Succeeded": + time.sleep(3) + response = client.get(f"/v1/operations/{operationId}") + response_body = json.loads(response.content) + response = client.get(f"/v1/operations/{operationId}/result") + print("Report creation succeeded") + print(response.json()) + + +def update_report_from_reportjson( + report: str, report_json: str, workspace: Optional[str] = None +): + """ + Updates a report based on a report.json file. + + Parameters + ---------- + report : str + Name of the report. + report_json : str + The report.json file to be used to update the report. + workspace : str, default=None + The Fabric workspace name in which the report resides. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + """ + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + else: + workspace_id = fabric.resolve_workspace_id(workspace) + + objectType = "Report" + + dfR = fabric.list_reports(workspace=workspace) + dfR_filt = dfR[(dfR["Name"] == report) & (dfR["Report Type"] == "PowerBIReport")] + + if len(dfR_filt) == 0: + print(f"The '{report}' report does not exist in the '{workspace}' workspace.") + return + + reportId = dfR_filt["Id"].iloc[0] + client = fabric.FabricRestClient() + + response = client.post( + f"/v1/workspaces/{workspace_id}/items/{reportId}/getDefinition" + ) + df_items = pd.json_normalize(response.json()["definition"]["parts"]) + df_items_filt = df_items[df_items["path"] == "definition.pbir"] + rptDefFile = df_items_filt["payload"].iloc[0] + # datasetId = dfR_filt['Dataset Id'].iloc[0] + # datasetWorkspaceId = dfR_filt['Dataset Workspace Id'].iloc[0] + + # defPBIR = { + # "version": "1.0", + # "datasetReference": { + # "byPath": None, + # "byConnection": { + # "connectionString": None, + # "pbiServiceModelId": None, + # "pbiModelVirtualServerName": "sobe_wowvirtualserver", + # "pbiModelDatabaseName": datasetId, + # "name": "EntityDataSource", + # "connectionType": "pbiServiceXmlaStyleLive" + # } + # } + # } + + def conv_b64(file): + + loadJson = json.dumps(file) + f = base64.b64encode(loadJson.encode("utf-8")).decode("utf-8") + + return f + + # definitionPBIR = conv_b64(defPBIR) + payloadReportJson = conv_b64(report_json) + + request_body = { + "displayName": report, + "type": objectType, + "definition": { + "parts": [ + { + "path": "report.json", + "payload": payloadReportJson, + "payloadType": "InlineBase64", + }, + { + "path": "definition.pbir", + "payload": rptDefFile, + "payloadType": "InlineBase64", + }, + ] + }, + } + + response = client.post( + f"/v1/workspaces/{workspace_id}/reports/{reportId}/updateDefinition", + json=request_body, + ) + + if response.status_code == 201: + print(f"The '{report}' report has been successfully updated.") + # print(response.json()) + elif response.status_code == 202: + operationId = response.headers["x-ms-operation-id"] + response = client.get(f"/v1/operations/{operationId}") + response_body = json.loads(response.content) + while response_body["status"] != "Succeeded": + time.sleep(3) + response = client.get(f"/v1/operations/{operationId}") + response_body = json.loads(response.content) + response = client.get(f"/v1/operations/{operationId}/result") + print(f"The '{report}' report has been successfully updated.") + # print(response.json()) diff --git a/sempy_labs/report/_report_functions.py b/sempy_labs/report/_report_functions.py new file mode 100644 index 00000000..a293c557 --- /dev/null +++ b/sempy_labs/report/_report_functions.py @@ -0,0 +1,869 @@ +import sempy +import sempy.fabric as fabric +import pandas as pd +import json, os, time, base64, copy, re +from anytree import Node, RenderTree +from powerbiclient import Report +from synapse.ml.services import Translate +from pyspark.sql.functions import col, flatten +from pyspark.sql import SparkSession +from .report._generate_report import update_report_from_reportjson +from .Translations import language_validate +from .lakehouse.Lakehouse import lakehouse_attached +from .HelperFunctions import ( + generate_embedded_filter, + resolve_dataset_name, + resolve_report_id, + resolve_lakehouse_name, +) +from typing import List, Optional, Union +from sempy._utils._log import log +import sempy_labs._icons as icons + + +def get_report_json( + report: str, + workspace: Optional[str] = None, + save_to_file_name: Optional[str] = None, +): + """ + Gets the report.json file content of a Power BI report. + + Parameters + ---------- + report : str + Name of the Power BI report. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + save_to_file_name : str, default=None + Specifying this parameter will save the report.json file to the lakehouse attached to the notebook with the file name of this parameter. + + Returns + ------- + str + The report.json file for a given Power BI report. + """ + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + else: + workspace_id = fabric.resolve_workspace_id(workspace) + + client = fabric.FabricRestClient() + + dfI = fabric.list_items(workspace=workspace, type="Report") + dfI_filt = dfI[(dfI["Display Name"] == report)] + + if len(dfI_filt) == 0: + print( + f"{icons.red_dot} The '{report}' report does not exist in the '{workspace}' workspace." + ) + return + + itemId = dfI_filt["Id"].iloc[0] + response = client.post( + f"/v1/workspaces/{workspace_id}/items/{itemId}/getDefinition" + ) + df_items = pd.json_normalize(response.json()["definition"]["parts"]) + df_items_filt = df_items[df_items["path"] == "report.json"] + payload = df_items_filt["payload"].iloc[0] + + reportFile = base64.b64decode(payload).decode("utf-8") + reportJson = json.loads(reportFile) + + if save_to_file_name is not None: + lakeAttach = lakehouse_attached() + if lakeAttach == False: + print( + f"{icons.red_dot} In order to save the report.json file, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook." + ) + return + + lakehouse_id = fabric.get_lakehouse_id() + lakehouse = resolve_lakehouse_name(lakehouse_id, workspace) + folderPath = "/lakehouse/default/Files" + fileExt = ".json" + if not save_to_file_name.endswith(fileExt): + save_to_file_name = save_to_file_name + fileExt + filePath = os.path.join(folderPath, save_to_file_name) + with open(filePath, "w") as json_file: + json.dump(reportJson, json_file, indent=4) + print( + f"{icons.green_dot} The report.json file for the '{report}' report has been saved to the '{lakehouse}' in this location: '{filePath}'.\n\n" + ) + + return reportJson + + +def report_dependency_tree(workspace: Optional[str] = None): + """ + Prints a dependency between reports and semantic models. + + Parameters + ---------- + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + """ + + if workspace == None: + workspaceId = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspaceId) + + dfR = fabric.list_reports(workspace=workspace) + dfD = fabric.list_datasets(workspace=workspace) + dfR = pd.merge( + dfR, + dfD[["Dataset ID", "Dataset Name"]], + left_on="Dataset Id", + right_on="Dataset ID", + how="left", + ) + dfR.rename(columns={"Name": "Report Name"}, inplace=True) + dfR = dfR[["Report Name", "Dataset Name"]] + + report_icon = "\U0001F4F6" + dataset_icon = "\U0001F9CA" + workspace_icon = "\U0001F465" + + node_dict = {} + rootNode = Node(workspace) + node_dict[workspace] = rootNode + rootNode.custom_property = workspace_icon + " " + + for i, r in dfR.iterrows(): + datasetName = r["Dataset Name"] + reportName = r["Report Name"] + parentNode = node_dict.get(datasetName) + if parentNode is None: + parentNode = Node(datasetName, parent=rootNode) + node_dict[datasetName] = parentNode + parentNode.custom_property = dataset_icon + " " + + child_node = Node(reportName, parent=parentNode) + child_node.custom_property = report_icon + " " + + # Print the tree structure + for pre, _, node in RenderTree(node_dict[workspace]): + print(f"{pre}{node.custom_property}'{node.name}'") + + +@log +def export_report( + report: str, + export_format: str, + file_name: Optional[str] = None, + bookmark_name: Optional[str] = None, + page_name: Optional[str] = None, + visual_name: Optional[str] = None, + report_filter: Optional[str] = None, + workspace: Optional[str] = None, +): + """ + Exports a Power BI report to a file in your lakehouse. + + Parameters + ---------- + report : str + Name of the Power BI report. + export_format : str + The format in which to export the report. See this link for valid formats: https://learn.microsoft.com/rest/api/power-bi/reports/export-to-file-in-group#fileformat. For image formats, enter the file extension in this parameter, not 'IMAGE'. + file_name : str, default=None + The name of the file to be saved within the lakehouse. Do not include the file extension. Defaults ot the reportName parameter value. + bookmark_name : str, default=None + The name (GUID) of a bookmark within the report. + page_name : str, default=None + The name (GUID) of the report page. + visual_name : str, default=None + The name (GUID) of a visual. If you specify this parameter you must also specify the page_name parameter. + report_filter : str, default=None + A report filter to be applied when exporting the report. Syntax is user-friendly. See above for examples. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + """ + + # https://learn.microsoft.com/rest/api/power-bi/reports/export-to-file-in-group + + lakeAttach = lakehouse_attached() + + if lakeAttach == False: + print( + f"{icons.red_dot} In order to run the 'export_report' function, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook." + ) + return + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + else: + workspace_id = fabric.resolve_workspace_id(workspace) + + if isinstance(page_name, str): + page_name = [page_name] + if isinstance(visual_name, str): + visual_name = [visual_name] + + if bookmark_name is not None and (page_name is not None or visual_name is not None): + print( + f"{icons.red_dot} If the 'bookmark_name' parameter is set, the 'page_name' and 'visual_name' parameters must not be set." + ) + return + if visual_name is not None and page_name is None: + print( + f"{icons.red_dot} If the 'visual_name' parameter is set, the 'page_name' parameter must be set." + ) + return + + validFormats = { + "ACCESSIBLEPDF": ".pdf", + "CSV": ".csv", + "DOCX": ".docx", + "MHTML": ".mhtml", + "PDF": ".pdf", + "PNG": ".png", + "PPTX": ".pptx", + "XLSX": ".xlsx", + "XML": ".xml", + "BMP": ".bmp", + "EMF": ".emf", + "GIF": ".gif", + "JPEG": ".jpeg", + "TIFF": ".tiff", + } + + export_format = export_format.upper() + if export_format not in validFormats: + print( + f"{icons.red_dot} The '{export_format}' format is not a valid format for exporting Power BI reports. Please enter a valid format. Options: {validFormats}" + ) + return + + fileExt = validFormats.get(export_format) + + if file_name == None: + file_name = report + fileExt + else: + file_name = file_name + fileExt + + folderPath = "/lakehouse/default/Files" + filePath = os.path.join(folderPath, file_name) + + dfI = fabric.list_items(workspace=workspace) + dfI_filt = dfI[ + (dfI["Type"].isin(["Report", "PaginatedReport"])) + & (dfI["Display Name"] == report) + ] + + if len(dfI_filt) == 0: + print( + f"{icons.red_dot} The '{report}' report does not exist in the '{workspace}' workspace." + ) + return + + reportType = dfI_filt["Type"].iloc[0] + + # Limitations + pbiOnly = ["PNG"] + paginatedOnly = [ + "ACCESSIBLEPDF", + "CSV", + "DOCX", + "BMP", + "EMF", + "GIF", + "JPEG", + "TIFF", + "MHTML", + "XLSX", + "XML", + ] + + if reportType == "Report" and export_format in paginatedOnly: + print( + f"{icons.red_dot} The '{export_format}' format is only supported for paginated reports." + ) + return + if reportType == "PaginatedReport" and export_format in pbiOnly: + print( + f"{icons.red_dot} The '{export_format}' format is only supported for Power BI reports." + ) + return + + if reportType == "PaginatedReport" and ( + bookmark_name is not None or page_name is not None or visual_name is not None + ): + print( + f"{icons.red_dot} Export for paginated reports does not support bookmarks/pages/visuals. Those parameters must not be set for paginated reports." + ) + return + + reportId = dfI_filt["Id"].iloc[0] + client = fabric.PowerBIRestClient() + + dfVisual = list_report_visuals(report=report, workspace=workspace) + dfPage = list_report_pages(report=report, workspace=workspace) + + if ( + export_format in ["BMP", "EMF", "GIF", "JPEG", "TIFF"] + and reportType == "PaginatedReport" + ): + request_body = { + "format": "IMAGE", + "paginatedReportConfiguration": { + "formatSettings": {"OutputFormat": export_format.lower()} + }, + } + elif bookmark_name is None and page_name is None and visual_name is None: + request_body = {"format": export_format} + elif bookmark_name is not None: + if reportType == "Report": + request_body = { + "format": export_format, + "powerBIReportConfiguration": { + "defaultBookmark": {"name": bookmark_name} + }, + } + elif page_name is not None and visual_name is None: + if reportType == "Report": + request_body = {"format": export_format, "powerBIReportConfiguration": {}} + + request_body["powerBIReportConfiguration"]["pages"] = [] + + for page in page_name: + dfPage_filt = dfPage[dfPage["Page ID"] == page] + if len(dfPage_filt) == 0: + print( + f"{icons.red_dot} The '{page}' page does not exist in the '{report}' report within the '{workspace}' workspace." + ) + return + page_dict = {"pageName": page} + request_body["powerBIReportConfiguration"]["pages"].append(page_dict) + + elif page_name is not None and visual_name is not None: + if len(page_name) != len(visual_name): + print( + f"{icons.red_dot} Each 'visual_name' must map to a single 'page_name'." + ) + return + if reportType == "Report": + request_body = {"format": export_format, "powerBIReportConfiguration": {}} + + request_body["powerBIReportConfiguration"]["pages"] = [] + a = 0 + for page in page_name: + visual = visual_name[a] + dfVisual_filt = dfVisual[ + (dfVisual["Page ID"] == page) & (dfVisual["Visual ID"] == visual) + ] + if len(dfVisual_filt) == 0: + print( + f"{icons.red_dot} The '{visual}' visual does not exist on the '{page}' in the '{report}' report within the '{workspace}' workspace." + ) + return + page_dict = {"pageName": page, "visualName": visual} + request_body["powerBIReportConfiguration"]["pages"].append(page_dict) + a += 1 + + # Transform and add report filter if it is specified + if report_filter is not None and reportType == "Report": + reportFilter = generate_embedded_filter(filter=report_filter) + report_level_filter = {"filter": reportFilter} + + if "powerBIReportConfiguration" not in request_body: + request_body["powerBIReportConfiguration"] = {} + request_body["powerBIReportConfiguration"]["reportLevelFilters"] = [ + report_level_filter + ] + print(request_body) + response = client.post( + f"/v1.0/myorg/groups/{workspace_id}/reports/{reportId}/ExportTo", + json=request_body, + ) + if response.status_code == 202: + response_body = json.loads(response.content) + exportId = response_body["id"] + response = client.get( + f"/v1.0/myorg/groups/{workspace_id}/reports/{reportId}/exports/{exportId}" + ) + response_body = json.loads(response.content) + while response_body["status"] not in ["Succeeded", "Failed"]: + time.sleep(3) + response = client.get( + f"/v1.0/myorg/groups/{workspace_id}/reports/{reportId}/exports/{exportId}" + ) + response_body = json.loads(response.content) + if response_body["status"] == "Failed": + print( + f"{icons.red_dot} The export for the '{report}' report within the '{workspace}' workspace in the '{export_format}' format has failed." + ) + else: + response = client.get( + f"/v1.0/myorg/groups/{workspace_id}/reports/{reportId}/exports/{exportId}/file" + ) + print( + f"{icons.in_progress} Saving the '{export_format}' export for the '{report}' report within the '{workspace}' workspace to the lakehouse..." + ) + with open(filePath, "wb") as export_file: + export_file.write(response.content) + print( + f"{icons.green_dot} The '{export_format}' export for the '{report}' report within the '{workspace}' workspace has been saved to the following location: '{filePath}'." + ) + + +def clone_report( + report: str, + cloned_report: str, + workspace: Optional[str] = None, + target_workspace: Optional[str] = None, + target_dataset: Optional[str] = None, +): + """ + Clones a Power BI report. + + Parameters + ---------- + report : str + Name of the Power BI report. + cloned_report : str + Name of the new Power BI report. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + target_workspace : str, default=None + The name of the Fabric workspace to place the cloned report. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + target_dataset : str, default=None + The name of the semantic model to be used by the cloned report. + Defaults to None which resolves to the semantic model used by the initial report. + """ + + # https://learn.microsoft.com/rest/api/power-bi/reports/clone-report-in-group + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + else: + workspace_id = fabric.resolve_workspace_id(workspace) + + dfI = fabric.list_items(workspace=workspace, type="Report") + dfI_filt = dfI[(dfI["Display Name"] == report)] + + if len(dfI_filt) == 0: + print( + f"{icons.red_dot} The '{report}' report does not exist within the '{workspace}' workspace." + ) + return + + reportId = resolve_report_id(report, workspace) + + if target_workspace is None: + target_workspace = workspace + target_workspace_id = workspace_id + else: + dfW = fabric.list_workspaces() + dfW_filt = dfW[dfW["Name"] == target_workspace] + + if len(dfW_filt) == 0: + print(f"{icons.red_dot} The '{workspace}' is not a valid workspace.") + return + target_workspace_id = dfW_filt["Id"].iloc[0] + + if target_dataset == None: + dfR = fabric.list_reports(workspace=target_workspace) + dfR_filt = dfR[dfR["Name"] == report] + target_dataset_id = dfR_filt["Dataset Id"].iloc[0] + target_dataset = resolve_dataset_name( + dataset_id=target_dataset_id, workspace=target_workspace + ) + else: + dfD = fabric.list_datasets(workspace=target_workspace) + dfD_filt = dfD[dfD["Dataset Name"] == target_dataset] + + if len(dfD_filt) == 0: + print( + f"{icons.red_dot} The '{target_dataset}' target dataset does not exist in the '{target_workspace}' workspace." + ) + return + target_dataset_id = dfD_filt["Dataset Id"].iloc[0] + + client = fabric.PowerBIRestClient() + + if target_workspace is None and target_dataset is None: + request_body = {"name": cloned_report} + elif target_workspace is not None and target_dataset is None: + request_body = {"name": cloned_report, "targetWorkspaceId": target_workspace_id} + elif target_workspace is not None and target_dataset is not None: + request_body = { + "name": cloned_report, + "targetModelId": target_dataset_id, + "targetWorkspaceId": target_workspace_id, + } + elif target_workspace is None and target_dataset is not None: + request_body = {"name": cloned_report, "targetModelId": target_dataset_id} + + response = client.post( + f"/v1.0/myorg/groups/{workspace_id}/reports/{reportId}/Clone", json=request_body + ) + + if response.status_code == 200: + print( + f"{icons.green_dot} The '{report}' report has been successfully cloned as the '{cloned_report}' report within the '{target_workspace}' workspace using the '{target_dataset}' semantic model." + ) + else: + print( + f"{icons.red_dot} POST request failed with status code: {response.status_code}" + ) + + +def launch_report(report: str, workspace: Optional[str] = None): + """ + Shows a Power BI report within a Fabric notebook. + + Parameters + ---------- + report : str + Name of the Power BI report. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + + Returns + ------- + str + An embedded Power BI report within the notebook. + """ + + from .HelperFunctions import resolve_report_id + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + else: + workspace_id = fabric.resolve_workspace_id(workspace) + + reportId = resolve_report_id(report, workspace) + + report = Report(group_id=workspace_id, report_id=reportId) + + return report + + +def list_report_pages(report: str, workspace: Optional[str] = None): + """ + Shows the properties of all pages within a Power BI report. + + Parameters + ---------- + report : str + Name of the Power BI report. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + + Returns + ------- + pandas.DataFrame + A pandas dataframe showing the pages within a Power BI report and their properties. + """ + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + + df = pd.DataFrame( + columns=["Page ID", "Page Name", "Hidden", "Width", "Height", "Visual Count"] + ) + + reportJson = get_report_json(report=report, workspace=workspace) + + for section in reportJson["sections"]: + pageID = section["name"] + pageName = section["displayName"] + # pageFilters = section['filters'] + pageWidth = section["width"] + pageHeight = section["height"] + visualCount = len(section["visualContainers"]) + pageHidden = False + pageConfig = section["config"] + pageConfigJson = json.loads(pageConfig) + + try: + pageH = pageConfigJson["visibility"] + if pageH == 1: + pageHidden = True + except: + pass + + new_data = { + "Page ID": pageID, + "Page Name": pageName, + "Hidden": pageHidden, + "Width": pageWidth, + "Height": pageHeight, + "Visual Count": visualCount, + } + df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + + df["Hidden"] = df["Hidden"].astype(bool) + intCol = ["Width", "Height", "Visual Count"] + df[intCol] = df[intCol].astype(int) + + return df + + +def list_report_visuals(report: str, workspace: Optional[str] = None): + """ + Shows the properties of all visuals within a Power BI report. + + Parameters + ---------- + report : str + Name of the Power BI report. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + + Returns + ------- + pandas.DataFrame + A pandas dataframe showing the visuals within a Power BI report and their properties. + """ + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + + reportJson = get_report_json(report=report, workspace=workspace) + + df = pd.DataFrame(columns=["Page Name", "Page ID", "Visual ID", "Title"]) + + for section in reportJson["sections"]: + pageID = section["name"] + pageName = section["displayName"] + + for visual in section["visualContainers"]: + visualConfig = visual["config"] + visualConfigJson = json.loads(visualConfig) + visualID = visualConfigJson["name"] + + try: + title = visualConfigJson["singleVisual"]["vcObjects"]["title"][0][ + "properties" + ]["text"]["expr"]["Literal"]["Value"] + title = title[1:-1] + except: + title = "" + + new_data = { + "Page Name": pageName, + "Page ID": pageID, + "Visual ID": visualID, + "Title": title, + } + df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + + return df + + +def list_report_bookmarks(report: str, workspace: Optional[str] = None): + """ + Shows the properties of all bookmarks within a Power BI report. + + Parameters + ---------- + report : str + Name of the Power BI report. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + + Returns + ------- + pandas.DataFrame + A pandas dataframe showing the bookmarks within a Power BI report and their properties. + """ + + if workspace == None: + workspace_id = fabric.get_workspace_id() + workspace = fabric.resolve_workspace_name(workspace_id) + + df = pd.DataFrame( + columns=[ + "Bookmark ID", + "Bookmark Name", + "Page ID", + "Visual ID", + "Visual Hidden", + ] + ) + + reportJson = get_report_json(report=report, workspace=workspace) + reportConfig = reportJson["config"] + reportConfigJson = json.loads(reportConfig) + + try: + for bookmark in reportConfigJson["bookmarks"]: + bID = bookmark["name"] + bName = bookmark["displayName"] + rptPageId = bookmark["explorationState"]["activeSection"] + + for rptPg in bookmark["explorationState"]["sections"]: + for vc in bookmark["explorationState"]["sections"][rptPg][ + "visualContainers" + ]: + vHidden = False + try: + hidden = bookmark["explorationState"]["sections"][rptPg][ + "visualContainers" + ][vc]["singleVisual"]["display"]["mode"] + if hidden == "hidden": + vHidden = True + except: + pass + + new_data = { + "Bookmark ID": bID, + "Bookmark Name": bName, + "Page ID": rptPageId, + "Visual ID": vc, + "Visual Hidden": vHidden, + } + df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + + listPages = list_report_pages(report=report, workspace=workspace) + + df = pd.merge(df, listPages[["Page ID", "Page Name"]], on="Page ID", how="left") + df = df[ + [ + "Bookmark ID", + "Bookmark Name", + "Page ID", + "Page Name", + "Visual ID", + "Visual Hidden", + ] + ] + + return df + + except: + print( + f"The '{report}' report within the '{workspace}' workspace has no bookmarks." + ) + + +def translate_report_titles( + report: str, languages: Union[str, List[str]], workspace: Optional[str] = None +): + """ + Dynamically generates new Power BI reports which have report titles translated into the specified language(s). + + Parameters + ---------- + report : str + Name of the Power BI report. + languages : str, List[str] + The language code(s) in which to translate the report titles. + workspace : str, default=None + The Fabric workspace name. + Defaults to None which resolves to the workspace of the attached lakehouse + or if no lakehouse attached, resolves to the workspace of the notebook. + """ + + if isinstance(languages, str): + languages = [languages] + + for lang in languages: + language_validate(lang) + + reportJson = get_report_json(report=report, workspace=workspace) + dfV = list_report_visuals(report=report, workspace=workspace) + spark = SparkSession.builder.getOrCreate() + df = spark.createDataFrame(dfV) + columnToTranslate = "Title" + + translate = ( + Translate() + .setTextCol(columnToTranslate) + .setToLanguage(languages) + .setOutputCol("translation") + .setConcurrency(5) + ) + + transDF = ( + translate.transform(df) + .withColumn("translation", flatten(col("translation.translations"))) + .withColumn("translation", col("translation.text")) + .select("Visual ID", columnToTranslate, "translation") + ) + + df_panda = transDF.toPandas() + + i = 0 + for lang in languages: + # Clone report + language = language_validate(lang) + clonedReportName = f"{report}_{language}" + + dfRep = fabric.list_reports(workspace=workspace) + dfRep_filt = dfRep[ + (dfRep["Name"] == clonedReportName) + & (dfRep["Report Type"] == "PowerBIReport") + ] + + if len(dfRep_filt) > 0: + print( + f"{icons.yellow_dot} The '{clonedReportName}' report already exists in the '{workspace} workspace." + ) + else: + clone_report( + report=report, cloned_report=clonedReportName, workspace=workspace + ) + print( + f"{icons.green_dot} The '{clonedReportName}' report has been created via clone in the '{workspace} workspace." + ) + + rptJsonTr = copy.deepcopy(reportJson) + + # Update report json file + for section in rptJsonTr["sections"]: + for visual in section["visualContainers"]: + visualConfig = visual["config"] + visualConfigJson = json.loads(visualConfig) + visualID = visualConfigJson["name"] + + df_filt = df_panda[ + (df_panda["Visual ID"] == visualID) & (df_panda["Title"] != "") + ] + + if len(df_filt) == 1: + tr = df_filt["translation"].str[i].iloc[0] + if len(tr) > 0: + prop = visualConfigJson["singleVisual"]["vcObjects"]["title"][ + 0 + ]["properties"]["text"]["expr"]["Literal"] + prop["Value"] = f"'{tr}'" + + visual["config"] = json.dumps(visualConfigJson) + + i += 1 + + # Post updated report json file to cloned report + update_report_from_reportjson( + report=clonedReportName, report_json=rptJsonTr, workspace=workspace + ) + print( + f"{icons.green_dot} The visual titles within the '{clonedReportName}' report within the '{workspace}' have been translated into '{language}' accordingly." + ) diff --git a/sempy_labs/ReportRebind.py b/sempy_labs/report/_report_rebind.py similarity index 67% rename from sempy_labs/ReportRebind.py rename to sempy_labs/report/_report_rebind.py index 844cc0b1..c86fecfc 100644 --- a/sempy_labs/ReportRebind.py +++ b/sempy_labs/report/_report_rebind.py @@ -3,15 +3,16 @@ from .HelperFunctions import resolve_dataset_id, resolve_report_id from typing import List, Optional, Union from sempy._utils._log import log +import sempy_labs._icons as icons -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' @log -def report_rebind(report: str, dataset: str, report_workspace: Optional[str] = None, dataset_workspace: Optional[str] = None): - +def report_rebind( + report: str, + dataset: str, + report_workspace: Optional[str] = None, + dataset_workspace: Optional[str] = None, +): """ Rebinds a report to a semantic model. @@ -32,7 +33,7 @@ def report_rebind(report: str, dataset: str, report_workspace: Optional[str] = N Returns ------- - + """ if report_workspace == None: @@ -41,28 +42,39 @@ def report_rebind(report: str, dataset: str, report_workspace: Optional[str] = N else: report_workspace_id = fabric.resolve_workspace_id(report_workspace) if dataset_workspace == None: - dataset_workspace = report_workspace + dataset_workspace = report_workspace client = fabric.PowerBIRestClient() - reportId = resolve_report_id(report = report, workspace = report_workspace) - datasetId = resolve_dataset_id(dataset = dataset, workspace = dataset_workspace) + reportId = resolve_report_id(report=report, workspace=report_workspace) + datasetId = resolve_dataset_id(dataset=dataset, workspace=dataset_workspace) # Prepare API - request_body = { - 'datasetId': datasetId - } + request_body = {"datasetId": datasetId} - response = client.post(f"/v1.0/myorg/groups/{report_workspace_id}/reports/{reportId}/Rebind",json=request_body) + response = client.post( + f"/v1.0/myorg/groups/{report_workspace_id}/reports/{reportId}/Rebind", + json=request_body, + ) if response.status_code == 200: - print(f"{green_dot} The '{report}' report has been successfully rebinded to the '{dataset}' semantic model.") + print( + f"{icons.green_dot} The '{report}' report has been successfully rebinded to the '{dataset}' semantic model." + ) else: - print(f"{red_dot} The '{report}' report within the '{report_workspace}' workspace failed to rebind to the '{dataset}' semantic model within the '{dataset_workspace}' workspace.") + print( + f"{icons.red_dot} The '{report}' report within the '{report_workspace}' workspace failed to rebind to the '{dataset}' semantic model within the '{dataset_workspace}' workspace." + ) -@log -def report_rebind_all(dataset: str, new_dataset: str, dataset_workspace: Optional[str] = None, new_dataset_workpace: Optional[str] = None, report_workspace: Optional[str] = None): +@log +def report_rebind_all( + dataset: str, + new_dataset: str, + dataset_workspace: Optional[str] = None, + new_dataset_workpace: Optional[str] = None, + report_workspace: Optional[str] = None, +): """ Rebinds all reports in a workspace which are bound to a specific semantic model to a new semantic model. @@ -86,29 +98,34 @@ def report_rebind_all(dataset: str, new_dataset: str, dataset_workspace: Optiona The name of the Fabric workspace in which the report resides. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - + Returns ------- - + """ if dataset_workspace == None: dataset_workspace_id = fabric.get_workspace_id() dataset_workspace = fabric.resolve_workspace_name(dataset_workspace_id) else: - dataset_workspace_id = fabric.resolve_workspace_id(dataset_workspace) + dataset_workspace_id = fabric.resolve_workspace_id(dataset_workspace) if new_dataset_workpace == None: new_dataset_workpace = dataset_workspace if report_workspace == None: report_workspace = dataset_workspace - + datasetId = resolve_dataset_id(dataset, dataset_workspace) - dfRep = fabric.list_reports(workspace = report_workspace) - dfRep_filt = dfRep[dfRep['Dataset Id'] == datasetId] + dfRep = fabric.list_reports(workspace=report_workspace) + dfRep_filt = dfRep[dfRep["Dataset Id"] == datasetId] for i, r in dfRep_filt.iterrows(): - rptName = r['Name'] - report_rebind(report = rptName, dataset = new_dataset, report_workspace = report_workspace, dataset_workspace = new_dataset_workpace) \ No newline at end of file + rptName = r["Name"] + report_rebind( + report=rptName, + dataset=new_dataset, + report_workspace=report_workspace, + dataset_workspace=new_dataset_workpace, + ) diff --git a/sempy_labs/shortcuts.py b/sempy_labs/shortcuts.py index 8a246bff..9be99197 100644 --- a/sempy_labs/shortcuts.py +++ b/sempy_labs/shortcuts.py @@ -1,16 +1,19 @@ import sempy_labs import sempy.fabric as fabric import pandas as pd -from .HelperFunctions import resolve_lakehouse_name, resolve_lakehouse_id +from ._helper_functions import resolve_lakehouse_name, resolve_lakehouse_id from typing import List, Optional, Union +import sempy_labs._icons as icons -green_dot = '\U0001F7E2' -yellow_dot = '\U0001F7E1' -red_dot = '\U0001F534' -in_progress = '⌛' - -def create_shortcut_onelake(table_name: str, source_lakehouse: str, source_workspace: str, destination_lakehouse: str, destination_workspace: Optional[str] = None, shortcut_name: Optional[str] = None): +def create_shortcut_onelake( + table_name: str, + source_lakehouse: str, + source_workspace: str, + destination_lakehouse: str, + destination_workspace: Optional[str] = None, + shortcut_name: Optional[str] = None, +): """ Creates a [shortcut](https://learn.microsoft.com/fabric/onelake/onelake-shortcuts) to a delta table in OneLake. @@ -28,12 +31,12 @@ def create_shortcut_onelake(table_name: str, source_lakehouse: str, source_works The name of the Fabric workspace in which the shortcut will be created. Defaults to None which resolves to the workspace of the attached lakehouse or if no lakehouse attached, resolves to the workspace of the notebook. - shortcut_name : str, default=None - The name of the shortcut 'table' to be created. This defaults to the 'table_name' parameter value. - + shortcut_name : str, default=None + The name of the shortcut 'table' to be created. This defaults to the 'table_name' parameter value. + Returns ------- - + """ sourceWorkspaceId = fabric.resolve_workspace_id(source_workspace) @@ -41,38 +44,56 @@ def create_shortcut_onelake(table_name: str, source_lakehouse: str, source_works if destination_workspace == None: destination_workspace = source_workspace - + destinationWorkspaceId = fabric.resolve_workspace_id(destination_workspace) - destinationLakehouseId = resolve_lakehouse_id(destination_lakehouse, destination_workspace) + destinationLakehouseId = resolve_lakehouse_id( + destination_lakehouse, destination_workspace + ) if shortcut_name == None: shortcut_name = table_name - + client = fabric.FabricRestClient() - tablePath = 'Tables/' + table_name + tablePath = "Tables/" + table_name request_body = { - "path": 'Tables', - "name": shortcut_name.replace(' ',''), - "target": { - "oneLake": { - "workspaceId": sourceWorkspaceId, - "itemId": sourceLakehouseId, - "path": tablePath} - } + "path": "Tables", + "name": shortcut_name.replace(" ", ""), + "target": { + "oneLake": { + "workspaceId": sourceWorkspaceId, + "itemId": sourceLakehouseId, + "path": tablePath, + } + }, } try: - response = client.post(f"/v1/workspaces/{destinationWorkspaceId}/items/{destinationLakehouseId}/shortcuts",json=request_body) + response = client.post( + f"/v1/workspaces/{destinationWorkspaceId}/items/{destinationLakehouseId}/shortcuts", + json=request_body, + ) if response.status_code == 201: - print(f"{green_dot} The shortcut '{shortcut_name}' was created in the '{destination_lakehouse}' lakehouse within the '{destination_workspace} workspace. It is based on the '{table_name}' table in the '{source_lakehouse}' lakehouse within the '{source_workspace}' workspace.") + print( + f"{icons.green_dot} The shortcut '{shortcut_name}' was created in the '{destination_lakehouse}' lakehouse within the '{destination_workspace} workspace. It is based on the '{table_name}' table in the '{source_lakehouse}' lakehouse within the '{source_workspace}' workspace." + ) else: print(response.status_code) except Exception as e: - print(f"{red_dot} Failed to create a shortcut for the '{table_name}' table: {e}") - -def create_shortcut(shortcut_name: str, location: str, subpath: str, source: str, connection_id: str, lakehouse: Optional[str] = None, workspace: Optional[str] = None): - + print( + f"{icons.red_dot} Failed to create a shortcut for the '{table_name}' table: {e}" + ) + + +def create_shortcut( + shortcut_name: str, + location: str, + subpath: str, + source: str, + connection_id: str, + lakehouse: Optional[str] = None, + workspace: Optional[str] = None, +): """ Creates a [shortcut](https://learn.microsoft.com/fabric/onelake/onelake-shortcuts) to an ADLS Gen2 or Amazon S3 source. @@ -88,22 +109,21 @@ def create_shortcut(shortcut_name: str, location: str, subpath: str, source: str workspace : str, default=None The name of the Fabric workspace in which the shortcut will be created. Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - + or if no lakehouse attached, resolves to the workspace of the notebook. + Returns ------- - + """ - source_titles = { - 'adlsGen2': 'ADLS Gen2', - 'amazonS3': 'Amazon S3' - } + source_titles = {"adlsGen2": "ADLS Gen2", "amazonS3": "Amazon S3"} sourceValues = list(source_titles.keys()) if source not in sourceValues: - print(f"{red_dot} The 'source' parameter must be one of these values: {sourceValues}.") + print( + f"{icons.red_dot} The 'source' parameter must be one of these values: {sourceValues}." + ) return sourceTitle = source_titles[source] @@ -118,32 +138,40 @@ def create_shortcut(shortcut_name: str, location: str, subpath: str, source: str lakehouse_id = fabric.get_lakehouse_id() else: lakehouse_id = resolve_lakehouse_id(lakehouse, workspace) - + client = fabric.FabricRestClient() - shortcutActualName = shortcut_name.replace(' ','') + shortcutActualName = shortcut_name.replace(" ", "") request_body = { - "path": 'Tables', - "name": shortcutActualName, - "target": { - source: { - "location": location, - "subpath": subpath, - "connectionId": connection_id} - } + "path": "Tables", + "name": shortcutActualName, + "target": { + source: { + "location": location, + "subpath": subpath, + "connectionId": connection_id, + } + }, } try: - response = client.post(f"/v1/workspaces/{workspace_id}/items/{lakehouse_id}/shortcuts",json=request_body) + response = client.post( + f"/v1/workspaces/{workspace_id}/items/{lakehouse_id}/shortcuts", + json=request_body, + ) if response.status_code == 201: - print(f"{green_dot} The shortcut '{shortcutActualName}' was created in the '{lakehouse}' lakehouse within the '{workspace} workspace. It is based on the '{subpath}' table in '{sourceTitle}'.") + print( + f"{icons.green_dot} The shortcut '{shortcutActualName}' was created in the '{lakehouse}' lakehouse within the '{workspace} workspace. It is based on the '{subpath}' table in '{sourceTitle}'." + ) else: print(response.status_code) except: - print(f"{red_dot} Failed to create a shortcut for the '{shortcut_name}' table.") + print( + f"{icons.red_dot} Failed to create a shortcut for the '{shortcut_name}' table." + ) -def list_shortcuts(lakehouse: Optional[str] = None, workspace: Optional[str] = None): +def list_shortcuts(lakehouse: Optional[str] = None, workspace: Optional[str] = None): """ Shows all shortcuts which exist in a Fabric lakehouse. @@ -155,8 +183,8 @@ def list_shortcuts(lakehouse: Optional[str] = None, workspace: Optional[str] = N workspace : str, default=None The name of the Fabric workspace in which lakehouse resides. Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - + or if no lakehouse attached, resolves to the workspace of the notebook. + Returns ------- pandas.DataFrame @@ -168,42 +196,79 @@ def list_shortcuts(lakehouse: Optional[str] = None, workspace: Optional[str] = N workspace = fabric.resolve_workspace_name(workspace_id) else: workspace_id = fabric.resolve_workspace_id(workspace) - + if lakehouse == None: lakehouse_id = fabric.get_lakehouse_id() lakehouse = resolve_lakehouse_name(lakehouse_id, workspace) else: lakehouse_id = resolve_lakehouse_id(lakehouse, workspace) - df = pd.DataFrame(columns=['Shortcut Name', 'Shortcut Path', 'Source', 'Source Lakehouse Name', 'Source Workspace Name', 'Source Path', 'Source Connection ID', 'Source Location', 'Source SubPath']) + df = pd.DataFrame( + columns=[ + "Shortcut Name", + "Shortcut Path", + "Source", + "Source Lakehouse Name", + "Source Workspace Name", + "Source Path", + "Source Connection ID", + "Source Location", + "Source SubPath", + ] + ) client = fabric.FabricRestClient() - response = client.get(f"/v1/workspaces/{workspace_id}/items/{lakehouse_id}/shortcuts") + response = client.get( + f"/v1/workspaces/{workspace_id}/items/{lakehouse_id}/shortcuts" + ) if response.status_code == 200: - for s in response.json()['value']: - shortcutName = s['name'] - shortcutPath = s['path'] - source = list(s['target'].keys())[0] - sourceLakehouseName, sourceWorkspaceName, sourcePath, connectionId, location, subpath = None, None, None, None, None, None - if source == 'oneLake': - sourceLakehouseId = s['target'][source]['itemId'] - sourcePath = s['target'][source]['path'] - sourceWorkspaceId = s['target'][source]['workspaceId'] + for s in response.json()["value"]: + shortcutName = s["name"] + shortcutPath = s["path"] + source = list(s["target"].keys())[0] + ( + sourceLakehouseName, + sourceWorkspaceName, + sourcePath, + connectionId, + location, + subpath, + ) = (None, None, None, None, None, None) + if source == "oneLake": + sourceLakehouseId = s["target"][source]["itemId"] + sourcePath = s["target"][source]["path"] + sourceWorkspaceId = s["target"][source]["workspaceId"] sourceWorkspaceName = fabric.resolve_workspace_name(sourceWorkspaceId) - sourceLakehouseName = resolve_lakehouse_name(sourceLakehouseId, sourceWorkspaceName) + sourceLakehouseName = resolve_lakehouse_name( + sourceLakehouseId, sourceWorkspaceName + ) else: - connectionId = s['target'][source]['connectionId'] - location = s['target'][source]['location'] - subpath = s['target'][source]['subpath'] - - new_data = {'Shortcut Name': shortcutName, 'Shortcut Path': shortcutPath, 'Source': source, 'Source Lakehouse Name': sourceLakehouseName, 'Source Workspace Name': sourceWorkspaceName, 'Source Path': sourcePath, 'Source Connection ID': connectionId, 'Source Location': location, 'Source SubPath': subpath} - df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) - - print(f"This function relies on an API which is not yet official as of May 21, 2024. Once the API becomes official this function will work as expected.") + connectionId = s["target"][source]["connectionId"] + location = s["target"][source]["location"] + subpath = s["target"][source]["subpath"] + + new_data = { + "Shortcut Name": shortcutName, + "Shortcut Path": shortcutPath, + "Source": source, + "Source Lakehouse Name": sourceLakehouseName, + "Source Workspace Name": sourceWorkspaceName, + "Source Path": sourcePath, + "Source Connection ID": connectionId, + "Source Location": location, + "Source SubPath": subpath, + } + df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True) + + print( + f"This function relies on an API which is not yet official as of May 21, 2024. Once the API becomes official this function will work as expected." + ) return df -def delete_shortcut(shortcut_name: str, lakehouse: Optional[str] = None, workspace: Optional[str] = None): +def delete_shortcut( + shortcut_name: str, lakehouse: Optional[str] = None, workspace: Optional[str] = None +): """ Deletes a shortcut. @@ -217,19 +282,19 @@ def delete_shortcut(shortcut_name: str, lakehouse: Optional[str] = None, workspa workspace : str, default=None The name of the Fabric workspace in which lakehouse resides. Defaults to None which resolves to the workspace of the attached lakehouse - or if no lakehouse attached, resolves to the workspace of the notebook. - + or if no lakehouse attached, resolves to the workspace of the notebook. + Returns ------- - - """ + + """ if workspace == None: workspace_id = fabric.get_workspace_id() workspace = fabric.resolve_workspace_name(workspace_id) else: workspace_id = fabric.resolve_workspace_id(workspace) - + if lakehouse == None: lakehouse_id = fabric.get_lakehouse_id() lakehouse = resolve_lakehouse_name(lakehouse_id, workspace) @@ -237,9 +302,13 @@ def delete_shortcut(shortcut_name: str, lakehouse: Optional[str] = None, workspa lakehouse_id = resolve_lakehouse_id(lakehouse, workspace) client = fabric.FabricRestClient() - response = client.delete(f"/v1/workspaces/{workspace_id}/items/{lakehouse_id}/shortcuts/Tables/{shortcut_name}") - + response = client.delete( + f"/v1/workspaces/{workspace_id}/items/{lakehouse_id}/shortcuts/Tables/{shortcut_name}" + ) + if response.status_code == 200: - print(f"{green_dot} The '{shortcut_name}' shortcut in the '{lakehouse}' within the '{workspace}' workspace has been deleted.") + print( + f"{icons.green_dot} The '{shortcut_name}' shortcut in the '{lakehouse}' within the '{workspace}' workspace has been deleted." + ) else: - print(f"{red_dot} The '{shortcut_name}' has not been deleted.") \ No newline at end of file + print(f"{icons.red_dot} The '{shortcut_name}' has not been deleted.")