diff --git a/helper/_resources/config/redshift/dt_mappings.json b/helper/_resources/config/redshift/dt_mappings.json new file mode 100644 index 0000000..f76a6ef --- /dev/null +++ b/helper/_resources/config/redshift/dt_mappings.json @@ -0,0 +1,27 @@ +{ + "VARCHAR": {"source_name": "VARCHAR", "target_name": "STRING"}, + "CHAR": {"source_name": "CHAR", "target_name": "STRING"}, + "CHARACTER": {"source_name": "CHARACTER", "target_name": "STRING"}, + "TEXT": {"source_name": "TEXT", "target_name": "STRING"}, + "BPCHAR": {"source_name": "BPCHAR", "target_name": "STRING"}, + "NCHAR": {"source_name": "NCHAR", "target_name": "STRING"}, + "NVARCHAR": {"source_name": "NVARCHAR", "target_name": "STRING"}, + "VARBYTE": {"source_name": "VARBYTE", "target_name": "BINARY"}, + "BINARY VARYING": {"source_name": "BINARY VARYING", "target_name": "BINARY"}, + "VARBINARY": {"source_name": "VARBINARY", "target_name": "BINARY"}, + "INT2": {"source_name": "INT2", "target_name": "SMALLINT"}, + "INT4": {"source_name": "INT4", "target_name": "INT"}, + "INT8": {"source_name": "INT8", "target_name": "BIGINT"}, + "NUMBER": {"source_name": "NUMBER", "target_name": "NUMERIC"}, + "BYTEINT": {"source_name": "BYTEINT", "target_name": "BYTE"}, + "FLOAT4": {"source_name": "FLOAT4", "target_name": "FLOAT"}, + "FLOAT8": {"source_name": "FLOAT8", "target_name": "DOUBLE"}, + "DOUBLE PRECISION": {"source_name": "DOUBLE PRECISION", "target_name": "DOUBLE"}, + "TIME": {"source_name": "TIME", "target_name": "TIMESTAMP"}, + "TIME WITHOUT TIME ZONE": {"source_name": "TIME WITHOUT TIME ZONE", "target_name": "TIMESTAMP"}, + "TIME WITH TIME ZONE": {"source_name": "TIME WITH TIME ZONE", "target_name": "TIMESTAMP"}, + "TIMETZ": {"source_name": "TIMETZ", "target_name": "TIMESTAMP"}, + "TIMESTAMP WITHOUT TIME ZONE": {"source_name": "TIMESTAMP WITHOUT TIME ZONE", "target_name": "TIMESTAMP"}, + "TIMESTAMPTZ": {"source_name": "TIMESTAMPTZ", "target_name": "TIMESTAMP"}, + "TIMESTAMP WITH TIME ZONE": {"source_name": "TIMESTAMP WITH TIME ZONE", "target_name": "TIMESTAMP"} +} \ No newline at end of file diff --git a/helper/_resources/config/snowflake/dt_mappings.json b/helper/_resources/config/snowflake/dt_mappings.json new file mode 100644 index 0000000..3cd8edd --- /dev/null +++ b/helper/_resources/config/snowflake/dt_mappings.json @@ -0,0 +1,20 @@ +{ + "VARCHAR": {"source_name": "VARCHAR", "target_name": "STRING"}, + "CHAR": {"source_name": "CHAR", "target_name": "STRING"}, + "CHARACTER": {"source_name": "CHARACTER", "target_name": "STRING"}, + "TEXT": {"source_name": "TEXT", "target_name": "STRING"}, + "VARBINARY": {"source_name": "VARBINARY", "target_name": "BINARY"}, + "NUMBER": {"source_name": "NUMBER", "target_name": "NUMERIC"}, + "BYTEINT": {"source_name": "BYTEINT", "target_name": "BYTE"}, + "FLOAT": {"source_name": "FLOAT", "target_name": "DOUBLE"}, + "FLOAT4": {"source_name": "FLOAT4", "target_name": "DOUBLE"}, + "FLOAT8": {"source_name": "FLOAT8", "target_name": "DOUBLE"}, + "DOUBLE PRECISION": {"source_name": "DOUBLE PRECISION", "target_name": "DOUBLE"}, + "REAL": {"source_name": "REAL", "target_name": "DOUBLE"}, + "TIME": {"source_name": "TIME", "target_name": "TIMESTAMP"}, + "DATETIME": {"source_name": "DATETIME", "target_name": "TIMESTAMP"}, + "TIMESTAMP_LTZ": {"source_name": "TIMESTAMP_LTZ", "target_name": "TIMESTAMP"}, + "TIMESTAMP_NTZ": {"source_name": "TIMESTAMP_NTZ", "target_name": "TIMESTAMP"}, + "TIMESTAMP_TZ": {"source_name": "TIMESTAMP_TZ", "target_name": "TIMESTAMP"}, + "OBJECT": {"source_name": "OBJECT", "target_name": "STRUCT"} +} \ No newline at end of file diff --git a/helper/convert_to_databricks.py b/helper/convert_to_databricks.py index 430a754..b7f455d 100644 --- a/helper/convert_to_databricks.py +++ b/helper/convert_to_databricks.py @@ -715,7 +715,31 @@ def syntax_chunk(syntax_map, content, results_dict = {}): else: print(f"No syntax values to parse: {syntax_map}. Skipping. ") - return content, results_dict + return content, results_dict + + def dtchunkuno(content, dtdict): + for key, value in dtdict.items(): + content = dtconvertuno(content, value.get("source_name"), value.get("target_name")) + return content + + + def dtchunkdos(content, dtdict): + for key, value in dtdict.items(): + content = dtconvertdos(content, value.get("source_name"), value.get("target_name")) + return content + + def dtconvertuno(content, dtinput, dtoutput): + cast_pattern = r'(CAST\(\s*[^)]+\s+AS\s+){}(\s*\))'.format(dtinput) + replacement_pattern = r'\1{}\2'.format(dtoutput) + updated_content = re.sub(cast_pattern, replacement_pattern, content, flags= re.DOTALL | re.IGNORECASE) + return updated_content + + def dtconvertdos(content, dtinput, dtoutput): + cast_pattern = r'::{}'.format(dtinput) + replacement_pattern = r'::{}'.format(dtoutput) + updated_content = re.sub(cast_pattern, replacement_pattern, content, flags= re.DOTALL | re.IGNORECASE) + return updated_content + with open(full_path, 'r+') as file: content = file.read() @@ -741,8 +765,14 @@ def syntax_chunk(syntax_map, content, results_dict = {}): content, converted_functions = functions_chunk(function_map=function_map, content=content, results_dict=converted_functions) else: raise(NotImplementedError(f"Incorrect Parse First Parameter Provided {parse_first}. Shoudl be syntax or functions")) - - + + current_script = Path(__file__).resolve() + parent_directory = current_script.parent + file_path = parent_directory / '_resources' / 'config' / sourcedb / 'dt_mappings.json' + with open(file_path, 'r') as file: + dtmap = json.load(file) + contentuno = dtchunkuno(content, dtmap) + contentdos = dtchunkdos(contentuno, dtmap) ## Instead of writing data in place, lets write it to a new model subfolder ./databricks/ # Write content to the new file @@ -766,7 +796,7 @@ def syntax_chunk(syntax_map, content, results_dict = {}): # Define the new file path new_file_path = new_dir / original_path.name with open(new_file_path, 'w') as file: - file.write(content) + file.write(contentdos) return (full_path, converted_functions, converted_syntax, parsed_discovery) ## Return list of functions that converted diff --git a/helper/pierunner.py.py b/helper/pierunner.py.py index fdaf4ed..5524d6a 100644 --- a/helper/pierunner.py.py +++ b/helper/pierunner.py.py @@ -25,7 +25,17 @@ # COMMAND ---------- # MAGIC %sh -# MAGIC python3 ./convert_to_databricks.py --sourcedb "redshift" --parse_mode 'syntax' --customdp "true" +# MAGIC python3 ./convert_to_databricks.py --sourcedb "redshift" --dir_path "redshift/" --parse_mode 'syntax' --customdp "true" + +# COMMAND ---------- + +# MAGIC %sh +# MAGIC python3 ./convert_to_databricks.py --sourcedb "snowflake" --dir_path "snowflake/" --parse_mode 'syntax' --customdp "true" + +# COMMAND ---------- + +# MAGIC %sh +# MAGIC python3 ./convert_to_databricks.py --sourcedb "redshift" --dir_path "redshift/" --parse_mode 'syntax' --customdp "true" # COMMAND ---------- diff --git a/models/redshift/customerrs.sql b/models/redshift/customerrs.sql index 651d47d..ddb31e7 100644 --- a/models/redshift/customerrs.sql +++ b/models/redshift/customerrs.sql @@ -33,6 +33,15 @@ select GETDATE AS get_date_caps_test, sysdate() AS sys_date_col_test, SYSDATE() AS sys_date_caps_col_test, + Name, + Age, + CAST(ID AS TEXT) as id_text, + created_date::TIME, + CAST(file_dump AS NUMERIC) as file_dump_numeric, + COALESCE(col1::FLOAT,col2::FLOAT8,col3::INT2) AS xyz + FROM catalog.schema.table1 + WHERE colA = colB::CHAR + AND somethingelse = 1 ISNULL(test, test_is_null) AS null_test_col_caps, ISNULL(test, test_is_null) AS null_test_col_caps, isnull(test, 'test_is_null') AS null_test_col, diff --git a/models/snowflake/customer.sql b/models/snowflake/customer.sql index cdede64..b0e7e1d 100644 --- a/models/snowflake/customer.sql +++ b/models/snowflake/customer.sql @@ -20,10 +20,16 @@ select convert_timezone(test), to_varchar(test), parse_json(testuno), - parse_json(testdos) - - - + parse_json(testdos), + Name, + Age, + CAST(ID AS VARCHAR) as id_text, + created_date::TIME, + CAST(file_dump AS NUMBER) as file_dump_numeric, + COALESCE(col1::FLOAT,col2::FLOAT8,col3::REAL) AS xyz + FROM catalog.schema.table1 + WHERE colA = colB::TEXT + AND somethingelse = 1 from snowflake_sample_data.tpch_sf1.customer