Skip to content

Commit

Permalink
Merge pull request #65 from rlsalcido24/dtmappings
Browse files Browse the repository at this point in the history
Dtmappings
  • Loading branch information
techvaquero authored Aug 2, 2024
2 parents de9e6b8 + ecf5140 commit f079333
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 9 deletions.
27 changes: 27 additions & 0 deletions helper/_resources/config/redshift/dt_mappings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"VARCHAR": {"source_name": "VARCHAR", "target_name": "STRING"},
"CHAR": {"source_name": "CHAR", "target_name": "STRING"},
"CHARACTER": {"source_name": "CHARACTER", "target_name": "STRING"},
"TEXT": {"source_name": "TEXT", "target_name": "STRING"},
"BPCHAR": {"source_name": "BPCHAR", "target_name": "STRING"},
"NCHAR": {"source_name": "NCHAR", "target_name": "STRING"},
"NVARCHAR": {"source_name": "NVARCHAR", "target_name": "STRING"},
"VARBYTE": {"source_name": "VARBYTE", "target_name": "BINARY"},
"BINARY VARYING": {"source_name": "BINARY VARYING", "target_name": "BINARY"},
"VARBINARY": {"source_name": "VARBINARY", "target_name": "BINARY"},
"INT2": {"source_name": "INT2", "target_name": "SMALLINT"},
"INT4": {"source_name": "INT4", "target_name": "INT"},
"INT8": {"source_name": "INT8", "target_name": "BIGINT"},
"NUMBER": {"source_name": "NUMBER", "target_name": "NUMERIC"},
"BYTEINT": {"source_name": "BYTEINT", "target_name": "BYTE"},
"FLOAT4": {"source_name": "FLOAT4", "target_name": "FLOAT"},
"FLOAT8": {"source_name": "FLOAT8", "target_name": "DOUBLE"},
"DOUBLE PRECISION": {"source_name": "DOUBLE PRECISION", "target_name": "DOUBLE"},
"TIME": {"source_name": "TIME", "target_name": "TIMESTAMP"},
"TIME WITHOUT TIME ZONE": {"source_name": "TIME WITHOUT TIME ZONE", "target_name": "TIMESTAMP"},
"TIME WITH TIME ZONE": {"source_name": "TIME WITH TIME ZONE", "target_name": "TIMESTAMP"},
"TIMETZ": {"source_name": "TIMETZ", "target_name": "TIMESTAMP"},
"TIMESTAMP WITHOUT TIME ZONE": {"source_name": "TIMESTAMP WITHOUT TIME ZONE", "target_name": "TIMESTAMP"},
"TIMESTAMPTZ": {"source_name": "TIMESTAMPTZ", "target_name": "TIMESTAMP"},
"TIMESTAMP WITH TIME ZONE": {"source_name": "TIMESTAMP WITH TIME ZONE", "target_name": "TIMESTAMP"}
}
20 changes: 20 additions & 0 deletions helper/_resources/config/snowflake/dt_mappings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"VARCHAR": {"source_name": "VARCHAR", "target_name": "STRING"},
"CHAR": {"source_name": "CHAR", "target_name": "STRING"},
"CHARACTER": {"source_name": "CHARACTER", "target_name": "STRING"},
"TEXT": {"source_name": "TEXT", "target_name": "STRING"},
"VARBINARY": {"source_name": "VARBINARY", "target_name": "BINARY"},
"NUMBER": {"source_name": "NUMBER", "target_name": "NUMERIC"},
"BYTEINT": {"source_name": "BYTEINT", "target_name": "BYTE"},
"FLOAT": {"source_name": "FLOAT", "target_name": "DOUBLE"},
"FLOAT4": {"source_name": "FLOAT4", "target_name": "DOUBLE"},
"FLOAT8": {"source_name": "FLOAT8", "target_name": "DOUBLE"},
"DOUBLE PRECISION": {"source_name": "DOUBLE PRECISION", "target_name": "DOUBLE"},
"REAL": {"source_name": "REAL", "target_name": "DOUBLE"},
"TIME": {"source_name": "TIME", "target_name": "TIMESTAMP"},
"DATETIME": {"source_name": "DATETIME", "target_name": "TIMESTAMP"},
"TIMESTAMP_LTZ": {"source_name": "TIMESTAMP_LTZ", "target_name": "TIMESTAMP"},
"TIMESTAMP_NTZ": {"source_name": "TIMESTAMP_NTZ", "target_name": "TIMESTAMP"},
"TIMESTAMP_TZ": {"source_name": "TIMESTAMP_TZ", "target_name": "TIMESTAMP"},
"OBJECT": {"source_name": "OBJECT", "target_name": "STRUCT"}
}
38 changes: 34 additions & 4 deletions helper/convert_to_databricks.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,31 @@ def syntax_chunk(syntax_map, content, results_dict = {}):
else:
print(f"No syntax values to parse: {syntax_map}. Skipping. ")

return content, results_dict
return content, results_dict

def dtchunkuno(content, dtdict):
for key, value in dtdict.items():
content = dtconvertuno(content, value.get("source_name"), value.get("target_name"))
return content


def dtchunkdos(content, dtdict):
for key, value in dtdict.items():
content = dtconvertdos(content, value.get("source_name"), value.get("target_name"))
return content

def dtconvertuno(content, dtinput, dtoutput):
cast_pattern = r'(CAST\(\s*[^)]+\s+AS\s+){}(\s*\))'.format(dtinput)
replacement_pattern = r'\1{}\2'.format(dtoutput)
updated_content = re.sub(cast_pattern, replacement_pattern, content, flags= re.DOTALL | re.IGNORECASE)
return updated_content

def dtconvertdos(content, dtinput, dtoutput):
cast_pattern = r'::{}'.format(dtinput)
replacement_pattern = r'::{}'.format(dtoutput)
updated_content = re.sub(cast_pattern, replacement_pattern, content, flags= re.DOTALL | re.IGNORECASE)
return updated_content

with open(full_path, 'r+') as file:
content = file.read()

Expand All @@ -741,8 +765,14 @@ def syntax_chunk(syntax_map, content, results_dict = {}):
content, converted_functions = functions_chunk(function_map=function_map, content=content, results_dict=converted_functions)
else:
raise(NotImplementedError(f"Incorrect Parse First Parameter Provided {parse_first}. Shoudl be syntax or functions"))



current_script = Path(__file__).resolve()
parent_directory = current_script.parent
file_path = parent_directory / '_resources' / 'config' / sourcedb / 'dt_mappings.json'
with open(file_path, 'r') as file:
dtmap = json.load(file)
contentuno = dtchunkuno(content, dtmap)
contentdos = dtchunkdos(contentuno, dtmap)
## Instead of writing data in place, lets write it to a new model subfolder ./databricks/
# Write content to the new file

Expand All @@ -766,7 +796,7 @@ def syntax_chunk(syntax_map, content, results_dict = {}):
# Define the new file path
new_file_path = new_dir / original_path.name
with open(new_file_path, 'w') as file:
file.write(content)
file.write(contentdos)

return (full_path, converted_functions, converted_syntax, parsed_discovery) ## Return list of functions that converted

Expand Down
12 changes: 11 additions & 1 deletion helper/pierunner.py.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,17 @@
# COMMAND ----------

# MAGIC %sh
# MAGIC python3 ./convert_to_databricks.py --sourcedb "redshift" --parse_mode 'syntax' --customdp "true"
# MAGIC python3 ./convert_to_databricks.py --sourcedb "redshift" --dir_path "redshift/" --parse_mode 'syntax' --customdp "true"

# COMMAND ----------

# MAGIC %sh
# MAGIC python3 ./convert_to_databricks.py --sourcedb "snowflake" --dir_path "snowflake/" --parse_mode 'syntax' --customdp "true"

# COMMAND ----------

# MAGIC %sh
# MAGIC python3 ./convert_to_databricks.py --sourcedb "redshift" --dir_path "redshift/" --parse_mode 'syntax' --customdp "true"

# COMMAND ----------

Expand Down
9 changes: 9 additions & 0 deletions models/redshift/customerrs.sql
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,15 @@ select
GETDATE AS get_date_caps_test,
sysdate() AS sys_date_col_test,
SYSDATE() AS sys_date_caps_col_test,
Name,
Age,
CAST(ID AS TEXT) as id_text,
created_date::TIME,
CAST(file_dump AS NUMERIC) as file_dump_numeric,
COALESCE(col1::FLOAT,col2::FLOAT8,col3::INT2) AS xyz
FROM catalog.schema.table1
WHERE colA = colB::CHAR
AND somethingelse = 1
ISNULL(test, test_is_null) AS null_test_col_caps,
ISNULL(test, test_is_null) AS null_test_col_caps,
isnull(test, 'test_is_null') AS null_test_col,
Expand Down
14 changes: 10 additions & 4 deletions models/snowflake/customer.sql
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,16 @@ select
convert_timezone(test),
to_varchar(test),
parse_json(testuno),
parse_json(testdos)



parse_json(testdos),
Name,
Age,
CAST(ID AS VARCHAR) as id_text,
created_date::TIME,
CAST(file_dump AS NUMBER) as file_dump_numeric,
COALESCE(col1::FLOAT,col2::FLOAT8,col3::REAL) AS xyz
FROM catalog.schema.table1
WHERE colA = colB::TEXT
AND somethingelse = 1

from
snowflake_sample_data.tpch_sf1.customer
Expand Down

0 comments on commit f079333

Please sign in to comment.