Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added datatype conversions #28

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 37 additions & 5 deletions helper/_resources/00-setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def function_to_macro(content, function_name):
except:
number_of_matches = 0

updated_content = re.sub(pattern, replacement_doubleQuotes, content)
updated_content = re.sub(pattern, replacement_doubleQuotes, content, flags=re.IGNORECASE)

#print(updated_content)

Expand Down Expand Up @@ -88,6 +88,26 @@ def function_to_macro(content, function_name):

# COMMAND ----------

def convert_datatypes(content, datatype_input, datatype_output):

cast_pattern = r'(CAST\(\s*[^)]+\s+AS\s+){}(\s*\))'.format(datatype_input) #Look for functions of the format CAST( ... AS ...)
replacement_pattern = r'\1{}\2'.format(datatype_output) #Substitute in the new datatype

cast_pattern_2 = r'::{}'.format(datatype_input) #Look for functions of the format <col_name>::<data_type>
replacement_pattern_2 = r'::{}'.format(datatype_output) #Substitute in the new datatype

try:
number_of_matches = len(re.findall(cast_pattern, content)) + len(re.findall(cast_pattern_2, content))
except:
number_of_matches = 0

updated_content = re.sub(cast_pattern, replacement_pattern, content, flags=re.IGNORECASE) #Replace CAST() instances
updated_content = re.sub(cast_pattern_2, replacement_pattern_2, updated_content, flags=re.IGNORECASE) #Replace :: instances

return (updated_content, number_of_matches)

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC ## Define function for all .sql files in the dbt repo
Expand All @@ -98,9 +118,10 @@ def function_to_macro(content, function_name):

## Function to asynchronously kick off: open each file, loop through every function, write results

def process_file(full_path, functions_list):
def process_file(full_path, functions_list, datatypes_tuples):

converted_functions = dict()
converted_datatypes = dict()
with open(full_path, 'r+') as file:
content = file.read()
for function_name in functions_list:
Expand All @@ -109,11 +130,17 @@ def process_file(full_path, functions_list):
if no_matches > 0:
converted_functions[function_name] = no_matches

for datatype_input, datatype_output in datatypes_tuples:
content, no_datatype_matches = convert_datatypes(content, datatype_input, datatype_output)

if no_datatype_matches > 0:
converted_datatypes[datatype_input] = no_datatype_matches

file.seek(0)
file.write(content)
file.truncate()

return (full_path, converted_functions) ## Return list of functions that converted
return (full_path, converted_functions, converted_datatypes) ## Return list of functions that converted

def dbt_project_functions_to_macros(repo_path):
# Verify we are running in a dbt project
Expand All @@ -138,11 +165,11 @@ def dbt_project_functions_to_macros(repo_path):
sql_files = [i[5:] for i in paths if '.sql' in i]

with ThreadPoolExecutor() as executor:
futures_sql = {executor.submit(process_file, p, input_functions): p for p in sql_files}
futures_sql = {executor.submit(process_file, p, input_functions, input_datatypes): p for p in sql_files}
for future in as_completed(futures_sql):
data = future.result()
if data:
print(f"Processed: {data[0]} Converted Functions: {data[1]}")
print(f"Processed: {data[0]} Converted Functions: {data[1]} Converted Datatypes: {data[2]}")

else:
print(f"Nothing to change: {data}")
Expand Down Expand Up @@ -178,10 +205,15 @@ def dbt_project_functions_to_macros(repo_path):

if targetdb == 'snowflake':
input_functionsql = sql('select * from {}.{}.functionlist'.format(catalog, schema))
input_datatypesql = sql('select * from {}.{}.datatypeslist'.format(catalog, schema))
elif targetdb == 'redshift':
input_functionsql = sql('select * from {}.{}.functionlistrs'.format(catalog, schema))
input_datatypesql = sql('select * from {}.{}.datatypeslistrs'.format(catalog, schema))
else:
input_functionsql = sql('select 1')

input_functionspd = input_functionsql.toPandas()
input_functions = input_functionspd["function_name"]

input_datatypespd = input_datatypesql.toPandas()
input_datatypes = list(zip(input_datatypespd.datatype_input, input_datatypespd.datatype_output))
25 changes: 25 additions & 0 deletions seeds/snowflake/datatypeslist.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
datatype_input,datatype_output
VARCHAR,STRING
CHAR,STRING
CHARACTER,STRING
STRING,STRING
TEXT,STRING
BINARY,BINARY
VARBINARY,BINARY
NUMBER,NUMERIC
NUMERIC,NUMERIC
INT,INT
INTEGER,INTEGER
BIGINT,BIGINT
SMALLINT,SMALLINT
TINYINT,TINYINT
BYTEINT,BYTE
FLOAT,DOUBLE
FLOAT4,DOUBLE
FLOAT8,DOUBLE
DOUBLE,DOUBLE
DOUBLE PRECISION,DOUBLE
REAL,DOUBLE
TIME,TIMESTAMP
VARIANT,STRING
OBJECT,STRUCT