Skip to content

Commit

Permalink
Merge pull request #474 from basedosdados/add_upload_data_script
Browse files Browse the repository at this point in the history
add: upload_data_and_create_dbt_files
  • Loading branch information
laura-l-amaral authored Mar 26, 2024
2 parents 1350664 + 9fa9587 commit 7b7bdfc
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 7 deletions.
14 changes: 7 additions & 7 deletions gists/create_yaml_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import List


def create_yaml_file(arq_url,
def create_yaml_file(arch_url,
table_id,
dataset_id,
at_least: float = 0.05,
Expand All @@ -16,14 +16,14 @@ def create_yaml_file(arq_url,
Creates dbt models and schema.yaml files based on the architecture table, including data quality tests automatically.
Args:
arq_url (str or list): The URL(s) or file path(s) of the input file(s) containing the data.
arch_url (str or list): The URL(s) or file path(s) of the input file(s) containing the data.
table_id (str or list): The table ID(s) or name(s) to use as the YAML model name(s).
dataset_id (str): The ID or name of the dataset to be used in the dbt models.
at_least (float): The proportion of non-null values accepted in the columns.
unique_keys (list, optional): A list of column names for which the 'dbt_utils.unique_combination_of_columns' test should be applied.
Defaults to ["insert unique keys here"].
mkdir (bool, optional): If True, creates a directory for the new model(s). Defaults to True.
preprocessed_staging_column_names (bool, optional): If False, renames staging column names using the architecture. Defaults to True.
preprocessed_staging_column_names (bool, optional): If True, builds SQL file renaming from 'original_name' to 'name' using the architecture file. Defaults to True.
Raises:
TypeError: If the table_id is not a string or a list.
Expand All @@ -35,7 +35,7 @@ def create_yaml_file(arq_url,
Example:
```python
create_yaml_file(arq_url='input_data.csv', table_id='example_table', dataset_id='example_dataset')
create_yaml_file(arch_url='input_data.csv', table_id='example_table', dataset_id='example_dataset')
```
"""
Expand Down Expand Up @@ -68,13 +68,13 @@ def create_yaml_file(arq_url,

if isinstance(table_id, str):
table_id = [table_id]
arq_url = [arq_url]
arch_url = [arch_url]

# If table_id is a list, assume multiple input files
if not isinstance(arq_url, list) or len(arq_url) != len(table_id):
if not isinstance(arch_url, list) or len(arch_url) != len(table_id):
raise ValueError("The number of URLs or file paths must match the number of table IDs.")

for url, id in zip(arq_url, table_id):
for url, id in zip(arch_url, table_id):

unique_keys_copy = unique_keys.copy()
architecture_df = sheet_to_df(url)
Expand Down
32 changes: 32 additions & 0 deletions gists/upload_data_and_create_dbt_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from create_yaml_file import create_yaml_file
import basedosdados as bd

if __name__ == '__main__':
# Defining variables for dataset, table, architecture URL, and path to data
dataset_id = 'br_bd_metadados'
table_id = 'storage_blobs'
architecture_url = 'https://docs.google.com/spreadsheets/d/1mWNTeUVpLAufhxdnXLqcbKasv9MA3xbZ/edit#gid=1518247806'
path_to_data = f"/path_to_datasets/{dataset_id}/{table_id}" # Standardized path for data communication with the BD

# Creating a Table object
tb = bd.Table(
dataset_id=dataset_id,
table_id=table_id
)

# Uploading data to BD Storage and creating a BigQuery table that accesses this data directly from Storage
# Below, we list the parameters commonly used, but it's important to explore other parameter options in our documentation for your specific use case
tb.create(
path=path_to_data,
if_storage_data_exists='raise',
if_table_exists='replace',
source_format='csv'
)

# Creating standard files required to run a dbt model
# Modifications will be needed, but this code significantly reduces workload
create_yaml_file(
arq_url=architecture_url,
table_id=table_id,
dataset_id=dataset_id,
preprocessed_staging_column_names=False) # If you've already modified 'original_name' to 'name' in the architecture table for your Python code, change this variable to True

0 comments on commit 7b7bdfc

Please sign in to comment.