Merge pull request #474 from basedosdados/add_upload_data_script

add: upload_data_and_create_dbt_files
basedosdados · Mar 26, 2024 · 7b7bdfc · 7b7bdfc
2 parents 1350664 + 9fa9587
commit 7b7bdfc
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 7 deletions.
diff --git a/gists/create_yaml_file.py b/gists/create_yaml_file.py
@@ -5,7 +5,7 @@
 from typing import List
 
 
-def create_yaml_file(arq_url,
+def create_yaml_file(arch_url,
                      table_id,
                      dataset_id,
                      at_least: float = 0.05,
@@ -16,14 +16,14 @@ def create_yaml_file(arq_url,
     Creates dbt models and schema.yaml files based on the architecture table, including data quality tests automatically.
 
     Args:
-        arq_url (str or list): The URL(s) or file path(s) of the input file(s) containing the data.
+        arch_url (str or list): The URL(s) or file path(s) of the input file(s) containing the data.
         table_id (str or list): The table ID(s) or name(s) to use as the YAML model name(s).
         dataset_id (str): The ID or name of the dataset to be used in the dbt models.
         at_least (float): The proportion of non-null values accepted in the columns.
         unique_keys (list, optional): A list of column names for which the 'dbt_utils.unique_combination_of_columns' test should be applied.
                                       Defaults to ["insert unique keys here"].
         mkdir (bool, optional): If True, creates a directory for the new model(s). Defaults to True.
-        preprocessed_staging_column_names (bool, optional): If False, renames staging column names using the architecture. Defaults to True.
+        preprocessed_staging_column_names (bool, optional):  If True, builds SQL file renaming from 'original_name' to 'name' using the architecture file. Defaults to True.
 
     Raises:
         TypeError: If the table_id is not a string or a list.
@@ -35,7 +35,7 @@ def create_yaml_file(arq_url,
 
     Example:
         ```python
-        create_yaml_file(arq_url='input_data.csv', table_id='example_table', dataset_id='example_dataset')
+        create_yaml_file(arch_url='input_data.csv', table_id='example_table', dataset_id='example_dataset')
         ```
 
     """
@@ -68,13 +68,13 @@ def create_yaml_file(arq_url,
 
     if isinstance(table_id, str):
         table_id = [table_id]
-        arq_url = [arq_url]
+        arch_url = [arch_url]
 
     # If table_id is a list, assume multiple input files
-    if not isinstance(arq_url, list) or len(arq_url) != len(table_id):
+    if not isinstance(arch_url, list) or len(arch_url) != len(table_id):
         raise ValueError("The number of URLs or file paths must match the number of table IDs.")
 
-    for url, id in zip(arq_url, table_id):
+    for url, id in zip(arch_url, table_id):
 
         unique_keys_copy = unique_keys.copy()
         architecture_df = sheet_to_df(url)

diff --git a/gists/upload_data_and_create_dbt_files.py b/gists/upload_data_and_create_dbt_files.py
@@ -0,0 +1,32 @@
+from create_yaml_file import create_yaml_file
+import basedosdados as bd
+
+if __name__ == '__main__':
+    # Defining variables for dataset, table, architecture URL, and path to data
+    dataset_id = 'br_bd_metadados'
+    table_id = 'storage_blobs'
+    architecture_url = 'https://docs.google.com/spreadsheets/d/1mWNTeUVpLAufhxdnXLqcbKasv9MA3xbZ/edit#gid=1518247806'
+    path_to_data = f"/path_to_datasets/{dataset_id}/{table_id}"  # Standardized path for data communication with the BD
+
+    # Creating a Table object
+    tb = bd.Table(
+        dataset_id=dataset_id,
+        table_id=table_id
+    )
+
+    # Uploading data to BD Storage and creating a BigQuery table that accesses this data directly from Storage
+    # Below, we list the parameters commonly used, but it's important to explore other parameter options in our documentation for your specific use case
+    tb.create(
+        path=path_to_data,
+        if_storage_data_exists='raise',
+        if_table_exists='replace',
+        source_format='csv'
+    )
+
+    # Creating standard files required to run a dbt model
+    # Modifications will be needed, but this code significantly reduces workload
+    create_yaml_file(
+        arq_url=architecture_url,
+        table_id=table_id,
+        dataset_id=dataset_id,
+        preprocessed_staging_column_names=False)  # If you've already modified 'original_name' to 'name' in the architecture table for your Python code, change this variable to True