diff --git a/README.md b/README.md index ba2a7f8f..99a0245b 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,12 @@ ```bash # iterate through all source nodes, run drop + create + refresh (if partitioned) dbt run-operation stage_external_sources -# maybe someday: dbt source create-external ? + +# maybe someday: dbt source stage-external ``` +![sample docs](etc/sample_docs.png) + The macros assume that you have already created an external stage (Snowflake) or external schema (Spectrum), and that you have permissions to select from it and create tables in it. @@ -21,14 +24,14 @@ source: tables: - name: event - # NEW: "external" property of source node + # NEW: "external" property of source node external: - location: # S3 file path or stage (Snowflake) - file_format: # Hive or Snowflake - row_format: # Hive - tbl_properties: # Hive + location: # S3 file path or Snowflake stage + file_format: # Hive specification or Snowflake named format / specification + row_format: # Hive specification + tbl_properties: # Hive specification - # Specify a list of file-path partitions. + # Specify a list of file-path partitions. # ------ SNOWFLAKE ------ partitions: @@ -37,43 +40,47 @@ source: expression: to_date(substr(metadata$filename, 8, 10), 'YYYY/MM/DD') # ------ REDSHIFT ------- - partitions: + partitions: - name: appId data_type: varchar(255) - vals: # array of values + vals: # list of values - dev - prod - # macro to convert partition value to file path specification - # takes keyword arguments 'name' + 'value' - path_macro: test_external_sources_redshift.year_month_day + path_macro: dbt_external_tables.key_value + # Macro to convert partition value to file path specification. + # This "helper" macro is defined in the package, but you can use + # any custom macro that takes keyword arguments 'name' + 'value' + # and returns the path as a string + # If multiple partitions, order matters for compiling S3 path - name: collector_date data_type: date - vals: # macro w/ args to generate array of values - macro: dbt.dates_in_range + vals: # macro w/ keyword args to generate list of values + macro: dbt.dates_in_range args: start_date_str: '2019-08-01' end_date_str: '{{modules.datetime.date.today().strftime("%Y-%m-%d")}}' in_fmt: "%Y-%m-%d" out_fmt: "%Y-%m-%d" - path_macro: test_external_sources_redshift.year_month_day + path_macro: dbt_external_tables.year_month_day - # Specify ALL column names + datatypes + # Specify ALL column names + datatypes. Column order matters for CSVs. + # Other file formats require column names to exactly match. + columns: - name: app_id data_type: varchar(255) description: "Application ID" + - name: platform + data_type: varchar(255) + description: "Platform" ... ``` -See `sample_sources` for full, valid YML config that establishes Snowplow events +See [`sample_sources`](sample_sources) for full valid YML config that establishes Snowplow events as a dbt source and stage-ready external table in Snowflake and Spectrum. -### Current dependencies - -* dbt@0.15.0 in [`dev/louisa-may-alcott`](https://github.com/fishtown-analytics/dbt/tree/dev/louisa-may-alcott) - ### Supported databases * Redshift (Spectrum) diff --git a/etc/sample_docs.png b/etc/sample_docs.png new file mode 100644 index 00000000..3e370f52 Binary files /dev/null and b/etc/sample_docs.png differ