Document all macros

omnata-labs · Jan 11, 2021 · 8bba677 · 8bba677
1 parent bddc1d5
commit 8bba677
Show file tree

Hide file tree

Showing 12 changed files with 266 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -25,8 +25,16 @@ To use this in your dbt project, create or modify packages.yml to include:
 ```
 packages:
   - git: "https://github.com/omnata-pty-ltd/dbt-ml-preprocessing.git"
-    revision: 0.1.0
+    revision: 0.2.0
 ```
 _(replace the revision number with the latest)_
 
+Then run:
+```dbt deps``` to import the package.
+
 ## Usage
+The macros are all designed to build an entire model, not just part of it. It would be too complex, and probably impossible to design as a single column macro.
+
+To read their documentation and see examples, simply run [generate your docs](https://docs.getdbt.com/reference/commands/cmd-docs/), and you'll see macro documentation in the Projects tree under ```dbt_ml_preprocessing```.
+
+
diff --git a/macros/k_bins_discretizer.yml b/macros/k_bins_discretizer.yml
@@ -5,7 +5,7 @@ macros:
     description: |
       Bin continuous data into intervals. See scikit-learn's [KBinsDiscretizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html#sklearn.preprocessing.KBinsDiscretizer) for full documentation.
 
-      Will append a new column with the name <source column>_binned
+      Will append a new column with the name &lt;source column&gt;_binned
 
       Example usage:
       #### **`models\customer_features.yml:`**
@@ -15,7 +15,7 @@ macros:
       {{ '{{' }} dbt_ml_preprocessing.k_bins_discretizer( ref('customer') ,'age') {{ '}}' }}
 
       ```
-      Will produce a model named customer_features, with a new column named ```age_binned``` containing the binned values
+      Will produce a model named customer_features, with a new column named ```age_binned``` containing the binned values.
     arguments:
       - name: source_table
         type: string

diff --git a/macros/label_encoder.yml b/macros/label_encoder.yml
@@ -0,0 +1,28 @@
+version: 2
+
+macros:
+  - name: label_encoder
+    description: |
+      Encode target labels with value between 0 and n_classes-1. See scikit-learn's [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder) for full documentation.
+
+      Will append a new column with the name &lt;source column&gt;_encoded
+
+      Example usage:
+      #### **`models\customer_features.yml:`**
+      ```
+      {{ '{{' }} config(materialized='view') {{ '}}' }}
+
+      {{ '{{' }} dbt_ml_preprocessing.label_encoder( ref('customer') ,'city') {{ '}}' }}
+
+      ```
+      Will produce a model named customer_features, with a new column named ```city_encoded``` containing the encoded values.
+    arguments:
+      - name: source_table
+        type: string
+        description: Pass in a ref to the table containing the data you want to transform
+      - name: source_column
+        type: string
+        description: The column containing the data you want to transform
+      - name: include_columns
+        type: string
+        description: Other columns from the source table to be included in the model (defaults to '*' and brings all columns across)
diff --git a/macros/max_abs_scaler.yml b/macros/max_abs_scaler.yml
@@ -0,0 +1,28 @@
+version: 2
+
+macros:
+  - name: max_abs_scaler
+    description: |
+      Scale each feature by its maximum absolute value. See scikit-learn's [MaxAbsScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html#sklearn.preprocessing.MaxAbsScaler) for full documentation.
+
+      Will append a new column with the name &lt;source column&gt;_scaled
+
+      Example usage:
+      #### **`models\customer_features.yml:`**
+      ```
+      {{ '{{' }} config(materialized='view') {{ '}}' }}
+
+      {{ '{{' }} dbt_ml_preprocessing.max_abs_scaler( ref('customer') ,'age') {{ '}}' }}
+
+      ```
+      Will produce a model named customer_features, with a new column named ```age_scaled``` containing the encoded values.
+    arguments:
+      - name: source_table
+        type: string
+        description: Pass in a ref to the table containing the data you want to transform
+      - name: source_column
+        type: string
+        description: The column containing the data you want to transform
+      - name: include_columns
+        type: string
+        description: Other columns from the source table to be included in the model (defaults to '*' and brings all columns across)
diff --git a/macros/min_max_scaler.yml b/macros/min_max_scaler.yml
@@ -0,0 +1,28 @@
+version: 2
+
+macros:
+  - name: min_max_scaler
+    description: |
+      Transform features by scaling each feature to a given range. See scikit-learn's [MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler) for full documentation.
+
+      Will append a new column with the name &lt;source column&gt;_scaled
+
+      Example usage:
+      #### **`models\customer_features.yml:`**
+      ```
+      {{ '{{' }} config(materialized='view') {{ '}}' }}
+
+      {{ '{{' }} dbt_ml_preprocessing.min_max_scaler( ref('customer') ,'age') {{ '}}' }}
+
+      ```
+      Will produce a model named customer_features, with a new column named ```age_scaled``` containing the encoded values.
+    arguments:
+      - name: source_table
+        type: string
+        description: Pass in a ref to the table containing the data you want to transform
+      - name: source_column
+        type: string
+        description: The column containing the data you want to transform
+      - name: include_columns
+        type: string
+        description: Other columns from the source table to be included in the model (defaults to '*' and brings all columns across)
diff --git a/macros/normalizer.yml b/macros/normalizer.yml
@@ -0,0 +1,28 @@
+version: 2
+
+macros:
+  - name: normalizer
+    description: |
+      Normalize samples individually to unit norm. See scikit-learn's [Normalizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer) for full documentation.
+
+      Will append a new column with the name &lt;source column&gt;_normalized
+
+      Example usage:
+      #### **`models\customer_features.yml:`**
+      ```
+      {{ '{{' }} config(materialized='view') {{ '}}' }}
+
+      {{ '{{' }} dbt_ml_preprocessing.normalizer( ref('customer') ,'age') {{ '}}' }}
+
+      ```
+      Will produce a model named customer_features, with a new column named ```age_normalized``` containing the encoded values.
+    arguments:
+      - name: source_table
+        type: string
+        description: Pass in a ref to the table containing the data you want to transform
+      - name: source_column
+        type: string
+        description: The column containing the data you want to transform
+      - name: include_columns
+        type: string
+        description: Other columns from the source table to be included in the model (defaults to '*' and brings all columns across)
diff --git a/macros/one_hot_encoder.sql b/macros/one_hot_encoder.sql
@@ -12,6 +12,12 @@
         {% set results_list = [] %}
     {% endif %}
 {%- endif -%}
+{%- if handle_unknown!='ordinal' -%}
+    {% set error_message %}
+The `one_hot_encoder` macro only supports an 'handle_unknown' value of 'ignore' at this time.
+    {% endset %}
+    {%- do exceptions.raise_compiler_error(error_message) -%}
+{%- endif -%}
 
 select 
 {% for column in include_columns %}

diff --git a/macros/one_hot_encoder.yml b/macros/one_hot_encoder.yml
@@ -0,0 +1,34 @@
+version: 2
+
+macros:
+  - name: one_hot_encoder
+    description: |
+      Encode categorical features as a one-hot numeric array. See scikit-learn's [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder) for full documentation.
+
+      Will append a new boolean column for every category present in the data with the name &lt;source column&gt;_&lt;category value&gt;.
+
+      Example usage:
+      #### **`models\customer_features.yml:`**
+      ```
+      {{ '{{' }} config(materialized='view') {{ '}}' }}
+
+      {{ '{{' }} dbt_ml_preprocessing.one_hot_encoder( ref('customer') ,'gender') {{ '}}' }}
+
+      ```
+      Will produce a model named customer_features, with a new column named ```gender_encoded``` containing the encoded values.
+    arguments:
+      - name: source_table
+        type: string
+        description: Pass in a ref to the table containing the data you want to transform
+      - name: source_column
+        type: string
+        description: The column containing the data you want to transform
+      - name: include_columns
+        type: string
+        description: Other columns from the source table to be included in the model (defaults to '*' and brings all columns across)
+      - name: categories
+        type: string
+        description: The categories of each feature determined during fitting. Defaults to 'auto', which will encode all values.
+      - name: handle_unknown
+        type: string
+        description: Whether to raise an error or ignore if an unknown categorical feature is present during transform. Only supports the default value of 'ignore' at this time.
diff --git a/macros/quantile_transformer.sql b/macros/quantile_transformer.sql
@@ -1,4 +1,4 @@
-{% macro quantile_transformer(source_table,source_column,n_quantiles=10,output_distribution='uniform',ignore_implicit_zeros=False,subsample=1000,include_columns='*') %}
+{% macro quantile_transformer(source_table,source_column,n_quantiles=10,output_distribution='uniform',subsample=1000,include_columns='*') %}
 {%- if include_columns=='*' -%}
 {%- set all_source_columns = adapter.get_columns_in_relation(source_table) | map(attribute='quoted') -%}
 {% set include_columns = all_source_columns | join(', ') %}

diff --git a/macros/quantile_transformer.yml b/macros/quantile_transformer.yml
@@ -0,0 +1,37 @@
+version: 2
+
+macros:
+  - name: quantile_transformer
+    description: |
+      Transform features using quantiles information. See scikit-learn's [QuantileTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html#sklearn.preprocessing.QuantileTransformer) for full documentation.
+
+      Will append a new column with the name &lt;source column&gt;_transformed.
+
+      Example usage:
+      #### **`models\customer_features.yml:`**
+      ```
+      {{ '{{' }} config(materialized='view') {{ '}}' }}
+
+      {{ '{{' }} dbt_ml_preprocessing.quantile_transformer( ref('customer') ,'age') {{ '}}' }}
+
+      ```
+      Will produce a model named customer_features, with a new column named ```age_transformed``` containing the encoded values.
+    arguments:
+      - name: source_table
+        type: string
+        description: Pass in a ref to the table containing the data you want to transform
+      - name: source_column
+        type: string
+        description: The column containing the data you want to transform
+      - name: include_columns
+        type: string
+        description: Other columns from the source table to be included in the model (defaults to '*' and brings all columns across)
+      - name: n_quantiles
+        type: string
+        description: Number of quantiles to be computed, defaults to 10.
+      - name: output_distribution
+        type: string
+        description: Marginal distribution for the transformed data. Only supports the default value of 'uniform' at this time.
+      - name: subsample
+        type: string
+        description: Maximum number of samples used to estimate the quantiles for computational efficiency, defaults to 1000.
diff --git a/macros/robust_scaler.yml b/macros/robust_scaler.yml
@@ -0,0 +1,34 @@
+version: 2
+
+macros:
+  - name: robust_scaler
+    description: |
+      Scale features using statistics that are robust to outliers. See scikit-learn's [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler) for full documentation.
+
+      Will append a new column with the name &lt;source column&gt;_scaled.
+
+      Example usage:
+      #### **`models\customer_features.yml:`**
+      ```
+      {{ '{{' }} config(materialized='view') {{ '}}' }}
+
+      {{ '{{' }} dbt_ml_preprocessing.robust_scaler( ref('customer') ,'age') {{ '}}' }}
+
+      ```
+      Will produce a model named customer_features, with a new column named ```age_scaled``` containing the scaled values.
+    arguments:
+      - name: source_table
+        type: string
+        description: Pass in a ref to the table containing the data you want to transform
+      - name: source_column
+        type: string
+        description: The column containing the data you want to transform
+      - name: include_columns
+        type: string
+        description: Other columns from the source table to be included in the model (defaults to '*' and brings all columns across)
+      - name: with_centering
+        type: string
+        description: If True, center the data before scaling. Only supports the default value of 'False' at this time.
+      - name: quantile_range
+        type: string
+        description: Quantile range, must be a two-item array containing the first quartile threshold and the third quartile threshold. Defaults to Interquartile Range, which is [25,75]
diff --git a/macros/standard_scaler.yml b/macros/standard_scaler.yml
@@ -0,0 +1,31 @@
+version: 2
+
+macros:
+  - name: standard_scaler
+    description: |
+      Standardize features by removing the mean and scaling to unit variance. See scikit-learn's [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler) for full documentation.
+
+      Will append a new column with the name &lt;source column&gt;_scaled.
+
+      Example usage:
+      #### **`models\customer_features.yml:`**
+      ```
+      {{ '{{' }} config(materialized='view') {{ '}}' }}
+
+      {{ '{{' }} dbt_ml_preprocessing.standard_scaler( ref('customer') ,'age') {{ '}}' }}
+
+      ```
+      Will produce a model named customer_features, with a new column named ```age_scaled``` containing the scaled values.
+    arguments:
+      - name: source_table
+        type: string
+        description: Pass in a ref to the table containing the data you want to transform
+      - name: source_column
+        type: string
+        description: The column containing the data you want to transform
+      - name: include_columns
+        type: string
+        description: Other columns from the source table to be included in the model (defaults to '*' and brings all columns across)
+      - name: with_mean
+        type: string
+        description: If True, center the data before scaling. Only supports the default value of 'True' at this time.