treasure-data · myui · Jun 9, 2022 · Jun 13, 2022 · Jun 13, 2022 · Jun 13, 2022
@@ -0,0 +1 @@
+2.6.3
@@ -0,0 +1,15 @@
+## How to use
+
+Workflow example of AutoML operator. 
+
+Note: this feature is still in Beta and available to limited customers.
+
+
+```sh
+# Push project
+$ td -c ~/.td/td.conf wf push <project_name> --project .
+
+# Setting td.apikey secret is required for automl operator.
+
+$ td -c ~/.td/td.conf wf secrets --project <project_name> --set td.apikey
+```
@@ -0,0 +1,25 @@
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: ${output_database}
+
++create_db_tbl_if_not_exists:
+  td_ddl>:
+  create_databases: ["${output_database}"]
+
++load_datasets:
+   ipynb>:
+     notebook: ml_datasets
+     output_database: ${input_database}
+     datasets: online_retail
+
++run_cltv:
+  ipynb>:
+    notebook: CLTV
+    input_table: ${input_database}.online_retail_txn
+    output_table: ${output_database}.online_retail_cltv_result
+    user_column: customerid
+    tstamp_column: invoicedate
+    amount_column: purchaseamount
+    audience_name: online_retail_cltv
@@ -0,0 +1,23 @@
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: ${output_database}
+
++create_db_tbl_if_not_exists:
+  td_ddl>:
+  create_databases: ["${output_database}"]
+
++load_datasets:
+  ipynb>:
+    notebook: ml_datasets
+    output_database: ml_datasets
+    datasets: dermatology
+
++clustering_gluon_new_model:
+  ipynb>:
+    notebook: clustering
+    input_table: ml_datasets.dermatology
+    output_table: ${output_database}.dermatology_clusters_${session_id}
+    export_feature_importance: ${output_database}.feature_importance_${session_id}
+    export_shap_values: ${output_database}.shap_values_${session_id}
@@ -0,0 +1,10 @@
+input_database: ml_datasets
+output_database: automl_test
+
+train_data_table: gluon_train
+target_column: class
+test_data_table: gluon_test
+
+fit_time_limit: 60 * 3   # fit timeout in sec. 3 min just for demo. Default: 60 * 60 (1hr).
+
+drift_auc_threshold: 0.93
@@ -0,0 +1,27 @@
+timezone: Asia/Tokyo
+#timezone: PST
+
++load_datasets:
+  ipynb>:
+    notebook: ml_datasets
+    output_database: ml_datasets
+    datasets: all
+#   datasets: gluon, bank_marketing, vehicle_coupon, online_retail, telco_churn, boston_house
+
++datasets:
+  for_each>:
+    table: [gluon_train, bank_marketing_train, vehicle_coupon_train, online_retail_ltv_train, telco_churn_train, boston_house_train]
+  _parallel:
+    limit: 3
+  _do:
+    +run_eda:
+      ipynb>:
+        docker:
+          task_mem: 128g
+        notebook: EDA
+        input_table: ml_datasets.${table}
+        # The following options are optional ones
+        eda: all
+        # eda: pandas-profiling, sweetviz
+        # target_column: label
+        sampling_threshold: 1000000
@@ -0,0 +1,13 @@
+timezone: Asia/Tokyo
+#timezone: PST
+
+_export:
+  td:
+    engine: presto
+
++load_datasets:
+  ipynb>:
+    notebook: ml_datasets
+    output_database: ml_datasets
+    datasets: all
+#   datasets: gluon, bank_marketing
@@ -0,0 +1,71 @@
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: ${output_database}
+
++create_db_tbl_if_not_exists:
+  td_ddl>:
+  create_databases: ["${output_database}"]
+  create_tables: ["automl_experiments", "automl_eval_results"]
+
++train:
+  ml_train>:
+    docker:
+      task_mem: 128g # 64g/128g/256g/384g/512g
+    notebook: gluon_train
+    model_name: gluon_model_${session_id}
+    input_table: ${input_database}.${train_data_table}
+    target_column: ${target_column}
+    time_limit: ${fit_time_limit}
+    share_model: true
+    export_leaderboard: ${output_database}.leaderboard_${train_data_table}
+    export_feature_importance: ${output_database}.feature_importance_${train_data_table}
+
++track_experiment:
+  td>: queries/track_experiment.sql
+  insert_into: ${output_database}.automl_experiments
+  last_executed_notebook: ${automl.last_executed_notebook}
+  user_id: ${automl.last_executed_user_id}
+  user_email: ${automl.last_executed_user_email}
+  model_name: gluon_model_${session_id}
+  shared_model: ${automl.shared_model}
+  task_attempt_id: ${attempt_id}
+  session_time: ${session_local_time}
+  engine: presto
+
+# Note: If input_table contains target labels, ml_predict shows evaluation results
++predict:
+  ml_predict>:
+    docker:
+      task_mem: 64g # 64g/128g/256g/384g/512g
+    notebook: gluon_predict
+    model_name: gluon_model_${session_id}
+    input_table: ${input_database}.${test_data_table}
+    output_table: ${output_database}.predicted_${test_data_table}_${session_id}
+
++evaluation:
+  td>: queries/auc.sql
+  table: ${output_database}.predicted_${test_data_table}_${session_id}
+  target_column: ${target_column}
+  positive_class: ' >50K'
+  store_last_results: true
+  engine: hive
+
++alert_if_drift_detected:
+  if>: ${td.last_results.auc < drift_auc_threshold}
+  _do:
+    mail>:
+      data: Detect drift in model performance. AUC was ${td.last_results.auc}.
+    subject: Drift detected
+    to: [[email protected]]
+    # bcc: [[email protected],[email protected]]
+
++record_evaluation:
+  td>: queries/record_evaluation.sql
+  insert_into: ${output_database}.automl_eval_results
+  engine: presto
+  model_name: gluon_model_${session_id}
+  test_table: ${input_database}.${test_data_table}
+  session_time: ${session_local_time}
+  auc: ${td.last_results.auc}
@@ -0,0 +1,67 @@
+timezone: Asia/Tokyo
+#timezone: PST
+
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: ${output_database}
+
++create_db_tbl_if_not_exists:
+  td_ddl>:
+  create_databases: ["${output_database}"]
+  create_tables: ["${expr_tracking_table}"]
+
++load_datasets:
+  ipynb>:
+    notebook: ml_datasets
+    output_database: ${input_database}
+    input_table: ${input_database}.dummy
+#   datasets: gluon, bank_marketing
+    datasets: gluon
+
++gluon_train:
+  ml_train>:
+    notebook: gluon_train
+    model_name: gluon_model_${session_id}
+    input_table: ${input_database}.gluon_train # expect database_name.table_name
+    target_column: class
+    # The following options are optional ones
+    #problem_type: binary                # ‘binary’, ‘multiclass’, ‘regression’, or ‘quantile’. autolugon automatically detect problem types
+    #eval_metric: roc_auc                # autolugon automatically select a right eval_metric for a given setting if not specified.
+    ignore_columns: time,rowid           # Note time column is ignored by the default.
+    time_limit: 60 * 3                   # fit timeout. 3 min just for training time. Default: 60 * 60 (1hr). 1hr or more is recommended for production purposes (Note 24 hours at max). Note this is a soft limit, not hard limit.
+    # timeout: 60 * 3                    # timeout for notebook cell-level execution. This is a hard limit. Note it's cell-level timeout. No timeout if not specified.
+    export_leaderboard: ${output_database}.leaderboard_gluon_train
+    export_feature_importance: ${output_database}.feature_importance_gluon_train
+    # hide_table_contents: true
+
++print_train_result:
+  echo>: "executed ${automl.last_executed_notebook}.ipynb"
+
++track_experiment:
+  td>: queries/track_experiment.sql
+  insert_into: automl_experiments
+  last_executed_notebook: ${automl.last_executed_notebook}
+  user_id: ${automl.last_executed_user_id}
+  user_email: ${automl.last_executed_user_email}
+  model_name: gluon_model_${session_id}
+  task_attempt_id: ${attempt_id}
+  session_time: ${session_local_time}
+  engine: presto
+
++gluon_predict:
+  ml_predict>:
+    notebook: gluon_predict
+    model_name: gluon_model_${session_id}
+    input_table: ${input_database}.gluon_test # expect database_name.table_name
+    output_table: ${output_database}.gluon_predicted  # expect database_name.table_name. DB will be created if not exists. table is overwrite'd.
+    # optional
+    #rowid_column: rowid                # Note when rowid_column is specified, only rowid column + prediction result columns are resulted in the output table
+    #ignore_columns: time               # target column should not be in test data
+    export_leaderboard: ${output_database}.leaderboard_gluon_predict
+    export_feature_importance: ${output_database}.feature_importance_gluon_predict
+    # hide_table_contents: true
+
++print_predict_result:
+  echo>: "executed ${automl.last_executed_notebook}.ipynb"
@@ -0,0 +1,41 @@
+#timezone: Asia/Tokyo
+#timezone: PST
+
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: sample_datasets # dummy to avoid error on create_databases
+  output_db: ml_test
+
++create_db_tbl_if_not_exists:
+  td_ddl>:
+  create_databases: ["ml_datasets", "${output_db}"]
+
++load_datasets:
+  ipynb>:
+    docker:
+      task_mem: 64g
+    notebook: ml_datasets
+    output_database: ml_datasets
+    datasets: mta
+
++run_mta:
+  ipynb>:
+    docker:
+      task_mem: 128g # 64g/128g/256g/384g/512g
+    notebook: MTA
+    # required param
+    input_table: ml_datasets.mta
+    # optional param
+    tstamp_column: tstamp
+    user_column: user
+    channel_column: channel
+    conversion_column: conversion
+    # optional columns (usually not needed)
+    analyze_topk_channels: 50
+    ignore_channels: Facebook
+    overwrite_channel: Direct
+    export_channel_interactions: ${output_db}.channel_interactions
+    export_shapley_attributions: ${output_db}.shapley_attributions
+    export_attributed_conversions: ${output_db}.attributed_conversions
@@ -0,0 +1,51 @@
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: ${output_database}
+
++create_db_tbl_if_not_exists:
+  td_ddl>:
+  create_databases: ["${output_database}"]
+
++load_datasets:
+  ipynb>:
+    notebook: ml_datasets
+    output_database: ml_datasets
+    datasets: nba
+
++nba_only_qtable:
+  ipynb>:
+    notebook: NBA
+    train_table: ml_datasets.nba_train
+    # optional
+    export_q_table: ${output_database}.rl_qtable_${session_id}
+    export_state_action: ${output_database}.rl_state_action_${session_id}
+
++nba_with_eval:
+  ipynb>:
+    notebook: NBA
+    train_table: ml_datasets.nba_train
+    test_table: ml_datasets.nba_test
+    budget: 10000
+    value_per_cv: 100
+    # optional
+    # export_q_table: ${output_database}.rl_qtable_${session_id}
+    export_channel_ratio: ${output_database}.rl_channel_ratio_${session_id}
+    export_predictions: ${output_database}.rl_predictions_${session_id}
+    export_model_performance: ${output_database}.rl_model_performance_${session_id}
+    ignore_actions: client_domain_organic_visit, organic_search
+    action_cost: |
+     {
+       "display": 2,
+       "social-social": 1.4,
+       "social": 2,
+       "social-paid": 5,
+       "organic_search": 1,
+       "emai": 3.2,
+       "cpc": 3,
+       "referral": 2,
+       "linkedin": 3,
+       "search-paid": 2,
+       "twitter": 1
+     }