ci

kyryl-opens-ml · Jul 21, 2024 · fb724df · fb724df
1 parent 5dec884
commit fb724df
Show file tree

Hide file tree

Showing 5 changed files with 98 additions and 28 deletions.
diff --git a/module-4/README.md b/module-4/README.md
@@ -133,10 +133,28 @@ python  kubeflow_pipelines/kfp_inference_pipeline.py http://0.0.0.0:3000
 # Dagster
 
 
+Setup
+
 ```bash
 mkdir ./dagster_pipelines/dagster-home
 export DAGSTER_HOME=$PWD/dagster_pipelines/dagster-home
 export WANDB_PROJECT=****************
 export WANDB_API_KEY=****************
+```
+
+Deploy modal functions
+
+```bash
+MODAL_FORCE_BUILD=1 modal deploy ./dagster_pipelines/text2sql_functions.py
+```
+
+Run Dagster
+
+```bash
 dagster dev -f dagster_pipelines/text2sql_pipeline.py -p 3000 -h 0.0.0.0
-```
+```
+
+### References:
+
+- [Introducing Asset Checks](https://dagster.io/blog/dagster-asset-checks)
+- [Anomaly Detection](https://dagster.io/glossary/anomaly-detection)
diff --git a/module-4/dagster_pipelines/Dockerfile b/module-4/dagster_pipelines/Dockerfile
@@ -14,4 +14,5 @@ RUN ln -s /usr/bin/python3 /usr/bin/python
 ENV PYTHONPATH /app
 COPY . . 
 
-CMD [ "bash" ]
+
+CMD dagster dev -f text2sql_pipeline.py -p 3000 -h 0.0.0.0
diff --git a/module-4/dagster_pipelines/README.md b/module-4/dagster_pipelines/README.md
@@ -0,0 +1,64 @@
+---
+base_model: microsoft/Phi-3-mini-4k-instruct
+library_name: peft
+license: mit
+tags:
+- generated_from_trainer
+model-index:
+- name: phi-3-mini-lora-text2sql
+  results: []
+---
+
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/truskovskiyk/ml-in-production-practice/runs/7932655z)
+# phi-3-mini-lora-text2sql
+
+This model is a fine-tuned version of [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.8630
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 0.0001
+- train_batch_size: 8
+- eval_batch_size: 8
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 32
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 3
+
+### Training results
+
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| No log        | 0      | 0    | 2.8867          |
+| 1.0694        | 2.1436 | 500  | 0.8630          |
+
+
+### Framework versions
+
+- PEFT 0.11.1
+- Transformers 4.42.3
+- Pytorch 2.1.0+cu118
+- Datasets 2.15.0
+- Tokenizers 0.19.1
diff --git a/module-4/dagster_pipelines/text2sql_functions.py b/module-4/dagster_pipelines/text2sql_functions.py
@@ -23,30 +23,8 @@ def training_job(dataset_chatml_pandas):
     return model_name, uri
 
 @app.function(image=custom_image, gpu="a10g", mounts=[mount], timeout=timeout)
-def evaluation_job(df, model_load_path):
+def evaluation_job(df, model_name):
     from text2sql_pipeline import evaluate_model
-    metrics = evaluate_model(df=df, model_load_path=model_load_path)
+    metrics = evaluate_model(df=df, model_name=model_name)
     return metrics
 
-
-@app.local_entrypoint()
-def main():
-
-    from text2sql_pipeline import _get_sql_data, AutoTokenizer, create_message_column, partial, format_dataset_chatml
-    subsample = 0.1
-    dataset = _get_sql_data(subsample=subsample)
-    model_id = "microsoft/Phi-3-mini-4k-instruct"
-
-    tokenizer_id = model_id
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
-    tokenizer.padding_side = "right"
-
-    dataset_chatml = dataset.map(create_message_column)
-    dataset_chatml = dataset_chatml.map(partial(format_dataset_chatml, tokenizer=tokenizer))
-    dataset_chatml_pandas = {'train': dataset_chatml['train'].to_pandas(), 'test': dataset_chatml['test'].to_pandas()}
-
-    # run the function locally
-    print(training_job.remote(dataset_chatml_pandas=dataset_chatml_pandas))
-
-if __name__ == "__main__":
-    main()
diff --git a/module-4/dagster_pipelines/text2sql_pipeline.py b/module-4/dagster_pipelines/text2sql_pipeline.py
@@ -279,8 +279,8 @@ def load_from_registry(model_name: str, model_path: Path):
         print(f"{artifact_dir}")
 
 
-def evaluate_model(df: pd.DataFrame, model_load_path: Path):
-    model = Predictor(model_load_path=model_load_path)
+def evaluate_model(df: pd.DataFrame, model_name: str):
+    model = Predictor.from_wandb(model_name=model_name)
 
     generated_sql = []
     for idx in tqdm(range(len(df))):
@@ -312,6 +312,8 @@ def trained_model(process_dataset):
 
 @asset(group_name="model", compute_kind="python")
 def model_metrics(trained_model, process_dataset):
+    model_path = f"/tmp/{trained_model}"
+    load_from_registry(model_name=trained_model, model_path=model_path)
     # local
     # metrics = evaluate_model(df=process_dataset['test'].to_pandas(), model_load_path=trained_model)
 
@@ -328,6 +330,13 @@ def model_metrics(trained_model, process_dataset):
 
 
 class Predictor:
+
+    @classmethod
+    def from_wandb(cls, model_name: str) -> 'Predictor':
+        model_path = f"/tmp/{model_name}"
+        load_from_registry(model_name=model_name, model_path=model_path)
+        return cls(model_load_path=model_path)
+
     def __init__(self, model_load_path: str):
         device_map = {"": 0}
         new_model = AutoPeftModelForCausalLM.from_pretrained(