update

kyryl-opens-ml · Apr 21, 2024 · 171bc46 · 171bc46
1 parent f4838ef
commit 171bc46
Show file tree

Hide file tree

Showing 7 changed files with 208 additions and 736 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,41 @@
+name: Publish Docker image
+
+on:
+  push:
+    branches:
+      - main
+      - migrate-to-github-registry-for-docker-images
+
+jobs:
+
+  container:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Log in to the Container registry
+        uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
+        with:
+          images: ghcr.io/kyryl-opens-ml/fine-tune-llm-in-2024
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
+        with:
+          context: .
+          push: true
+          tags: ${{ steps.meta.outputs.tags }},latest
+          labels: ${{ steps.meta.outputs.labels }}
+
diff --git a/test.txt b/test.txt
@@ -0,0 +1 @@
+/app/test.txt
diff --git a/text2sql_training/code.py b/text2sql_training/code.py
@@ -1,37 +1,105 @@
-import transformers
-import torch
-
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-
-pipeline = transformers.pipeline(
-    "text-generation",
-    model=model_id,
-    model_kwargs={"torch_dtype": torch.bfloat16},
-    device="cuda",
-)
-
-messages = [
-    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
-    {"role": "user", "content": "Who are you?"},
-]
-
-prompt = pipeline.tokenizer.apply_chat_template(
-        messages, 
-        tokenize=False, 
-        add_generation_prompt=True
-)
-
-terminators = [
-    pipeline.tokenizer.eos_token_id,
-    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
-]
-
-outputs = pipeline(
-    prompt,
-    max_new_tokens=256,
-    eos_token_id=terminators,
-    do_sample=True,
-    temperature=0.6,
-    top_p=0.9,
-)
-print(outputs[0]["generated_text"][len(prompt):])
+# import transformers
+# import torch
+
+# model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+# pipeline = transformers.pipeline(
+#     "text-generation",
+#     model=model_id,
+#     model_kwargs={"torch_dtype": torch.bfloat16},
+#     device="cuda",
+# )
+
+# messages = [
+#     {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
+#     {"role": "user", "content": "Who are you?"},
+# ]
+
+# prompt = pipeline.tokenizer.apply_chat_template(
+#         messages, 
+#         tokenize=False, 
+#         add_generation_prompt=True
+# )
+
+# terminators = [
+#     pipeline.tokenizer.eos_token_id,
+#     pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
+# ]
+
+# outputs = pipeline(
+#     prompt,
+#     max_new_tokens=256,
+#     eos_token_id=terminators,
+#     do_sample=True,
+#     temperature=0.6,
+#     top_p=0.9,
+# )
+# print(outputs[0]["generated_text"][len(prompt):])
+
+def end2end_test():
+    # create_text_to_sql_dataset = create_text_to_sql_dataset(config=DataConfig())
+    create_text_to_sql_dataset = {'train_path': 'train_dataset-sql.json', 'test_path': 'test_dataset-sql.json'}
+    train_data = load_dataset(
+        "json", data_files=create_text_to_sql_dataset["train_path"], split="train"
+    )
+    test_data = load_dataset(
+        "json", data_files=create_text_to_sql_dataset["test_path"], split="train"
+    )
+
+    # train_data = train_data.select(range(100))
+    # test_data = train_data
+    config = ModelTrainingConfig(peft_model_id='duckdb-text2sql-llama-3-8B-sql-full-lora')
+
+    # trained_model = trained_model()
+
+
+import os
+import zipfile
+import io
+
+def create_and_zip_folder():
+    # Define the folder and files to create
+    folder_path = 'example_folder'
+    file_names = ['file1.txt', 'file2.txt', 'file3.txt']
+
+    # Create the folder
+    os.makedirs(folder_path, exist_ok=True)
+
+    # Create some example files in the folder
+    for file_name in file_names:
+        with open(os.path.join(folder_path, file_name), 'w') as f:
+            f.write(f"Contents of {file_name}")
+
+    # Create a zip file in memory
+    zip_buffer = io.BytesIO()
+    with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+        for file_name in file_names:
+            zip_file.write(os.path.join(folder_path, file_name), arcname=file_name)
+
+    # Clean up the folder after zipping (optional)
+    for file_name in file_names:
+        os.remove(os.path.join(folder_path, file_name))
+    os.rmdir(folder_path)
+
+    # Return the bytes of the zip file
+    zip_buffer.seek(0)
+    return zip_buffer.getvalue()
+
+def main_function():
+    # Get the zip bytes
+    zip_bytes = create_and_zip_folder()
+
+    # Read the zip from bytes
+    zip_buffer = io.BytesIO(zip_bytes)
+
+    # Define the directory where to unzip
+    output_dir = 'unzipped_content'
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Unzip the content
+    with zipfile.ZipFile(zip_buffer, 'r') as zip_file:
+        zip_file.extractall(path=output_dir)
+
+# Calling the main function to execute
+if __name__ == '__main__':
+    main_function()
diff --git a/text2sql_training/llm_stf.py b/text2sql_training/llm_stf.py
@@ -23,18 +23,19 @@
 
 
 class DataConfig(Config):
-    dataset_name: str = "motherduckdb/duckdb-text2sql-25k"
+    # dataset_name: str = "motherduckdb/duckdb-text2sql-25k"
+    dataset_name: str = "b-mc2/sql-create-context"
 
-    train_data_path: str = "train_dataset-dagster.json"
-    test_data_path: str = "test_dataset-dagster.json"
+    train_data_path: str = "train_dataset-sql.json"
+    test_data_path: str = "test_dataset-sql.json"
 
     test_size: float = 0.1
     sample_training: int = 5000
 
 
 class ModelTrainingConfig(Config):
     pretrained_model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct"
-    peft_model_id: str = "duckdb-text2sql-llama-3-8B-dagster"
+    peft_model_id: str = "duckdb-text2sql-llama-3-8B-sql"
 
 
 def create_conversation(sample):
@@ -47,10 +48,10 @@ def create_conversation(sample):
         "messages": [
             {
                 "role": "system",
-                "content": system_message.format(schema=sample["schema"]),
+                "content": system_message.format(schema=sample["context"]),
             },
-            {"role": "user", "content": sample["prompt"]},
-            {"role": "assistant", "content": sample["query"]},
+            {"role": "user", "content": sample["question"]},
+            {"role": "assistant", "content": sample["answer"]},
         ]
     }
 
@@ -115,10 +116,10 @@ def test_data(context: AssetExecutionContext, create_text_to_sql_dataset):
     return dataset
 
 
-
+    
 @asset(group_name="model")
 def trained_model(
-    context: AssetExecutionContext, config: ModelTrainingConfig, train_data
+    context: AssetExecutionContext, config: ModelTrainingConfig, train_data, test_data
 ):
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
@@ -162,7 +163,7 @@ def trained_model(
 
     args = TrainingArguments(
         output_dir=config.peft_model_id,  # directory to save and repository id
-        num_train_epochs=0.1,  # number of training epochs
+        num_train_epochs=1,  # number of training epochs
         per_device_train_batch_size=2,  # batch size per device during training
         gradient_accumulation_steps=2,  # number of steps before performing a backward/update pass
         gradient_checkpointing=True,  # use gradient checkpointing to save memory
@@ -182,11 +183,19 @@ def trained_model(
     max_seq_length = 3072  # max sequence length for model and packing of the dataset
     # max_seq_length = 2048  # max sequence length for model and packing of the dataset
 
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        print(preds)
+        print(labels)
+        return {'accuracy': 1}
+
     trainer = SFTTrainer(
         model=model,
         args=args,
         train_dataset=train_data,
+        eval_dataset=test_data,
         peft_config=peft_config,
+        compute_metrics=compute_metrics,
         max_seq_length=max_seq_length,
         tokenizer=tokenizer,
         packing=True,
@@ -233,85 +242,51 @@ def trained_model(
     torch.cuda.empty_cache()
 
 
-def end2end_test():
-    create_text_to_sql_dataset = create_text_to_sql_dataset(config=DataConfig())
-    train_data = load_dataset(
-        "json", data_files=create_text_to_sql_dataset["train_path"], split="train"
-    )
-    test_data = load_dataset(
-        "json", data_files=create_text_to_sql_dataset["test_path"], split="train"
-    )
 
-    train_data = train_data.select(range(100))
-    test_data = train_data
-    config = ModelTrainingConfig()
-
-    # trained_model = trained_model()
+
 
 
 @asset(group_name="model")
-def test_results(test_data, trained_model, config: ModelTrainingConfig):
+def test_results(context: AssetExecutionContext, test_data, trained_model, config: ModelTrainingConfig):
     tokenizer = AutoTokenizer.from_pretrained(config.peft_model_id)
     model = AutoPeftModelForCausalLM.from_pretrained(
         config.peft_model_id,
-        device_map="cuda",
+        device_map="auto",
         torch_dtype=torch.float16
     )
 
     merged_model = model.merge_and_unload()
     pipe = pipeline("text-generation", model=merged_model, tokenizer=tokenizer, torch_dtype=torch.float16)
 
-    rand_idx = randint(0, len(test_data))
-    messages = test_data[rand_idx]["messages"][:2]
-
-    prompt = pipe.tokenizer.apply_chat_template(
-            messages, 
-            tokenize=False, 
-            add_generation_prompt=True
-    )
 
     terminators = [
         pipe.tokenizer.eos_token_id,
         pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
     ]
 
-    outputs = pipe(
-        prompt,
-        max_new_tokens=256,
-        eos_token_id=terminators,
-        do_sample=True,
-        temperature=0.6,
-        top_p=0.9,
-    )
-    print(outputs[0]["generated_text"][len(prompt):])
 
-    print(f"Query:\n{test_data[rand_idx]['messages'][1]['content']}")
-    print(f"Original Answer:\n{test_data[rand_idx]['messages'][2]['content']}")
-    print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")
-
-    # prompt = pipe.tokenizer.apply_chat_template(
-    #     test_data[rand_idx]["messages"][:2],
-    #     tokenize=False,
-    #     add_generation_prompt=True,
-    # )
-    # outputs = pipe(
-    #     prompt,
-    #     max_new_tokens=256,
-    #     do_sample=False,
-    #     temperature=0.1,
-    #     top_k=50,
-    #     top_p=0.1,
-    #     eos_token_id=pipe.tokenizer.eos_token_id,
-    #     pad_token_id=pipe.tokenizer.pad_token_id,
-    # )
-
-
-
-    # sample = test_data[randint(0, len(test_data))]
 
+    inference_samples = []
+    for _ in range(10):
 
+        rand_idx = randint(0, len(test_data))
+        messages = test_data[rand_idx]["messages"][:2]
 
-    # prompt = sample["prompt"]
-    # response = pipe(prompt)
+        prompt = pipe.tokenizer.apply_chat_template(
+                messages, 
+                tokenize=False, 
+                add_generation_prompt=True
+        )
+        outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=terminators, pad_token_id=pipe.tokenizer.pad_token_id)
+        inference_samples.append({
+            'query': test_data[rand_idx]['messages'][1]['content'],
+            'original_sql': test_data[rand_idx]['messages'][2]['content'],
+            'generated_sql': outputs[0]['generated_text'][len(prompt):].strip().lower()
+        })
+
+    context.add_output_metadata(
+        {
+            "inference_samples": MetadataValue.json(inference_samples),
+        }
+    )
 
-    # return response