Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
truskovskiyk committed Apr 21, 2024
1 parent f4838ef commit 171bc46
Show file tree
Hide file tree
Showing 7 changed files with 208 additions and 736 deletions.
41 changes: 41 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: Publish Docker image

on:
push:
branches:
- main
- migrate-to-github-registry-for-docker-images

jobs:

container:
runs-on: ubuntu-latest

permissions:
contents: read
packages: write

steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images: ghcr.io/kyryl-opens-ml/fine-tune-llm-in-2024

- name: Build and push Docker image
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }},latest
labels: ${{ steps.meta.outputs.labels }}

1 change: 1 addition & 0 deletions test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/app/test.txt
142 changes: 105 additions & 37 deletions text2sql_training/code.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,105 @@
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
"text-generation",
model=model_id,
model_kwargs={"torch_dtype": torch.bfloat16},
device="cuda",
)

messages = [
{"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
{"role": "user", "content": "Who are you?"},
]

prompt = pipeline.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)

terminators = [
pipeline.tokenizer.eos_token_id,
pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
prompt,
max_new_tokens=256,
eos_token_id=terminators,
do_sample=True,
temperature=0.6,
top_p=0.9,
)
print(outputs[0]["generated_text"][len(prompt):])
# import transformers
# import torch

# model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# pipeline = transformers.pipeline(
# "text-generation",
# model=model_id,
# model_kwargs={"torch_dtype": torch.bfloat16},
# device="cuda",
# )

# messages = [
# {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
# {"role": "user", "content": "Who are you?"},
# ]

# prompt = pipeline.tokenizer.apply_chat_template(
# messages,
# tokenize=False,
# add_generation_prompt=True
# )

# terminators = [
# pipeline.tokenizer.eos_token_id,
# pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
# ]

# outputs = pipeline(
# prompt,
# max_new_tokens=256,
# eos_token_id=terminators,
# do_sample=True,
# temperature=0.6,
# top_p=0.9,
# )
# print(outputs[0]["generated_text"][len(prompt):])

def end2end_test():
# create_text_to_sql_dataset = create_text_to_sql_dataset(config=DataConfig())
create_text_to_sql_dataset = {'train_path': 'train_dataset-sql.json', 'test_path': 'test_dataset-sql.json'}
train_data = load_dataset(
"json", data_files=create_text_to_sql_dataset["train_path"], split="train"
)
test_data = load_dataset(
"json", data_files=create_text_to_sql_dataset["test_path"], split="train"
)

# train_data = train_data.select(range(100))
# test_data = train_data
config = ModelTrainingConfig(peft_model_id='duckdb-text2sql-llama-3-8B-sql-full-lora')

# trained_model = trained_model()


import os
import zipfile
import io

def create_and_zip_folder():
# Define the folder and files to create
folder_path = 'example_folder'
file_names = ['file1.txt', 'file2.txt', 'file3.txt']

# Create the folder
os.makedirs(folder_path, exist_ok=True)

# Create some example files in the folder
for file_name in file_names:
with open(os.path.join(folder_path, file_name), 'w') as f:
f.write(f"Contents of {file_name}")

# Create a zip file in memory
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
for file_name in file_names:
zip_file.write(os.path.join(folder_path, file_name), arcname=file_name)

# Clean up the folder after zipping (optional)
for file_name in file_names:
os.remove(os.path.join(folder_path, file_name))
os.rmdir(folder_path)

# Return the bytes of the zip file
zip_buffer.seek(0)
return zip_buffer.getvalue()

def main_function():
# Get the zip bytes
zip_bytes = create_and_zip_folder()

# Read the zip from bytes
zip_buffer = io.BytesIO(zip_bytes)

# Define the directory where to unzip
output_dir = 'unzipped_content'
os.makedirs(output_dir, exist_ok=True)

# Unzip the content
with zipfile.ZipFile(zip_buffer, 'r') as zip_file:
zip_file.extractall(path=output_dir)

# Calling the main function to execute
if __name__ == '__main__':
main_function()
111 changes: 43 additions & 68 deletions text2sql_training/llm_stf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,19 @@


class DataConfig(Config):
dataset_name: str = "motherduckdb/duckdb-text2sql-25k"
# dataset_name: str = "motherduckdb/duckdb-text2sql-25k"
dataset_name: str = "b-mc2/sql-create-context"

train_data_path: str = "train_dataset-dagster.json"
test_data_path: str = "test_dataset-dagster.json"
train_data_path: str = "train_dataset-sql.json"
test_data_path: str = "test_dataset-sql.json"

test_size: float = 0.1
sample_training: int = 5000


class ModelTrainingConfig(Config):
pretrained_model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct"
peft_model_id: str = "duckdb-text2sql-llama-3-8B-dagster"
peft_model_id: str = "duckdb-text2sql-llama-3-8B-sql"


def create_conversation(sample):
Expand All @@ -47,10 +48,10 @@ def create_conversation(sample):
"messages": [
{
"role": "system",
"content": system_message.format(schema=sample["schema"]),
"content": system_message.format(schema=sample["context"]),
},
{"role": "user", "content": sample["prompt"]},
{"role": "assistant", "content": sample["query"]},
{"role": "user", "content": sample["question"]},
{"role": "assistant", "content": sample["answer"]},
]
}

Expand Down Expand Up @@ -115,10 +116,10 @@ def test_data(context: AssetExecutionContext, create_text_to_sql_dataset):
return dataset



@asset(group_name="model")
def trained_model(
context: AssetExecutionContext, config: ModelTrainingConfig, train_data
context: AssetExecutionContext, config: ModelTrainingConfig, train_data, test_data
):
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
Expand Down Expand Up @@ -162,7 +163,7 @@ def trained_model(

args = TrainingArguments(
output_dir=config.peft_model_id, # directory to save and repository id
num_train_epochs=0.1, # number of training epochs
num_train_epochs=1, # number of training epochs
per_device_train_batch_size=2, # batch size per device during training
gradient_accumulation_steps=2, # number of steps before performing a backward/update pass
gradient_checkpointing=True, # use gradient checkpointing to save memory
Expand All @@ -182,11 +183,19 @@ def trained_model(
max_seq_length = 3072 # max sequence length for model and packing of the dataset
# max_seq_length = 2048 # max sequence length for model and packing of the dataset

def compute_metrics(eval_preds):
preds, labels = eval_preds
print(preds)
print(labels)
return {'accuracy': 1}

trainer = SFTTrainer(
model=model,
args=args,
train_dataset=train_data,
eval_dataset=test_data,
peft_config=peft_config,
compute_metrics=compute_metrics,
max_seq_length=max_seq_length,
tokenizer=tokenizer,
packing=True,
Expand Down Expand Up @@ -233,85 +242,51 @@ def trained_model(
torch.cuda.empty_cache()


def end2end_test():
create_text_to_sql_dataset = create_text_to_sql_dataset(config=DataConfig())
train_data = load_dataset(
"json", data_files=create_text_to_sql_dataset["train_path"], split="train"
)
test_data = load_dataset(
"json", data_files=create_text_to_sql_dataset["test_path"], split="train"
)

train_data = train_data.select(range(100))
test_data = train_data
config = ModelTrainingConfig()

# trained_model = trained_model()



@asset(group_name="model")
def test_results(test_data, trained_model, config: ModelTrainingConfig):
def test_results(context: AssetExecutionContext, test_data, trained_model, config: ModelTrainingConfig):
tokenizer = AutoTokenizer.from_pretrained(config.peft_model_id)
model = AutoPeftModelForCausalLM.from_pretrained(
config.peft_model_id,
device_map="cuda",
device_map="auto",
torch_dtype=torch.float16
)

merged_model = model.merge_and_unload()
pipe = pipeline("text-generation", model=merged_model, tokenizer=tokenizer, torch_dtype=torch.float16)

rand_idx = randint(0, len(test_data))
messages = test_data[rand_idx]["messages"][:2]

prompt = pipe.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)

terminators = [
pipe.tokenizer.eos_token_id,
pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipe(
prompt,
max_new_tokens=256,
eos_token_id=terminators,
do_sample=True,
temperature=0.6,
top_p=0.9,
)
print(outputs[0]["generated_text"][len(prompt):])

print(f"Query:\n{test_data[rand_idx]['messages'][1]['content']}")
print(f"Original Answer:\n{test_data[rand_idx]['messages'][2]['content']}")
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")

# prompt = pipe.tokenizer.apply_chat_template(
# test_data[rand_idx]["messages"][:2],
# tokenize=False,
# add_generation_prompt=True,
# )
# outputs = pipe(
# prompt,
# max_new_tokens=256,
# do_sample=False,
# temperature=0.1,
# top_k=50,
# top_p=0.1,
# eos_token_id=pipe.tokenizer.eos_token_id,
# pad_token_id=pipe.tokenizer.pad_token_id,
# )



# sample = test_data[randint(0, len(test_data))]

inference_samples = []
for _ in range(10):

rand_idx = randint(0, len(test_data))
messages = test_data[rand_idx]["messages"][:2]

# prompt = sample["prompt"]
# response = pipe(prompt)
prompt = pipe.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=terminators, pad_token_id=pipe.tokenizer.pad_token_id)
inference_samples.append({
'query': test_data[rand_idx]['messages'][1]['content'],
'original_sql': test_data[rand_idx]['messages'][2]['content'],
'generated_sql': outputs[0]['generated_text'][len(prompt):].strip().lower()
})

context.add_output_metadata(
{
"inference_samples": MetadataValue.json(inference_samples),
}
)

# return response
Loading

0 comments on commit 171bc46

Please sign in to comment.