Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
truskovskiyk committed Apr 21, 2024
1 parent 878fbc0 commit cc34fac
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 83 deletions.
7 changes: 2 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,9 @@ RUN pip install ninja packaging
RUN MAX_JOBS=4 pip install flash-attn --no-build-isolation

ENV PYTHONPATH /app


ENV WANDB_DIR /tmp/wandb
ENV WANDB_PROJECT dist-training
ENV WANDB_API_KEY cb86168a2e8db7edb905da69307450f5e7867d66
ENV HF_TOKEN_READ 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
ENV HF_TOKEN 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
ENV HF_TOKEN_WRITE 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

RUN ln -s /usr/bin/python3 /usr/bin/python

Expand Down
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
# fine-tune-llms-in-2024-with-trl

## Docker setup
export CUDA_VISIBLE_DEVICES=1

```
docker build -t fine-tune-llms-in-2024-with-trl:laster .
docker run -it --gpus '"device=0"' --ipc=host --net=host -v $PWD:/app fine-tune-llms-in-2024-with-trl:laster /bin/bash
docker run -it --gpus '"device=1"' --ipc=host --net=host -v $PWD:/app fine-tune-llms-in-2024-with-trl:laster /bin/bash
```
```


modal token set --token-id ak-r6mjZ61XGQtNoCDZGHrFLP --token-secret as-2m1UyDMKKwTo2uApJVGovn

export MODAL_TOKEN_ID=ak-r6mjZ61XGQtNoCDZGHrFLP
export MODAL_TOKEN_SECRET=as-2m1UyDMKKwTo2uApJVGovn

modal run
2 changes: 0 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
torch
tensorboard
transformers==4.38.2
datasets==2.16.1
accelerate==0.26.1
evaluate==0.4.1
bitsandbytes==0.42.0
trl==0.7.11
peft==0.8.2
wandb
dagster==1.7.1
dagster-webserver==1.7.1
ipython
Expand Down
Empty file added text2sql_training/__init__.py
Empty file.
93 changes: 45 additions & 48 deletions text2sql_training/llm_stf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
pipeline,
TrainingArguments,
)
import os
import evaluate
from peft import AutoPeftModelForCausalLM
import torch
Expand All @@ -21,6 +22,11 @@
from transformers import AutoTokenizer, pipeline
from datasets import load_dataset
from random import randint
from huggingface_hub import hf_hub_download


HF_TOKEN_READ = os.getenv("HF_TOKEN_READ")
HF_TOKEN_WRITE = os.getenv("HF_TOKEN_WRITE")


class DataConfig(Config):
Expand All @@ -36,7 +42,7 @@ class DataConfig(Config):

class ModelTrainingConfig(Config):
pretrained_model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct"
peft_model_id: str = "duckdb-text2sql-llama-3-8B-sql"
peft_model_id: str = "text2sql-llama-3-8B"


def create_conversation(sample):
Expand All @@ -57,7 +63,7 @@ def create_conversation(sample):
}


@asset(group_name="data")
@asset(group_name="data", compute_kind="python")
def create_text_to_sql_dataset(config: DataConfig):
if Path(config.train_data_path).exists() and Path(config.test_data_path).exists():
return {
Expand All @@ -66,15 +72,10 @@ def create_text_to_sql_dataset(config: DataConfig):
}
else:
dataset = load_dataset(config.dataset_name, split="train")
dataset = dataset.map(
create_conversation, remove_columns=dataset.features, batched=False
)
dataset = dataset.map(create_conversation, remove_columns=dataset.features, batched=False)
dataset = dataset.train_test_split(test_size=config.test_size)

dataset["train"] = (
dataset["train"].shuffle().select(range(config.sample_training))
)

dataset["train"] = dataset["train"].shuffle().select(range(config.sample_training))
# Save datasets to disk
dataset["train"].to_json(config.train_data_path, orient="records")
dataset["test"].to_json(config.test_data_path, orient="records")
Expand All @@ -85,11 +86,9 @@ def create_text_to_sql_dataset(config: DataConfig):
}


@asset(group_name="data")
@asset(group_name="data", compute_kind="python")
def train_data(context: AssetExecutionContext, create_text_to_sql_dataset):
dataset = load_dataset(
"json", data_files=create_text_to_sql_dataset["train_path"], split="train"
)
dataset = load_dataset("json", data_files=create_text_to_sql_dataset["train_path"], split="train")

context.add_output_metadata(
{
Expand All @@ -101,11 +100,9 @@ def train_data(context: AssetExecutionContext, create_text_to_sql_dataset):
return dataset


@asset(group_name="data")
@asset(group_name="data", compute_kind="python")
def test_data(context: AssetExecutionContext, create_text_to_sql_dataset):
dataset = load_dataset(
"json", data_files=create_text_to_sql_dataset["test_path"], split="train"
)
dataset = load_dataset("json", data_files=create_text_to_sql_dataset["test_path"], split="train")

context.add_output_metadata(
{
Expand All @@ -116,12 +113,7 @@ def test_data(context: AssetExecutionContext, create_text_to_sql_dataset):

return dataset



@asset(group_name="model")
def trained_model(
context: AssetExecutionContext, config: ModelTrainingConfig, train_data
):
def run_training(pretrained_model_id: str, peft_model_id: str, train_data):
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
Expand All @@ -130,14 +122,15 @@ def trained_model(
)

model = AutoModelForCausalLM.from_pretrained(
config.pretrained_model_id,
pretrained_model_id,
device_map="auto",
attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16,
quantization_config=bnb_config,
token=HF_TOKEN_READ,
)

tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model_id)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, token=HF_TOKEN_READ)
tokenizer.padding_side = "right"

# # set chat template to OAI chatML, remove if you start from a fine-tuned model
Expand All @@ -153,6 +146,7 @@ def trained_model(
# task_type="CAUSAL_LM",
# )

# We are going to train only q and v layer to speedup.
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.05,
Expand All @@ -163,7 +157,7 @@ def trained_model(
)

args = TrainingArguments(
output_dir=config.peft_model_id, # directory to save and repository id
output_dir=peft_model_id, # directory to save and repository id
num_train_epochs=1, # number of training epochs
per_device_train_batch_size=2, # batch size per device during training
gradient_accumulation_steps=2, # number of steps before performing a backward/update pass
Expand All @@ -177,12 +171,12 @@ def trained_model(
max_grad_norm=0.3, # max gradient norm based on QLoRA paper
warmup_ratio=0.03, # warmup ratio based on QLoRA paper
lr_scheduler_type="constant", # use constant learning rate scheduler
push_to_hub=False, # push model to hub
report_to="wandb", # report metrics to tensorboard
push_to_hub=True, # push model to hub
report_to="none", # report metrics to tensorboard
hub_token=HF_TOKEN_WRITE,
)

max_seq_length = 3072 # max sequence length for model and packing of the dataset
# max_seq_length = 2048 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
model=model,
Expand All @@ -208,46 +202,49 @@ def trained_model(


kwargs = {
"finetuned_from": config.pretrained_model_id,
"finetuned_from": pretrained_model_id,
"tasks": "text2sql",
"language": "en",
# "trainable_params": trainable_params,
# "all_params": all_params,
}
# kwargs.update(train_metrics)
trainer.create_model_card(**kwargs)

hub_model_id = trainer.hub_model_id
del trainer
del model
torch.cuda.empty_cache()

return hub_model_id

@asset(group_name="model", compute_kind="modal")
def trained_model(
context: AssetExecutionContext, config: ModelTrainingConfig, train_data
):

hub_model_id = run_training(pretrained_model_id=config.pretrained_model_id, peft_model_id=config.peft_model_id, train_data=train_data)

context.add_output_metadata(
{
"model_card": MetadataValue.md(
open(Path(config.peft_model_id) / "README.md", "r").read()
),
"model_url": MetadataValue.url(
f"https://huggingface.co/{trainer.hub_model_id}"
),
"train_metrics": MetadataValue.json(train_metrics),
"trainable_params": MetadataValue.int(trainable_params),
"all_params": MetadataValue.int(all_params),
"model_url": MetadataValue.url(f"https://huggingface.co/{hub_model_id}"),
}
)

del trainer
del model
torch.cuda.empty_cache()




@asset(group_name="model")
def model_card(context: AssetExecutionContext, config: ModelTrainingConfig, trained_model):
content = open(Path(config.peft_model_id) / "README.md", "r").read()
def model_card(context: AssetExecutionContext, trained_model):
model_card_path = hf_hub_download(repo_id=trained_model, filename="README.md")
with open(model_card_path, "r") as f:
content = f.read()
context.add_output_metadata(
{
"content": MetadataValue.md(content),
})
return content


@asset(group_name="model")
@asset(group_name="model", compute_kind="modal")
def test_results(context: AssetExecutionContext, test_data, trained_model, config: ModelTrainingConfig):
tokenizer = AutoTokenizer.from_pretrained(config.peft_model_id)
model = AutoPeftModelForCausalLM.from_pretrained(
Expand Down
40 changes: 13 additions & 27 deletions text2sql_training/modal_training.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,22 @@
import json
import sys

import modal
import os

import os
import zipfile
import io
from modal import Image

app = modal.App("example-hello-world")
custom_image = Image.from_registry("ghcr.io/kyryl-opens-ml/fine-tune-llm-in-2024:main")


@app.function(image=custom_image, gpu="A100")
def foo():
from datasets import load_dataset
import text2sql_training
from text2sql_training.llm_stf import run_training

@app.function(mounts=[modal.Mount.from_local_dir(".", remote_path="/app")])
def foo(a):
print(a)
with open('/app/test.txt', 'w') as f:
f.write('TEST')
print("DONE")
return '/app/test.txt'
train_data = load_dataset("json", data_files='train_dataset-sql.json', split="train")
model_url = run_training(pretrained_model_id='meta-llama/Meta-Llama-3-8B-Instruct', peft_model_id='modal-test', train_data=train_data)
return model_url


@app.local_entrypoint()
def main():
data_structure = open("requirements.txt", "r").read()
result = foo.remote(data_structure)
with open('test.txt', 'w') as f:
f.write(result)

# Define the directory where to unzip
output_dir = 'unzipped_content'
os.makedirs(output_dir, exist_ok=True)

# Unzip the content
with zipfile.ZipFile(zip_buffer, 'r') as zip_file:
zip_file.extractall(path=output_dir)
result = foo.remote()
print(result)

0 comments on commit cc34fac

Please sign in to comment.