Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
truskovskiyk committed Apr 22, 2024
1 parent 118e3ec commit 1e74686
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 165 deletions.
5 changes: 2 additions & 3 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,16 @@ on:
- migrate-to-github-registry-for-docker-images

jobs:

container:
runs-on: ubuntu-latest

permissions:
contents: read
packages: write

steps:

- name: Checkout repository
uses: actions/checkout@v4

- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with:
Expand Down
18 changes: 3 additions & 15 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,15 @@ FROM huggingface/transformers-pytorch-gpu:4.35.2

WORKDIR /app

RUN apt-get update && apt-get install -y curl vim

ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8

COPY requirements.txt requirements.txt
RUN pip3 install --no-cache-dir -r requirements.txt
RUN MAX_JOBS=4 pip3 install flash-attn==2.5.7 --no-build-isolation

RUN pip install ninja packaging
RUN MAX_JOBS=4 pip install flash-attn --no-build-isolation
ENV DAGSTER_HOME /app/dagster_data
RUN mkdir -p $DAGSTER_HOME

ENV PYTHONPATH /app
ENV HF_TOKEN_READ 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
ENV HF_TOKEN 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
ENV HF_TOKEN_WRITE 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

RUN ln -s /usr/bin/python3 /usr/bin/python

COPY text2sql_training text2sql_training

ENV DAGSTER_HOME /app/dagster_data
RUN mkdir -p $DAGSTER_HOME

CMD dagster dev -f text2sql_training/llm_stf.py -p 3000 -h 0.0.0.0
7 changes: 2 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,8 @@ docker run -it --gpus '"device=1"' --ipc=host --net=host -v $PWD:/app fine-tune-
```


modal token set --token-id ak-r6mjZ61XGQtNoCDZGHrFLP --token-secret as-2m1UyDMKKwTo2uApJVGovn

https://modal.com/docs/reference/modal.config

export MODAL_TOKEN_ID=ak-r6mjZ61XGQtNoCDZGHrFLP
export MODAL_TOKEN_SECRET=as-2m1UyDMKKwTo2uApJVGovn
modal run text2sql_training/modal_training.py

modal run text2sql_training/modal_training.py
export $(cat .env | xargs)
8 changes: 5 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
torch
torch==2.1.0
transformers==4.38.2
datasets==2.16.1
accelerate==0.26.1
Expand All @@ -8,5 +8,7 @@ trl==0.7.11
peft==0.8.2
dagster==1.7.1
dagster-webserver==1.7.1
ipython
modal==0.62.97
ipython==8.12.3
modal==0.62.97
packaging==23.2
ninja==1.11.1.1
105 changes: 0 additions & 105 deletions text2sql_training/code.py

This file was deleted.

42 changes: 18 additions & 24 deletions text2sql_training/llm_stf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@
from random import randint
from huggingface_hub import hf_hub_download
from collections import defaultdict
from tqdm import tqdm
import modal

HF_TOKEN_READ = os.getenv("HF_TOKEN_READ")
HF_TOKEN_WRITE = os.getenv("HF_TOKEN_WRITE")


Expand All @@ -40,7 +41,7 @@ class DataConfig(Config):
class ModelTrainingConfig(Config):
pretrained_model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct"
peft_model_id: str = "text2sql-llama-3-8B"

mode: str = 'modal' # or local

def create_conversation(sample):
# Convert dataset to OAI messages
Expand Down Expand Up @@ -124,10 +125,9 @@ def run_training(pretrained_model_id: str, peft_model_id: str, train_data) -> st
attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16,
quantization_config=bnb_config,
token=HF_TOKEN_READ,
)

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, token=HF_TOKEN_READ)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id)
tokenizer.padding_side = "right"

# # set chat template to OAI chatML, remove if you start from a fine-tuned model
Expand Down Expand Up @@ -189,7 +189,6 @@ def run_training(pretrained_model_id: str, peft_model_id: str, train_data) -> st
},
)
trainer.model.print_trainable_parameters()
trainable_params, all_params = trainer.model.get_nb_trainable_parameters()

train_result = trainer.train()
train_metrics = train_result.metrics
Expand All @@ -214,7 +213,11 @@ def run_training(pretrained_model_id: str, peft_model_id: str, train_data) -> st

@asset(group_name="model", compute_kind="modal-labs")
def trained_model(context: AssetExecutionContext, config: ModelTrainingConfig, train_data):
hub_model_id = run_training(pretrained_model_id=config.pretrained_model_id, peft_model_id=config.peft_model_id, train_data=train_data)
if config.mode == 'modal':
run_training_modal_function = modal.Function.lookup("fine-tune-llms-in-2024-with-trl", "run_training_modal")
hub_model_id = run_training_modal_function.remote(train_data_pandas=train_data.to_pandas(), pretrained_model_id=config.pretrained_model_id, peft_model_id=config.peft_model_id)
elif config.mode == 'local':
hub_model_id = run_training(pretrained_model_id=config.pretrained_model_id, peft_model_id=config.peft_model_id, train_data=train_data)
context.add_output_metadata({"model_url": MetadataValue.url(f"https://huggingface.co/{hub_model_id}")})
return hub_model_id

Expand All @@ -224,9 +227,7 @@ def trained_model(context: AssetExecutionContext, config: ModelTrainingConfig, t

@asset(group_name="model", compute_kind="python")
def model_card(context: AssetExecutionContext, trained_model):

model_card_path = hf_hub_download(repo_id=trained_model, filename="README.md")

with open(model_card_path, "r") as f:
content = f.read()

Expand All @@ -236,33 +237,26 @@ def model_card(context: AssetExecutionContext, trained_model):

@asset(group_name="model", compute_kind="python")
def test_results(context: AssetExecutionContext, test_data, trained_model, config: ModelTrainingConfig):
tokenizer = AutoTokenizer.from_pretrained(config.peft_model_id)
tokenizer = AutoTokenizer.from_pretrained(trained_model)
model = AutoPeftModelForCausalLM.from_pretrained(
config.peft_model_id,
trained_model,
device_map="auto",
torch_dtype=torch.float16
)

merged_model = model.merge_and_unload()
pipe = pipeline("text-generation", model=merged_model, tokenizer=tokenizer, torch_dtype=torch.float16)


terminators = [
pipe.tokenizer.eos_token_id,
pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

results = defaultdict(list)
number_of_eval_samples = 100
for s in test_data.select(range(number_of_eval_samples)):
number_of_eval_samples = 10
for s in tqdm(test_data.select(range(number_of_eval_samples))):
query = s['messages'][1]['content']


prompt = pipe.tokenizer.apply_chat_template(
s['messages'][:2],
tokenize=False,
add_generation_prompt=True
)
prompt = pipe.tokenizer.apply_chat_template(s['messages'][:2], tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=terminators, pad_token_id=pipe.tokenizer.pad_token_id)
original_sql = s['messages'][2]['content'].lower()
generated_sql = outputs[0]['generated_text'][len(prompt):].strip().lower()
Expand All @@ -274,13 +268,13 @@ def test_results(context: AssetExecutionContext, test_data, trained_model, confi


rouge = evaluate.load('rouge')
rouge.compute(predictions=results['generated_sql'], references=results['original_sql'])


rouge_metrics = rouge.compute(predictions=results['generated_sql'], references=results['original_sql'])
inference_samples = [{'original_sql': original_sql, 'generated_sql': generated_sql, 'hard_match': hard_match} for (original_sql, generated_sql, hard_match) in zip(results['original_sql'], results['generated_sql'], results['hard_match'])]
context.add_output_metadata(
{
"inference_samples": MetadataValue.json(inference_samples),
"accuracy": MetadataValue.float(accuracy),
"rouge_metrics": MetadataValue.json({x:float(rouge_metrics[x]) for x in rouge_metrics}),
}
)

20 changes: 10 additions & 10 deletions text2sql_training/modal_training.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
import modal
from modal import Image
from datasets import load_dataset
from datasets import disable_caching
import pandas as pd
import os
import os

app = modal.App("example-hello-world")
custom_image = Image.from_registry("ghcr.io/kyryl-opens-ml/fine-tune-llm-in-2024:main").env({"HF_TOKEN": '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', "HF_TOKEN_WRITE": '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'})
app = modal.App("fine-tune-llms-in-2024-with-trl")
env = {"HF_TOKEN": os.getenv('HF_TOKEN'), "HF_TOKEN_WRITE": os.getenv('HF_TOKEN_WRITE')}
custom_image = Image.from_registry("ghcr.io/kyryl-opens-ml/fine-tune-llm-in-2024:main").env(env)

# ENV HF_TOKEN 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
# ENV HF_TOKEN_WRITE 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

@app.function(image=custom_image, gpu="A100", mounts=[modal.Mount.from_local_python_packages("text2sql_training", "text2sql_training")], timeout=15 * 60)
def run_training_modal(train_data_pandas: pd.DataFrame):
def run_training_modal(train_data_pandas: pd.DataFrame, pretrained_model_id: str, peft_model_id: str):
from datasets import Dataset
from text2sql_training.llm_stf import run_training

model_url = run_training(pretrained_model_id='meta-llama/Meta-Llama-3-8B-Instruct', peft_model_id='modal-test', train_data=Dataset.from_pandas(train_data_pandas))
model_url = run_training(pretrained_model_id=pretrained_model_id, peft_model_id=peft_model_id, train_data=Dataset.from_pandas(train_data_pandas))
return model_url


@app.local_entrypoint()
def main():
train_data = load_dataset("json", data_files='train_dataset-sql.json', split="train")
result = run_training_modal.remote(train_data_pandas=train_data.to_pandas())
pretrained_model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'
peft_model_id = 'modal-test'

result = run_training_modal.remote(train_data_pandas=train_data.to_pandas(), pretrained_model_id=pretrained_model_id, peft_model_id=peft_model_id)
print(result)

0 comments on commit 1e74686

Please sign in to comment.