diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f65e55f..864ba48 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -7,17 +7,16 @@ on: - migrate-to-github-registry-for-docker-images jobs: - container: runs-on: ubuntu-latest - permissions: contents: read packages: write - steps: + - name: Checkout repository uses: actions/checkout@v4 + - name: Log in to the Container registry uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 with: diff --git a/Dockerfile b/Dockerfile index 5f57cde..c48d982 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,27 +2,15 @@ FROM huggingface/transformers-pytorch-gpu:4.35.2 WORKDIR /app -RUN apt-get update && apt-get install -y curl vim - -ENV LC_ALL=C.UTF-8 -ENV LANG=C.UTF-8 - COPY requirements.txt requirements.txt RUN pip3 install --no-cache-dir -r requirements.txt +RUN MAX_JOBS=4 pip3 install flash-attn==2.5.7 --no-build-isolation -RUN pip install ninja packaging -RUN MAX_JOBS=4 pip install flash-attn --no-build-isolation +ENV DAGSTER_HOME /app/dagster_data +RUN mkdir -p $DAGSTER_HOME ENV PYTHONPATH /app -ENV HF_TOKEN_READ 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 -ENV HF_TOKEN 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 -ENV HF_TOKEN_WRITE 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 - RUN ln -s /usr/bin/python3 /usr/bin/python COPY text2sql_training text2sql_training - -ENV DAGSTER_HOME /app/dagster_data -RUN mkdir -p $DAGSTER_HOME - CMD dagster dev -f text2sql_training/llm_stf.py -p 3000 -h 0.0.0.0 diff --git a/README.md b/README.md index 599bb1d..32840ec 100644 --- a/README.md +++ b/README.md @@ -10,11 +10,8 @@ docker run -it --gpus '"device=1"' --ipc=host --net=host -v $PWD:/app fine-tune- ``` -modal token set --token-id ak-r6mjZ61XGQtNoCDZGHrFLP --token-secret as-2m1UyDMKKwTo2uApJVGovn - https://modal.com/docs/reference/modal.config -export MODAL_TOKEN_ID=ak-r6mjZ61XGQtNoCDZGHrFLP -export MODAL_TOKEN_SECRET=as-2m1UyDMKKwTo2uApJVGovn +modal run text2sql_training/modal_training.py -modal run text2sql_training/modal_training.py \ No newline at end of file +export $(cat .env | xargs) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 49b47ed..3b94a6b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -torch +torch==2.1.0 transformers==4.38.2 datasets==2.16.1 accelerate==0.26.1 @@ -8,5 +8,7 @@ trl==0.7.11 peft==0.8.2 dagster==1.7.1 dagster-webserver==1.7.1 -ipython -modal==0.62.97 \ No newline at end of file +ipython==8.12.3 +modal==0.62.97 +packaging==23.2 +ninja==1.11.1.1 \ No newline at end of file diff --git a/text2sql_training/code.py b/text2sql_training/code.py deleted file mode 100644 index 447bb33..0000000 --- a/text2sql_training/code.py +++ /dev/null @@ -1,105 +0,0 @@ -# import transformers -# import torch - -# model_id = "meta-llama/Meta-Llama-3-8B-Instruct" - -# pipeline = transformers.pipeline( -# "text-generation", -# model=model_id, -# model_kwargs={"torch_dtype": torch.bfloat16}, -# device="cuda", -# ) - -# messages = [ -# {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"}, -# {"role": "user", "content": "Who are you?"}, -# ] - -# prompt = pipeline.tokenizer.apply_chat_template( -# messages, -# tokenize=False, -# add_generation_prompt=True -# ) - -# terminators = [ -# pipeline.tokenizer.eos_token_id, -# pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>") -# ] - -# outputs = pipeline( -# prompt, -# max_new_tokens=256, -# eos_token_id=terminators, -# do_sample=True, -# temperature=0.6, -# top_p=0.9, -# ) -# print(outputs[0]["generated_text"][len(prompt):]) - -def end2end_test(): - # create_text_to_sql_dataset = create_text_to_sql_dataset(config=DataConfig()) - create_text_to_sql_dataset = {'train_path': 'train_dataset-sql.json', 'test_path': 'test_dataset-sql.json'} - train_data = load_dataset( - "json", data_files=create_text_to_sql_dataset["train_path"], split="train" - ) - test_data = load_dataset( - "json", data_files=create_text_to_sql_dataset["test_path"], split="train" - ) - - # train_data = train_data.select(range(100)) - # test_data = train_data - config = ModelTrainingConfig(peft_model_id='duckdb-text2sql-llama-3-8B-sql-full-lora') - - # trained_model = trained_model() - - -import os -import zipfile -import io - -def create_and_zip_folder(): - # Define the folder and files to create - folder_path = 'example_folder' - file_names = ['file1.txt', 'file2.txt', 'file3.txt'] - - # Create the folder - os.makedirs(folder_path, exist_ok=True) - - # Create some example files in the folder - for file_name in file_names: - with open(os.path.join(folder_path, file_name), 'w') as f: - f.write(f"Contents of {file_name}") - - # Create a zip file in memory - zip_buffer = io.BytesIO() - with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: - for file_name in file_names: - zip_file.write(os.path.join(folder_path, file_name), arcname=file_name) - - # Clean up the folder after zipping (optional) - for file_name in file_names: - os.remove(os.path.join(folder_path, file_name)) - os.rmdir(folder_path) - - # Return the bytes of the zip file - zip_buffer.seek(0) - return zip_buffer.getvalue() - -def main_function(): - # Get the zip bytes - zip_bytes = create_and_zip_folder() - - # Read the zip from bytes - zip_buffer = io.BytesIO(zip_bytes) - - # Define the directory where to unzip - output_dir = 'unzipped_content' - os.makedirs(output_dir, exist_ok=True) - - # Unzip the content - with zipfile.ZipFile(zip_buffer, 'r') as zip_file: - zip_file.extractall(path=output_dir) - -# Calling the main function to execute -if __name__ == '__main__': - main_function() diff --git a/text2sql_training/llm_stf.py b/text2sql_training/llm_stf.py index cd3bc21..a90fbaf 100644 --- a/text2sql_training/llm_stf.py +++ b/text2sql_training/llm_stf.py @@ -24,8 +24,9 @@ from random import randint from huggingface_hub import hf_hub_download from collections import defaultdict +from tqdm import tqdm +import modal -HF_TOKEN_READ = os.getenv("HF_TOKEN_READ") HF_TOKEN_WRITE = os.getenv("HF_TOKEN_WRITE") @@ -40,7 +41,7 @@ class DataConfig(Config): class ModelTrainingConfig(Config): pretrained_model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct" peft_model_id: str = "text2sql-llama-3-8B" - + mode: str = 'modal' # or local def create_conversation(sample): # Convert dataset to OAI messages @@ -124,10 +125,9 @@ def run_training(pretrained_model_id: str, peft_model_id: str, train_data) -> st attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16, quantization_config=bnb_config, - token=HF_TOKEN_READ, ) - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, token=HF_TOKEN_READ) + tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id) tokenizer.padding_side = "right" # # set chat template to OAI chatML, remove if you start from a fine-tuned model @@ -189,7 +189,6 @@ def run_training(pretrained_model_id: str, peft_model_id: str, train_data) -> st }, ) trainer.model.print_trainable_parameters() - trainable_params, all_params = trainer.model.get_nb_trainable_parameters() train_result = trainer.train() train_metrics = train_result.metrics @@ -214,7 +213,11 @@ def run_training(pretrained_model_id: str, peft_model_id: str, train_data) -> st @asset(group_name="model", compute_kind="modal-labs") def trained_model(context: AssetExecutionContext, config: ModelTrainingConfig, train_data): - hub_model_id = run_training(pretrained_model_id=config.pretrained_model_id, peft_model_id=config.peft_model_id, train_data=train_data) + if config.mode == 'modal': + run_training_modal_function = modal.Function.lookup("fine-tune-llms-in-2024-with-trl", "run_training_modal") + hub_model_id = run_training_modal_function.remote(train_data_pandas=train_data.to_pandas(), pretrained_model_id=config.pretrained_model_id, peft_model_id=config.peft_model_id) + elif config.mode == 'local': + hub_model_id = run_training(pretrained_model_id=config.pretrained_model_id, peft_model_id=config.peft_model_id, train_data=train_data) context.add_output_metadata({"model_url": MetadataValue.url(f"https://huggingface.co/{hub_model_id}")}) return hub_model_id @@ -224,9 +227,7 @@ def trained_model(context: AssetExecutionContext, config: ModelTrainingConfig, t @asset(group_name="model", compute_kind="python") def model_card(context: AssetExecutionContext, trained_model): - model_card_path = hf_hub_download(repo_id=trained_model, filename="README.md") - with open(model_card_path, "r") as f: content = f.read() @@ -236,9 +237,9 @@ def model_card(context: AssetExecutionContext, trained_model): @asset(group_name="model", compute_kind="python") def test_results(context: AssetExecutionContext, test_data, trained_model, config: ModelTrainingConfig): - tokenizer = AutoTokenizer.from_pretrained(config.peft_model_id) + tokenizer = AutoTokenizer.from_pretrained(trained_model) model = AutoPeftModelForCausalLM.from_pretrained( - config.peft_model_id, + trained_model, device_map="auto", torch_dtype=torch.float16 ) @@ -246,23 +247,16 @@ def test_results(context: AssetExecutionContext, test_data, trained_model, confi merged_model = model.merge_and_unload() pipe = pipeline("text-generation", model=merged_model, tokenizer=tokenizer, torch_dtype=torch.float16) - terminators = [ pipe.tokenizer.eos_token_id, pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>") ] results = defaultdict(list) - number_of_eval_samples = 100 - for s in test_data.select(range(number_of_eval_samples)): + number_of_eval_samples = 10 + for s in tqdm(test_data.select(range(number_of_eval_samples))): query = s['messages'][1]['content'] - - - prompt = pipe.tokenizer.apply_chat_template( - s['messages'][:2], - tokenize=False, - add_generation_prompt=True - ) + prompt = pipe.tokenizer.apply_chat_template(s['messages'][:2], tokenize=False, add_generation_prompt=True) outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=terminators, pad_token_id=pipe.tokenizer.pad_token_id) original_sql = s['messages'][2]['content'].lower() generated_sql = outputs[0]['generated_text'][len(prompt):].strip().lower() @@ -274,13 +268,13 @@ def test_results(context: AssetExecutionContext, test_data, trained_model, confi rouge = evaluate.load('rouge') - rouge.compute(predictions=results['generated_sql'], references=results['original_sql']) - - + rouge_metrics = rouge.compute(predictions=results['generated_sql'], references=results['original_sql']) + inference_samples = [{'original_sql': original_sql, 'generated_sql': generated_sql, 'hard_match': hard_match} for (original_sql, generated_sql, hard_match) in zip(results['original_sql'], results['generated_sql'], results['hard_match'])] + context.add_output_metadata( { "inference_samples": MetadataValue.json(inference_samples), - "accuracy": MetadataValue.float(accuracy), + "rouge_metrics": MetadataValue.json({x:float(rouge_metrics[x]) for x in rouge_metrics}), } ) diff --git a/text2sql_training/modal_training.py b/text2sql_training/modal_training.py index 7972aae..3de22d7 100644 --- a/text2sql_training/modal_training.py +++ b/text2sql_training/modal_training.py @@ -1,27 +1,27 @@ import modal from modal import Image from datasets import load_dataset -from datasets import disable_caching import pandas as pd -import os +import os -app = modal.App("example-hello-world") -custom_image = Image.from_registry("ghcr.io/kyryl-opens-ml/fine-tune-llm-in-2024:main").env({"HF_TOKEN": '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', "HF_TOKEN_WRITE": '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'}) +app = modal.App("fine-tune-llms-in-2024-with-trl") +env = {"HF_TOKEN": os.getenv('HF_TOKEN'), "HF_TOKEN_WRITE": os.getenv('HF_TOKEN_WRITE')} +custom_image = Image.from_registry("ghcr.io/kyryl-opens-ml/fine-tune-llm-in-2024:main").env(env) -# ENV HF_TOKEN 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 -# ENV HF_TOKEN_WRITE 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 @app.function(image=custom_image, gpu="A100", mounts=[modal.Mount.from_local_python_packages("text2sql_training", "text2sql_training")], timeout=15 * 60) -def run_training_modal(train_data_pandas: pd.DataFrame): +def run_training_modal(train_data_pandas: pd.DataFrame, pretrained_model_id: str, peft_model_id: str): from datasets import Dataset from text2sql_training.llm_stf import run_training - - model_url = run_training(pretrained_model_id='meta-llama/Meta-Llama-3-8B-Instruct', peft_model_id='modal-test', train_data=Dataset.from_pandas(train_data_pandas)) + model_url = run_training(pretrained_model_id=pretrained_model_id, peft_model_id=peft_model_id, train_data=Dataset.from_pandas(train_data_pandas)) return model_url @app.local_entrypoint() def main(): train_data = load_dataset("json", data_files='train_dataset-sql.json', split="train") - result = run_training_modal.remote(train_data_pandas=train_data.to_pandas()) + pretrained_model_id = 'meta-llama/Meta-Llama-3-8B-Instruct' + peft_model_id = 'modal-test' + + result = run_training_modal.remote(train_data_pandas=train_data.to_pandas(), pretrained_model_id=pretrained_model_id, peft_model_id=peft_model_id) print(result) \ No newline at end of file