diff --git a/.github/workflows/module-1-advanced.yaml b/.github/workflows/module-1-advanced.yaml index 6c5e324..8dcb859 100644 --- a/.github/workflows/module-1-advanced.yaml +++ b/.github/workflows/module-1-advanced.yaml @@ -9,6 +9,8 @@ on: pull_request: branches: - main + paths: + - 'module-1/**' jobs: diff --git a/.github/workflows/module-1-basic.yaml b/.github/workflows/module-1-basic.yaml index 13137f0..5d190c9 100644 --- a/.github/workflows/module-1-basic.yaml +++ b/.github/workflows/module-1-basic.yaml @@ -9,6 +9,9 @@ on: pull_request: branches: - main + paths: + - 'module-1/**' + env: IMAGE_ML_APP: app-ml IMAGE_ML_WEB: app-web diff --git a/.github/workflows/module-2.yaml b/.github/workflows/module-2.yaml index 1c4be7a..98cb660 100644 --- a/.github/workflows/module-2.yaml +++ b/.github/workflows/module-2.yaml @@ -3,12 +3,15 @@ name: Module 2 on: - pull_request: + push: branches: - main - push: + + pull_request: branches: - main + paths: + - 'module-2/**' jobs: diff --git a/.github/workflows/module-3.yaml b/.github/workflows/module-3.yaml index 76b15d0..75dfcb4 100644 --- a/.github/workflows/module-3.yaml +++ b/.github/workflows/module-3.yaml @@ -1,7 +1,14 @@ name: Module 3 on: - workflow_dispatch: + push: + branches: + - main + + pull_request: + branches: + - main + env: IMAGE_MAIN_NAME: nlp-sample @@ -15,21 +22,21 @@ jobs: test: runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Login to Docker Hub - uses: docker/login-action@v1 - with: - username: ${{ secrets.DOCKER_HUB_USERNAME }} - password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} + - name: Checkout + uses: actions/checkout@v4 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v1 + # See explanation: https://github.com/orgs/community/discussions/25678 + - name: Clean disk + run: | + rm -rf /opt/hostedtoolcache + - name: Build new run: | - docker build -f week-3/nlp-sample/Dockerfile -t nlp-sample:latest week-3/nlp-sample + docker build -f module-3/nlp-sample/Dockerfile -t nlp-sample:latest module-3/nlp-sample - name: Test style run: | @@ -60,49 +67,3 @@ jobs: # tags: ${{ secrets.DOCKER_HUB_USERNAME }}/${{ env.IMAGE_MAIN_NAME }}:${{ env.IMAGE_MAIN_TAG }} # cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/${{ env.IMAGE_MAIN_NAME }}:buildcache # cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/${{ env.IMAGE_MAIN_NAME }}:buildcache,mode=max - - cml-test: - - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - - uses: iterative/setup-cml@v1 - - - name: Train model - run: | - docker build -f week-3/nlp-sample/Dockerfile -t nlp-sample:latest week-3/nlp-sample - docker run -v $PWD:/tmp/results -e WANDB_PROJECT=${{ secrets.WANDB_PROJECT }} -e WANDB_API_KEY=${{ secrets.WANDB_API_KEY }} nlp-sample:latest make train_fast_ci - - - name: Write CML report - env: - REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # Post reports as comments in GitHub PRs - # cat results.txt >> report.md - cml send-comment README.md - - build-push-aim: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v2 - - - name: Login to Docker Hub - uses: docker/login-action@v1 - with: - username: ${{ secrets.DOCKER_HUB_USERNAME }} - password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - - - name: Build - uses: docker/build-push-action@v2 - with: - context: week-3/aim/ - file: week-3/aim//Dockerfile - push: true - tags: ${{ secrets.DOCKER_HUB_USERNAME }}/${{ env.IMAGE_AIM_NAME }}:${{ env.IMAGE_AIM_TAG }} - cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/${{ env.IMAGE_AIM_NAME }}:buildcache - cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/${{ env.IMAGE_AIM_NAME }}:buildcache,mode=max \ No newline at end of file diff --git a/module-3/PRACTICE.md b/module-3/PRACTICE.md index fc25e7b..e782bd3 100644 --- a/module-3/PRACTICE.md +++ b/module-3/PRACTICE.md @@ -1,10 +1,10 @@ -# Practice +# Practice -*** +*** # H5: Training & Experiments -## Reading list: +## Reading list: - [The Data Science Lifecycle Process](https://github.com/dslp/dslp#the-data-science-lifecycle-process) - [Structuring Your Project](https://docs.python-guide.org/writing/structure/) @@ -39,15 +39,12 @@ You need to have a training pipeline for your model for this homework. You can t - PR6: Write code for distributed training with PyTorch, Accelerate, and Ray. - Public link to your W&B project with experiments. +## Criteria: -## Criteria: - -- 6 PRs are merged. +- 6 PRs are merged. - W&B project created. - Description of experiment section in the google doc. - - # H6: Testing & CI ## Reading list: @@ -67,7 +64,6 @@ You need to have a training pipeline for your model for this homework. You can t - [Privacy Testing for Deep Learning](https://github.com/trailofbits/PrivacyRaven) - [Learning Interpretability Tool (LIT)](https://github.com/PAIR-code/lit) - ## Task: You need to have a training pipeline for your model for this homework. You can take it from your test task for this course, bring your own or use this [code](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) as an example. @@ -82,5 +78,5 @@ You need to have a training pipeline for your model for this homework. You can t ## Criteria: -- 6 PRs merged +- 6 PRs merged. - Testing plan in the google doc. \ No newline at end of file diff --git a/module-3/README.md b/module-3/README.md index 5f3a6d5..4e2ea36 100644 --- a/module-3/README.md +++ b/module-3/README.md @@ -2,112 +2,70 @@ ![alt text](./../docs/experiments.jpg) -# Practice +# Practice [Practice task](./PRACTICE.md) -*** +*** # Reference implementation -*** +*** - -# Project stucture +## Project stucture - [Python project](https://github.com/navdeep-G/samplemod.git) - [ML project](https://github.com/ashleve/lightning-hydra-template.git) - [Advanced features](https://github.com/Lightning-AI/lightning) -# Configuration +## Styling -[hydra](https://hydra.cc/docs/intro/) +[ruff](https://github.com/astral-sh/ruff) -# Example ML model with testing +## Configuration -[nlp-sample](./nlp-sample) +[hydra](https://hydra.cc/docs/intro/) -# Experiments +## Experiments management https://neptune.ai/blog/best-ml-experiment-tracking-tools -## AIM - -https://github.com/aimhubio/aim - - -``` -kubectl create -f aim/deployment-aim-web.yaml -kubectl port-forward svc/my-aim-service 8080:80 --namespace default -``` - - -# Model card - -- https://github.com/ivylee/model-cards-and-datasheets -- https://arxiv.org/abs/1810.03993 - - -# LLMs for everything - - -## LoRA & Peft +## Model card -- https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2 -- https://github.com/huggingface/peft +- [Model Cards for Model Reporting](https://arxiv.org/abs/1810.03993) +- [A collection of machine learning model cards and datasheets.](https://github.com/ivylee/model-cards-and-datasheets) +- [GPT-4o](https://openai.com/index/hello-gpt-4o/) +- [GPT-4 System Card](https://cdn.openai.com/papers/gpt-4-system-card.pdf) -## Experiments +## Classic example: BERT-based training -- https://github.com/georgian-io/LLM-Finetuning-Hub -- https://medium.com/georgian-impact-blog/the-practical-guide-to-llms-llama-2-cdf21d540ce3 - -## Run example - -``` -python lora_training/mistral_classification.py training-llm --pretrained-ckpt mistralai/Mistral-7B-v0.1 --epochs 1 --train-sample-fraction 0.3 -python lora_training/mistral_classification.py training-llm --pretrained-ckpt facebook/opt-350m --epochs 1 --train-sample-fraction 0.3 - -python lora_training/mistral_classification.py inference-llm -``` - - -https://github.com/brevdev/notebooks/blob/main/mistral-finetune-own-data.ipynb - -## Run example RLHF - - -``` -docker build -t rlhf:latest . -docker run --net=host --gpus all -it -v ${PWD}:/main rlhf:latest /bin/bash - -accelerate config -python sft_llama2.py - -``` +[nlp-sample](./nlp-sample) -https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama_2/scripts +https://huggingface.co/models?sort=downloads +## Modern example: GenAI-based training -## Eval: +TODO -- https://github.com/explodinggradients/ragas -- https://github.com/NVIDIA/NeMo-Guardrails -- https://github.com/guardrail-ml/guardrail -- https://github.com/promptfoo/promptfoo -- https://github.com/confident-ai/deepeval +- https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/tree/main +- https://github.com/microsoft/Phi-3CookBook +- https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard +## LLM API testing -``` -pip install nemoguardrails -pip install openai -export OPENAI_API_KEY=********** -``` +- [deepeval](https://github.com/confident-ai/deepeval) +- [promptfoo](https://github.com/promptfoo/promptfoo) +- [LLM Testing in 2024: Top Methods and Strategies](https://www.confident-ai.com/blog/llm-testing-in-2024-top-methods-and-strategies) +- [uptrain](https://github.com/uptrain-ai/uptrain) +- [ragas](https://github.com/explodinggradients/ragas) +- [NeMo Guardrails](https://github.com/NVIDIA/NeMo-Guardrails) +- [Automated Unit Test Improvement using Large Language Models at Meta](https://arxiv.org/abs/2402.09171) -# Distributed training +# Distributed training - https://www.anyscale.com/blog/what-is-distributed-training - https://www.anyscale.com/blog/training-175b-parameter-language-models-at-1000-gpu-scale-with-alpa-and-ray @@ -119,8 +77,3 @@ export OPENAI_API_KEY=********** - https://github.com/microsoft/nni - https://github.com/autogluon/autogluon - - -# Declarative ML - -https://predibase.com/blog/how-to-fine-tune-llama-2-on-your-data-with-scalable-llm-infrastructure \ No newline at end of file diff --git a/module-3/aim/Dockerfile b/module-3/aim/Dockerfile deleted file mode 100644 index 57022ba..0000000 --- a/module-3/aim/Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -# python3.7 should be sufficient to run Aim -FROM python:3.7 - -# install the `aim` package on the latest version -RUN pip install --upgrade aim - -# make a directory where the Aim repo will be initialized, `/aim` -RUN mkdir /aim - -ENTRYPOINT ["/bin/sh", "-c"] - -# have to run `aim init` in the directory that stores aim data for -# otherwise `aim up` will prompt for confirmation to create the directory itself. -# We run aim listening on 0.0.0.0 to expose all ports. Also, we run -# using `--dev` to print verbose logs. Port 43800 is the default port of -# `aim up` but explicit is better than implicit. -CMD ["echo \"N\" | aim init --repo /aim && aim up --host 0.0.0.0 --port 43800 --workers 2 --repo /aim"] diff --git a/module-3/aim/deployment-aim-web.yaml b/module-3/aim/deployment-aim-web.yaml deleted file mode 100644 index 0b1d33f..0000000 --- a/module-3/aim/deployment-aim-web.yaml +++ /dev/null @@ -1,40 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - app: my-aim-deployment - name: my-aim-deployment - namespace: default -spec: - selector: - matchLabels: - app: my-aim-deployment - template: - metadata: - labels: - app: my-aim-deployment - spec: - containers: - - image: kyrylprojector/aim:latest - name: my-aim-deployment - ports: - - containerPort: 43800 - protocol: TCP - volumeMounts: - - mountPath: /aim - name: aim-runs - volumes: - - name: aim-runs - emptyDir: {} ---- -apiVersion: v1 -kind: Service -metadata: - name: my-aim-service -spec: - selector: - app: my-aim-deployment - ports: - - protocol: TCP - port: 80 - targetPort: 43800 \ No newline at end of file diff --git a/module-3/llm-training/Dockerfile b/module-3/llm-training/Dockerfile deleted file mode 100644 index 114ce71..0000000 --- a/module-3/llm-training/Dockerfile +++ /dev/null @@ -1,16 +0,0 @@ -FROM huggingface/transformers-pytorch-gpu:4.35.2 - -WORKDIR /app - -ENV LC_ALL=C.UTF-8 -ENV LANG=C.UTF-8 - -COPY requirements.txt requirements.txt -RUN pip install -r requirements.txt - -RUN ln -s /usr/bin/python3 /usr/bin/python - -ENV PYTHONPATH /app -COPY . . - -CMD [ "bash" ] \ No newline at end of file diff --git a/module-3/llm-training/README.md b/module-3/llm-training/README.md deleted file mode 100644 index f178840..0000000 --- a/module-3/llm-training/README.md +++ /dev/null @@ -1,49 +0,0 @@ -## Run example RLHF - - -``` -docker build -t llm-training:latest . -docker run --net=host --gpus all -it -v ${PWD}:/main llm-training:latest /bin/bash - -accelerate config -python sft_llama2.py - -``` - - - -``` -from transformers import AutoModelForCausalLM, AutoTokenizer -from datasets import load_dataset -from trl import SFTTrainer, DataCollatorForCompletionOnlyLM - -dataset = load_dataset("lucasmccabe-lmi/CodeAlpaca-20k", split="train") - -model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m") -tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") - -def formatting_prompts_func(example): - output_texts = [] - for i in range(len(example['instruction'])): - text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}" - output_texts.append(text) - return output_texts - -response_template = " ### Answer:" -collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer) - -trainer = SFTTrainer( - model, - train_dataset=dataset, - formatting_func=formatting_prompts_func, - data_collator=collator, -) - -trainer.train() - -``` -https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama_2/scripts - - - -s \ No newline at end of file diff --git a/module-3/llm-training/requirements.txt b/module-3/llm-training/requirements.txt deleted file mode 100644 index 4715675..0000000 --- a/module-3/llm-training/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -transformers -trl -peft -accelerate -datasets -bitsandbytes -wandb -ipython \ No newline at end of file diff --git a/module-3/llm-training/test_copy.py b/module-3/llm-training/test_copy.py deleted file mode 100644 index 4743c31..0000000 --- a/module-3/llm-training/test_copy.py +++ /dev/null @@ -1,110 +0,0 @@ -from transformers import AutoModelForCausalLM, AutoTokenizer -from datasets import load_dataset -from trl import SFTTrainer, DataCollatorForCompletionOnlyLM -from peft import LoraConfig -from transformers import AutoModelForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments -from trl import SFTTrainer, is_xpu_available -from accelerate import Accelerator -import torch - -dataset = load_dataset("imdb", split="train") - -quantization_config = BitsAndBytesConfig(load_in_8bit=False, load_in_4bit=True) -device_map = ({"": f"xpu:{Accelerator().local_process_index}"} if is_xpu_available() else {"": Accelerator().local_process_index}) -torch_dtype = torch.bfloat16 - -tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") - - -model = AutoModelForCausalLM.from_pretrained( - "facebook/opt-350m", - quantization_config=quantization_config, - device_map=device_map, - torch_dtype=torch_dtype, -) - - - -# def formatting_prompts_func(example): -# output_texts = [] -# for i in range(len(example['instruction'])): -# text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}" -# output_texts.append(text) -# return output_texts - -def formatting_func(example): - text = f"### Review: {example['text']}\n ### Answer: {'Positive' if example['label'] == 1 else 'Negative'}" - return text - -response_template = " ### Answer:" -# collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer) - -peft_config = LoraConfig( - r=64, - lora_alpha=16, - bias="none", - task_type="CAUSAL_LM", -) -output_dir = 'training' - -training_args = TrainingArguments( - output_dir=output_dir, - per_device_train_batch_size=4, - gradient_accumulation_steps=4, - # learning_rate=script_args.learning_rate, - # logging_steps=script_args.logging_steps, - # num_train_epochs=script_args.num_train_epochs, - # max_steps=script_args.max_steps, - # report_to=script_args.log_with, - # save_steps=script_args.save_steps, - # save_total_limit=script_args.save_total_limit, - # push_to_hub=script_args.push_to_hub, - # hub_model_id=script_args.hub_model_id, - # gradient_checkpointing=script_args.gradient_checkpointing, - # TODO: uncomment that on the next release - # gradient_checkpointing_kwargs=script_args.gradient_checkpointing_kwargs, -) - -trainer = SFTTrainer( - model, - args=training_args, - train_dataset=dataset, - packing=True, - formatting_func=formatting_func, - peft_config=peft_config -) - -trainer.train() - -trainer.save_model(output_dir) - - - - -from transformers import AutoModelForCausalLM, AutoTokenizer -from datasets import load_dataset -from trl import SFTTrainer, DataCollatorForCompletionOnlyLM -from peft import LoraConfig -from transformers import AutoModelForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments -from trl import SFTTrainer, is_xpu_available -from accelerate import Accelerator -import torch - - -tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") -# model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m") -model = AutoModelForCausalLM.from_pretrained("training/checkpoint-1000/") -dataset = load_dataset("imdb", split="train") -example = dataset[0] -inference(dataset[15000]) - - -def inference(example): - - text = f"### Review: {example['text']}\n ### Answer:" - - input_ids = tokenizer(text, return_tensors="pt", truncation=True).input_ids - - outputs = model.generate(input_ids=input_ids, max_new_tokens=512, do_sample=True, top_p=0.95, temperature=1e-3,) - result = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0] - print(result) diff --git a/module-3/llm-training/test_dpo.py b/module-3/llm-training/test_dpo.py deleted file mode 100644 index 8440f89..0000000 --- a/module-3/llm-training/test_dpo.py +++ /dev/null @@ -1,178 +0,0 @@ -from dataclasses import dataclass, field -from typing import Dict, Optional - -import torch -from datasets import Dataset, load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, TrainingArguments - -from trl import DPOTrainer - - -# Define and parse arguments. -@dataclass -class ScriptArguments: - """ - The arguments for the DPO training script. - """ - - # data parameters - beta: Optional[float] = field(default=0.1, metadata={"help": "the beta parameter for DPO loss"}) - - # training parameters - model_name_or_path: Optional[str] = field(default="gpt2", metadata={"help": "the model name"}) - learning_rate: Optional[float] = field(default=1e-3, metadata={"help": "optimizer learning rate"}) - per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "batch size per device"}) - gradient_accumulation_steps: Optional[int] = field(default=1, metadata={"help": "the number of gradient accumulation steps"}) - max_length: Optional[int] = field(default=512, metadata={"help": "max length of each sample"}) - max_prompt_length: Optional[int] = field(default=128, metadata={"help": "max length of each sample's prompt"}) - max_target_length: Optional[int] = field(default=128, metadata={"help": "Only used for encoder decoder model. Max target of each sample's prompt"}) - label_pad_token_id: Optional[int] = field(default=-100, metadata={"help": "label for non response tokens"}) - max_steps: Optional[int] = field(default=1000, metadata={"help": "max number of training steps"}) - # instrumentation - sanity_check: Optional[bool] = field(default=True, metadata={"help": "only train on 1000 samples"}) - report_to: Optional[str] = field( - default=None, - metadata={ - "help": 'The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,' - '`"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. ' - 'Use `"all"` to report to all integrations installed, `"none"` for no integrations.' - }, - ) - # debug argument for distributed training - ignore_bias_buffers: Optional[bool] = field( - default=False, - metadata={ - "help": "fix for DDP issues with LM bias/mask buffers - invalid scalar type,`inplace operation. See" - "https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992" - }, - ) - gradient_checkpointing: Optional[bool] = field( - default=False, metadata={"help": "Whether to use gradient checkpointing or no"} - ) - gradient_checkpointing_kwargs: Optional[dict] = field( - default=None, - metadata={ - "help": "key word arguments to be passed along `torch.utils.checkpoint.checkpoint` method - e.g. `use_reentrant=False`" - }, - ) - - -def extract_anthropic_prompt(prompt_and_response): - """Extract the anthropic prompt from a prompt and response pair.""" - search_term = "\n\nAssistant:" - search_term_idx = prompt_and_response.rfind(search_term) - assert search_term_idx != -1, f"Prompt and response does not contain '{search_term}'" - return prompt_and_response[: search_term_idx + len(search_term)] - - -def get_hh(split: str, sanity_check: bool = False, silent: bool = False, cache_dir: str = None) -> Dataset: - """Load the Anthropic Helpful-Harmless dataset from Hugging Face and convert it to the necessary format. - - The dataset is converted to a dictionary with the following structure: - { - 'prompt': List[str], - 'chosen': List[str], - 'rejected': List[str], - } - - Prompts should be structured as follows: - \n\nHuman: \n\nAssistant: - Multiple turns are allowed, but the prompt should always start with \n\nHuman: and end with \n\nAssistant:. - """ - dataset = load_dataset("Anthropic/hh-rlhf", split=split, cache_dir=cache_dir) - if sanity_check: - dataset = dataset.select(range(min(len(dataset), 1000))) - - def split_prompt_and_responses(sample) -> Dict[str, str]: - prompt = extract_anthropic_prompt(sample["chosen"]) - return { - "prompt": prompt, - "chosen": sample["chosen"][len(prompt) :], - "rejected": sample["rejected"][len(prompt) :], - } - - return dataset.map(split_prompt_and_responses) - - -if __name__ == "__main__": - parser = HfArgumentParser(ScriptArguments) - script_args = parser.parse_args_into_dataclasses()[0] - - # 1. load a pretrained model - model = AutoModelForCausalLM.from_pretrained(script_args.model_name_or_path) - - if script_args.ignore_bias_buffers: - # torch distributed hack - model._ddp_params_and_buffers_to_ignore = [ - name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool - ] - - model_ref = AutoModelForCausalLM.from_pretrained(script_args.model_name_or_path) - - tokenizer = AutoTokenizer.from_pretrained(script_args.model_name_or_path) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - - # 2. Load the Anthropic Helpful-Harmless dataset - train_dataset = get_hh("train", sanity_check=script_args.sanity_check) - - # 3. Load evaluation dataset - eval_dataset = get_hh("test", sanity_check=script_args.sanity_check) - - # 4. initialize training arguments: - training_args = TrainingArguments( - per_device_train_batch_size=script_args.per_device_train_batch_size, - max_steps=script_args.max_steps, - remove_unused_columns=False, - gradient_accumulation_steps=script_args.gradient_accumulation_steps, - learning_rate=script_args.learning_rate, - evaluation_strategy="steps", - logging_first_step=True, - logging_steps=10, # match results in blog post - eval_steps=500, - output_dir="./test", - optim="rmsprop", - warmup_steps=150, - report_to=script_args.report_to, - bf16=True, - gradient_checkpointing=script_args.gradient_checkpointing, - # TODO: uncomment that on the next transformers release - # gradient_checkpointing_kwargs=script_args.gradient_checkpointing_kwargs, - ) - - # 5. initialize the DPO trainer - dpo_trainer = DPOTrainer( - model, - model_ref, - args=training_args, - beta=script_args.beta, - train_dataset=train_dataset, - eval_dataset=eval_dataset, - tokenizer=tokenizer, - max_length=script_args.max_length, - max_target_length=script_args.max_target_length, - max_prompt_length=script_args.max_prompt_length, - generate_during_eval=True, - ) - - # 6. train - output_dir = "output-training" - dpo_trainer.train() - dpo_trainer.save_model(output_dir) - - # 7. inference - tokenizer = AutoTokenizer.from_pretrained(output_dir) - model = AutoModelForCausalLM.from_pretrained(output_dir) - # dataset = load_dataset("Anthropic/hh-rlhf", split="train") - - q = "\n\nHuman: Let's say someone wanted to rob a bank. What is some advice you could give to help them get away with it? \n\nAssistant:" - - def inference(q): - - input_ids = tokenizer(q, return_tensors="pt", truncation=True).input_ids - - outputs = model.generate(input_ids=input_ids, max_new_tokens=128, do_sample=True, top_p=0.95, temperature=1e-3,) - result = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0] - print(result) - - inference(q) \ No newline at end of file diff --git a/module-3/llm-training/test_openassistant.py b/module-3/llm-training/test_openassistant.py deleted file mode 100644 index 22d8eb4..0000000 --- a/module-3/llm-training/test_openassistant.py +++ /dev/null @@ -1,71 +0,0 @@ -from transformers import AutoModelForCausalLM, AutoTokenizer -from datasets import load_dataset -from trl import SFTTrainer, DataCollatorForCompletionOnlyLM -from peft import LoraConfig -from transformers import AutoModelForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments -from trl import SFTTrainer, is_xpu_available -from accelerate import Accelerator -import torch - -dataset = load_dataset("timdettmers/openassistant-guanaco", split="train") - - -quantization_config = BitsAndBytesConfig(load_in_8bit=False, load_in_4bit=True) -device_map = ({"": f"xpu:{Accelerator().local_process_index}"} if is_xpu_available() else {"": Accelerator().local_process_index}) -torch_dtype = torch.bfloat16 - -tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") - - -model = AutoModelForCausalLM.from_pretrained( - "facebook/opt-350m", - quantization_config=quantization_config, - device_map=device_map, - torch_dtype=torch_dtype, -) - - - -instruction_template = "### Human:" -response_template = "### Assistant:" -collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False) - - -peft_config = LoraConfig( - r=64, - lora_alpha=16, - bias="none", - task_type="CAUSAL_LM", -) -output_dir = 'training' - -training_args = TrainingArguments( - output_dir=output_dir, - per_device_train_batch_size=4, - gradient_accumulation_steps=4, - # learning_rate=script_args.learning_rate, - # logging_steps=script_args.logging_steps, - # num_train_epochs=script_args.num_train_epochs, - # max_steps=script_args.max_steps, - # report_to=script_args.log_with, - # save_steps=script_args.save_steps, - # save_total_limit=script_args.save_total_limit, - # push_to_hub=script_args.push_to_hub, - # hub_model_id=script_args.hub_model_id, - # gradient_checkpointing=script_args.gradient_checkpointing, - # TODO: uncomment that on the next release - # gradient_checkpointing_kwargs=script_args.gradient_checkpointing_kwargs, -) - -trainer = SFTTrainer( - model, - args=training_args, - train_dataset=dataset, - dataset_text_field="text", - data_collator=collator, - peft_config=peft_config -) - -trainer.train() - -trainer.save_model(output_dir) diff --git a/module-3/lora_training/__init__.py b/module-3/lora_training/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/module-3/lora_training/datasets_prep.py b/module-3/lora_training/datasets_prep.py deleted file mode 100644 index 6e7a0d1..0000000 --- a/module-3/lora_training/datasets_prep.py +++ /dev/null @@ -1,85 +0,0 @@ -import pandas as pd -import datasets -from datasets import load_dataset -from sklearn.model_selection import train_test_split - -TRAINING_CLASSIFIER_PROMPT_v2 = """### Q:{sentence} ### Math:{label}""" -INFERENCE_CLASSIFIER_PROMPT_v2 = """### Sentence:{sentence} ### Class:""" - -def clean_newsgroup_data(texts, labels): - label2data = {} - clean_data, clean_labels = [], [] - for data, label in zip(texts, labels): - if isinstance(data, str) and isinstance(label, str): - clean_data.append(data) - clean_labels.append(label) - - if label not in label2data: - label2data[label] = data - - return label2data, clean_data, clean_labels - -def get_newsgroup_instruction_data(mode, texts, labels): - if mode == "train": - prompt = TRAINING_CLASSIFIER_PROMPT_v2 - elif mode == "inference": - prompt = INFERENCE_CLASSIFIER_PROMPT_v2 - - instructions = [] - - for text, label in zip(texts, labels): - if mode == "train": - example = prompt.format( - sentence=text, - label=label, - ) - elif mode == "inference": - example = prompt.format( - sentence=text, - ) - instructions.append(example) - - return instructions - -def get_newsgroup_data_for_ft(mode="train", train_sample_fraction=0.99): - newsgroup_dataset = load_dataset("rungalileo/20_Newsgroups_Fixed") - train_data = newsgroup_dataset["train"]["text"] - train_labels = newsgroup_dataset["train"]["label"] - label2data, train_data, train_labels = clean_newsgroup_data(train_data, train_labels) - - test_data = newsgroup_dataset["test"]["text"] - test_labels = newsgroup_dataset["test"]["label"] - _, test_data, test_labels = clean_newsgroup_data(test_data, test_labels) - - # sample n points from training data - train_df = pd.DataFrame(data={"text": train_data, "label": train_labels}) - train_df, _ = train_test_split( - train_df, - train_size=train_sample_fraction, - stratify=train_df["label"], - random_state=42, - ) - train_data = train_df["text"] - train_labels = train_df["label"] - - train_instructions = get_newsgroup_instruction_data(mode, train_data, train_labels) - test_instructions = get_newsgroup_instruction_data(mode, test_data, test_labels) - - train_dataset = datasets.Dataset.from_pandas( - pd.DataFrame( - data={ - "instructions": train_instructions, - "labels": train_labels, - } - ) - ) - test_dataset = datasets.Dataset.from_pandas( - pd.DataFrame( - data={ - "instructions": test_instructions, - "labels": test_labels, - } - ) - ) - - return train_dataset, test_dataset diff --git a/module-3/lora_training/mistral_classification.py b/module-3/lora_training/mistral_classification.py deleted file mode 100644 index 1bba3ed..0000000 --- a/module-3/lora_training/mistral_classification.py +++ /dev/null @@ -1,225 +0,0 @@ -import torch -import typer - -from peft import ( - LoraConfig, - prepare_model_for_kbit_training, - get_peft_model, -) -from transformers import ( - AutoTokenizer, - AutoModelForCausalLM, - BitsAndBytesConfig, - TrainingArguments, -) -from trl import SFTTrainer -import argparse -import torch -import os -import pandas as pd -import pickle -import warnings -from tqdm import tqdm - -from peft import AutoPeftModelForCausalLM -from transformers import AutoTokenizer -from sklearn.metrics import ( - accuracy_score, - f1_score, - precision_score, - recall_score, -) - -# from lora_training.datasets_prep import get_newsgroup_data_for_ft - -warnings.filterwarnings("ignore") - - - - - - -def training_llm( - # pretrained_ckpt: str = "mistralai/Mistral-7B-v0.1", - - # pretrained_ckpt: str = "microsoft/phi-1_5", - # pretrained_ckpt: str = "facebook/opt-350m", - - lora_r: int = 8, - epochs: int = 5, - dropout: float = 0.1, - train_sample_fraction: float = 0.25 - ): - - - train_dataset, test_dataset = get_newsgroup_data_for_ft(mode="train", train_sample_fraction=train_sample_fraction) - print(f"Sample fraction:{train_sample_fraction}") - print(f"Training samples:{train_dataset.shape}") - print(f"Training sample idx = 0\n:{train_dataset['instructions'][0]}") - - # BitsAndBytesConfig int-4 config - bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, - ) - - # Load model and tokenizer - model = AutoModelForCausalLM.from_pretrained( - pretrained_ckpt, - quantization_config=bnb_config, - use_cache=False, - device_map="auto", - ) - model.config.pretraining_tp = 1 - - tokenizer = AutoTokenizer.from_pretrained(pretrained_ckpt) - tokenizer.pad_token = tokenizer.eos_token - tokenizer.padding_side = "right" - - # LoRA config based on QLoRA paper - peft_config = LoraConfig( - lora_alpha=16, - lora_dropout=dropout, - r=lora_r, - bias="none", - task_type="CAUSAL_LM", - ) - - # prepare model for training - model = prepare_model_for_kbit_training(model) - model = get_peft_model(model, peft_config) - model.print_trainable_parameters() - - results_dir = f"experiments/classification" - - training_args = TrainingArguments( - output_dir=results_dir, - logging_dir=f"{results_dir}/logs", - num_train_epochs=epochs, - per_device_train_batch_size=8, - gradient_accumulation_steps=2, - gradient_checkpointing=True, - optim="paged_adamw_32bit", - logging_steps=100, - learning_rate=2e-4, - bf16=True, - tf32=True, - max_grad_norm=0.3, - warmup_ratio=0.03, - lr_scheduler_type="constant", - report_to="none", - ) - max_seq_length = 512 # max sequence length for model and packing of the dataset - trainer = SFTTrainer( - model=model, - train_dataset=train_dataset, - peft_config=peft_config, - max_seq_length=max_seq_length, - tokenizer=tokenizer, - packing=True, - args=training_args, - dataset_text_field="instructions", - ) - - trainer_stats = trainer.train() - train_loss = trainer_stats.training_loss - print(f"Training loss:{train_loss}") - - peft_model_id = f"{results_dir}/assets" - trainer.model.save_pretrained(peft_model_id) - tokenizer.save_pretrained(peft_model_id) - print("Experiment over") - -def inference_llm(experiment_dir: str = 'experiments/classification'): - _, test_dataset = get_newsgroup_data_for_ft(mode="inference") - - experiment = experiment_dir - peft_model_id = f"{experiment}/assets" - - # load base LLM model and tokenizer - model = AutoPeftModelForCausalLM.from_pretrained( - peft_model_id, - low_cpu_mem_usage=True, - torch_dtype=torch.float16, - load_in_4bit=True, - ) - - model.eval() - - tokenizer = AutoTokenizer.from_pretrained(peft_model_id) - - results = [] - oom_examples = [] - instructions, labels = test_dataset["instructions"], test_dataset["labels"] - - for instruct, label in tqdm(zip(instructions, labels)): - input_ids = tokenizer( - instruct, return_tensors="pt", truncation=True - ).input_ids.cuda() - - with torch.inference_mode(): - try: - outputs = model.generate( - input_ids=input_ids, - max_new_tokens=20, - do_sample=True, - top_p=0.95, - temperature=1e-3, - ) - result = tokenizer.batch_decode( - outputs.detach().cpu().numpy(), skip_special_tokens=True - )[0] - - result = result[len(instruct) :] - print(result) - except: - result = "" - oom_examples.append(input_ids.shape[-1]) - - results.append(result) - - metrics = { - "micro_f1": f1_score(labels, results, average="micro"), - "macro_f1": f1_score(labels, results, average="macro"), - "precision": precision_score(labels, results, average="micro"), - "recall": recall_score(labels, results, average="micro"), - "accuracy": accuracy_score(labels, results), - "oom_examples": oom_examples, - } - print(metrics) - print(f"Completed experiment {peft_model_id}") - print("----------------------------------------") - - -def cli(): - app = typer.Typer() - app.command()(training_llm) - app.command()(inference_llm) - app() - - -# if __name__ == "__main__": -# parser = argparse.ArgumentParser() -# parser.add_argument( -# "--experiment_dir", -# default="experiments/classification-sampleFraction-0.1_epochs-5_rank-8_dropout-0.1", -# ) - -# args = parser.parse_args() -# main(args) - -# if __name__ == "__main__": -# parser = argparse.ArgumentParser() -# parser.add_argument("--pretrained_ckpt", default="mistralai/Mistral-7B-v0.1") -# parser.add_argument("--lora_r", default=8, type=int) -# parser.add_argument("--epochs", default=5, type=int) -# parser.add_argument("--dropout", default=0.1, type=float) -# parser.add_argument("--train_sample_fraction", default=0.99, type=float) - -# args = parser.parse_args() -# main(args) - -if __name__ == '__main__': - cli() \ No newline at end of file diff --git a/module-3/lora_training_rlhf/Dockerfile b/module-3/lora_training_rlhf/Dockerfile deleted file mode 100644 index 114ce71..0000000 --- a/module-3/lora_training_rlhf/Dockerfile +++ /dev/null @@ -1,16 +0,0 @@ -FROM huggingface/transformers-pytorch-gpu:4.35.2 - -WORKDIR /app - -ENV LC_ALL=C.UTF-8 -ENV LANG=C.UTF-8 - -COPY requirements.txt requirements.txt -RUN pip install -r requirements.txt - -RUN ln -s /usr/bin/python3 /usr/bin/python - -ENV PYTHONPATH /app -COPY . . - -CMD [ "bash" ] \ No newline at end of file diff --git a/module-3/lora_training_rlhf/__init__.py b/module-3/lora_training_rlhf/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/module-3/lora_training_rlhf/dpo_llama2.py b/module-3/lora_training_rlhf/dpo_llama2.py deleted file mode 100644 index 60373d1..0000000 --- a/module-3/lora_training_rlhf/dpo_llama2.py +++ /dev/null @@ -1,216 +0,0 @@ -# 0. imports -import os -from dataclasses import dataclass, field -from typing import Dict, Optional - -import torch -from datasets import Dataset, load_dataset -from peft import LoraConfig -from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, TrainingArguments - -from trl import DPOTrainer - - -# Define and parse arguments. -@dataclass -class ScriptArguments: - """ - The arguments for the DPO training script. - """ - - # data parameters - beta: Optional[float] = field(default=0.1, metadata={"help": "the beta parameter for DPO loss"}) - - # training parameters - model_name_or_path: Optional[str] = field(default="./sft/results/final_checkpoint",metadata={"help": "the location of the SFT model name or path"}) - learning_rate: Optional[float] = field(default=5e-4, metadata={"help": "optimizer learning rate"}) - lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"}) - warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"}) - weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"}) - optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"}) - - per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "train batch size per device"}) - per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "eval batch size per device"}) - gradient_accumulation_steps: Optional[int] = field(default=4, metadata={"help": "the number of gradient accumulation steps"}) - gradient_checkpointing: Optional[bool] = field(default=True, metadata={"help": "whether to use gradient checkpointing"}) - - lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"}) - lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"}) - lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"}) - - max_prompt_length: Optional[int] = field(default=512, metadata={"help": "the maximum prompt length"}) - max_length: Optional[int] = field(default=1024, metadata={"help": "the maximum sequence length"}) - max_steps: Optional[int] = field(default=1000, metadata={"help": "max number of training steps"}) - logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"}) - save_steps: Optional[int] = field(default=100, metadata={"help": "the saving frequency"}) - eval_steps: Optional[int] = field(default=100, metadata={"help": "the evaluation frequency"}) - - output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"}) - log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"}) - - # instrumentation - sanity_check: Optional[bool] = field(default=False, metadata={"help": "only train on 1000 samples"}) - report_to: Optional[str] = field( - default="wandb", - metadata={ - "help": 'The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,' - '`"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. ' - 'Use `"all"` to report to all integrations installed, `"none"` for no integrations.' - }, - ) - # debug argument for distributed training - ignore_bias_buffers: Optional[bool] = field( - default=False, - metadata={ - "help": "fix for DDP issues with LM bias/mask buffers - invalid scalar type,`inplace operation. See" - "https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992" - }, - ) - - -def get_stack_exchange_paired( - data_dir: str = "data/rl", - sanity_check: bool = False, - cache_dir: str = None, - num_proc=24, -) -> Dataset: - """Load the stack-exchange-paired dataset from Hugging Face and convert it to the necessary format. - - The dataset is converted to a dictionary with the following structure: - { - 'prompt': List[str], - 'chosen': List[str], - 'rejected': List[str], - } - - Prompts are structured as follows: - "Question: " + + "\n\nAnswer: " - """ - dataset = load_dataset( - "lvwerra/stack-exchange-paired", - split="train", - cache_dir=cache_dir, - data_dir=data_dir, - ) - original_columns = dataset.column_names - - if sanity_check: - dataset = dataset.select(range(min(len(dataset), 1000))) - - def return_prompt_and_responses(samples) -> Dict[str, str]: - return { - "prompt": ["Question: " + question + "\n\nAnswer: " for question in samples["question"]], - "chosen": samples["response_j"], - "rejected": samples["response_k"], - } - - return dataset.map( - return_prompt_and_responses, - batched=True, - num_proc=num_proc, - remove_columns=original_columns, - ) - - -if __name__ == "__main__": - parser = HfArgumentParser(ScriptArguments) - script_args = parser.parse_args_into_dataclasses()[0] - - # 1. load a pretrained model - model = AutoModelForCausalLM.from_pretrained( - script_args.model_name_or_path, - low_cpu_mem_usage=True, - # torch_dtype=torch.float16, - load_in_4bit=True, - ) - model.config.use_cache = False - - if script_args.ignore_bias_buffers: - # torch distributed hack - model._ddp_params_and_buffers_to_ignore = [ - name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool - ] - - model_ref = AutoModelForCausalLM.from_pretrained( - script_args.model_name_or_path, - low_cpu_mem_usage=True, - # torch_dtype=torch.float16, - load_in_4bit=True, - ) - tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") - tokenizer.pad_token = tokenizer.eos_token - - # 2. Load the Stack-exchange paired dataset - train_dataset = get_stack_exchange_paired(data_dir="data/rl", sanity_check=script_args.sanity_check) - train_dataset = train_dataset.filter( - lambda x: len(x["prompt"]) + len(x["chosen"]) <= script_args.max_length - and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length - ) - - # 3. Load evaluation dataset - eval_dataset = get_stack_exchange_paired(data_dir="data/evaluation", sanity_check=True) - eval_dataset = eval_dataset.filter( - lambda x: len(x["prompt"]) + len(x["chosen"]) <= script_args.max_length - and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length - ) - - # 4. initialize training arguments: - training_args = TrainingArguments( - per_device_train_batch_size=script_args.per_device_train_batch_size, - per_device_eval_batch_size=script_args.per_device_eval_batch_size, - max_steps=script_args.max_steps, - logging_steps=script_args.logging_steps, - save_steps=script_args.save_steps, - gradient_accumulation_steps=script_args.gradient_accumulation_steps, - gradient_checkpointing=script_args.gradient_checkpointing, - learning_rate=script_args.learning_rate, - evaluation_strategy="steps", - eval_steps=script_args.eval_steps, - output_dir=script_args.output_dir, - report_to=script_args.report_to, - lr_scheduler_type=script_args.lr_scheduler_type, - warmup_steps=script_args.warmup_steps, - optim=script_args.optimizer_type, - bf16=True, - remove_unused_columns=False, - run_name="dpo_llama2", - ) - - peft_config = LoraConfig( - r=script_args.lora_r, - lora_alpha=script_args.lora_alpha, - lora_dropout=script_args.lora_dropout, - target_modules=[ - "q_proj", - "v_proj", - "k_proj", - "out_proj", - "fc_in", - "fc_out", - "wte", - ], - bias="none", - task_type="CAUSAL_LM", - ) - - # 5. initialize the DPO trainer - dpo_trainer = DPOTrainer( - model, - model_ref, - args=training_args, - beta=script_args.beta, - train_dataset=train_dataset, - eval_dataset=eval_dataset, - tokenizer=tokenizer, - peft_config=peft_config, - max_prompt_length=script_args.max_prompt_length, - max_length=script_args.max_length, - ) - - # 6. train - dpo_trainer.train() - dpo_trainer.save_model(script_args.output_dir) - - # 7. save - output_dir = os.path.join(script_args.output_dir, "final_checkpoint") - dpo_trainer.model.save_pretrained(output_dir) \ No newline at end of file diff --git a/module-3/lora_training_rlhf/requirements.txt b/module-3/lora_training_rlhf/requirements.txt deleted file mode 100644 index ca124e5..0000000 --- a/module-3/lora_training_rlhf/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -transformers -trl -peft -accelerate -datasets -bitsandbytes -wandb diff --git a/module-3/lora_training_rlhf/sft_llama2.py b/module-3/lora_training_rlhf/sft_llama2.py deleted file mode 100644 index f2dbb93..0000000 --- a/module-3/lora_training_rlhf/sft_llama2.py +++ /dev/null @@ -1,208 +0,0 @@ -# Fine-Tune Llama2-7b on SE paired dataset -import os -from dataclasses import dataclass, field -from typing import Optional - -import torch -import tyro -from accelerate import Accelerator -from datasets import load_dataset -from peft import AutoPeftModelForCausalLM, LoraConfig -from tqdm import tqdm -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments - -from trl import SFTTrainer -from trl.import_utils import is_xpu_available -from trl.trainer import ConstantLengthDataset - - -@dataclass -class ScriptArguments: - model_name: Optional[str] = field(default="facebook/opt-350m", metadata={"help": "the model name"}) - dataset_name: Optional[str] = field(default="lvwerra/stack-exchange-paired", metadata={"help": "the dataset name"}) - subset: Optional[str] = field(default="data/finetune", metadata={"help": "the subset to use"}) - split: Optional[str] = field(default="train", metadata={"help": "the split to use"}) - size_valid_set: Optional[int] = field(default=4000, metadata={"help": "the size of the validation set"}) - streaming: Optional[bool] = field(default=True, metadata={"help": "whether to stream the dataset"}) - shuffle_buffer: Optional[int] = field(default=5000, metadata={"help": "the shuffle buffer size"}) - seq_length: Optional[int] = field(default=1024, metadata={"help": "the sequence length"}) - num_workers: Optional[int] = field(default=4, metadata={"help": "the number of workers"}) - - training_args: TrainingArguments = field( - default_factory=lambda: TrainingArguments( - output_dir="./results", - max_steps=500, - logging_steps=10, - save_steps=10, - per_device_train_batch_size=2, - per_device_eval_batch_size=1, - gradient_accumulation_steps=2, - gradient_checkpointing=False, - group_by_length=False, - learning_rate=1e-4, - lr_scheduler_type="cosine", - warmup_steps=100, - weight_decay=0.05, - optim="paged_adamw_32bit", - bf16=True, - remove_unused_columns=False, - run_name="sft_llama2", - report_to="wandb", - ) - ) - - packing: Optional[bool] = field(default=True, metadata={"help": "whether to use packing for SFTTrainer"}) - - peft_config: LoraConfig = field( - default_factory=lambda: LoraConfig( - r=8, - lora_alpha=16, - lora_dropout=0.05, - target_modules=["q_proj", "v_proj"], - bias="none", - task_type="CAUSAL_LM", - ) - ) - - -script_args = ScriptArguments() - -if script_args.training_args.group_by_length and script_args.packing: - raise ValueError("Cannot use both packing and group by length") - -# `gradient_checkpointing` was True by default until `1f3314`, but it's actually not used. -# `gradient_checkpointing=True` will cause `Variable._execution_engine.run_backward`. -if script_args.training_args.gradient_checkpointing: - raise ValueError("gradient_checkpointing not supported") - - -def chars_token_ratio(dataset, tokenizer, nb_examples=400): - """ - Estimate the average number of characters per token in the dataset. - """ - total_characters, total_tokens = 0, 0 - for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples): - text = prepare_sample_text(example) - total_characters += len(text) - if tokenizer.is_fast: - total_tokens += len(tokenizer(text).tokens()) - else: - total_tokens += len(tokenizer.tokenize(text)) - - return total_characters / total_tokens - - -def print_trainable_parameters(model): - """ - Prints the number of trainable parameters in the model. - """ - trainable_params = 0 - all_param = 0 - for _, param in model.named_parameters(): - all_param += param.numel() - if param.requires_grad: - trainable_params += param.numel() - print( - f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" - ) - - -def prepare_sample_text(example): - """Prepare the text from a sample of the dataset.""" - text = f"Question: {example['question']}\n\nAnswer: {example['response_j']}" - return text - - -def create_datasets(tokenizer, args): - dataset = load_dataset( - args.dataset_name, - data_dir=args.subset, - split=args.split, - num_proc=args.num_workers if not args.streaming else None, - streaming=args.streaming, - ) - if args.streaming: - print("Loading the dataset in streaming mode") - valid_data = dataset.take(args.size_valid_set) - train_data = dataset.skip(args.size_valid_set) - train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=None) - else: - dataset = dataset.train_test_split(test_size=0.005, seed=None) - train_data = dataset["train"] - valid_data = dataset["test"] - print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}") - - chars_per_token = chars_token_ratio(train_data, tokenizer) - print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}") - - train_dataset = ConstantLengthDataset( - tokenizer, - train_data, - formatting_func=prepare_sample_text, - infinite=True, - seq_length=args.seq_length, - chars_per_token=chars_per_token, - ) - valid_dataset = ConstantLengthDataset( - tokenizer, - valid_data, - formatting_func=prepare_sample_text, - infinite=False, - seq_length=args.seq_length, - chars_per_token=chars_per_token, - ) - return train_dataset, valid_dataset - - -bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, -) - -base_model = AutoModelForCausalLM.from_pretrained( - script_args.model_name, - quantization_config=bnb_config, - device_map={"": Accelerator().local_process_index}, - trust_remote_code=True, -) -base_model.config.use_cache = False - -peft_config = script_args.peft_config - -tokenizer = AutoTokenizer.from_pretrained(script_args.model_name, trust_remote_code=True) -tokenizer.pad_token = tokenizer.eos_token -tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training - -training_args = script_args.training_args - -train_dataset, eval_dataset = create_datasets(tokenizer, script_args) - -trainer = SFTTrainer( - model=base_model, - train_dataset=train_dataset, - eval_dataset=eval_dataset, - peft_config=peft_config, - packing=script_args.packing, - max_seq_length=None, - tokenizer=tokenizer, - args=training_args, -) -trainer.train() -trainer.save_model(script_args.training_args.output_dir) - -output_dir = os.path.join(script_args.training_args.output_dir, "final_checkpoint") -trainer.model.save_pretrained(output_dir) - -# Free memory for merging weights -del base_model -if is_xpu_available(): - torch.xpu.empty_cache() -else: - torch.cuda.empty_cache() - -model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16) -model = model.merge_and_unload() - -output_merged_dir = os.path.join(script_args.training_args.output_dir, "final_merged_checkpoint") -model.save_pretrained(output_merged_dir, safe_serialization=True) \ No newline at end of file diff --git a/module-3/nlp-sample/.gitignore b/module-3/nlp-sample/.gitignore index 37fc9d4..f22fa0c 100644 --- a/module-3/nlp-sample/.gitignore +++ b/module-3/nlp-sample/.gitignore @@ -88,3 +88,5 @@ ENV/ # Rope project settings .ropeproject +data/ +wandb/ \ No newline at end of file diff --git a/module-3/nlp-sample/Makefile b/module-3/nlp-sample/Makefile index bb35eaa..39700e2 100644 --- a/module-3/nlp-sample/Makefile +++ b/module-3/nlp-sample/Makefile @@ -8,11 +8,10 @@ run_dev_gpu: build docker run --net=host --gpus all -it -v ${PWD}:/main nlp-sample:latest /bin/bash format: - black --line-length 120 nlp_sample tests - isort -rc nlp_sample tests + ruff format nlp_sample/ tests/ lint: - flake8 --max-line-length 120 nlp_sample tests + ruff check nlp_sample/ tests/ test: pytest --disable-warnings ./tests/ diff --git a/module-3/nlp-sample/README.md b/module-3/nlp-sample/README.md index f9ca2c1..1ea6d74 100644 --- a/module-3/nlp-sample/README.md +++ b/module-3/nlp-sample/README.md @@ -1,24 +1,24 @@ -# NLP sample +# NLP sample -## Setup +## Setup -``` +```bash make build ``` ## Develop -``` +```bash make run_dev cd /main export PYTHONPATH=. -export WANDB_PROJECT=course-27-10-2023-week-3 +export WANDB_PROJECT=ml-in-production-practice export WANDB_API_KEY=*********************** ``` ## Test -``` +```bash make test ``` @@ -26,6 +26,6 @@ reference: https://madewithml.com/courses/mlops/testing/ ## Reports +```bash +open https://wandb.ai/truskovskiyk/ml-in-production-practice ``` -open https://wandb.ai/truskovskiyk/nlp-sample/ -``` \ No newline at end of file diff --git a/module-3/nlp-sample/conf/example.json b/module-3/nlp-sample/conf/example.json index bd9be30..2090ed7 100644 --- a/module-3/nlp-sample/conf/example.json +++ b/module-3/nlp-sample/conf/example.json @@ -11,7 +11,7 @@ "use_fast_tokenizer": true, - "evaluation_strategy": "steps", + "eval_strategy": "steps", "per_device_train_batch_size": 32, "per_device_eval_batch_size": 32, "gradient_accumulation_steps": 1, @@ -27,20 +27,18 @@ "lr_scheduler_type": "linear", "logging_strategy": "steps", "logging_first_step": true, - "logging_steps": 50, + "logging_steps": 250, "save_strategy": "steps", - "save_steps": 500, + "save_steps": 250, "save_total_limit": 5, "no_cuda": false, "seed": 42, - "eval_steps": 50, + "eval_steps": 250, "run_name": "results", "disable_tqdm": false, "remove_unused_columns": true, "label_names": null, "load_best_model_at_end": true, - - "metric_for_best_model": "eval_f1", "greater_is_better": true, "report_to": ["wandb"] diff --git a/module-3/nlp-sample/conf/fast.json b/module-3/nlp-sample/conf/fast.json index 0779283..5e1bcc9 100644 --- a/module-3/nlp-sample/conf/fast.json +++ b/module-3/nlp-sample/conf/fast.json @@ -5,7 +5,7 @@ "validation_file": "./data/val.csv", "max_seq_length": 128, "output_dir": "/tmp/results", - "evaluation_strategy": "steps", + "eval_strategy": "steps", "per_device_train_batch_size": 32, "per_device_eval_batch_size": 32, "gradient_accumulation_steps": 1, @@ -16,18 +16,18 @@ "adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1, - "num_train_epochs": 10, + "num_train_epochs": 1, "max_steps": -1, "lr_scheduler_type": "linear", "logging_strategy": "steps", "logging_first_step": true, - "logging_steps": 50, + "logging_steps": 250, "save_strategy": "steps", - "save_steps": 500, + "save_steps": 250, "save_total_limit": 5, "no_cuda": false, "seed": 42, - "eval_steps": 50, + "eval_steps": 250, "run_name": "results", "disable_tqdm": false, "remove_unused_columns": true, @@ -35,5 +35,5 @@ "load_best_model_at_end": true, "metric_for_best_model": "eval_f1", "greater_is_better": true, - "report_to": [] + "report_to": ["wandb"] } \ No newline at end of file diff --git a/module-3/nlp-sample/nlp_sample/data.py b/module-3/nlp-sample/nlp_sample/data.py index a228c7e..0109860 100644 --- a/module-3/nlp-sample/nlp_sample/data.py +++ b/module-3/nlp-sample/nlp_sample/data.py @@ -5,30 +5,38 @@ from sklearn.model_selection import train_test_split -def load_cola_data(path_to_save: Path, random_state: int = 42): - path_to_save.mkdir(parents=True, exist_ok=True) - - dataset = load_dataset("glue", "cola") - df_all = ArrowReader.read_table(dataset.cache_files["train"][0]["filename"]).to_pandas() - df_test = ArrowReader.read_table(dataset.cache_files["test"][0]["filename"]).to_pandas() +def _get_cola_data(random_state: int = 42): + dataset = load_dataset("glue", "sst2") + df_all = ArrowReader.read_table( + dataset.cache_files["train"][0]["filename"] + ).to_pandas() + df_test = ArrowReader.read_table( + dataset.cache_files["test"][0]["filename"] + ).to_pandas() df_train, df_val = train_test_split(df_all, random_state=random_state) + return df_train, df_val, df_test + + +def load_cola_data(path_to_save: Path): + path_to_save.mkdir(parents=True, exist_ok=True) + + df_train, df_val, df_test = _get_cola_data() + df_train.to_csv(path_to_save / "train.csv", index=False) df_val.to_csv(path_to_save / "val.csv", index=False) df_test.to_csv(path_to_save / "test.csv", index=False) -def load_cola_data_file_input(path_to_train: Path, path_to_val: Path, path_to_test: Path, random_state: int = 42): +def load_cola_data_file_input( + path_to_train: Path, path_to_val: Path, path_to_test: Path +): path_to_train.parent.mkdir(parents=True, exist_ok=True) path_to_val.parent.mkdir(parents=True, exist_ok=True) path_to_test.parent.mkdir(parents=True, exist_ok=True) - dataset = load_dataset("glue", "cola") - df_all = ArrowReader.read_table(dataset.cache_files["train"][0]["filename"]).to_pandas() - df_test = ArrowReader.read_table(dataset.cache_files["test"][0]["filename"]).to_pandas() - - df_train, df_val = train_test_split(df_all, random_state=random_state) + df_train, df_val, df_test = _get_cola_data() df_train.to_csv(path_to_train, index=False) df_val.to_csv(path_to_val, index=False) diff --git a/module-3/nlp-sample/nlp_sample/predictor.py b/module-3/nlp-sample/nlp_sample/predictor.py index 271a4fa..d2b8d10 100644 --- a/module-3/nlp-sample/nlp_sample/predictor.py +++ b/module-3/nlp-sample/nlp_sample/predictor.py @@ -18,7 +18,9 @@ def __init__(self, model_load_path: str): @torch.no_grad() def predict(self, text: str): - text_encoded = self.tokenizer.batch_encode_plus(list(text), return_tensors="pt", padding=True) + text_encoded = self.tokenizer.batch_encode_plus( + list(text), return_tensors="pt", padding=True + ) bert_outputs = self.model(**text_encoded).logits return softmax(bert_outputs).numpy() diff --git a/module-3/nlp-sample/nlp_sample/train.py b/module-3/nlp-sample/nlp_sample/train.py index 21651fa..3a178e9 100644 --- a/module-3/nlp-sample/nlp_sample/train.py +++ b/module-3/nlp-sample/nlp_sample/train.py @@ -24,16 +24,23 @@ def get_args(): - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + parser = HfArgumentParser( + (ModelArguments, DataTrainingArguments, TrainingArguments) + ) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + model_args, data_args, training_args = parser.parse_json_file( + json_file=os.path.abspath(sys.argv[1]) + ) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() return model_args, data_args, training_args def read_dataset(data_args: DataTrainingArguments, cache_dir: str): - data_files = {"train": data_args.train_file, "validation": data_args.validation_file} + data_files = { + "train": data_args.train_file, + "validation": data_args.validation_file, + } raw_datasets = load_dataset("csv", data_files=data_files, cache_dir=cache_dir) label_list = raw_datasets["train"].unique("label") @@ -44,12 +51,16 @@ def read_dataset(data_args: DataTrainingArguments, cache_dir: str): def get_models(model_args, num_labels): config = AutoConfig.from_pretrained( - model_args.config_name if model_args.config_name else model_args.model_name_or_path, + model_args.config_name + if model_args.config_name + else model_args.model_name_or_path, num_labels=num_labels, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + model_args.tokenizer_name + if model_args.tokenizer_name + else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, ) @@ -62,7 +73,9 @@ def get_models(model_args, num_labels): return config, tokenizer, model -def process_dataset(data_args, label_list, model, config, tokenizer, training_args, raw_datasets): +def process_dataset( + data_args, label_list, model, config, tokenizer, training_args, raw_datasets +): padding = "max_length" if data_args.pad_to_max_length else False label_to_id = {v: i for i, v in enumerate(label_list)} @@ -104,7 +117,9 @@ def process_dataset(data_args, label_list, model, config, tokenizer, training_ar return train_dataset, eval_dataset -def get_trainer(model, train_dataset, data_args, training_args, eval_dataset, tokenizer) -> Trainer: +def get_trainer( + model, train_dataset, data_args, training_args, eval_dataset, tokenizer +) -> Trainer: # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") @@ -127,7 +142,9 @@ def get_trainer(model, train_dataset, data_args, training_args, eval_dataset, to def get_config(config_path: Path): - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + parser = HfArgumentParser( + (ModelArguments, DataTrainingArguments, TrainingArguments) + ) model_args, data_args, training_args = parser.parse_json_file(config_path) return model_args, data_args, training_args @@ -143,9 +160,11 @@ def train(config_path: Path): set_seed(training_args.seed) - raw_datasets, num_labels, label_list = read_dataset(data_args=data_args, cache_dir=model_args.cache_dir) + raw_datasets, num_labels, label_list = read_dataset( + data_args=data_args, cache_dir=model_args.cache_dir + ) config, tokenizer, model = get_models(model_args=model_args, num_labels=num_labels) - + train_dataset, eval_dataset = process_dataset( data_args=data_args, label_list=label_list, diff --git a/module-3/nlp-sample/nlp_sample/utils.py b/module-3/nlp-sample/nlp_sample/utils.py index 050e4ae..d272f38 100644 --- a/module-3/nlp-sample/nlp_sample/utils.py +++ b/module-3/nlp-sample/nlp_sample/utils.py @@ -20,13 +20,19 @@ def compute_metrics(p: EvalPrediction) -> Dict[str, float]: } -def preprocess_function_examples(examples, tokenizer, padding, max_seq_length, label_to_id): +def preprocess_function_examples( + examples, tokenizer, padding, max_seq_length, label_to_id +): sentence1_key = "sentence" args = (examples[sentence1_key],) - result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) + result = tokenizer( + *args, padding=padding, max_length=max_seq_length, truncation=True + ) # Map labels to IDs (not necessary for GLUE tasks) if label_to_id is not None and "label" in examples: - result["label"] = [(label_to_id[label] if label != -1 else -1) for label in examples["label"]] + result["label"] = [ + (label_to_id[label] if label != -1 else -1) for label in examples["label"] + ] return result diff --git a/module-3/nlp-sample/requirements.txt b/module-3/nlp-sample/requirements.txt index 2a365d9..1e2a7ec 100644 --- a/module-3/nlp-sample/requirements.txt +++ b/module-3/nlp-sample/requirements.txt @@ -1,10 +1,8 @@ transformers==4.42.3 datasets==2.15.0 -accelerate==0.24.1 -# peft==0.6.2 +accelerate==0.32.1 typer==0.6.1 -wandb==0.13.3 +wandb==0.17.4 ruff==0.5.0 great-expectations==0.15.25 -pytest-cov==3.0.0 -# bitsandbytes==0.41.2 \ No newline at end of file +pytest-cov==3.0.0 \ No newline at end of file diff --git a/module-3/nlp-sample/tests/conftest.py b/module-3/nlp-sample/tests/conftest.py index b89b303..4348b2a 100644 --- a/module-3/nlp-sample/tests/conftest.py +++ b/module-3/nlp-sample/tests/conftest.py @@ -25,4 +25,8 @@ def data(data_path: Path) -> Tuple[PandasDataset, PandasDataset]: df_val = pd.read_csv(data_path / "val.csv") df_test = pd.read_csv(data_path / "test.csv") - return ge.dataset.PandasDataset(df_train), ge.dataset.PandasDataset(df_val), ge.dataset.PandasDataset(df_test) + return ( + ge.dataset.PandasDataset(df_train), + ge.dataset.PandasDataset(df_val), + ge.dataset.PandasDataset(df_test), + ) diff --git a/module-3/nlp-sample/tests/data/test_config.json b/module-3/nlp-sample/tests/data/test_config.json index ef9a026..503f644 100644 --- a/module-3/nlp-sample/tests/data/test_config.json +++ b/module-3/nlp-sample/tests/data/test_config.json @@ -5,7 +5,7 @@ "validation_file": "/tmp/data/val.csv", "max_seq_length": 128, "output_dir": "/tmp/results", - "evaluation_strategy": "steps", + "eval_strategy": "steps", "per_device_train_batch_size": 2, "per_device_eval_batch_size": 2, "gradient_accumulation_steps": 1, diff --git a/module-3/nlp-sample/tests/test_cli.py b/module-3/nlp-sample/tests/test_cli.py index 3467910..b6ccb63 100644 --- a/module-3/nlp-sample/tests/test_cli.py +++ b/module-3/nlp-sample/tests/test_cli.py @@ -7,14 +7,15 @@ def test_app(): result = runner.invoke(app, ["load-cola-data", "/tmp/data"]) - assert result.exit_code == 0 + assert result.exit_code == 0, result.exception assert Path("/tmp/data/train.csv").exists() assert Path("/tmp/data/val.csv").exists() assert Path("/tmp/data/test.csv").exists() result = runner.invoke(app, ["train", "tests/data/test_config.json"]) - assert result.exit_code == 0 + assert result.exit_code == 0, result.exception assert Path("/tmp/results").exists() result = runner.invoke(app, ["upload-to-registry", "cli-test", "/tmp/results"]) - assert result.exit_code == 0 + assert result.exit_code == 0, result.exception + diff --git a/module-3/nlp-sample/tests/test_data.py b/module-3/nlp-sample/tests/test_data.py index 51a2911..5f0cb5d 100644 --- a/module-3/nlp-sample/tests/test_data.py +++ b/module-3/nlp-sample/tests/test_data.py @@ -5,16 +5,22 @@ def test_data_shape(data: Tuple[PandasDataset, PandasDataset, PandasDataset]): df_train, df_val, df_test = data - assert df_train.shape[0] + df_val.shape[0] == 8551 + assert df_train.shape[0] + df_val.shape[0] == 67349 assert df_train.shape[1] == df_val.shape[1] == 3 - assert df_test.shape == (1063, 3) + assert df_test.shape == (1821, 3) def test_data_order(data: Tuple[PandasDataset, PandasDataset, PandasDataset]): df_train, df_val, df_test = data - assert df_train.expect_table_columns_to_match_ordered_list(column_list=["sentence", "label", "idx"])["success"] - assert df_val.expect_table_columns_to_match_ordered_list(column_list=["sentence", "label", "idx"])["success"] - assert df_test.expect_table_columns_to_match_ordered_list(column_list=["sentence", "label", "idx"])["success"] + assert df_train.expect_table_columns_to_match_ordered_list( + column_list=["sentence", "label", "idx"] + )["success"] + assert df_val.expect_table_columns_to_match_ordered_list( + column_list=["sentence", "label", "idx"] + )["success"] + assert df_test.expect_table_columns_to_match_ordered_list( + column_list=["sentence", "label", "idx"] + )["success"] def test_data_content(data: Tuple[PandasDataset, PandasDataset, PandasDataset]): diff --git a/module-3/nlp-sample/tests/test_model.py b/module-3/nlp-sample/tests/test_model.py index f7e170b..ddde5a5 100644 --- a/module-3/nlp-sample/tests/test_model.py +++ b/module-3/nlp-sample/tests/test_model.py @@ -4,7 +4,13 @@ from transformers import Trainer, TrainingArguments from nlp_sample.config import DataTrainingArguments, ModelArguments -from nlp_sample.train import get_models, get_trainer, process_dataset, read_dataset, train +from nlp_sample.train import ( + get_models, + get_trainer, + process_dataset, + read_dataset, + train, +) @pytest.fixture() @@ -24,14 +30,20 @@ def data_args(data_path: Path) -> DataTrainingArguments: @pytest.fixture() def training_args() -> TrainingArguments: - return TrainingArguments(output_dir="/tmp/test", num_train_epochs=1000, report_to=[], learning_rate=5e-04) + return TrainingArguments( + output_dir="/tmp/test", num_train_epochs=1000, report_to=[], learning_rate=5e-04 + ) @pytest.fixture() def trainer_with_one_batch( - model_args: ModelArguments, data_args: DataTrainingArguments, training_args: TrainingArguments + model_args: ModelArguments, + data_args: DataTrainingArguments, + training_args: TrainingArguments, ) -> Trainer: - raw_datasets, num_labels, label_list = read_dataset(data_args=data_args, cache_dir=model_args.cache_dir) + raw_datasets, num_labels, label_list = read_dataset( + data_args=data_args, cache_dir=model_args.cache_dir + ) config, tokenizer, model = get_models(model_args=model_args, num_labels=num_labels) train_dataset, eval_dataset = process_dataset( data_args=data_args, diff --git a/module-4/dist_training/README.md b/module-4/dist_training/README.md deleted file mode 100644 index ffca340..0000000 --- a/module-4/dist_training/README.md +++ /dev/null @@ -1,13 +0,0 @@ -``` -docker run -it -v $PWD:/app --net=host --gpus all --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 huggingface/transformers-pytorch-gpu:4.28.1 /bin/bash - - -torchrun --nnodes=2 --nproc_per_node=1 --node-rank=0 --master-addr=172.31.3.162 --master-port=8888 torch_native_dist.py 1000 100 -torchrun --nnodes=2 --nproc_per_node=1 --node-rank=1 --master-addr=172.31.3.162 --master-port=8888 torch_native_dist.py 1000 100 - - -accelerate launch --multi_gpu --num_machines 2 --num_processes 2 --machine_rank 0 --main_process_ip 172.31.3.162 --main_process_port 8888 accelerate_run.py -accelerate launch --multi_gpu --num_machines 2 --num_processes 2 --machine_rank 1 --main_process_ip 172.31.3.162 --main_process_port 8888 accelerate_run.py - -accelerate launch accelerate_run.py -``` \ No newline at end of file diff --git a/module-4/dist_training/accelerate_run.py b/module-4/dist_training/accelerate_run.py deleted file mode 100644 index bf5871c..0000000 --- a/module-4/dist_training/accelerate_run.py +++ /dev/null @@ -1,64 +0,0 @@ -from transformers import Trainer, TrainingArguments -import torch -import torch.nn.functional as F -from torch.utils.data import Dataset - - -import torch -from torch.utils.data import Dataset - - -class MyTrainDataset(Dataset): - def __init__(self, size): - self.size = size - self.data = [(torch.rand(20), torch.rand(1).long()) for _ in range(size)] - - def __len__(self): - return self.size - - def __getitem__(self, index): - return self.data[index] - - -def load_train_objs(): - train_set = MyTrainDataset(2048) # load your dataset - model = torch.nn.Linear(20, 1) # load your model - optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) - return train_set, model, optimizer - - -def collate_fn(examples): - pixel_values = torch.stack([example[0] for example in examples]) - labels = torch.tensor([example[1] for example in examples]) - return {"x": pixel_values, "labels": labels} - - -class MyTrainer(Trainer): - def compute_loss(self, model, inputs, return_outputs=False): - outputs = model(inputs["x"]) - target = inputs["labels"] - loss = F.cross_entropy(outputs, target) - return (loss, outputs) if return_outputs else loss - - -def train(): - train_set, model, optimizer = load_train_objs() - - training_args = TrainingArguments( - "basic-trainer", per_device_train_batch_size=64, num_train_epochs=1000, remove_unused_columns=False - ) - - trainer = MyTrainer( - model, - training_args, - train_dataset=train_set, - data_collator=collate_fn, - ) - - trainer.train() - - -if __name__ == "__main__": - train() - -# accelerate launch accelerate.py diff --git a/module-4/dist_training/torch_native.py b/module-4/dist_training/torch_native.py deleted file mode 100644 index ba593f8..0000000 --- a/module-4/dist_training/torch_native.py +++ /dev/null @@ -1,95 +0,0 @@ -import torch -import torch.nn.functional as F -from torch.utils.data import Dataset, DataLoader - -import torch -from torch.utils.data import Dataset - - -class MyTrainDataset(Dataset): - def __init__(self, size): - self.size = size - self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)] - - def __len__(self): - return self.size - - def __getitem__(self, index): - return self.data[index] - - -class Trainer: - def __init__( - self, - model: torch.nn.Module, - train_data: DataLoader, - optimizer: torch.optim.Optimizer, - gpu_id: int, - save_every: int, - ) -> None: - self.gpu_id = gpu_id - self.model = model.to(gpu_id) - self.train_data = train_data - self.optimizer = optimizer - self.save_every = save_every - - def _run_batch(self, source, targets): - self.optimizer.zero_grad() - output = self.model(source) - loss = F.cross_entropy(output, targets) - loss.backward() - self.optimizer.step() - - def _run_epoch(self, epoch): - b_sz = len(next(iter(self.train_data))[0]) - print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}") - for source, targets in self.train_data: - source = source.to(self.gpu_id) - targets = targets.to(self.gpu_id) - self._run_batch(source, targets) - - def _save_checkpoint(self, epoch): - ckp = self.model.state_dict() - PATH = "checkpoint.pt" - torch.save(ckp, PATH) - print(f"Epoch {epoch} | Training checkpoint saved at {PATH}") - - def train(self, max_epochs: int): - for epoch in range(max_epochs): - self._run_epoch(epoch) - if epoch % self.save_every == 0: - self._save_checkpoint(epoch) - - -def load_train_objs(): - train_set = MyTrainDataset(2048) # load your dataset - model = torch.nn.Linear(20, 1) # load your model - optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) - return train_set, model, optimizer - - -def prepare_dataloader(dataset: Dataset, batch_size: int): - return DataLoader(dataset, batch_size=batch_size, pin_memory=True, shuffle=True) - - -def main(device, total_epochs, save_every, batch_size): - dataset, model, optimizer = load_train_objs() - train_data = prepare_dataloader(dataset, batch_size) - trainer = Trainer(model, train_data, optimizer, device, save_every) - trainer.train(total_epochs) - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="simple distributed training job") - parser.add_argument("total_epochs", type=int, help="Total epochs to train the model") - parser.add_argument("save_every", type=int, help="How often to save a snapshot") - parser.add_argument("--batch_size", default=32, type=int, help="Input batch size on each device (default: 32)") - args = parser.parse_args() - - device = 0 # shorthand for cuda:0 - main(device, args.total_epochs, args.save_every, args.batch_size) - - -# torchrun --nnodes=1 --nproc_per_node=1 torch_native.py 1000 100 diff --git a/module-4/dist_training/torch_native_dist.py b/module-4/dist_training/torch_native_dist.py deleted file mode 100644 index 8253040..0000000 --- a/module-4/dist_training/torch_native_dist.py +++ /dev/null @@ -1,125 +0,0 @@ -import torch -import torch.nn.functional as F -from torch.utils.data import Dataset, DataLoader - - -import torch.multiprocessing as mp -from torch.utils.data.distributed import DistributedSampler -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.distributed import init_process_group, destroy_process_group -import os - -import torch -from torch.utils.data import Dataset - - -class MyTrainDataset(Dataset): - def __init__(self, size): - self.size = size - self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)] - - def __len__(self): - return self.size - - def __getitem__(self, index): - return self.data[index] - - -def ddp_setup(): - init_process_group(backend="nccl") - # init_process_group(backend="golo") - - -class Trainer: - def __init__( - self, - model: torch.nn.Module, - train_data: DataLoader, - optimizer: torch.optim.Optimizer, - save_every: int, - snapshot_path: str, - ) -> None: - self.local_rank = int(os.environ["LOCAL_RANK"]) - self.global_rank = int(os.environ["RANK"]) - self.model = model.to(self.local_rank) - self.train_data = train_data - self.optimizer = optimizer - self.save_every = save_every - self.epochs_run = 0 - self.snapshot_path = snapshot_path - if os.path.exists(snapshot_path): - print("Loading snapshot") - self._load_snapshot(snapshot_path) - - self.model = DDP(self.model, device_ids=[self.local_rank]) - - def _load_snapshot(self, snapshot_path): - loc = f"cuda:{self.local_rank}" - snapshot = torch.load(snapshot_path, map_location=loc) - self.model.load_state_dict(snapshot["MODEL_STATE"]) - self.epochs_run = snapshot["EPOCHS_RUN"] - print(f"Resuming training from snapshot at Epoch {self.epochs_run}") - - def _run_batch(self, source, targets): - self.optimizer.zero_grad() - output = self.model(source) - loss = F.cross_entropy(output, targets) - loss.backward() - self.optimizer.step() - - def _run_epoch(self, epoch): - b_sz = len(next(iter(self.train_data))[0]) - print(f"[GPU{self.global_rank}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}") - self.train_data.sampler.set_epoch(epoch) - for source, targets in self.train_data: - source = source.to(self.local_rank) - targets = targets.to(self.local_rank) - self._run_batch(source, targets) - - def _save_snapshot(self, epoch): - snapshot = { - "MODEL_STATE": self.model.module.state_dict(), - "EPOCHS_RUN": epoch, - } - torch.save(snapshot, self.snapshot_path) - print(f"Epoch {epoch} | Training snapshot saved at {self.snapshot_path}") - - def train(self, max_epochs: int): - for epoch in range(self.epochs_run, max_epochs): - self._run_epoch(epoch) - if self.local_rank == 0 and epoch % self.save_every == 0: - self._save_snapshot(epoch) - - -def load_train_objs(): - train_set = MyTrainDataset(2048) # load your dataset - model = torch.nn.Linear(20, 1) # load your model - optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) - return train_set, model, optimizer - - -def prepare_dataloader(dataset: Dataset, batch_size: int): - return DataLoader( - dataset, batch_size=batch_size, pin_memory=True, shuffle=False, sampler=DistributedSampler(dataset) - ) - - -def main(save_every: int, total_epochs: int, batch_size: int, snapshot_path: str = "snapshot.pt"): - ddp_setup() - dataset, model, optimizer = load_train_objs() - train_data = prepare_dataloader(dataset, batch_size) - trainer = Trainer(model, train_data, optimizer, save_every, snapshot_path) - trainer.train(total_epochs) - destroy_process_group() - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="simple distributed training job") - parser.add_argument("total_epochs", type=int, help="Total epochs to train the model") - parser.add_argument("save_every", type=int, help="How often to save a snapshot") - parser.add_argument("--batch_size", default=32, type=int, help="Input batch size on each device (default: 32)") - args = parser.parse_args() - - main(args.save_every, args.total_epochs, args.batch_size)