-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
239 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
apiVersion: batch/v1 | ||
kind: Job | ||
# This is used for naming the job and pod, and letting other cluster/namespace users know I created it | ||
metadata: | ||
generateName: bking2--hf-libraries-demo- | ||
labels: | ||
user: bking2 | ||
k8s-app: bking2-hf-libraries-demo | ||
spec: | ||
template: | ||
spec: | ||
# Here we additionally specify that we need our pod (created by the job) to attach to a node with an A100 | ||
affinity: | ||
nodeAffinity: | ||
requiredDuringSchedulingIgnoredDuringExecution: | ||
nodeSelectorTerms: | ||
- matchExpressions: | ||
- key: nvidia.com/gpu.product | ||
operator: In | ||
values: | ||
- NVIDIA-A100-SXM4-80GB | ||
# Here is where we define the core parts of the job. We need 1) the Docker image 2) it's environment requirements | ||
# (CPU/Memory/GPU) and 3) the command that gets run | ||
containers: | ||
- name: bking2-hf-libraries-demo | ||
image: kingb12/hf_libraries_demo:latest | ||
# Here I've added a secret for my weights and biases API key, so the job | ||
# can create logs, and my huggingface API key, so I can download weights | ||
envFrom: | ||
- secretRef: | ||
name: bking2-wandb-api-key-71a5 | ||
- secretRef: | ||
name: bking2-hf-api-token | ||
resources: | ||
limits: | ||
memory: 64Gi | ||
cpu: 32 | ||
nvidia.com/gpu: "1" | ||
requests: | ||
memory: 32Gi | ||
cpu: 16 | ||
nvidia.com/gpu: "1" | ||
command: [ "/bin/sh" ] | ||
# This includes further setup to 1) cache transformers and datasets on my volume so weights don't need to be | ||
# re-downloaded on each run and 2) log in to huggingface since Starcoder is agreement protected. | ||
# everything after 'job ready to start' is the script we want to run. Using | ||
# conda run --no-capture-output -p ./venv runs things with the correct conda environment | ||
|
||
# Note: rather than clone this job over different arguments to batch size, I just modified them here as I created | ||
# things. | ||
args: | ||
- -c | ||
- >- | ||
cd /home/bking2/hf_libraries_demo && | ||
export TRANSFORMERS_CACHE=/data/users/bking2/.cache/huggingface && | ||
export HF_HOME=/data/users/bking2/.cache/huggingface && | ||
pip install huggingface_hub && | ||
python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('${HF_API_TOKEN}')" && | ||
echo "job ready to start" && | ||
echo "import hf_libraries_demo.package_demo.addition_module as mymod\nprint(f'4 + 5 is {mymod.add_five_to_x(4)}')" > demo.py && | ||
conda run --no-capture-output -p ./venv python src/hf_libraries_demo/experiments/peft/karpathy_speedups_example.py --batch_size 1 && | ||
echo "job complete!" | ||
# some arguments needed by kubernetes, plus some useful defaults | ||
volumeMounts: | ||
- mountPath: /data/users/bking2 | ||
name: bking2-data-volume | ||
restartPolicy: Never | ||
schedulerName: default-scheduler | ||
securityContext: {} | ||
serviceAccount: default | ||
serviceAccountName: default | ||
terminationGracePeriodSeconds: 30 | ||
# tolerations are used to define what to do if the cluster isn't ready, can't be reached, etc. Other tolerations | ||
# can be used to define what to do when resources are inadequate for our requests/limits | ||
tolerations: | ||
- effect: NoExecute | ||
key: node.kubernetes.io/not-ready | ||
operator: Exists | ||
tolerationSeconds: 300 | ||
- effect: NoExecute | ||
key: node.kubernetes.io/unreachable | ||
operator: Exists | ||
tolerationSeconds: 300 | ||
# We add a toleration telling k8s not to schedule our job if no A100s are available yet | ||
- effect: PreferNoSchedule | ||
key: nvidia.com/gpu | ||
operator: Exists | ||
# here we specify the data volume as well. So far, I just use this for caching transformer/dataset weights | ||
# See https://ucsd-prp.gitlab.io/userdocs/tutorial/storage/ for info on creating a data volume to mount to like | ||
# this (pre-requisite to mounting as in this job, not shown in repo) | ||
volumes: | ||
- name: bking2-data-volume | ||
persistentVolumeClaim: | ||
claimName: bking2-data-volume | ||
backoffLimit: 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
121 changes: 121 additions & 0 deletions
121
src/hf_libraries_demo/experiments/peft/karpathy_speedups_example.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
""" | ||
Adding @karpathys speed-ups, and taking batch size as an argument for fine-tuning StarCoder with the Peft Library | ||
""" | ||
import argparse | ||
import os | ||
|
||
import wandb | ||
from datasets import load_dataset, Dataset, DatasetDict | ||
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training | ||
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer | ||
from transformers import TrainingArguments | ||
|
||
from hf_libraries_demo.experiments.peft.flops_counter import TFLOPSCallback | ||
from hf_libraries_demo.experiments.peft.utils import SavePeftModelCallback, LoadBestPeftModelCallback, \ | ||
print_trainable_parameters | ||
|
||
if __name__ == "__main__": | ||
# parse arguments | ||
parser = argparse.ArgumentParser(description="Training arguments parser") | ||
parser.add_argument('--batch_size', type=int, default=1, help='Batch size for training (default: 1)') | ||
parser.add_argument('--num_workers', type=int, default=8, help='Number of workers for data loading (default: 8)') | ||
parser.add_argument('--pin_memory', action='store_true', default=True, | ||
help='Use pinned (page-locked) memory. If not set, defaults to True.') | ||
args = parser.parse_args() | ||
|
||
# Load the and process dataset. Added more training data points to get a more complete test. | ||
full_dataset: Dataset = load_dataset("HuggingFaceH4/CodeAlpaca_20K", split=f"train[0:{128*10}]", use_auth_token=True) | ||
split_dataset: DatasetDict = full_dataset.train_test_split(test_size=0.1) | ||
|
||
# take each prompt and completion and form a single text with a 'Question' and 'Answer', drop existing columns | ||
split_dataset = split_dataset.map( | ||
lambda item: {'text': f"Question: {item['prompt']}\n\nAnswer: {item['completion']}"}, | ||
remove_columns=split_dataset['train'].column_names | ||
) | ||
|
||
# setup the tokenizer and tokenizer, ignore padding/truncation for now since we're using batch size 1 | ||
tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoder", use_auth_token=True) | ||
tokenized_dataset = split_dataset.map(lambda batch: tokenizer(batch['text']), batched=True) | ||
|
||
# set the labels to the inputs. In this case, the MODEL will know to do appropriate shifting for Causal LM | ||
tokenized_dataset = tokenized_dataset.map(lambda batch: {'labels': batch['input_ids']}, batched=True) | ||
|
||
model = AutoModelForCausalLM.from_pretrained( | ||
"bigcode/starcoder", | ||
use_auth_token=True, | ||
use_cache=True, | ||
# note this argument for loading the in 8-bit mode | ||
load_in_8bit=True, | ||
device_map="auto", | ||
) | ||
|
||
# some model preparation work done by `peft` | ||
model = prepare_model_for_kbit_training(model) | ||
|
||
# For our parameter efficient tuning method, we'll use LoRA | ||
lora_config = LoraConfig( | ||
r=16, | ||
lora_alpha=32, | ||
lora_dropout=.05, | ||
bias="none", | ||
task_type="CAUSAL_LM", | ||
target_modules=["c_proj", "c_attn", "q_attn"] | ||
) | ||
|
||
# get a peft model based on our config and base model | ||
model = get_peft_model(model, lora_config) | ||
|
||
# for information, we'll log the total number of parameters and those that are trainable (requires_grad=True) | ||
print_trainable_parameters(model) | ||
|
||
# wandb init for logging (log as this file name, no hyperparameters) | ||
run = wandb.init(project="hf_libraries_demo_peft_example", name=os.path.basename(__file__)) | ||
|
||
wandb.log(vars(args)) | ||
|
||
# Finally, set up a Trainer and train as in typical fine-tuning. Taking very few steps again | ||
output_dir: str = "./outputs" | ||
os.makedirs(output_dir, exist_ok=True) | ||
training_args = TrainingArguments( | ||
output_dir=output_dir, | ||
evaluation_strategy="steps", | ||
save_strategy="steps", | ||
load_best_model_at_end=True, | ||
max_steps=32, | ||
eval_steps=16, | ||
save_steps=16, | ||
logging_steps=1, | ||
# We're optimizing training speed but in a real setup you can increase eval batch size beyond train batch size | ||
per_device_train_batch_size=args.batch_size, | ||
per_device_eval_batch_size=args.batch_size, | ||
learning_rate=5e-6, | ||
lr_scheduler_type="cosine", | ||
warmup_steps=100, | ||
gradient_accumulation_steps=4, # our effective batch size will be 4 as a result | ||
fp16=True, | ||
weight_decay=0.05, | ||
report_to="wandb", | ||
# implementing @karpathy's simple speed-ups for the dataloader. If using k8s, make sure cpu requests > this val | ||
dataloader_num_workers=args.num_workers, | ||
dataloader_pin_memory=args.pin_memory | ||
) | ||
|
||
|
||
# Create a TFLOPs Callback which logs to wandb | ||
tflops_callback: TFLOPSCallback = TFLOPSCallback(logging_callback=wandb.log) | ||
|
||
# setup the trainer and initiate training | ||
trainer = Trainer( | ||
model=model, | ||
args=training_args, | ||
train_dataset=tokenized_dataset['train'], | ||
eval_dataset=tokenized_dataset['test'], | ||
# these are defined in utils.py, and are convenience methods for saving and loading peft models without | ||
# saving/loading the large model over again | ||
callbacks=[ | ||
SavePeftModelCallback(checkpoint_dir=output_dir), | ||
LoadBestPeftModelCallback(), | ||
tflops_callback | ||
] | ||
) | ||
trainer.train() |