Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor out train function #2

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
ARG BASE_TAG=main-base
FROM winglian/axolotl-base:$BASE_TAG

ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
ARG AXOLOTL_EXTRAS=""
ARG CUDA="118"
ENV BNB_CUDA_VERSION=$CUDA

RUN apt-get update && \
apt-get install -y vim curl

WORKDIR /workspace

# RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
# clone AblateIt axolotl fork
RUN git clone --depth=1 https://github.com/AblateIt/axolotl.git

# If AXOLOTL_EXTRAS is set, append it in brackets
RUN cd axolotl && \
if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
pip install -e .[$AXOLOTL_EXTRAS]; \
else \
pip install -e .; \
fi

RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"

# RUN pip3 install flash-attn==2.0.7

RUN git clone --depth=1 https://github.com/morganmcg1/finetune-study.git

# fix so that git fetch/pull from remote works
RUN cd axolotl && \
git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
git config --get remote.origin.fetch

# helper for huggingface-login cli
RUN git config --global credential.helper store

# RUN pip3 install nvidia-ml-py
# RUN pip install nvidia-ml-py

# ENTRYPOINT [ "python", "finetune-study/test.py", "--default_training_args", "finetune-study/configs/default_training_configs/default_lora.yaml"]
104 changes: 104 additions & 0 deletions docker/Dockerfile-base
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
ARG CUDA_VERSION="11.8.0"
ARG CUDNN_VERSION="8"
ARG UBUNTU_VERSION="22.04"
ARG MAX_JOBS=4

FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION as base-builder

ENV PATH="/root/miniconda3/bin:${PATH}"

ARG PYTHON_VERSION="3.9"
ARG PYTORCH_VERSION="2.0.1"
ARG CUDA="118"

ENV PYTHON_VERSION=$PYTHON_VERSION

RUN apt-get update
RUN apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev && rm -rf /var/lib/apt/lists/*

RUN wget \
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
&& mkdir /root/.conda \
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
&& rm -f Miniconda3-latest-Linux-x86_64.sh

RUN conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"

ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"

WORKDIR /workspace

RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA


FROM base-builder AS flash-attn-builder

WORKDIR /workspace

ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

RUN git clone https://github.com/Dao-AILab/flash-attention.git && \
cd flash-attention && \
git checkout v2.0.1 && \
python3 setup.py bdist_wheel && \
cd csrc/fused_dense_lib && \
python3 setup.py bdist_wheel && \
cd ../xentropy && \
python3 setup.py bdist_wheel && \
cd ../rotary && \
python3 setup.py bdist_wheel && \
cd ../layer_norm && \
python3 setup.py bdist_wheel

FROM base-builder AS deepspeed-builder

ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

WORKDIR /workspace

RUN git clone https://github.com/microsoft/DeepSpeed.git && \
cd DeepSpeed && \
MAX_CONCURRENCY=8 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_OPS=1 python3 setup.py bdist_wheel

FROM base-builder AS bnb-builder

WORKDIR /workspace
ARG CUDA="118"
ENV CUDA=$CUDA

RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \
cd bitsandbytes && \
CUDA_VERSION=$CUDA make cuda11x && \
python setup.py bdist_wheel

FROM base-builder

ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST

# recompile apex
RUN python3 -m pip uninstall -y apex
RUN git clone https://github.com/NVIDIA/apex
# `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
RUN cd apex && MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./

RUN mkdir -p /workspace/builds
COPY --from=bnb-builder /workspace/bitsandbytes /workspace/builds/bitsandbytes

RUN mkdir -p /workspace/wheels/bitsandbytes
COPY --from=deepspeed-builder /workspace/DeepSpeed/dist/deepspeed-*.whl wheels
COPY --from=bnb-builder /workspace/bitsandbytes/dist/bitsandbytes-*.whl wheels
COPY --from=bnb-builder /workspace/bitsandbytes/bitsandbytes/libbitsandbytes*.so wheels/bitsandbytes
COPY --from=flash-attn-builder /workspace/flash-attention/dist/flash_attn-*.whl wheels
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/fused_dense_lib/dist/fused_dense_lib-*.whl wheels
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/xentropy/dist/xentropy_cuda_lib-*.whl wheels
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/rotary/dist/rotary_emb-*.whl wheels
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/layer_norm/dist/dropout_layer_norm-*.whl wheels

RUN pip3 install wheels/deepspeed-*.whl wheels/flash_attn-*.whl wheels/fused_dense_lib-*.whl wheels/xentropy_cuda_lib-*.whl wheels/rotary_emb-*.whl wheels/dropout_layer_norm-*.whl
RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
RUN git lfs install --skip-repo
RUN pip3 install awscli && \
# The base image ships with `pydantic==1.8.2` which is not working
pip3 install -U --no-cache-dir pydantic==1.10.10
18 changes: 18 additions & 0 deletions docker/Dockerfile-runpod
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
ARG BASE_TAG=main
FROM winglian/axolotl:$BASE_TAG

ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"

COPY scripts/runpod-entrypoint.sh /root/runpod-entrypoint.sh

RUN apt install --yes --no-install-recommends openssh-server tmux && \
mkdir -p ~/.ssh && \
chmod 700 ~/.ssh && \
printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
chmod +x /workspace/axolotl/scripts/runpod-entrypoint.sh && \
chmod +x /root/runpod-entrypoint.sh

ENTRYPOINT ["/root/runpod-entrypoint.sh"]
CMD ["sleep", "infinity"]
6 changes: 6 additions & 0 deletions launch_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from sweep import get_args, create_name, train, DATASET_SIZES

if __name__ == "__main__":
args = get_args()
print(vars(args))
train(args)
151 changes: 77 additions & 74 deletions sweep.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import yaml
import shutil
from subprocess import call
from functools import partial
import os

wandb.login()
Expand Down Expand Up @@ -54,8 +55,7 @@ def get_args():

parser.add_argument(
"--push_to_hub",
type=bool,
default=True,
action='store_true',
help="Whether to push the models to the hub during training.",
)

Expand All @@ -81,89 +81,92 @@ def create_name(config_dict):
return name[:-1]


def train(args):
wandb.init(entity=args.entity, project=args.project)
config = dict(wandb.config)

warmup_factor = (
config.pop("warmpup_steps_factor_of_epoch")
if "warmpup_steps_factor_of_epoch" in config
else None
)
finetune_type = config.pop("ft_type")
sweep_name = config.pop("sweep_name")

run_name = sweep_name + "-" + finetune_type + "-" + create_name(config)

wandb.run.name = run_name
with open(args.default_training_args, "r") as file:
run_config = yaml.safe_load(file)

for hyperparameter, value in config.items():
run_config[hyperparameter] = value

epoch_train_steps = int((DATASET_SIZES["Puffin"] *
(1 - run_config["val_set_size"]))/ (run_config["gradient_accumulation_steps"] * run_config["micro_batch_size"]))

if warmup_factor:
run_config["warmup_steps"] = int(epoch_train_steps * warmup_factor)

if run_config["eval_strategy"] == "epoch" and type(run_config["eval_steps"]) == float:
run_config["eval_steps"] = int(epoch_train_steps * run_config["eval_steps"])
run_config["eval_strategy"] = "steps"

if run_config["save_strategy"] == "epoch" and type(run_config["save_steps"]) == float:
run_config["save_steps"] = int(epoch_train_steps * run_config["save_steps"])
run_config["save_strategy"] = "steps"

if args.push_to_hub:
run_config["hub_model_id"] = "AblateIt/" + run_name
run_config["push_to_hub"] = True
run_config["hub_strategy"] = "all_checkpoints"
print(run_config["hub_model_id"])

run_config["wandb_project"] = args.project
run_config["wandb_entity"] = args.entity
run_config["wandb_run_name"] = run_name
run_config["output_dir"] = run_config["output_dir"] + "/" + run_name + "/"

run_config_path = run_config["output_dir"] + "config.yaml"

if not os.path.exists(run_config["output_dir"]):
os.makedirs(run_config["output_dir"])

with open(run_config_path, "w") as file:
yaml.dump(run_config, file)
print(run_config)

# Run the training command with the temporary config file
cuda_device_declaration = (
"export CUDA_VISIBLE_DEVICES=" + ",".join([str(x) for x in args.gpu]) + "; "
if args.gpu
else ""
)
cmd = (
cuda_device_declaration
+ f"accelerate launch axolotl/scripts/finetune.py {run_config_path} --main_process_port 0"
)
print(cmd)
call(cmd, shell=True)


def sweep():
args = get_args()

sweep_id = args.sweep_id

if not sweep_id:
if sweep_id is None:
sweep_config = yaml.safe_load(open(args.sweep_config))["wandb_args"]
sweep_id = wandb.sweep(sweep_config, project=args.project)
sweep_id = wandb.sweep(sweep_config, entity=args.entity, project=args.project)
print(sweep_id)
with open("sweep_id.txt", "w") as file:
file.write(sweep_id)

def run_sweep():
wandb.init(entity=args.entity)
config = dict(wandb.config)

warmup_factor = (
config.pop("warmpup_steps_factor_of_epoch")
if "warmpup_steps_factor_of_epoch" in config
else None
)
finetune_type = config.pop("ft_type")
sweep_name = config.pop("sweep_name")

run_name = sweep_name + "-" + finetune_type + "-" + create_name(config)

wandb.run.name = run_name
with open(args.default_training_args, "r") as file:
run_config = yaml.safe_load(file)

for hyperparameter, value in config.items():
run_config[hyperparameter] = value

epoch_train_steps = int((DATASET_SIZES["Puffin"] *
(1 - run_config["val_set_size"]))/ (run_config["gradient_accumulation_steps"] * run_config["micro_batch_size"]))

if warmup_factor:
run_config["warmup_steps"] = int(epoch_train_steps * warmup_factor)

if run_config["eval_strategy"] == "epoch" and type(run_config["eval_steps"]) == float:
run_config["eval_steps"] = int(epoch_train_steps * run_config["eval_steps"])
run_config["eval_strategy"] = "steps"

if run_config["save_strategy"] == "epoch" and type(run_config["save_steps"]) == float:
run_config["save_steps"] = int(epoch_train_steps * run_config["save_steps"])
run_config["save_strategy"] = "steps"

if args.push_to_hub:
run_config["hub_model_id"] = "AblateIt/" + run_name
run_config["push_to_hub"] = True
run_config["hub_strategy"] = "all_checkpoints"
print(run_config["hub_model_id"])

run_config["wandb_project"] = args.project
run_config["wandb_entity"] = args.entity
run_config["wandb_run_name"] = run_name
run_config["output_dir"] = run_config["output_dir"] + "/" + run_name + "/"

run_config_path = run_config["output_dir"] + "config.yaml"

if not os.path.exists(run_config["output_dir"]):
os.makedirs(run_config["output_dir"])

with open(run_config_path, "w") as file:
yaml.dump(run_config, file)
print(run_config)

# Run the training command with the temporary config file
cuda_device_declaration = (
"export CUDA_VISIBLE_DEVICES=" + ",".join([str(x) for x in args.gpu]) + "; "
if args.gpu
else ""
)
cmd = (
cuda_device_declaration
+ f"accelerate launch axolotl/scripts/finetune.py {run_config_path} --main_process_port 0"
)
print(cmd)
call(cmd, shell=True)

if args.sweep_id is not None:
if sweep_id is not None:
# Run the sweep
wandb.agent(sweep_id, run_sweep, project=args.project, entity=args.entity)
wandb.agent(sweep_id, partial(train, args), project=args.project, entity=args.entity)
else:
print("No Sweep ID provided")


if __name__ == "__main__":
Expand Down