From ba71a1e41f33e1138d793bdbc2ebd7c23f2d3f48 Mon Sep 17 00:00:00 2001 From: Morgan Date: Sat, 12 Aug 2023 20:14:10 +0000 Subject: [PATCH 01/17] add new docker and test files --- docker/Dockerfile | 30 +++++++++++ docker/Dockerfile-base | 104 +++++++++++++++++++++++++++++++++++++++ docker/Dockerfile-runpod | 18 +++++++ docker/testDockerfile | 32 ++++++++++++ test.py | 58 ++++++++++++++++++++++ test_run.py | 22 +++++++++ 6 files changed, 264 insertions(+) create mode 100644 docker/Dockerfile create mode 100644 docker/Dockerfile-base create mode 100644 docker/Dockerfile-runpod create mode 100644 docker/testDockerfile create mode 100644 test.py create mode 100644 test_run.py diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..b5198e1 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,30 @@ +ARG BASE_TAG=main-base +FROM winglian/axolotl-base:$BASE_TAG + +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" +ARG AXOLOTL_EXTRAS="" +ARG CUDA="118" +ENV BNB_CUDA_VERSION=$CUDA + +RUN apt-get update && \ + apt-get install -y vim curl + +WORKDIR /workspace + +RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main" +RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git +# If AXOLOTL_EXTRAS is set, append it in brackets +RUN cd axolotl && \ + if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ + pip install -e .[$AXOLOTL_EXTRAS]; \ + else \ + pip install -e .; \ + fi + +# fix so that git fetch/pull from remote works +RUN cd axolotl && \ + git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \ + git config --get remote.origin.fetch + +# helper for huggingface-login cli +RUN git config --global credential.helper store diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base new file mode 100644 index 0000000..aec727c --- /dev/null +++ b/docker/Dockerfile-base @@ -0,0 +1,104 @@ +ARG CUDA_VERSION="11.8.0" +ARG CUDNN_VERSION="8" +ARG UBUNTU_VERSION="22.04" +ARG MAX_JOBS=4 + +FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION as base-builder + +ENV PATH="/root/miniconda3/bin:${PATH}" + +ARG PYTHON_VERSION="3.9" +ARG PYTORCH_VERSION="2.0.1" +ARG CUDA="118" + +ENV PYTHON_VERSION=$PYTHON_VERSION + +RUN apt-get update +RUN apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev && rm -rf /var/lib/apt/lists/* + +RUN wget \ + https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ + && mkdir /root/.conda \ + && bash Miniconda3-latest-Linux-x86_64.sh -b \ + && rm -f Miniconda3-latest-Linux-x86_64.sh + +RUN conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}" + +ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}" + +WORKDIR /workspace + +RUN python3 -m pip install --upgrade pip && pip3 install packaging && \ + python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA + + +FROM base-builder AS flash-attn-builder + +WORKDIR /workspace + +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX" + +RUN git clone https://github.com/Dao-AILab/flash-attention.git && \ + cd flash-attention && \ + git checkout v2.0.1 && \ + python3 setup.py bdist_wheel && \ + cd csrc/fused_dense_lib && \ + python3 setup.py bdist_wheel && \ + cd ../xentropy && \ + python3 setup.py bdist_wheel && \ + cd ../rotary && \ + python3 setup.py bdist_wheel && \ + cd ../layer_norm && \ + python3 setup.py bdist_wheel + +FROM base-builder AS deepspeed-builder + +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX" + +WORKDIR /workspace + +RUN git clone https://github.com/microsoft/DeepSpeed.git && \ + cd DeepSpeed && \ + MAX_CONCURRENCY=8 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_OPS=1 python3 setup.py bdist_wheel + +FROM base-builder AS bnb-builder + +WORKDIR /workspace +ARG CUDA="118" +ENV CUDA=$CUDA + +RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \ + cd bitsandbytes && \ + CUDA_VERSION=$CUDA make cuda11x && \ + python setup.py bdist_wheel + +FROM base-builder + +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX" +ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST + +# recompile apex +RUN python3 -m pip uninstall -y apex +RUN git clone https://github.com/NVIDIA/apex +# `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners +RUN cd apex && MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ + +RUN mkdir -p /workspace/builds +COPY --from=bnb-builder /workspace/bitsandbytes /workspace/builds/bitsandbytes + +RUN mkdir -p /workspace/wheels/bitsandbytes +COPY --from=deepspeed-builder /workspace/DeepSpeed/dist/deepspeed-*.whl wheels +COPY --from=bnb-builder /workspace/bitsandbytes/dist/bitsandbytes-*.whl wheels +COPY --from=bnb-builder /workspace/bitsandbytes/bitsandbytes/libbitsandbytes*.so wheels/bitsandbytes +COPY --from=flash-attn-builder /workspace/flash-attention/dist/flash_attn-*.whl wheels +COPY --from=flash-attn-builder /workspace/flash-attention/csrc/fused_dense_lib/dist/fused_dense_lib-*.whl wheels +COPY --from=flash-attn-builder /workspace/flash-attention/csrc/xentropy/dist/xentropy_cuda_lib-*.whl wheels +COPY --from=flash-attn-builder /workspace/flash-attention/csrc/rotary/dist/rotary_emb-*.whl wheels +COPY --from=flash-attn-builder /workspace/flash-attention/csrc/layer_norm/dist/dropout_layer_norm-*.whl wheels + +RUN pip3 install wheels/deepspeed-*.whl wheels/flash_attn-*.whl wheels/fused_dense_lib-*.whl wheels/xentropy_cuda_lib-*.whl wheels/rotary_emb-*.whl wheels/dropout_layer_norm-*.whl +RUN cd /workspace/builds/bitsandbytes && python3 setup.py install +RUN git lfs install --skip-repo +RUN pip3 install awscli && \ + # The base image ships with `pydantic==1.8.2` which is not working + pip3 install -U --no-cache-dir pydantic==1.10.10 diff --git a/docker/Dockerfile-runpod b/docker/Dockerfile-runpod new file mode 100644 index 0000000..2ea6e99 --- /dev/null +++ b/docker/Dockerfile-runpod @@ -0,0 +1,18 @@ +ARG BASE_TAG=main +FROM winglian/axolotl:$BASE_TAG + +ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets" +ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub" +ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub" + +COPY scripts/runpod-entrypoint.sh /root/runpod-entrypoint.sh + +RUN apt install --yes --no-install-recommends openssh-server tmux && \ + mkdir -p ~/.ssh && \ + chmod 700 ~/.ssh && \ + printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \ + chmod +x /workspace/axolotl/scripts/runpod-entrypoint.sh && \ + chmod +x /root/runpod-entrypoint.sh + +ENTRYPOINT ["/root/runpod-entrypoint.sh"] +CMD ["sleep", "infinity"] diff --git a/docker/testDockerfile b/docker/testDockerfile new file mode 100644 index 0000000..14c621b --- /dev/null +++ b/docker/testDockerfile @@ -0,0 +1,32 @@ +ARG BASE_TAG=main-base +FROM winglian/axolotl-base:$BASE_TAG + +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" +ARG AXOLOTL_EXTRAS="" +ARG CUDA="118" +ENV BNB_CUDA_VERSION=$CUDA + +RUN apt-get update && \ + apt-get install -y vim curl + +WORKDIR /workspace + +RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main" +RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git +# If AXOLOTL_EXTRAS is set, append it in brackets +RUN cd axolotl && \ + if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ + pip install -e .[$AXOLOTL_EXTRAS]; \ + else \ + pip install -e .; \ + fi + +# fix so that git fetch/pull from remote works +RUN cd axolotl && \ + git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \ + git config --get remote.origin.fetch + +# helper for huggingface-login cli +RUN git config --global credential.helper store + +RUN python test.py --default_training_args configs/default_training_configs/default_lora.yaml \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..fd9cf15 --- /dev/null +++ b/test.py @@ -0,0 +1,58 @@ +import wandb +import argparse +import yaml +import shutil +from subprocess import call +import os + +# wandb.login() + +""" +Still in progress and not yet tested. +""" + + +def get_args(): + parser = argparse.ArgumentParser() + # parser.add_argument('--sweep_id', type=str, default=None, + # help='Wandb sweep id for decentralized sweeping. If not provided, a new sweep will be created.') + + # parser.add_argument('--sweep_config', help='Path to sweep config yaml file', + # type=str, default='configs/sweep_configs/qlora_sweep.yaml') + + parser.add_argument('--wandb_project', type=str, help='Wandb project name', + default='test-launch-sweeps') + + parser.add_argument('--wandb_entity', type=str, help='Wandb project name', + default='ablateit') + + parser.add_argument('--default_training_args', type=str, help='Path to default training args yaml file', + default='configs/default_training_configs/default_qlora.yaml') + + return parser.parse_args() + +def main(): + args = get_args() + + temp_config_path = args.default_training_args.replace('.yaml', '_temp.yaml') + shutil.copyfile(args.default_training_args, temp_config_path) + + wandb.init() + config = wandb.config + + with open(temp_config_path, 'r') as file: + temp_config = yaml.safe_load(file) + + for hyperparameter, value in config.items(): + temp_config[hyperparameter] = value + + with open(temp_config_path, 'w') as file: + yaml.dump(temp_config, file) + + # Run the training command with the temporary config file + # cmd = f"accelerate launch axolotl/scripts/finetune.py {temp_config_path}" + cmd = f"python test_run.py --training_args_path {temp_config_path}" + call(cmd, shell=True) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/test_run.py b/test_run.py new file mode 100644 index 0000000..ebaafd1 --- /dev/null +++ b/test_run.py @@ -0,0 +1,22 @@ +import argparse +import yaml +import os + +def get_args(): + parser = argparse.ArgumentParser() + + parser.add_argument('--training_args_path', type=str, help='Path to default training args yaml file', + default='configs/default_training_configs/default_qlora.yaml') + + return parser.parse_args() + +def main(): + args = get_args() + + with open(args.training_args_path, 'r') as file: + temp_config = yaml.safe_load(file) + + print(temp_config) + +if __name__ == '__main__': + main() \ No newline at end of file From 3cb246d34231c60cc3bca357362b9382adab7ea1 Mon Sep 17 00:00:00 2001 From: Morgan Date: Sat, 12 Aug 2023 20:58:32 +0000 Subject: [PATCH 02/17] update test file --- docker/Dockerfile | 4 ++++ test.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index b5198e1..14ea630 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -28,3 +28,7 @@ RUN cd axolotl && \ # helper for huggingface-login cli RUN git config --global credential.helper store + +RUN git clone --depth=1 https://github.com/morganmcg1/finetune-study.git + +ENTRYPOINT [ "python", "finetune-study/test.py", "--default_training_args", "finetune-study/configs/default_training_configs/default_lora.yaml"] diff --git a/test.py b/test.py index fd9cf15..ede0987 100644 --- a/test.py +++ b/test.py @@ -51,7 +51,9 @@ def main(): # Run the training command with the temporary config file # cmd = f"accelerate launch axolotl/scripts/finetune.py {temp_config_path}" - cmd = f"python test_run.py --training_args_path {temp_config_path}" + print("YAAAAY") + cmd = f"python finetune-study/test_run.py --training_args_path {temp_config_path}" + cmd("ls") call(cmd, shell=True) if __name__ == '__main__': From a39780c485be4735810a46ac33a82e78bb19abfb Mon Sep 17 00:00:00 2001 From: Morgan Date: Sat, 12 Aug 2023 21:06:58 +0000 Subject: [PATCH 03/17] fix --- test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test.py b/test.py index ede0987..dbefd5c 100644 --- a/test.py +++ b/test.py @@ -51,9 +51,7 @@ def main(): # Run the training command with the temporary config file # cmd = f"accelerate launch axolotl/scripts/finetune.py {temp_config_path}" - print("YAAAAY") cmd = f"python finetune-study/test_run.py --training_args_path {temp_config_path}" - cmd("ls") call(cmd, shell=True) if __name__ == '__main__': From a18b95dc009f677c4c5835883a718d83a80849fd Mon Sep 17 00:00:00 2001 From: Morgan Date: Sun, 13 Aug 2023 16:33:07 +0000 Subject: [PATCH 04/17] change run command --- test.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/test.py b/test.py index dbefd5c..2ae33b1 100644 --- a/test.py +++ b/test.py @@ -31,6 +31,17 @@ def get_args(): return parser.parse_args() +def create_name(config_dict): + short = { + 'gradient_accumulation_steps': 'graccsteps', + 'learning_rate': 'lr', + 'lora_r': 'lora_r' + } + name = '' + for hyperparam, value in config_dict.items(): + name += short.get(hyperparam, hyperparam) + str(value).replace('.', '_') + '-' + return name[:-1] + def main(): args = get_args() @@ -40,6 +51,9 @@ def main(): wandb.init() config = wandb.config + run_name = sweep_name + "-" + finetune_type + "-" + create_name(config) + wandb.run.name = run_name + with open(temp_config_path, 'r') as file: temp_config = yaml.safe_load(file) @@ -49,9 +63,19 @@ def main(): with open(temp_config_path, 'w') as file: yaml.dump(temp_config, file) + # log the artifact file + art = wandb.Artifact(name=f'config-{run_name}', type='run_config') + art.add_file(temp_config_path) + wandb.log_artifact(art) + # Run the training command with the temporary config file # cmd = f"accelerate launch axolotl/scripts/finetune.py {temp_config_path}" - cmd = f"python finetune-study/test_run.py --training_args_path {temp_config_path}" + + # Run the training command with the temporary config file + cuda_device_declaration = "CUDA_VISIBLE_DEVICES=" + ",".join( + [str(x) for x in args.CUDA_device_ids]) + " " if args.CUDA_device_ids else "" + cmd = cuda_device_declaration + f"accelerate launch axolotl/scripts/finetune.py {temp_config_path}" + # cmd = f"python finetune-study/test_run.py --training_args_path {temp_config_path}" call(cmd, shell=True) if __name__ == '__main__': From 4455782c81a4c8d46b7bdf06548d0455b3218ac6 Mon Sep 17 00:00:00 2001 From: Morgan Date: Sun, 13 Aug 2023 17:32:33 +0000 Subject: [PATCH 05/17] fix wandb name --- test.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/test.py b/test.py index 2ae33b1..505b6eb 100644 --- a/test.py +++ b/test.py @@ -42,16 +42,66 @@ def create_name(config_dict): name += short.get(hyperparam, hyperparam) + str(value).replace('.', '_') + '-' return name[:-1] +import subprocess +import re + +def get_cuda_version(): + try: + nvcc_version = subprocess.check_output(["nvcc", "--version"]).decode() + # Extract the version using regex + match = re.search(r"release (\d+\.\d+)", nvcc_version) + if match: + return match.group(1) + else: + return "No CUDA version found" + except subprocess.CalledProcessError: + return "Failed to run nvcc" + except FileNotFoundError: + return "nvcc not found" + +import subprocess + +def get_git_commit_sha(git_dir=None): + cmd = ["git", "rev-parse", "HEAD"] + if git_dir: + cmd.extend(["--git-dir", f"{git_dir}/.git"]) + sha = subprocess.check_output(cmd).strip().decode("utf-8") + return sha + +from pynvml import * +def get_nvidia_details(): + nvidia_details = {} + nvmlInit() + # print(f"Driver Version: {nvmlSystemGetDriverVersion()}") + deviceCount = nvmlDeviceGetCount() + gpus_ls = [] + for i in range(deviceCount): + handle = nvmlDeviceGetHandleByIndex(i) + # print(f"Device {i} : {nvmlDeviceGetName(handle)}") + gpus_ls.append(nvmlDeviceGetName(handle)) + nvidia_details["nvidia_driver_version"]: nvmlSystemGetDriverVersion() + nvidia_details["num_gpus_on_machine"] = deviceCount + nvidia_details["gpus"] = gpus_ls + return nvidia_details + def main(): args = get_args() temp_config_path = args.default_training_args.replace('.yaml', '_temp.yaml') shutil.copyfile(args.default_training_args, temp_config_path) - wandb.init() + nvidia_details = get_nvidia_details() + wandb.init(config={ + "axolotl_git_commit_sha": get_git_commit_sha("axolotl"), + "finetune-study_git_commit_sha": get_git_commit_sha("finetune-study"), + "cuda_version": get_cuda_version(), + "nvidia_driver_version": nvidia_details["nvidia_driver_version"], + "num_gpus_on_machine": nvidia_details["num_gpus_on_machine"], + "gpus": nvidia_details["gpus"] + }) config = wandb.config - run_name = sweep_name + "-" + finetune_type + "-" + create_name(config) + run_name = create_name(config) wandb.run.name = run_name with open(temp_config_path, 'r') as file: From 92c4a4c95fe43bed9292ffb41dd7b67e9a6f080a Mon Sep 17 00:00:00 2001 From: Morgan Date: Sun, 13 Aug 2023 18:01:51 +0000 Subject: [PATCH 06/17] fix up test file --- test.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/test.py b/test.py index 505b6eb..4254083 100644 --- a/test.py +++ b/test.py @@ -29,6 +29,9 @@ def get_args(): parser.add_argument('--default_training_args', type=str, help='Path to default training args yaml file', default='configs/default_training_configs/default_qlora.yaml') + parser.add_argument('--CUDA_device_ids', type=list, default=None, + help='List of CUDA device ids to use for training. If not provided, all available GPUs will be used.') + return parser.parse_args() def create_name(config_dict): @@ -79,7 +82,8 @@ def get_nvidia_details(): handle = nvmlDeviceGetHandleByIndex(i) # print(f"Device {i} : {nvmlDeviceGetName(handle)}") gpus_ls.append(nvmlDeviceGetName(handle)) - nvidia_details["nvidia_driver_version"]: nvmlSystemGetDriverVersion() + + nvidia_details["nvidia_driver_version"] = nvmlSystemGetDriverVersion() nvidia_details["num_gpus_on_machine"] = deviceCount nvidia_details["gpus"] = gpus_ls return nvidia_details @@ -91,12 +95,13 @@ def main(): shutil.copyfile(args.default_training_args, temp_config_path) nvidia_details = get_nvidia_details() + print(nvidia_details) wandb.init(config={ - "axolotl_git_commit_sha": get_git_commit_sha("axolotl"), - "finetune-study_git_commit_sha": get_git_commit_sha("finetune-study"), + # "axolotl_git_commit_sha": get_git_commit_sha("axolotl"), + # "finetune-study_git_commit_sha": get_git_commit_sha("finetune-study"), "cuda_version": get_cuda_version(), "nvidia_driver_version": nvidia_details["nvidia_driver_version"], - "num_gpus_on_machine": nvidia_details["num_gpus_on_machine"], + "gpu_count": nvidia_details["num_gpus_on_machine"], "gpus": nvidia_details["gpus"] }) config = wandb.config @@ -114,7 +119,7 @@ def main(): yaml.dump(temp_config, file) # log the artifact file - art = wandb.Artifact(name=f'config-{run_name}', type='run_config') + art = wandb.Artifact(name=f'my-config', type='run_config') art.add_file(temp_config_path) wandb.log_artifact(art) From d149efd8160ecd1de6e9e24b4399c45914f7455f Mon Sep 17 00:00:00 2001 From: Morgan Date: Sun, 13 Aug 2023 18:19:00 +0000 Subject: [PATCH 07/17] fix config logging --- test.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/test.py b/test.py index 4254083..c285c80 100644 --- a/test.py +++ b/test.py @@ -96,14 +96,7 @@ def main(): nvidia_details = get_nvidia_details() print(nvidia_details) - wandb.init(config={ - # "axolotl_git_commit_sha": get_git_commit_sha("axolotl"), - # "finetune-study_git_commit_sha": get_git_commit_sha("finetune-study"), - "cuda_version": get_cuda_version(), - "nvidia_driver_version": nvidia_details["nvidia_driver_version"], - "gpu_count": nvidia_details["num_gpus_on_machine"], - "gpus": nvidia_details["gpus"] - }) + wandb.init() config = wandb.config run_name = create_name(config) @@ -118,6 +111,9 @@ def main(): with open(temp_config_path, 'w') as file: yaml.dump(temp_config, file) + # Update the wandb config with the yaml config + wandb.config.update({**temp_config, **nvidia_details}) + # log the artifact file art = wandb.Artifact(name=f'my-config', type='run_config') art.add_file(temp_config_path) From 6a52aff29de5fd5b7850cd8b370c58558dc4aea9 Mon Sep 17 00:00:00 2001 From: Morgan Date: Sun, 13 Aug 2023 19:36:20 +0000 Subject: [PATCH 08/17] remove nvidia tracking --- docker/Dockerfile | 15 +++++++++++---- test.py | 5 +++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 14ea630..5337919 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -11,8 +11,10 @@ RUN apt-get update && \ WORKDIR /workspace -RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main" -RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git +# RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git +# clone AblateIt axolotl fork +RUN git clone --depth=1 https://github.com/AblateIt/axolotl.git + # If AXOLOTL_EXTRAS is set, append it in brackets RUN cd axolotl && \ if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ @@ -21,6 +23,10 @@ RUN cd axolotl && \ pip install -e .; \ fi +RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main" + +RUN git clone --depth=1 https://github.com/morganmcg1/finetune-study.git + # fix so that git fetch/pull from remote works RUN cd axolotl && \ git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \ @@ -29,6 +35,7 @@ RUN cd axolotl && \ # helper for huggingface-login cli RUN git config --global credential.helper store -RUN git clone --depth=1 https://github.com/morganmcg1/finetune-study.git +# RUN pip3 install nvidia-ml-py +# RUN pip install nvidia-ml-py -ENTRYPOINT [ "python", "finetune-study/test.py", "--default_training_args", "finetune-study/configs/default_training_configs/default_lora.yaml"] +# ENTRYPOINT [ "python", "finetune-study/test.py", "--default_training_args", "finetune-study/configs/default_training_configs/default_lora.yaml"] diff --git a/test.py b/test.py index c285c80..6f8927c 100644 --- a/test.py +++ b/test.py @@ -72,6 +72,7 @@ def get_git_commit_sha(git_dir=None): return sha from pynvml import * + def get_nvidia_details(): nvidia_details = {} nvmlInit() @@ -84,7 +85,7 @@ def get_nvidia_details(): gpus_ls.append(nvmlDeviceGetName(handle)) nvidia_details["nvidia_driver_version"] = nvmlSystemGetDriverVersion() - nvidia_details["num_gpus_on_machine"] = deviceCount + nvidia_details["gpu_count"] = deviceCount nvidia_details["gpus"] = gpus_ls return nvidia_details @@ -94,7 +95,7 @@ def main(): temp_config_path = args.default_training_args.replace('.yaml', '_temp.yaml') shutil.copyfile(args.default_training_args, temp_config_path) - nvidia_details = get_nvidia_details() + # nvidia_details = get_nvidia_details() print(nvidia_details) wandb.init() config = wandb.config From 66ef5c965c539ac2bbceeeeff06ae437b2cbb618 Mon Sep 17 00:00:00 2001 From: Morgan Date: Sun, 13 Aug 2023 19:46:11 +0000 Subject: [PATCH 09/17] fix --- test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test.py b/test.py index 6f8927c..d5ea199 100644 --- a/test.py +++ b/test.py @@ -96,7 +96,7 @@ def main(): shutil.copyfile(args.default_training_args, temp_config_path) # nvidia_details = get_nvidia_details() - print(nvidia_details) + wandb.init() config = wandb.config From 4dc75d672838773bfef3b4c1fbe67b7e21c7dbcb Mon Sep 17 00:00:00 2001 From: Morgan Date: Sun, 13 Aug 2023 19:53:59 +0000 Subject: [PATCH 10/17] fix --- test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test.py b/test.py index d5ea199..0cccd20 100644 --- a/test.py +++ b/test.py @@ -113,7 +113,8 @@ def main(): yaml.dump(temp_config, file) # Update the wandb config with the yaml config - wandb.config.update({**temp_config, **nvidia_details}) + # wandb.config.update({**temp_config, **nvidia_details}) + wandb.config.update(temp_config) # log the artifact file art = wandb.Artifact(name=f'my-config', type='run_config') From 8055e8ed1d4046d2d2a22e3dc41131b310b69fad Mon Sep 17 00:00:00 2001 From: Morgan Date: Sun, 13 Aug 2023 21:01:52 +0000 Subject: [PATCH 11/17] added python to cmd --- test.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test.py b/test.py index 0cccd20..2100563 100644 --- a/test.py +++ b/test.py @@ -2,7 +2,7 @@ import argparse import yaml import shutil -from subprocess import call +from subprocess import call, run import os # wandb.login() @@ -127,9 +127,10 @@ def main(): # Run the training command with the temporary config file cuda_device_declaration = "CUDA_VISIBLE_DEVICES=" + ",".join( [str(x) for x in args.CUDA_device_ids]) + " " if args.CUDA_device_ids else "" - cmd = cuda_device_declaration + f"accelerate launch axolotl/scripts/finetune.py {temp_config_path}" + cmd = "python " + cuda_device_declaration + f" accelerate launch axolotl/scripts/finetune.py {temp_config_path}" # cmd = f"python finetune-study/test_run.py --training_args_path {temp_config_path}" - call(cmd, shell=True) + # call(cmd, shell=True) + run(cmd, shell=True) if __name__ == '__main__': main() \ No newline at end of file From f5280e7dce7d98ed180986946b961ff97906f1ad Mon Sep 17 00:00:00 2001 From: Morgan Date: Sun, 13 Aug 2023 21:09:21 +0000 Subject: [PATCH 12/17] added python to cmd --- test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test.py b/test.py index 2100563..a6a20a5 100644 --- a/test.py +++ b/test.py @@ -127,7 +127,7 @@ def main(): # Run the training command with the temporary config file cuda_device_declaration = "CUDA_VISIBLE_DEVICES=" + ",".join( [str(x) for x in args.CUDA_device_ids]) + " " if args.CUDA_device_ids else "" - cmd = "python " + cuda_device_declaration + f" accelerate launch axolotl/scripts/finetune.py {temp_config_path}" + cmd = cuda_device_declaration + f" accelerate launch axolotl/scripts/finetune.py {temp_config_path}" # cmd = f"python finetune-study/test_run.py --training_args_path {temp_config_path}" # call(cmd, shell=True) run(cmd, shell=True) From f1713579f43b96066f2b282e1c19ada9e7119f73 Mon Sep 17 00:00:00 2001 From: Morgan Date: Tue, 15 Aug 2023 15:12:14 +0000 Subject: [PATCH 13/17] add launch_run file --- launch_run.py | 6 +++ sweep.py | 142 ++++++++++++++++++++++++++------------------------ 2 files changed, 79 insertions(+), 69 deletions(-) create mode 100644 launch_run.py diff --git a/launch_run.py b/launch_run.py new file mode 100644 index 0000000..c81c8e0 --- /dev/null +++ b/launch_run.py @@ -0,0 +1,6 @@ +from sweep import get_args, create_name, train, DATASET_SIZES + +if __name__ == "__main__": + args = get_args() + print(vars(args)) + train(args) \ No newline at end of file diff --git a/sweep.py b/sweep.py index 961103a..0a437a6 100644 --- a/sweep.py +++ b/sweep.py @@ -3,6 +3,7 @@ import yaml import shutil from subprocess import call +from functools import partial import os wandb.login() @@ -81,6 +82,75 @@ def create_name(config_dict): return name[:-1] +def train(args): + wandb.init(entity=args.entity, project=args.project) + config = dict(wandb.config) + + warmup_factor = ( + config.pop("warmpup_steps_factor_of_epoch") + if "warmpup_steps_factor_of_epoch" in config + else None + ) + finetune_type = config.pop("ft_type") + sweep_name = config.pop("sweep_name") + + run_name = sweep_name + "-" + finetune_type + "-" + create_name(config) + + wandb.run.name = run_name + with open(args.default_training_args, "r") as file: + run_config = yaml.safe_load(file) + + for hyperparameter, value in config.items(): + run_config[hyperparameter] = value + + epoch_train_steps = int((DATASET_SIZES["Puffin"] * + (1 - run_config["val_set_size"]))/ (run_config["gradient_accumulation_steps"] * run_config["micro_batch_size"])) + + if warmup_factor: + run_config["warmup_steps"] = int(epoch_train_steps * warmup_factor) + + if run_config["eval_strategy"] == "epoch" and type(run_config["eval_steps"]) == float: + run_config["eval_steps"] = int(epoch_train_steps * run_config["eval_steps"]) + run_config["eval_strategy"] = "steps" + + if run_config["save_strategy"] == "epoch" and type(run_config["save_steps"]) == float: + run_config["save_steps"] = int(epoch_train_steps * run_config["save_steps"]) + run_config["save_strategy"] = "steps" + + if args.push_to_hub: + run_config["hub_model_id"] = "AblateIt/" + run_name + run_config["push_to_hub"] = True + run_config["hub_strategy"] = "all_checkpoints" + print(run_config["hub_model_id"]) + + run_config["wandb_project"] = args.project + run_config["wandb_entity"] = args.entity + run_config["wandb_run_name"] = run_name + run_config["output_dir"] = run_config["output_dir"] + "/" + run_name + "/" + + run_config_path = run_config["output_dir"] + "config.yaml" + + if not os.path.exists(run_config["output_dir"]): + os.makedirs(run_config["output_dir"]) + + with open(run_config_path, "w") as file: + yaml.dump(run_config, file) + print(run_config) + + # Run the training command with the temporary config file + cuda_device_declaration = ( + "export CUDA_VISIBLE_DEVICES=" + ",".join([str(x) for x in args.gpu]) + "; " + if args.gpu + else "" + ) + cmd = ( + cuda_device_declaration + + f"accelerate launch axolotl/scripts/finetune.py {run_config_path} --main_process_port 0" + ) + print(cmd) + call(cmd, shell=True) + + def sweep(): args = get_args() @@ -93,77 +163,11 @@ def sweep(): with open("sweep_id.txt", "w") as file: file.write(sweep_id) - def run_sweep(): - wandb.init(entity=args.entity) - config = dict(wandb.config) - - warmup_factor = ( - config.pop("warmpup_steps_factor_of_epoch") - if "warmpup_steps_factor_of_epoch" in config - else None - ) - finetune_type = config.pop("ft_type") - sweep_name = config.pop("sweep_name") - - run_name = sweep_name + "-" + finetune_type + "-" + create_name(config) - - wandb.run.name = run_name - with open(args.default_training_args, "r") as file: - run_config = yaml.safe_load(file) - - for hyperparameter, value in config.items(): - run_config[hyperparameter] = value - - epoch_train_steps = int((DATASET_SIZES["Puffin"] * - (1 - run_config["val_set_size"]))/ (run_config["gradient_accumulation_steps"] * run_config["micro_batch_size"])) - - if warmup_factor: - run_config["warmup_steps"] = int(epoch_train_steps * warmup_factor) - - if run_config["eval_strategy"] == "epoch" and type(run_config["eval_steps"]) == float: - run_config["eval_steps"] = int(epoch_train_steps * run_config["eval_steps"]) - run_config["eval_strategy"] = "steps" - - if run_config["save_strategy"] == "epoch" and type(run_config["save_steps"]) == float: - run_config["save_steps"] = int(epoch_train_steps * run_config["save_steps"]) - run_config["save_strategy"] = "steps" - - if args.push_to_hub: - run_config["hub_model_id"] = "AblateIt/" + run_name - run_config["push_to_hub"] = True - run_config["hub_strategy"] = "all_checkpoints" - print(run_config["hub_model_id"]) - - run_config["wandb_project"] = args.project - run_config["wandb_entity"] = args.entity - run_config["wandb_run_name"] = run_name - run_config["output_dir"] = run_config["output_dir"] + "/" + run_name + "/" - - run_config_path = run_config["output_dir"] + "config.yaml" - - if not os.path.exists(run_config["output_dir"]): - os.makedirs(run_config["output_dir"]) - - with open(run_config_path, "w") as file: - yaml.dump(run_config, file) - print(run_config) - - # Run the training command with the temporary config file - cuda_device_declaration = ( - "export CUDA_VISIBLE_DEVICES=" + ",".join([str(x) for x in args.gpu]) + "; " - if args.gpu - else "" - ) - cmd = ( - cuda_device_declaration - + f"accelerate launch axolotl/scripts/finetune.py {run_config_path} --main_process_port 0" - ) - print(cmd) - call(cmd, shell=True) - if args.sweep_id is not None: # Run the sweep - wandb.agent(sweep_id, run_sweep, project=args.project, entity=args.entity) + wandb.agent(sweep_id, partial(train, args), project=args.project, entity=args.entity) + else: + print("No Sweep ID provided") if __name__ == "__main__": From b1222968fcd876c8e46b6c08566347264956d78a Mon Sep 17 00:00:00 2001 From: Morgan McGuire Date: Tue, 15 Aug 2023 16:24:59 +0100 Subject: [PATCH 14/17] Delete testDockerfile --- docker/testDockerfile | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 docker/testDockerfile diff --git a/docker/testDockerfile b/docker/testDockerfile deleted file mode 100644 index 14c621b..0000000 --- a/docker/testDockerfile +++ /dev/null @@ -1,32 +0,0 @@ -ARG BASE_TAG=main-base -FROM winglian/axolotl-base:$BASE_TAG - -ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" -ARG AXOLOTL_EXTRAS="" -ARG CUDA="118" -ENV BNB_CUDA_VERSION=$CUDA - -RUN apt-get update && \ - apt-get install -y vim curl - -WORKDIR /workspace - -RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main" -RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git -# If AXOLOTL_EXTRAS is set, append it in brackets -RUN cd axolotl && \ - if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ - pip install -e .[$AXOLOTL_EXTRAS]; \ - else \ - pip install -e .; \ - fi - -# fix so that git fetch/pull from remote works -RUN cd axolotl && \ - git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \ - git config --get remote.origin.fetch - -# helper for huggingface-login cli -RUN git config --global credential.helper store - -RUN python test.py --default_training_args configs/default_training_configs/default_lora.yaml \ No newline at end of file From 2271c193df6521db1011a2e230cfeabcdb34a7a3 Mon Sep 17 00:00:00 2001 From: Morgan McGuire Date: Tue, 15 Aug 2023 16:25:44 +0100 Subject: [PATCH 15/17] Delete test.py --- test.py | 136 -------------------------------------------------------- 1 file changed, 136 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index a6a20a5..0000000 --- a/test.py +++ /dev/null @@ -1,136 +0,0 @@ -import wandb -import argparse -import yaml -import shutil -from subprocess import call, run -import os - -# wandb.login() - -""" -Still in progress and not yet tested. -""" - - -def get_args(): - parser = argparse.ArgumentParser() - # parser.add_argument('--sweep_id', type=str, default=None, - # help='Wandb sweep id for decentralized sweeping. If not provided, a new sweep will be created.') - - # parser.add_argument('--sweep_config', help='Path to sweep config yaml file', - # type=str, default='configs/sweep_configs/qlora_sweep.yaml') - - parser.add_argument('--wandb_project', type=str, help='Wandb project name', - default='test-launch-sweeps') - - parser.add_argument('--wandb_entity', type=str, help='Wandb project name', - default='ablateit') - - parser.add_argument('--default_training_args', type=str, help='Path to default training args yaml file', - default='configs/default_training_configs/default_qlora.yaml') - - parser.add_argument('--CUDA_device_ids', type=list, default=None, - help='List of CUDA device ids to use for training. If not provided, all available GPUs will be used.') - - return parser.parse_args() - -def create_name(config_dict): - short = { - 'gradient_accumulation_steps': 'graccsteps', - 'learning_rate': 'lr', - 'lora_r': 'lora_r' - } - name = '' - for hyperparam, value in config_dict.items(): - name += short.get(hyperparam, hyperparam) + str(value).replace('.', '_') + '-' - return name[:-1] - -import subprocess -import re - -def get_cuda_version(): - try: - nvcc_version = subprocess.check_output(["nvcc", "--version"]).decode() - # Extract the version using regex - match = re.search(r"release (\d+\.\d+)", nvcc_version) - if match: - return match.group(1) - else: - return "No CUDA version found" - except subprocess.CalledProcessError: - return "Failed to run nvcc" - except FileNotFoundError: - return "nvcc not found" - -import subprocess - -def get_git_commit_sha(git_dir=None): - cmd = ["git", "rev-parse", "HEAD"] - if git_dir: - cmd.extend(["--git-dir", f"{git_dir}/.git"]) - sha = subprocess.check_output(cmd).strip().decode("utf-8") - return sha - -from pynvml import * - -def get_nvidia_details(): - nvidia_details = {} - nvmlInit() - # print(f"Driver Version: {nvmlSystemGetDriverVersion()}") - deviceCount = nvmlDeviceGetCount() - gpus_ls = [] - for i in range(deviceCount): - handle = nvmlDeviceGetHandleByIndex(i) - # print(f"Device {i} : {nvmlDeviceGetName(handle)}") - gpus_ls.append(nvmlDeviceGetName(handle)) - - nvidia_details["nvidia_driver_version"] = nvmlSystemGetDriverVersion() - nvidia_details["gpu_count"] = deviceCount - nvidia_details["gpus"] = gpus_ls - return nvidia_details - -def main(): - args = get_args() - - temp_config_path = args.default_training_args.replace('.yaml', '_temp.yaml') - shutil.copyfile(args.default_training_args, temp_config_path) - - # nvidia_details = get_nvidia_details() - - wandb.init() - config = wandb.config - - run_name = create_name(config) - wandb.run.name = run_name - - with open(temp_config_path, 'r') as file: - temp_config = yaml.safe_load(file) - - for hyperparameter, value in config.items(): - temp_config[hyperparameter] = value - - with open(temp_config_path, 'w') as file: - yaml.dump(temp_config, file) - - # Update the wandb config with the yaml config - # wandb.config.update({**temp_config, **nvidia_details}) - wandb.config.update(temp_config) - - # log the artifact file - art = wandb.Artifact(name=f'my-config', type='run_config') - art.add_file(temp_config_path) - wandb.log_artifact(art) - - # Run the training command with the temporary config file - # cmd = f"accelerate launch axolotl/scripts/finetune.py {temp_config_path}" - - # Run the training command with the temporary config file - cuda_device_declaration = "CUDA_VISIBLE_DEVICES=" + ",".join( - [str(x) for x in args.CUDA_device_ids]) + " " if args.CUDA_device_ids else "" - cmd = cuda_device_declaration + f" accelerate launch axolotl/scripts/finetune.py {temp_config_path}" - # cmd = f"python finetune-study/test_run.py --training_args_path {temp_config_path}" - # call(cmd, shell=True) - run(cmd, shell=True) - -if __name__ == '__main__': - main() \ No newline at end of file From 9a69b4169edb55f2b2a4a81f0378e0cebec46739 Mon Sep 17 00:00:00 2001 From: Morgan McGuire Date: Tue, 15 Aug 2023 16:26:12 +0100 Subject: [PATCH 16/17] Delete test_run.py --- test_run.py | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 test_run.py diff --git a/test_run.py b/test_run.py deleted file mode 100644 index ebaafd1..0000000 --- a/test_run.py +++ /dev/null @@ -1,22 +0,0 @@ -import argparse -import yaml -import os - -def get_args(): - parser = argparse.ArgumentParser() - - parser.add_argument('--training_args_path', type=str, help='Path to default training args yaml file', - default='configs/default_training_configs/default_qlora.yaml') - - return parser.parse_args() - -def main(): - args = get_args() - - with open(args.training_args_path, 'r') as file: - temp_config = yaml.safe_load(file) - - print(temp_config) - -if __name__ == '__main__': - main() \ No newline at end of file From e3e73b3ec0a990ae86bc74b7d60bb79547b8de2d Mon Sep 17 00:00:00 2001 From: Morgan Date: Tue, 15 Aug 2023 19:18:23 +0000 Subject: [PATCH 17/17] fix push to hub --- docker/Dockerfile | 2 ++ sweep.py | 9 ++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 5337919..ee24266 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -25,6 +25,8 @@ RUN cd axolotl && \ RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main" +# RUN pip3 install flash-attn==2.0.7 + RUN git clone --depth=1 https://github.com/morganmcg1/finetune-study.git # fix so that git fetch/pull from remote works diff --git a/sweep.py b/sweep.py index 0a437a6..8a4dd8c 100644 --- a/sweep.py +++ b/sweep.py @@ -55,8 +55,7 @@ def get_args(): parser.add_argument( "--push_to_hub", - type=bool, - default=True, + action='store_true', help="Whether to push the models to the hub during training.", ) @@ -156,14 +155,14 @@ def sweep(): sweep_id = args.sweep_id - if not sweep_id: + if sweep_id is None: sweep_config = yaml.safe_load(open(args.sweep_config))["wandb_args"] - sweep_id = wandb.sweep(sweep_config, project=args.project) + sweep_id = wandb.sweep(sweep_config, entity=args.entity, project=args.project) print(sweep_id) with open("sweep_id.txt", "w") as file: file.write(sweep_id) - if args.sweep_id is not None: + if sweep_id is not None: # Run the sweep wandb.agent(sweep_id, partial(train, args), project=args.project, entity=args.entity) else: