From ba71a1e41f33e1138d793bdbc2ebd7c23f2d3f48 Mon Sep 17 00:00:00 2001
From: Morgan <morganmcg1@users.noreply.github.com>
Date: Sat, 12 Aug 2023 20:14:10 +0000
Subject: [PATCH 01/17] add new docker and test files

---
 docker/Dockerfile        |  30 +++++++++++
 docker/Dockerfile-base   | 104 +++++++++++++++++++++++++++++++++++++++
 docker/Dockerfile-runpod |  18 +++++++
 docker/testDockerfile    |  32 ++++++++++++
 test.py                  |  58 ++++++++++++++++++++++
 test_run.py              |  22 +++++++++
 6 files changed, 264 insertions(+)
 create mode 100644 docker/Dockerfile
 create mode 100644 docker/Dockerfile-base
 create mode 100644 docker/Dockerfile-runpod
 create mode 100644 docker/testDockerfile
 create mode 100644 test.py
 create mode 100644 test_run.py

diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 0000000..b5198e1
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,30 @@
+ARG BASE_TAG=main-base
+FROM winglian/axolotl-base:$BASE_TAG
+
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
+ARG AXOLOTL_EXTRAS=""
+ARG CUDA="118"
+ENV BNB_CUDA_VERSION=$CUDA
+
+RUN apt-get update && \
+    apt-get install -y vim curl
+
+WORKDIR /workspace
+
+RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
+RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
+# If AXOLOTL_EXTRAS is set, append it in brackets
+RUN cd axolotl && \
+    if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
+        pip install -e .[$AXOLOTL_EXTRAS]; \
+    else \
+        pip install -e .; \
+    fi
+
+# fix so that git fetch/pull from remote works
+RUN cd axolotl && \
+    git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
+    git config --get remote.origin.fetch
+
+# helper for huggingface-login cli
+RUN git config --global credential.helper store
diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
new file mode 100644
index 0000000..aec727c
--- /dev/null
+++ b/docker/Dockerfile-base
@@ -0,0 +1,104 @@
+ARG CUDA_VERSION="11.8.0"
+ARG CUDNN_VERSION="8"
+ARG UBUNTU_VERSION="22.04"
+ARG MAX_JOBS=4
+
+FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION as base-builder
+
+ENV PATH="/root/miniconda3/bin:${PATH}"
+
+ARG PYTHON_VERSION="3.9"
+ARG PYTORCH_VERSION="2.0.1"
+ARG CUDA="118"
+
+ENV PYTHON_VERSION=$PYTHON_VERSION
+
+RUN apt-get update
+RUN apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev && rm -rf /var/lib/apt/lists/*
+
+RUN wget \
+    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+    && mkdir /root/.conda \
+    && bash Miniconda3-latest-Linux-x86_64.sh -b \
+    && rm -f Miniconda3-latest-Linux-x86_64.sh
+
+RUN conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
+
+ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
+
+WORKDIR /workspace
+
+RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
+    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA
+
+
+FROM base-builder AS flash-attn-builder
+
+WORKDIR /workspace
+
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+
+RUN git clone https://github.com/Dao-AILab/flash-attention.git && \
+    cd flash-attention && \
+    git checkout v2.0.1  && \
+    python3 setup.py bdist_wheel && \
+    cd csrc/fused_dense_lib && \
+    python3 setup.py bdist_wheel && \
+    cd ../xentropy && \
+    python3 setup.py bdist_wheel && \
+    cd ../rotary && \
+    python3 setup.py bdist_wheel && \
+    cd ../layer_norm && \
+    python3 setup.py bdist_wheel
+
+FROM base-builder AS deepspeed-builder
+
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+
+WORKDIR /workspace
+
+RUN git clone https://github.com/microsoft/DeepSpeed.git && \
+    cd DeepSpeed && \
+    MAX_CONCURRENCY=8 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_OPS=1 python3 setup.py bdist_wheel
+
+FROM base-builder AS bnb-builder
+
+WORKDIR /workspace
+ARG CUDA="118"
+ENV CUDA=$CUDA
+
+RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \
+    cd bitsandbytes && \
+    CUDA_VERSION=$CUDA make cuda11x && \
+    python setup.py bdist_wheel
+
+FROM base-builder
+
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+
+# recompile apex
+RUN python3 -m pip uninstall -y apex
+RUN git clone https://github.com/NVIDIA/apex
+#  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
+RUN cd apex && MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+
+RUN mkdir -p /workspace/builds
+COPY --from=bnb-builder /workspace/bitsandbytes /workspace/builds/bitsandbytes
+
+RUN mkdir -p /workspace/wheels/bitsandbytes
+COPY --from=deepspeed-builder /workspace/DeepSpeed/dist/deepspeed-*.whl wheels
+COPY --from=bnb-builder /workspace/bitsandbytes/dist/bitsandbytes-*.whl wheels
+COPY --from=bnb-builder /workspace/bitsandbytes/bitsandbytes/libbitsandbytes*.so wheels/bitsandbytes
+COPY --from=flash-attn-builder /workspace/flash-attention/dist/flash_attn-*.whl wheels
+COPY --from=flash-attn-builder /workspace/flash-attention/csrc/fused_dense_lib/dist/fused_dense_lib-*.whl wheels
+COPY --from=flash-attn-builder /workspace/flash-attention/csrc/xentropy/dist/xentropy_cuda_lib-*.whl wheels
+COPY --from=flash-attn-builder /workspace/flash-attention/csrc/rotary/dist/rotary_emb-*.whl wheels
+COPY --from=flash-attn-builder /workspace/flash-attention/csrc/layer_norm/dist/dropout_layer_norm-*.whl wheels
+
+RUN pip3 install wheels/deepspeed-*.whl wheels/flash_attn-*.whl wheels/fused_dense_lib-*.whl wheels/xentropy_cuda_lib-*.whl wheels/rotary_emb-*.whl wheels/dropout_layer_norm-*.whl
+RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
+RUN git lfs install --skip-repo
+RUN pip3 install awscli && \
+    # The base image ships with `pydantic==1.8.2` which is not working
+    pip3 install -U --no-cache-dir pydantic==1.10.10
diff --git a/docker/Dockerfile-runpod b/docker/Dockerfile-runpod
new file mode 100644
index 0000000..2ea6e99
--- /dev/null
+++ b/docker/Dockerfile-runpod
@@ -0,0 +1,18 @@
+ARG BASE_TAG=main
+FROM winglian/axolotl:$BASE_TAG
+
+ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
+ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
+ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
+
+COPY scripts/runpod-entrypoint.sh /root/runpod-entrypoint.sh
+
+RUN apt install --yes --no-install-recommends openssh-server tmux && \
+    mkdir -p ~/.ssh && \
+    chmod 700 ~/.ssh && \
+    printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
+    chmod +x /workspace/axolotl/scripts/runpod-entrypoint.sh && \
+    chmod +x /root/runpod-entrypoint.sh
+
+ENTRYPOINT ["/root/runpod-entrypoint.sh"]
+CMD ["sleep", "infinity"]
diff --git a/docker/testDockerfile b/docker/testDockerfile
new file mode 100644
index 0000000..14c621b
--- /dev/null
+++ b/docker/testDockerfile
@@ -0,0 +1,32 @@
+ARG BASE_TAG=main-base
+FROM winglian/axolotl-base:$BASE_TAG
+
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
+ARG AXOLOTL_EXTRAS=""
+ARG CUDA="118"
+ENV BNB_CUDA_VERSION=$CUDA
+
+RUN apt-get update && \
+    apt-get install -y vim curl
+
+WORKDIR /workspace
+
+RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
+RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
+# If AXOLOTL_EXTRAS is set, append it in brackets
+RUN cd axolotl && \
+    if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
+        pip install -e .[$AXOLOTL_EXTRAS]; \
+    else \
+        pip install -e .; \
+    fi
+
+# fix so that git fetch/pull from remote works
+RUN cd axolotl && \
+    git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
+    git config --get remote.origin.fetch
+
+# helper for huggingface-login cli
+RUN git config --global credential.helper store
+
+RUN python test.py --default_training_args configs/default_training_configs/default_lora.yaml 
\ No newline at end of file
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..fd9cf15
--- /dev/null
+++ b/test.py
@@ -0,0 +1,58 @@
+import wandb
+import argparse
+import yaml
+import shutil
+from subprocess import call
+import os
+
+# wandb.login()
+
+"""
+Still in progress and not yet tested.
+"""
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    # parser.add_argument('--sweep_id', type=str, default=None,
+    #                     help='Wandb sweep id for decentralized sweeping. If not provided, a new sweep will be created.')
+
+    # parser.add_argument('--sweep_config', help='Path to sweep config yaml file',
+    #                     type=str, default='configs/sweep_configs/qlora_sweep.yaml')
+
+    parser.add_argument('--wandb_project', type=str, help='Wandb project name',
+                        default='test-launch-sweeps')
+
+    parser.add_argument('--wandb_entity', type=str, help='Wandb project name',
+                        default='ablateit')
+
+    parser.add_argument('--default_training_args', type=str, help='Path to default training args yaml file',
+                        default='configs/default_training_configs/default_qlora.yaml')
+
+    return parser.parse_args()
+
+def main():
+    args = get_args()
+
+    temp_config_path = args.default_training_args.replace('.yaml', '_temp.yaml')
+    shutil.copyfile(args.default_training_args, temp_config_path)
+
+    wandb.init()
+    config = wandb.config
+
+    with open(temp_config_path, 'r') as file:
+        temp_config = yaml.safe_load(file)
+
+    for hyperparameter, value in config.items():
+        temp_config[hyperparameter] = value
+
+    with open(temp_config_path, 'w') as file:
+        yaml.dump(temp_config, file)
+
+    # Run the training command with the temporary config file
+    # cmd = f"accelerate launch axolotl/scripts/finetune.py {temp_config_path}"
+    cmd = f"python test_run.py --training_args_path {temp_config_path}"
+    call(cmd, shell=True)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/test_run.py b/test_run.py
new file mode 100644
index 0000000..ebaafd1
--- /dev/null
+++ b/test_run.py
@@ -0,0 +1,22 @@
+import argparse
+import yaml
+import os
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--training_args_path', type=str, help='Path to default training args yaml file',
+                        default='configs/default_training_configs/default_qlora.yaml')
+
+    return parser.parse_args()
+
+def main():
+    args = get_args()
+
+    with open(args.training_args_path, 'r') as file:
+        temp_config = yaml.safe_load(file)
+
+    print(temp_config)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From 3cb246d34231c60cc3bca357362b9382adab7ea1 Mon Sep 17 00:00:00 2001
From: Morgan <morganmcg1@users.noreply.github.com>
Date: Sat, 12 Aug 2023 20:58:32 +0000
Subject: [PATCH 02/17] update test file

---
 docker/Dockerfile | 4 ++++
 test.py           | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index b5198e1..14ea630 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -28,3 +28,7 @@ RUN cd axolotl && \
 
 # helper for huggingface-login cli
 RUN git config --global credential.helper store
+
+RUN git clone --depth=1 https://github.com/morganmcg1/finetune-study.git
+
+ENTRYPOINT [ "python", "finetune-study/test.py", "--default_training_args", "finetune-study/configs/default_training_configs/default_lora.yaml"]
diff --git a/test.py b/test.py
index fd9cf15..ede0987 100644
--- a/test.py
+++ b/test.py
@@ -51,7 +51,9 @@ def main():
 
     # Run the training command with the temporary config file
     # cmd = f"accelerate launch axolotl/scripts/finetune.py {temp_config_path}"
-    cmd = f"python test_run.py --training_args_path {temp_config_path}"
+    print("YAAAAY")
+    cmd = f"python finetune-study/test_run.py --training_args_path {temp_config_path}"
+    cmd("ls")
     call(cmd, shell=True)
 
 if __name__ == '__main__':

From a39780c485be4735810a46ac33a82e78bb19abfb Mon Sep 17 00:00:00 2001
From: Morgan <morganmcg1@users.noreply.github.com>
Date: Sat, 12 Aug 2023 21:06:58 +0000
Subject: [PATCH 03/17] fix

---
 test.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test.py b/test.py
index ede0987..dbefd5c 100644
--- a/test.py
+++ b/test.py
@@ -51,9 +51,7 @@ def main():
 
     # Run the training command with the temporary config file
     # cmd = f"accelerate launch axolotl/scripts/finetune.py {temp_config_path}"
-    print("YAAAAY")
     cmd = f"python finetune-study/test_run.py --training_args_path {temp_config_path}"
-    cmd("ls")
     call(cmd, shell=True)
 
 if __name__ == '__main__':

From a18b95dc009f677c4c5835883a718d83a80849fd Mon Sep 17 00:00:00 2001
From: Morgan <morganmcg1@users.noreply.github.com>
Date: Sun, 13 Aug 2023 16:33:07 +0000
Subject: [PATCH 04/17] change run command

---
 test.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/test.py b/test.py
index dbefd5c..2ae33b1 100644
--- a/test.py
+++ b/test.py
@@ -31,6 +31,17 @@ def get_args():
 
     return parser.parse_args()
 
+def create_name(config_dict):
+    short = {
+        'gradient_accumulation_steps': 'graccsteps',
+        'learning_rate': 'lr',
+        'lora_r': 'lora_r'
+    }
+    name = ''
+    for hyperparam, value in config_dict.items():
+        name += short.get(hyperparam, hyperparam) + str(value).replace('.', '_') + '-'
+    return name[:-1]
+
 def main():
     args = get_args()
 
@@ -40,6 +51,9 @@ def main():
     wandb.init()
     config = wandb.config
 
+    run_name = sweep_name + "-" + finetune_type + "-" + create_name(config)
+    wandb.run.name = run_name
+
     with open(temp_config_path, 'r') as file:
         temp_config = yaml.safe_load(file)
 
@@ -49,9 +63,19 @@ def main():
     with open(temp_config_path, 'w') as file:
         yaml.dump(temp_config, file)
 
+    # log the artifact file
+    art = wandb.Artifact(name=f'config-{run_name}', type='run_config')
+    art.add_file(temp_config_path)
+    wandb.log_artifact(art)
+
     # Run the training command with the temporary config file
     # cmd = f"accelerate launch axolotl/scripts/finetune.py {temp_config_path}"
-    cmd = f"python finetune-study/test_run.py --training_args_path {temp_config_path}"
+
+    # Run the training command with the temporary config file
+    cuda_device_declaration = "CUDA_VISIBLE_DEVICES=" + ",".join(
+        [str(x) for x in args.CUDA_device_ids]) + " " if args.CUDA_device_ids else ""
+    cmd = cuda_device_declaration + f"accelerate launch axolotl/scripts/finetune.py {temp_config_path}"
+    # cmd = f"python finetune-study/test_run.py --training_args_path {temp_config_path}"
     call(cmd, shell=True)
 
 if __name__ == '__main__':

From 4455782c81a4c8d46b7bdf06548d0455b3218ac6 Mon Sep 17 00:00:00 2001
From: Morgan <morganmcg1@users.noreply.github.com>
Date: Sun, 13 Aug 2023 17:32:33 +0000
Subject: [PATCH 05/17] fix wandb name

---
 test.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/test.py b/test.py
index 2ae33b1..505b6eb 100644
--- a/test.py
+++ b/test.py
@@ -42,16 +42,66 @@ def create_name(config_dict):
         name += short.get(hyperparam, hyperparam) + str(value).replace('.', '_') + '-'
     return name[:-1]
 
+import subprocess
+import re
+
+def get_cuda_version():
+    try:
+        nvcc_version = subprocess.check_output(["nvcc", "--version"]).decode()
+        # Extract the version using regex
+        match = re.search(r"release (\d+\.\d+)", nvcc_version)
+        if match:
+            return match.group(1)
+        else:
+            return "No CUDA version found"
+    except subprocess.CalledProcessError:
+        return "Failed to run nvcc"
+    except FileNotFoundError:
+        return "nvcc not found"
+
+import subprocess
+
+def get_git_commit_sha(git_dir=None):
+    cmd = ["git", "rev-parse", "HEAD"]
+    if git_dir:
+        cmd.extend(["--git-dir", f"{git_dir}/.git"])
+    sha = subprocess.check_output(cmd).strip().decode("utf-8")
+    return sha
+
+from pynvml import *
+def get_nvidia_details():
+    nvidia_details = {}
+    nvmlInit()
+    # print(f"Driver Version: {nvmlSystemGetDriverVersion()}")
+    deviceCount = nvmlDeviceGetCount()
+    gpus_ls = []
+    for i in range(deviceCount):
+        handle = nvmlDeviceGetHandleByIndex(i)
+        # print(f"Device {i} : {nvmlDeviceGetName(handle)}")
+        gpus_ls.append(nvmlDeviceGetName(handle))
+    nvidia_details["nvidia_driver_version"]: nvmlSystemGetDriverVersion()
+    nvidia_details["num_gpus_on_machine"] = deviceCount
+    nvidia_details["gpus"] = gpus_ls
+    return nvidia_details
+
 def main():
     args = get_args()
 
     temp_config_path = args.default_training_args.replace('.yaml', '_temp.yaml')
     shutil.copyfile(args.default_training_args, temp_config_path)
 
-    wandb.init()
+    nvidia_details = get_nvidia_details()
+    wandb.init(config={
+        "axolotl_git_commit_sha": get_git_commit_sha("axolotl"),
+        "finetune-study_git_commit_sha": get_git_commit_sha("finetune-study"),
+        "cuda_version": get_cuda_version(),
+        "nvidia_driver_version": nvidia_details["nvidia_driver_version"],
+        "num_gpus_on_machine": nvidia_details["num_gpus_on_machine"],
+        "gpus": nvidia_details["gpus"]
+    })
     config = wandb.config
 
-    run_name = sweep_name + "-" + finetune_type + "-" + create_name(config)
+    run_name = create_name(config)
     wandb.run.name = run_name
 
     with open(temp_config_path, 'r') as file:

From 92c4a4c95fe43bed9292ffb41dd7b67e9a6f080a Mon Sep 17 00:00:00 2001
From: Morgan <morganmcg1@users.noreply.github.com>
Date: Sun, 13 Aug 2023 18:01:51 +0000
Subject: [PATCH 06/17] fix up test file

---
 test.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/test.py b/test.py
index 505b6eb..4254083 100644
--- a/test.py
+++ b/test.py
@@ -29,6 +29,9 @@ def get_args():
     parser.add_argument('--default_training_args', type=str, help='Path to default training args yaml file',
                         default='configs/default_training_configs/default_qlora.yaml')
 
+    parser.add_argument('--CUDA_device_ids', type=list, default=None,
+                        help='List of CUDA device ids to use for training. If not provided, all available GPUs will be used.')
+
     return parser.parse_args()
 
 def create_name(config_dict):
@@ -79,7 +82,8 @@ def get_nvidia_details():
         handle = nvmlDeviceGetHandleByIndex(i)
         # print(f"Device {i} : {nvmlDeviceGetName(handle)}")
         gpus_ls.append(nvmlDeviceGetName(handle))
-    nvidia_details["nvidia_driver_version"]: nvmlSystemGetDriverVersion()
+
+    nvidia_details["nvidia_driver_version"] = nvmlSystemGetDriverVersion()
     nvidia_details["num_gpus_on_machine"] = deviceCount
     nvidia_details["gpus"] = gpus_ls
     return nvidia_details
@@ -91,12 +95,13 @@ def main():
     shutil.copyfile(args.default_training_args, temp_config_path)
 
     nvidia_details = get_nvidia_details()
+    print(nvidia_details)
     wandb.init(config={
-        "axolotl_git_commit_sha": get_git_commit_sha("axolotl"),
-        "finetune-study_git_commit_sha": get_git_commit_sha("finetune-study"),
+        # "axolotl_git_commit_sha": get_git_commit_sha("axolotl"),
+        # "finetune-study_git_commit_sha": get_git_commit_sha("finetune-study"),
         "cuda_version": get_cuda_version(),
         "nvidia_driver_version": nvidia_details["nvidia_driver_version"],
-        "num_gpus_on_machine": nvidia_details["num_gpus_on_machine"],
+        "gpu_count": nvidia_details["num_gpus_on_machine"],
         "gpus": nvidia_details["gpus"]
     })
     config = wandb.config
@@ -114,7 +119,7 @@ def main():
         yaml.dump(temp_config, file)
 
     # log the artifact file
-    art = wandb.Artifact(name=f'config-{run_name}', type='run_config')
+    art = wandb.Artifact(name=f'my-config', type='run_config')
     art.add_file(temp_config_path)
     wandb.log_artifact(art)
 

From d149efd8160ecd1de6e9e24b4399c45914f7455f Mon Sep 17 00:00:00 2001
From: Morgan <morganmcg1@users.noreply.github.com>
Date: Sun, 13 Aug 2023 18:19:00 +0000
Subject: [PATCH 07/17] fix config logging

---
 test.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/test.py b/test.py
index 4254083..c285c80 100644
--- a/test.py
+++ b/test.py
@@ -96,14 +96,7 @@ def main():
 
     nvidia_details = get_nvidia_details()
     print(nvidia_details)
-    wandb.init(config={
-        # "axolotl_git_commit_sha": get_git_commit_sha("axolotl"),
-        # "finetune-study_git_commit_sha": get_git_commit_sha("finetune-study"),
-        "cuda_version": get_cuda_version(),
-        "nvidia_driver_version": nvidia_details["nvidia_driver_version"],
-        "gpu_count": nvidia_details["num_gpus_on_machine"],
-        "gpus": nvidia_details["gpus"]
-    })
+    wandb.init()
     config = wandb.config
 
     run_name = create_name(config)
@@ -118,6 +111,9 @@ def main():
     with open(temp_config_path, 'w') as file:
         yaml.dump(temp_config, file)
 
+    # Update the wandb config with the yaml config
+    wandb.config.update({**temp_config, **nvidia_details})
+
     # log the artifact file
     art = wandb.Artifact(name=f'my-config', type='run_config')
     art.add_file(temp_config_path)

From 6a52aff29de5fd5b7850cd8b370c58558dc4aea9 Mon Sep 17 00:00:00 2001
From: Morgan <morganmcg1@users.noreply.github.com>
Date: Sun, 13 Aug 2023 19:36:20 +0000
Subject: [PATCH 08/17] remove nvidia tracking

---
 docker/Dockerfile | 15 +++++++++++----
 test.py           |  5 +++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 14ea630..5337919 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -11,8 +11,10 @@ RUN apt-get update && \
 
 WORKDIR /workspace
 
-RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
-RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
+# RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
+# clone AblateIt axolotl fork
+RUN git clone --depth=1 https://github.com/AblateIt/axolotl.git 
+
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN cd axolotl && \
     if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
@@ -21,6 +23,10 @@ RUN cd axolotl && \
         pip install -e .; \
     fi
 
+RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
+
+RUN git clone --depth=1 https://github.com/morganmcg1/finetune-study.git
+
 # fix so that git fetch/pull from remote works
 RUN cd axolotl && \
     git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
@@ -29,6 +35,7 @@ RUN cd axolotl && \
 # helper for huggingface-login cli
 RUN git config --global credential.helper store
 
-RUN git clone --depth=1 https://github.com/morganmcg1/finetune-study.git
+# RUN pip3 install nvidia-ml-py
+# RUN pip install nvidia-ml-py
 
-ENTRYPOINT [ "python", "finetune-study/test.py", "--default_training_args", "finetune-study/configs/default_training_configs/default_lora.yaml"]
+# ENTRYPOINT [ "python", "finetune-study/test.py", "--default_training_args", "finetune-study/configs/default_training_configs/default_lora.yaml"]
diff --git a/test.py b/test.py
index c285c80..6f8927c 100644
--- a/test.py
+++ b/test.py
@@ -72,6 +72,7 @@ def get_git_commit_sha(git_dir=None):
     return sha
 
 from pynvml import *
+
 def get_nvidia_details():
     nvidia_details = {}
     nvmlInit()
@@ -84,7 +85,7 @@ def get_nvidia_details():
         gpus_ls.append(nvmlDeviceGetName(handle))
 
     nvidia_details["nvidia_driver_version"] = nvmlSystemGetDriverVersion()
-    nvidia_details["num_gpus_on_machine"] = deviceCount
+    nvidia_details["gpu_count"] = deviceCount
     nvidia_details["gpus"] = gpus_ls
     return nvidia_details
 
@@ -94,7 +95,7 @@ def main():
     temp_config_path = args.default_training_args.replace('.yaml', '_temp.yaml')
     shutil.copyfile(args.default_training_args, temp_config_path)
 
-    nvidia_details = get_nvidia_details()
+    # nvidia_details = get_nvidia_details()
     print(nvidia_details)
     wandb.init()
     config = wandb.config

From 66ef5c965c539ac2bbceeeeff06ae437b2cbb618 Mon Sep 17 00:00:00 2001
From: Morgan <morganmcg1@users.noreply.github.com>
Date: Sun, 13 Aug 2023 19:46:11 +0000
Subject: [PATCH 09/17] fix

---
 test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test.py b/test.py
index 6f8927c..d5ea199 100644
--- a/test.py
+++ b/test.py
@@ -96,7 +96,7 @@ def main():
     shutil.copyfile(args.default_training_args, temp_config_path)
 
     # nvidia_details = get_nvidia_details()
-    print(nvidia_details)
+
     wandb.init()
     config = wandb.config
 

From 4dc75d672838773bfef3b4c1fbe67b7e21c7dbcb Mon Sep 17 00:00:00 2001
From: Morgan <morganmcg1@users.noreply.github.com>
Date: Sun, 13 Aug 2023 19:53:59 +0000
Subject: [PATCH 10/17] fix

---
 test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test.py b/test.py
index d5ea199..0cccd20 100644
--- a/test.py
+++ b/test.py
@@ -113,7 +113,8 @@ def main():
         yaml.dump(temp_config, file)
 
     # Update the wandb config with the yaml config
-    wandb.config.update({**temp_config, **nvidia_details})
+    # wandb.config.update({**temp_config, **nvidia_details})
+    wandb.config.update(temp_config)
 
     # log the artifact file
     art = wandb.Artifact(name=f'my-config', type='run_config')

From 8055e8ed1d4046d2d2a22e3dc41131b310b69fad Mon Sep 17 00:00:00 2001
From: Morgan <morganmcg1@users.noreply.github.com>
Date: Sun, 13 Aug 2023 21:01:52 +0000
Subject: [PATCH 11/17] added python to cmd

---
 test.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test.py b/test.py
index 0cccd20..2100563 100644
--- a/test.py
+++ b/test.py
@@ -2,7 +2,7 @@
 import argparse
 import yaml
 import shutil
-from subprocess import call
+from subprocess import call, run
 import os
 
 # wandb.login()
@@ -127,9 +127,10 @@ def main():
     # Run the training command with the temporary config file
     cuda_device_declaration = "CUDA_VISIBLE_DEVICES=" + ",".join(
         [str(x) for x in args.CUDA_device_ids]) + " " if args.CUDA_device_ids else ""
-    cmd = cuda_device_declaration + f"accelerate launch axolotl/scripts/finetune.py {temp_config_path}"
+    cmd = "python " + cuda_device_declaration + f" accelerate launch axolotl/scripts/finetune.py {temp_config_path}"
     # cmd = f"python finetune-study/test_run.py --training_args_path {temp_config_path}"
-    call(cmd, shell=True)
+    # call(cmd, shell=True)
+    run(cmd, shell=True)
 
 if __name__ == '__main__':
     main()
\ No newline at end of file

From f5280e7dce7d98ed180986946b961ff97906f1ad Mon Sep 17 00:00:00 2001
From: Morgan <morganmcg1@users.noreply.github.com>
Date: Sun, 13 Aug 2023 21:09:21 +0000
Subject: [PATCH 12/17] added python to cmd

---
 test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test.py b/test.py
index 2100563..a6a20a5 100644
--- a/test.py
+++ b/test.py
@@ -127,7 +127,7 @@ def main():
     # Run the training command with the temporary config file
     cuda_device_declaration = "CUDA_VISIBLE_DEVICES=" + ",".join(
         [str(x) for x in args.CUDA_device_ids]) + " " if args.CUDA_device_ids else ""
-    cmd = "python " + cuda_device_declaration + f" accelerate launch axolotl/scripts/finetune.py {temp_config_path}"
+    cmd = cuda_device_declaration + f" accelerate launch axolotl/scripts/finetune.py {temp_config_path}"
     # cmd = f"python finetune-study/test_run.py --training_args_path {temp_config_path}"
     # call(cmd, shell=True)
     run(cmd, shell=True)

From f1713579f43b96066f2b282e1c19ada9e7119f73 Mon Sep 17 00:00:00 2001
From: Morgan <morganmcg1@users.noreply.github.com>
Date: Tue, 15 Aug 2023 15:12:14 +0000
Subject: [PATCH 13/17] add launch_run file

---
 launch_run.py |   6 +++
 sweep.py      | 142 ++++++++++++++++++++++++++------------------------
 2 files changed, 79 insertions(+), 69 deletions(-)
 create mode 100644 launch_run.py

diff --git a/launch_run.py b/launch_run.py
new file mode 100644
index 0000000..c81c8e0
--- /dev/null
+++ b/launch_run.py
@@ -0,0 +1,6 @@
+from sweep import get_args, create_name, train, DATASET_SIZES
+
+if __name__ == "__main__":
+    args = get_args()
+    print(vars(args))
+    train(args)
\ No newline at end of file
diff --git a/sweep.py b/sweep.py
index 961103a..0a437a6 100644
--- a/sweep.py
+++ b/sweep.py
@@ -3,6 +3,7 @@
 import yaml
 import shutil
 from subprocess import call
+from functools import partial
 import os
 
 wandb.login()
@@ -81,6 +82,75 @@ def create_name(config_dict):
     return name[:-1]
 
 
+def train(args):
+    wandb.init(entity=args.entity, project=args.project)
+    config = dict(wandb.config)
+
+    warmup_factor = (
+        config.pop("warmpup_steps_factor_of_epoch")
+        if "warmpup_steps_factor_of_epoch" in config
+        else None
+    )
+    finetune_type = config.pop("ft_type")
+    sweep_name = config.pop("sweep_name")
+
+    run_name = sweep_name + "-" + finetune_type + "-" + create_name(config)
+
+    wandb.run.name = run_name
+    with open(args.default_training_args, "r") as file:
+        run_config = yaml.safe_load(file)
+
+    for hyperparameter, value in config.items():
+        run_config[hyperparameter] = value
+
+    epoch_train_steps = int((DATASET_SIZES["Puffin"] *
+                            (1 - run_config["val_set_size"]))/ (run_config["gradient_accumulation_steps"] * run_config["micro_batch_size"]))
+
+    if warmup_factor:
+        run_config["warmup_steps"] = int(epoch_train_steps * warmup_factor)
+
+    if run_config["eval_strategy"] == "epoch" and type(run_config["eval_steps"]) == float:
+        run_config["eval_steps"] = int(epoch_train_steps * run_config["eval_steps"])
+        run_config["eval_strategy"] = "steps"
+
+    if run_config["save_strategy"] == "epoch" and type(run_config["save_steps"]) == float:
+        run_config["save_steps"] = int(epoch_train_steps * run_config["save_steps"])
+        run_config["save_strategy"] = "steps"
+
+    if args.push_to_hub:
+        run_config["hub_model_id"] = "AblateIt/" + run_name
+        run_config["push_to_hub"] = True
+        run_config["hub_strategy"] = "all_checkpoints"
+        print(run_config["hub_model_id"])
+
+    run_config["wandb_project"] = args.project
+    run_config["wandb_entity"] = args.entity
+    run_config["wandb_run_name"] = run_name
+    run_config["output_dir"] = run_config["output_dir"] + "/" + run_name + "/"
+
+    run_config_path = run_config["output_dir"] + "config.yaml"
+
+    if not os.path.exists(run_config["output_dir"]):
+        os.makedirs(run_config["output_dir"])
+
+    with open(run_config_path, "w") as file:
+        yaml.dump(run_config, file)
+    print(run_config)
+
+    # Run the training command with the temporary config file
+    cuda_device_declaration = (
+        "export CUDA_VISIBLE_DEVICES=" + ",".join([str(x) for x in args.gpu]) + "; "
+        if args.gpu
+        else ""
+    )
+    cmd = (
+        cuda_device_declaration
+        + f"accelerate launch axolotl/scripts/finetune.py {run_config_path} --main_process_port 0"
+    )
+    print(cmd)
+    call(cmd, shell=True)
+
+
 def sweep():
     args = get_args()
 
@@ -93,77 +163,11 @@ def sweep():
         with open("sweep_id.txt", "w") as file:
             file.write(sweep_id)
 
-    def run_sweep():
-        wandb.init(entity=args.entity)
-        config = dict(wandb.config)
-
-        warmup_factor = (
-            config.pop("warmpup_steps_factor_of_epoch")
-            if "warmpup_steps_factor_of_epoch" in config
-            else None
-        )
-        finetune_type = config.pop("ft_type")
-        sweep_name = config.pop("sweep_name")
-
-        run_name = sweep_name + "-" + finetune_type + "-" + create_name(config)
-
-        wandb.run.name = run_name
-        with open(args.default_training_args, "r") as file:
-            run_config = yaml.safe_load(file)
-
-        for hyperparameter, value in config.items():
-            run_config[hyperparameter] = value
-
-        epoch_train_steps = int((DATASET_SIZES["Puffin"] *
-                               (1 - run_config["val_set_size"]))/ (run_config["gradient_accumulation_steps"] * run_config["micro_batch_size"]))
-
-        if warmup_factor:
-            run_config["warmup_steps"] = int(epoch_train_steps * warmup_factor)
-
-        if run_config["eval_strategy"] == "epoch" and type(run_config["eval_steps"]) == float:
-            run_config["eval_steps"] = int(epoch_train_steps * run_config["eval_steps"])
-            run_config["eval_strategy"] = "steps"
-
-        if run_config["save_strategy"] == "epoch" and type(run_config["save_steps"]) == float:
-            run_config["save_steps"] = int(epoch_train_steps * run_config["save_steps"])
-            run_config["save_strategy"] = "steps"
-
-        if args.push_to_hub:
-            run_config["hub_model_id"] = "AblateIt/" + run_name
-            run_config["push_to_hub"] = True
-            run_config["hub_strategy"] = "all_checkpoints"
-            print(run_config["hub_model_id"])
-
-        run_config["wandb_project"] = args.project
-        run_config["wandb_entity"] = args.entity
-        run_config["wandb_run_name"] = run_name
-        run_config["output_dir"] = run_config["output_dir"] + "/" + run_name + "/"
-
-        run_config_path = run_config["output_dir"] + "config.yaml"
-
-        if not os.path.exists(run_config["output_dir"]):
-            os.makedirs(run_config["output_dir"])
-
-        with open(run_config_path, "w") as file:
-            yaml.dump(run_config, file)
-        print(run_config)
-
-        # Run the training command with the temporary config file
-        cuda_device_declaration = (
-            "export CUDA_VISIBLE_DEVICES=" + ",".join([str(x) for x in args.gpu]) + "; "
-            if args.gpu
-            else ""
-        )
-        cmd = (
-            cuda_device_declaration
-            + f"accelerate launch axolotl/scripts/finetune.py {run_config_path} --main_process_port 0"
-        )
-        print(cmd)
-        call(cmd, shell=True)
-
     if args.sweep_id is not None:
         # Run the sweep
-        wandb.agent(sweep_id, run_sweep, project=args.project, entity=args.entity)
+        wandb.agent(sweep_id, partial(train, args), project=args.project, entity=args.entity)
+    else:
+        print("No Sweep ID provided")
 
 
 if __name__ == "__main__":

From b1222968fcd876c8e46b6c08566347264956d78a Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Tue, 15 Aug 2023 16:24:59 +0100
Subject: [PATCH 14/17] Delete testDockerfile

---
 docker/testDockerfile | 32 --------------------------------
 1 file changed, 32 deletions(-)
 delete mode 100644 docker/testDockerfile

diff --git a/docker/testDockerfile b/docker/testDockerfile
deleted file mode 100644
index 14c621b..0000000
--- a/docker/testDockerfile
+++ /dev/null
@@ -1,32 +0,0 @@
-ARG BASE_TAG=main-base
-FROM winglian/axolotl-base:$BASE_TAG
-
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
-ARG AXOLOTL_EXTRAS=""
-ARG CUDA="118"
-ENV BNB_CUDA_VERSION=$CUDA
-
-RUN apt-get update && \
-    apt-get install -y vim curl
-
-WORKDIR /workspace
-
-RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
-RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
-# If AXOLOTL_EXTRAS is set, append it in brackets
-RUN cd axolotl && \
-    if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[$AXOLOTL_EXTRAS]; \
-    else \
-        pip install -e .; \
-    fi
-
-# fix so that git fetch/pull from remote works
-RUN cd axolotl && \
-    git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch
-
-# helper for huggingface-login cli
-RUN git config --global credential.helper store
-
-RUN python test.py --default_training_args configs/default_training_configs/default_lora.yaml 
\ No newline at end of file

From 2271c193df6521db1011a2e230cfeabcdb34a7a3 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Tue, 15 Aug 2023 16:25:44 +0100
Subject: [PATCH 15/17] Delete test.py

---
 test.py | 136 --------------------------------------------------------
 1 file changed, 136 deletions(-)
 delete mode 100644 test.py

diff --git a/test.py b/test.py
deleted file mode 100644
index a6a20a5..0000000
--- a/test.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import wandb
-import argparse
-import yaml
-import shutil
-from subprocess import call, run
-import os
-
-# wandb.login()
-
-"""
-Still in progress and not yet tested.
-"""
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    # parser.add_argument('--sweep_id', type=str, default=None,
-    #                     help='Wandb sweep id for decentralized sweeping. If not provided, a new sweep will be created.')
-
-    # parser.add_argument('--sweep_config', help='Path to sweep config yaml file',
-    #                     type=str, default='configs/sweep_configs/qlora_sweep.yaml')
-
-    parser.add_argument('--wandb_project', type=str, help='Wandb project name',
-                        default='test-launch-sweeps')
-
-    parser.add_argument('--wandb_entity', type=str, help='Wandb project name',
-                        default='ablateit')
-
-    parser.add_argument('--default_training_args', type=str, help='Path to default training args yaml file',
-                        default='configs/default_training_configs/default_qlora.yaml')
-
-    parser.add_argument('--CUDA_device_ids', type=list, default=None,
-                        help='List of CUDA device ids to use for training. If not provided, all available GPUs will be used.')
-
-    return parser.parse_args()
-
-def create_name(config_dict):
-    short = {
-        'gradient_accumulation_steps': 'graccsteps',
-        'learning_rate': 'lr',
-        'lora_r': 'lora_r'
-    }
-    name = ''
-    for hyperparam, value in config_dict.items():
-        name += short.get(hyperparam, hyperparam) + str(value).replace('.', '_') + '-'
-    return name[:-1]
-
-import subprocess
-import re
-
-def get_cuda_version():
-    try:
-        nvcc_version = subprocess.check_output(["nvcc", "--version"]).decode()
-        # Extract the version using regex
-        match = re.search(r"release (\d+\.\d+)", nvcc_version)
-        if match:
-            return match.group(1)
-        else:
-            return "No CUDA version found"
-    except subprocess.CalledProcessError:
-        return "Failed to run nvcc"
-    except FileNotFoundError:
-        return "nvcc not found"
-
-import subprocess
-
-def get_git_commit_sha(git_dir=None):
-    cmd = ["git", "rev-parse", "HEAD"]
-    if git_dir:
-        cmd.extend(["--git-dir", f"{git_dir}/.git"])
-    sha = subprocess.check_output(cmd).strip().decode("utf-8")
-    return sha
-
-from pynvml import *
-
-def get_nvidia_details():
-    nvidia_details = {}
-    nvmlInit()
-    # print(f"Driver Version: {nvmlSystemGetDriverVersion()}")
-    deviceCount = nvmlDeviceGetCount()
-    gpus_ls = []
-    for i in range(deviceCount):
-        handle = nvmlDeviceGetHandleByIndex(i)
-        # print(f"Device {i} : {nvmlDeviceGetName(handle)}")
-        gpus_ls.append(nvmlDeviceGetName(handle))
-
-    nvidia_details["nvidia_driver_version"] = nvmlSystemGetDriverVersion()
-    nvidia_details["gpu_count"] = deviceCount
-    nvidia_details["gpus"] = gpus_ls
-    return nvidia_details
-
-def main():
-    args = get_args()
-
-    temp_config_path = args.default_training_args.replace('.yaml', '_temp.yaml')
-    shutil.copyfile(args.default_training_args, temp_config_path)
-
-    # nvidia_details = get_nvidia_details()
-
-    wandb.init()
-    config = wandb.config
-
-    run_name = create_name(config)
-    wandb.run.name = run_name
-
-    with open(temp_config_path, 'r') as file:
-        temp_config = yaml.safe_load(file)
-
-    for hyperparameter, value in config.items():
-        temp_config[hyperparameter] = value
-
-    with open(temp_config_path, 'w') as file:
-        yaml.dump(temp_config, file)
-
-    # Update the wandb config with the yaml config
-    # wandb.config.update({**temp_config, **nvidia_details})
-    wandb.config.update(temp_config)
-
-    # log the artifact file
-    art = wandb.Artifact(name=f'my-config', type='run_config')
-    art.add_file(temp_config_path)
-    wandb.log_artifact(art)
-
-    # Run the training command with the temporary config file
-    # cmd = f"accelerate launch axolotl/scripts/finetune.py {temp_config_path}"
-
-    # Run the training command with the temporary config file
-    cuda_device_declaration = "CUDA_VISIBLE_DEVICES=" + ",".join(
-        [str(x) for x in args.CUDA_device_ids]) + " " if args.CUDA_device_ids else ""
-    cmd = cuda_device_declaration + f" accelerate launch axolotl/scripts/finetune.py {temp_config_path}"
-    # cmd = f"python finetune-study/test_run.py --training_args_path {temp_config_path}"
-    # call(cmd, shell=True)
-    run(cmd, shell=True)
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file

From 9a69b4169edb55f2b2a4a81f0378e0cebec46739 Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Tue, 15 Aug 2023 16:26:12 +0100
Subject: [PATCH 16/17] Delete test_run.py

---
 test_run.py | 22 ----------------------
 1 file changed, 22 deletions(-)
 delete mode 100644 test_run.py

diff --git a/test_run.py b/test_run.py
deleted file mode 100644
index ebaafd1..0000000
--- a/test_run.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import argparse
-import yaml
-import os
-
-def get_args():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--training_args_path', type=str, help='Path to default training args yaml file',
-                        default='configs/default_training_configs/default_qlora.yaml')
-
-    return parser.parse_args()
-
-def main():
-    args = get_args()
-
-    with open(args.training_args_path, 'r') as file:
-        temp_config = yaml.safe_load(file)
-
-    print(temp_config)
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file

From e3e73b3ec0a990ae86bc74b7d60bb79547b8de2d Mon Sep 17 00:00:00 2001
From: Morgan <morganmcg1@users.noreply.github.com>
Date: Tue, 15 Aug 2023 19:18:23 +0000
Subject: [PATCH 17/17] fix push to hub

---
 docker/Dockerfile | 2 ++
 sweep.py          | 9 ++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 5337919..ee24266 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -25,6 +25,8 @@ RUN cd axolotl && \
 
 RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
 
+# RUN pip3 install flash-attn==2.0.7
+
 RUN git clone --depth=1 https://github.com/morganmcg1/finetune-study.git
 
 # fix so that git fetch/pull from remote works
diff --git a/sweep.py b/sweep.py
index 0a437a6..8a4dd8c 100644
--- a/sweep.py
+++ b/sweep.py
@@ -55,8 +55,7 @@ def get_args():
 
     parser.add_argument(
         "--push_to_hub",
-        type=bool,
-        default=True,
+        action='store_true',
         help="Whether to push the models to the hub during training.",
     )
 
@@ -156,14 +155,14 @@ def sweep():
 
     sweep_id = args.sweep_id
 
-    if not sweep_id:
+    if sweep_id is None:
         sweep_config = yaml.safe_load(open(args.sweep_config))["wandb_args"]
-        sweep_id = wandb.sweep(sweep_config, project=args.project)
+        sweep_id = wandb.sweep(sweep_config, entity=args.entity, project=args.project)
         print(sweep_id)
         with open("sweep_id.txt", "w") as file:
             file.write(sweep_id)
 
-    if args.sweep_id is not None:
+    if sweep_id is not None:
         # Run the sweep
         wandb.agent(sweep_id, partial(train, args), project=args.project, entity=args.entity)
     else: