AblateIt · morganmcg1 · Aug 12, 2023 · Aug 12, 2023 · Aug 12, 2023 · Aug 13, 2023
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -0,0 +1,43 @@
+ARG BASE_TAG=main-base
+FROM winglian/axolotl-base:$BASE_TAG
+
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
+ARG AXOLOTL_EXTRAS=""
+ARG CUDA="118"
+ENV BNB_CUDA_VERSION=$CUDA
+
+RUN apt-get update && \
+    apt-get install -y vim curl
+
+WORKDIR /workspace
+
+# RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
+# clone AblateIt axolotl fork
+RUN git clone --depth=1 https://github.com/AblateIt/axolotl.git 
+
+# If AXOLOTL_EXTRAS is set, append it in brackets
+RUN cd axolotl && \
+    if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
+        pip install -e .[$AXOLOTL_EXTRAS]; \
+    else \
+        pip install -e .; \
+    fi
+
+RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
+
+# RUN pip3 install flash-attn==2.0.7
+
+RUN git clone --depth=1 https://github.com/morganmcg1/finetune-study.git
+
+# fix so that git fetch/pull from remote works
+RUN cd axolotl && \
+    git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
+    git config --get remote.origin.fetch
+
+# helper for huggingface-login cli
+RUN git config --global credential.helper store
+
+# RUN pip3 install nvidia-ml-py
+# RUN pip install nvidia-ml-py
+
+# ENTRYPOINT [ "python", "finetune-study/test.py", "--default_training_args", "finetune-study/configs/default_training_configs/default_lora.yaml"]
diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
@@ -0,0 +1,104 @@
+ARG CUDA_VERSION="11.8.0"
+ARG CUDNN_VERSION="8"
+ARG UBUNTU_VERSION="22.04"
+ARG MAX_JOBS=4
+
+FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION as base-builder
+
+ENV PATH="/root/miniconda3/bin:${PATH}"
+
+ARG PYTHON_VERSION="3.9"
+ARG PYTORCH_VERSION="2.0.1"
+ARG CUDA="118"
+
+ENV PYTHON_VERSION=$PYTHON_VERSION
+
+RUN apt-get update
+RUN apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev && rm -rf /var/lib/apt/lists/*
+
+RUN wget \
+    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+    && mkdir /root/.conda \
+    && bash Miniconda3-latest-Linux-x86_64.sh -b \
+    && rm -f Miniconda3-latest-Linux-x86_64.sh
+
+RUN conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
+
+ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
+
+WORKDIR /workspace
+
+RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
+    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA
+
+
+FROM base-builder AS flash-attn-builder
+
+WORKDIR /workspace
+
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+
+RUN git clone https://github.com/Dao-AILab/flash-attention.git && \
+    cd flash-attention && \
+    git checkout v2.0.1  && \
+    python3 setup.py bdist_wheel && \
+    cd csrc/fused_dense_lib && \
+    python3 setup.py bdist_wheel && \
+    cd ../xentropy && \
+    python3 setup.py bdist_wheel && \
+    cd ../rotary && \
+    python3 setup.py bdist_wheel && \
+    cd ../layer_norm && \
+    python3 setup.py bdist_wheel
+
+FROM base-builder AS deepspeed-builder
+
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+
+WORKDIR /workspace
+
+RUN git clone https://github.com/microsoft/DeepSpeed.git && \
+    cd DeepSpeed && \
+    MAX_CONCURRENCY=8 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_OPS=1 python3 setup.py bdist_wheel
+
+FROM base-builder AS bnb-builder
+
+WORKDIR /workspace
+ARG CUDA="118"
+ENV CUDA=$CUDA
+
+RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \
+    cd bitsandbytes && \
+    CUDA_VERSION=$CUDA make cuda11x && \
+    python setup.py bdist_wheel
+
+FROM base-builder
+
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+
+# recompile apex
+RUN python3 -m pip uninstall -y apex
+RUN git clone https://github.com/NVIDIA/apex
+#  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
+RUN cd apex && MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+
+RUN mkdir -p /workspace/builds
+COPY --from=bnb-builder /workspace/bitsandbytes /workspace/builds/bitsandbytes
+
+RUN mkdir -p /workspace/wheels/bitsandbytes
+COPY --from=deepspeed-builder /workspace/DeepSpeed/dist/deepspeed-*.whl wheels
+COPY --from=bnb-builder /workspace/bitsandbytes/dist/bitsandbytes-*.whl wheels
+COPY --from=bnb-builder /workspace/bitsandbytes/bitsandbytes/libbitsandbytes*.so wheels/bitsandbytes
+COPY --from=flash-attn-builder /workspace/flash-attention/dist/flash_attn-*.whl wheels
+COPY --from=flash-attn-builder /workspace/flash-attention/csrc/fused_dense_lib/dist/fused_dense_lib-*.whl wheels
+COPY --from=flash-attn-builder /workspace/flash-attention/csrc/xentropy/dist/xentropy_cuda_lib-*.whl wheels
+COPY --from=flash-attn-builder /workspace/flash-attention/csrc/rotary/dist/rotary_emb-*.whl wheels
+COPY --from=flash-attn-builder /workspace/flash-attention/csrc/layer_norm/dist/dropout_layer_norm-*.whl wheels
+
+RUN pip3 install wheels/deepspeed-*.whl wheels/flash_attn-*.whl wheels/fused_dense_lib-*.whl wheels/xentropy_cuda_lib-*.whl wheels/rotary_emb-*.whl wheels/dropout_layer_norm-*.whl
+RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
+RUN git lfs install --skip-repo
+RUN pip3 install awscli && \
+    # The base image ships with `pydantic==1.8.2` which is not working
+    pip3 install -U --no-cache-dir pydantic==1.10.10
diff --git a/docker/Dockerfile-runpod b/docker/Dockerfile-runpod
@@ -0,0 +1,18 @@
+ARG BASE_TAG=main
+FROM winglian/axolotl:$BASE_TAG
+
+ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
+ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
+ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
+
+COPY scripts/runpod-entrypoint.sh /root/runpod-entrypoint.sh
+
+RUN apt install --yes --no-install-recommends openssh-server tmux && \
+    mkdir -p ~/.ssh && \
+    chmod 700 ~/.ssh && \
+    printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
+    chmod +x /workspace/axolotl/scripts/runpod-entrypoint.sh && \
+    chmod +x /root/runpod-entrypoint.sh
+
+ENTRYPOINT ["/root/runpod-entrypoint.sh"]
+CMD ["sleep", "infinity"]
diff --git a/launch_run.py b/launch_run.py
@@ -0,0 +1,6 @@
+from sweep import get_args, create_name, train, DATASET_SIZES
+
+if __name__ == "__main__":
+    args = get_args()
+    print(vars(args))
+    train(args)
diff --git a/sweep.py b/sweep.py
@@ -3,6 +3,7 @@
 import yaml
 import shutil
 from subprocess import call
+from functools import partial
 import os
 
 wandb.login()
@@ -54,8 +55,7 @@ def get_args():
 
     parser.add_argument(
         "--push_to_hub",
-        type=bool,
-        default=True,
+        action='store_true',
         help="Whether to push the models to the hub during training.",
     )
 
@@ -81,89 +81,92 @@ def create_name(config_dict):
     return name[:-1]
 
 
+def train(args):
+    wandb.init(entity=args.entity, project=args.project)
+    config = dict(wandb.config)
+
+    warmup_factor = (
+        config.pop("warmpup_steps_factor_of_epoch")
+        if "warmpup_steps_factor_of_epoch" in config
+        else None
+    )
+    finetune_type = config.pop("ft_type")
+    sweep_name = config.pop("sweep_name")
+
+    run_name = sweep_name + "-" + finetune_type + "-" + create_name(config)
+
+    wandb.run.name = run_name
+    with open(args.default_training_args, "r") as file:
+        run_config = yaml.safe_load(file)
+
+    for hyperparameter, value in config.items():
+        run_config[hyperparameter] = value
+
+    epoch_train_steps = int((DATASET_SIZES["Puffin"] *
+                            (1 - run_config["val_set_size"]))/ (run_config["gradient_accumulation_steps"] * run_config["micro_batch_size"]))
+
+    if warmup_factor:
+        run_config["warmup_steps"] = int(epoch_train_steps * warmup_factor)
+
+    if run_config["eval_strategy"] == "epoch" and type(run_config["eval_steps"]) == float:
+        run_config["eval_steps"] = int(epoch_train_steps * run_config["eval_steps"])
+        run_config["eval_strategy"] = "steps"
+
+    if run_config["save_strategy"] == "epoch" and type(run_config["save_steps"]) == float:
+        run_config["save_steps"] = int(epoch_train_steps * run_config["save_steps"])
+        run_config["save_strategy"] = "steps"
+
+    if args.push_to_hub:
+        run_config["hub_model_id"] = "AblateIt/" + run_name
+        run_config["push_to_hub"] = True
+        run_config["hub_strategy"] = "all_checkpoints"
+        print(run_config["hub_model_id"])
+
+    run_config["wandb_project"] = args.project
+    run_config["wandb_entity"] = args.entity
+    run_config["wandb_run_name"] = run_name
+    run_config["output_dir"] = run_config["output_dir"] + "/" + run_name + "/"
+
+    run_config_path = run_config["output_dir"] + "config.yaml"
+
+    if not os.path.exists(run_config["output_dir"]):
+        os.makedirs(run_config["output_dir"])
+
+    with open(run_config_path, "w") as file:
+        yaml.dump(run_config, file)
+    print(run_config)
+
+    # Run the training command with the temporary config file
+    cuda_device_declaration = (
+        "export CUDA_VISIBLE_DEVICES=" + ",".join([str(x) for x in args.gpu]) + "; "
+        if args.gpu
+        else ""
+    )
+    cmd = (
+        cuda_device_declaration
+        + f"accelerate launch axolotl/scripts/finetune.py {run_config_path} --main_process_port 0"
+    )
+    print(cmd)
+    call(cmd, shell=True)
+
+
 def sweep():
     args = get_args()
 
     sweep_id = args.sweep_id
 
-    if not sweep_id:
+    if sweep_id is None:
         sweep_config = yaml.safe_load(open(args.sweep_config))["wandb_args"]
-        sweep_id = wandb.sweep(sweep_config, project=args.project)
+        sweep_id = wandb.sweep(sweep_config, entity=args.entity, project=args.project)
         print(sweep_id)
         with open("sweep_id.txt", "w") as file:
             file.write(sweep_id)
 
-    def run_sweep():
-        wandb.init(entity=args.entity)
-        config = dict(wandb.config)
-
-        warmup_factor = (
-            config.pop("warmpup_steps_factor_of_epoch")
-            if "warmpup_steps_factor_of_epoch" in config
-            else None
-        )
-        finetune_type = config.pop("ft_type")
-        sweep_name = config.pop("sweep_name")
-
-        run_name = sweep_name + "-" + finetune_type + "-" + create_name(config)
-
-        wandb.run.name = run_name
-        with open(args.default_training_args, "r") as file:
-            run_config = yaml.safe_load(file)
-
-        for hyperparameter, value in config.items():
-            run_config[hyperparameter] = value
-
-        epoch_train_steps = int((DATASET_SIZES["Puffin"] *
-                               (1 - run_config["val_set_size"]))/ (run_config["gradient_accumulation_steps"] * run_config["micro_batch_size"]))
-
-        if warmup_factor:
-            run_config["warmup_steps"] = int(epoch_train_steps * warmup_factor)
-
-        if run_config["eval_strategy"] == "epoch" and type(run_config["eval_steps"]) == float:
-            run_config["eval_steps"] = int(epoch_train_steps * run_config["eval_steps"])
-            run_config["eval_strategy"] = "steps"
-
-        if run_config["save_strategy"] == "epoch" and type(run_config["save_steps"]) == float:
-            run_config["save_steps"] = int(epoch_train_steps * run_config["save_steps"])
-            run_config["save_strategy"] = "steps"
-
-        if args.push_to_hub:
-            run_config["hub_model_id"] = "AblateIt/" + run_name
-            run_config["push_to_hub"] = True
-            run_config["hub_strategy"] = "all_checkpoints"
-            print(run_config["hub_model_id"])
-
-        run_config["wandb_project"] = args.project
-        run_config["wandb_entity"] = args.entity
-        run_config["wandb_run_name"] = run_name
-        run_config["output_dir"] = run_config["output_dir"] + "/" + run_name + "/"
-
-        run_config_path = run_config["output_dir"] + "config.yaml"
-
-        if not os.path.exists(run_config["output_dir"]):
-            os.makedirs(run_config["output_dir"])
-
-        with open(run_config_path, "w") as file:
-            yaml.dump(run_config, file)
-        print(run_config)
-
-        # Run the training command with the temporary config file
-        cuda_device_declaration = (
-            "export CUDA_VISIBLE_DEVICES=" + ",".join([str(x) for x in args.gpu]) + "; "
-            if args.gpu
-            else ""
-        )
-        cmd = (
-            cuda_device_declaration
-            + f"accelerate launch axolotl/scripts/finetune.py {run_config_path} --main_process_port 0"
-        )
-        print(cmd)
-        call(cmd, shell=True)
-
-    if args.sweep_id is not None:
+    if sweep_id is not None:
         # Run the sweep
-        wandb.agent(sweep_id, run_sweep, project=args.project, entity=args.entity)
+        wandb.agent(sweep_id, partial(train, args), project=args.project, entity=args.entity)
+    else:
+        print("No Sweep ID provided")
 
 
 if __name__ == "__main__":