Skip to content

Commit

Permalink
add new models
Browse files Browse the repository at this point in the history
add correct synchronization script (removed sleeps where necessary)
  • Loading branch information
Martin Fajčík committed Aug 26, 2024
1 parent d13068d commit d3b47a2
Show file tree
Hide file tree
Showing 18 changed files with 258 additions and 66 deletions.
17 changes: 15 additions & 2 deletions jobs/run_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,27 @@ source ./jobs/TASKS.sh
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_S/eval_phi3mini_instruct_smartt.sh
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_S/eval_qwen2_instruct_smartt.sh
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_S/eval_olmo_instruct_smartt.sh
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_S/eval_cstllama_smartt.sh ## DONE
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_S/eval_gemma2-2b_smartt.sh ## DONE
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_S/eval_gemma2-2b_instruct_smartt.sh ## DONE
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_S/eval_gemma2_smartt.sh ## DONE
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_S/eval_hermes_llama31_instruct_smartt.sh ## DONE
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_S/eval_llama31_smartt.sh ## DONE
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_S/eval_qwen2_smartt.sh ## Still missing

# M MODELS - EACH JOB uses multiple GPUs
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_M/eval_mixtral8x7_instruct_smartt.sh
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_M/eval_mistral_nemo_instruct_smartt.sh
# --
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_M/eval_mixtral8x7_base_smartt.sh ## DONE
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_M/eval_mistral_nemo_base_smartt.sh ## DONE
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_M/eval_aya23_35B_instruct_smartt.sh ## DONE

# L MODELS - EACH JOB IS MULTINODE, and uses 8 GPUs/node
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_L/eval_llama31_instruct_TEST_smartt.sh
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_L/eval_llama31_instruct_70B_smartt.sh
# --
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_L/eval_llama31_70B_smartt.sh ## DONE
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_L/eval_mixtral_8x22B_instruct_smartt.sh

# XXL MODELS - EACH JOB IS MANY MULTINODE (e.g. llama406 is 8 nodes), and uses 8 GPUs/node
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_XXL/eval_llama31_instruct_405B_smartt.sh
#sbatch --array=0-$((${#TASKS[@]} - 1)) jobs/scripts/submit/models_XXL/eval_llama31_instruct_405B_smartt.sh
10 changes: 5 additions & 5 deletions jobs/scripts/models/eval_L_vllm_master.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ export TRUNCATE_STRATEGY="$5"
export NUM_FEWSHOT="$6"
export MODEL_NAME="$7"

source ~/.bashrc
micromamba activate harness

#ray start --head --port $MASTER_PORT

# Check if OUTPUT_PATH exists
if [ -e "$OUTPUT_PATH" ]; then
echo "Output path $OUTPUT_PATH already exists. Exiting."
exit 0
fi

srun /home/ifajcik/data_scratch_new/lm-evaluation-harness/jobs/scripts/models/eval_L_vllm_worker.sh
echo "Finished master script at $(hostname)"
112 changes: 57 additions & 55 deletions jobs/scripts/models/eval_L_vllm_worker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,77 +7,64 @@ num_nodes="$((SLURM_JOB_NUM_NODES))"
WORKDIR="/home/ifajcik/data_scratch_new/lm-evaluation-harness"
cd $WORKDIR

# create temp directory for ray
#export VLLM_LOGGING_LEVEL=DEBUG
#export CUDA_LAUNCH_BLOCKING=1
#export NCCL_DEBUG=TRACE
#export VLLM_TRACE_FUNCTION=1

export NCCL_IB_GID_INDEX=3
#export TORCH_NCCL_USE_COMM_NONBLOCKING=1
#export NCCL_SOCKET_IFNAME=eth0

# Create temp directory for ray
hostname=$(hostname)
target_dir="${WORKDIR}/ray_${hostname}"
rm -rf /tmp/ray
rm -rf $target_dir
mkdir -p $target_dir
ln -s "$target_dir" /tmp/ray


source ~/.bashrc
micromamba activate harness

# Starting ray, else ray start --include-dashboard=True --head --port $MASTER_PORT
LOCAL_ADDR=$(hostname)
if [ "$MASTER_ADDR" != "$LOCAL_ADDR" ]; then
sleep 10
echo "Connecting from $LOCAL_ADDR to Ray head at $MASTER_ADDR:$MASTER_PORT"
ray start --address $MASTER_ADDR:$MASTER_PORT
else
echo "Starting Ray head at $MASTER_ADDR:$MASTER_PORT"
ray start --include-dashboard=True --head --port $MASTER_PORT
fi

echo "Executing in $(pwd)"
job_array_id=$SLURM_ARRAY_JOB_ID
task_id=$SLURM_ARRAY_TASK_ID
full_job_id="${job_array_id}_${task_id}"

export NUMEXPR_MAX_THREADS=$(nproc --all)
# Directory for job-specific locks and signals
LOCKDIR="${WORKDIR}/locks_${full_job_id}"
mkdir -p $LOCKDIR

set -x
HEAD_STARTED_FILE="${LOCKDIR}/ray_head_started_${full_job_id}.signal"
WORKER_CONNECTED_FILE="${LOCKDIR}/worker_connected_${LOCAL_ADDR}_${full_job_id}.signal"

# Normalize log probs based on sumlogp argument
if [ "$SUMLOGP" = "no" ]; then
NORMALIZE_LOG_PROBS="True"
else
NORMALIZE_LOG_PROBS="False"
fi

# Chat template arguments based on chat_template argument
CHAT_TEMPLATE_ARGS=""
if [ "$CHAT_TEMPLATE" = "singleturn" ]; then
CHAT_TEMPLATE_ARGS="--apply_chat_template"
elif [ "$CHAT_TEMPLATE" = "multiturn" ]; then
CHAT_TEMPLATE_ARGS="--apply_chat_template --fewshot_as_multiturn"
fi
TOTAL_GPUS=$((num_gpus * num_nodes))

# Truncate strategy argument based on truncate_strategy argument
TRUNCATE_STRATEGY_ARG=""
if [ "$TRUNCATE_STRATEGY" != "none" ]; then
TRUNCATE_STRATEGY_ARG=",truncate_strategy=$TRUNCATE_STRATEGY"
fi
if [ "$MASTER_ADDR" == "$LOCAL_ADDR" ]; then
# Start the Ray head
echo "Starting Ray head at $MASTER_ADDR:$MASTER_PORT"
ray start --include-dashboard=True --head --port $MASTER_PORT

#export VLLM_LOGGING_LEVEL=DEBUG
#export CUDA_LAUNCH_BLOCKING=1
#export NCCL_DEBUG=TRACE
#export VLLM_TRACE_FUNCTION=1
# Signal that the Ray head has started
touch $HEAD_STARTED_FILE

export NCCL_IB_GID_INDEX=3
#export TORCH_NCCL_USE_COMM_NONBLOCKING=1
#export NCCL_SOCKET_IFNAME=eth0
echo "Executing in $(pwd)"

export NUMEXPR_MAX_THREADS=$(nproc --all)
export NUMEXPR_MAX_THREADS=$(nproc --all)

TOTAL_GPUS=$((num_gpus * num_nodes))
# ONLY MASTER NODE SHOULD RUN THE EVALUATION, OTHERS SHOULD JUST WAIT
if [ "$MASTER_ADDR" == "$LOCAL_ADDR" ]; then
sleep 15
# Get the job array ID and index
job_array_id=$SLURM_ARRAY_JOB_ID
task_id=$SLURM_ARRAY_TASK_ID
# Wait for all worker nodes to connect
echo "Waiting for all worker nodes to connect..."
for ((i=1; i<num_nodes; i++)); do
while [ ! -f "${LOCKDIR}/worker_connected_node${i}_${full_job_id}.signal" ]; do
sleep 2 # Check every 2 seconds
done
done

# Construct the full job ID
full_job_id="${job_array_id}_${task_id}"
echo "All workers connected. Proceeding with task execution."
for ((i=1; i<num_nodes; i++)); do
rm -f "${LOCKDIR}/worker_connected_node${i}_${full_job_id}.signal"
done

# Print the job array task ID
echo "Running job array task: $full_job_id"
Expand All @@ -92,14 +79,29 @@ if [ "$MASTER_ADDR" == "$LOCAL_ADDR" ]; then
--verbosity DEBUG \
--num_fewshot $NUM_FEWSHOT $CHAT_TEMPLATE_ARGS

# Cancel the current job array task
echo "Completed job array task: $full_job_id. Running cancel command."
scancel $full_job_id
# Release the lock after completion
echo "Completed job array task: $full_job_id. Running cancel command."
rm -f $HEAD_STARTED_FILE
scancel $full_job_id
else
set +x
echo "Waiting for Ray head to start"

# Wait for the signal file from the master node
while [ ! -f "$HEAD_STARTED_FILE" ]; do
sleep 2 # Check every 2 seconds
done

echo "Connecting from $LOCAL_ADDR to Ray head at $MASTER_ADDR:$MASTER_PORT"
ray start --address $MASTER_ADDR:$MASTER_PORT

# Signal that this worker has connected
touch "${LOCKDIR}/worker_connected_node${SLURM_NODEID}_${full_job_id}.signal"

echo "Waiting for master node to finish"
while [ ! -f "$OUTPUT_PATH" ]; do
sleep 10
sleep 10 # Check every 10 seconds
done

echo "Master node finished, exiting"
fi
13 changes: 9 additions & 4 deletions jobs/scripts/models/eval_M_accelerate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@ export NUMEXPR_MAX_THREADS=$(nproc --all)

set -x

# Check if OUTPUT_PATH exists
if [ -e "$OUTPUT_PATH" ]; then
echo "Output path $OUTPUT_PATH already exists. Exiting."
exit 0
fi


# Normalize log probs based on sumlogp argument
if [ "$SUMLOGP" = "no" ]; then
NORMALIZE_LOG_PROBS="True"
Expand All @@ -33,14 +40,12 @@ if [ "$TRUNCATE_STRATEGY" != "none" ]; then
TRUNCATE_STRATEGY_ARG=",truncate_strategy=$TRUNCATE_STRATEGY"
fi

#TODO: Some tasks fail with batch_size auto, keeping 2 for now
#TODO: Phi3 requires specifically setting attn_implementation="flash_attention_2"
#TODO: Generative tasks require 2048-256=1792 max_length, as they have default gen. limit 256. MPT model fails with 2048.

$PYTHON \
-m lm_eval --model hf \
--model_args pretrained=$MODEL_NAME,dtype=bfloat16,parallelize=True,max_length=2048,truncation=True,normalize_log_probs=$NORMALIZE_LOG_PROBS,trust_remote_code=True$TRUNCATE_STRATEGY_ARG \
--tasks "$TASK" \
--batch_size 8 \
--batch_size 8\
--output_path "$OUTPUT_PATH" \
--log_samples \
--verbosity DEBUG \
Expand Down
6 changes: 6 additions & 0 deletions jobs/scripts/models/eval_S_accelerate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ export NUMEXPR_MAX_THREADS=$(nproc --all)

set -x

# Check if OUTPUT_PATH exists
if [ -e "$OUTPUT_PATH" ]; then
echo "Output path $OUTPUT_PATH already exists. Exiting."
exit 0
fi

# Normalize log probs based on sumlogp argument
if [ "$SUMLOGP" = "no" ]; then
NORMALIZE_LOG_PROBS="True"
Expand Down
14 changes: 14 additions & 0 deletions jobs/scripts/submit/models_L/eval_llama31_70B_smartt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/bash
#SBATCH --job-name bcm_llama31
#SBATCH --account OPEN-30-35
#SBATCH --partition qgpu
#SBATCH --time 48:00:00
#SBATCH --gpus-per-node 8
#SBATCH --nodes 2

NAME='70bllama31_lm'
MODEL_NAME='meta-llama/Meta-Llama-3.1-70B'



source ./jobs/scripts/submit/fire/fire_L_smartt.sh
14 changes: 14 additions & 0 deletions jobs/scripts/submit/models_L/eval_mixtral_8x22B_instruct_smartt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/bash
#SBATCH --job-name bcm_8x22mix
#SBATCH --account OPEN-30-35
#SBATCH --partition qgpu
#SBATCH --time 48:00:00
#SBATCH --gpus-per-node 8
#SBATCH --nodes 2

NAME='8x22Mixtral_instruct'
MODEL_NAME='mistralai/Mixtral-8x22B-Instruct-v0.1'



source ./jobs/scripts/submit/fire/fire_L_smartt.sh
13 changes: 13 additions & 0 deletions jobs/scripts/submit/models_M/eval_aya23_35B_instruct_smartt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/bash
#SBATCH --job-name bcm_aya35b
#SBATCH --account OPEN-30-35
#SBATCH --partition qgpu
#SBATCH --time 48:00:00
#SBATCH --gpus-per-node 4
#SBATCH --nodes 1

NAME='aya23_35b_instruct'
MODEL_NAME='CohereForAI/aya-23-35B'
BACKEND='huggingface'

source ./jobs/scripts/submit/fire/fire_M_smartt.sh
13 changes: 13 additions & 0 deletions jobs/scripts/submit/models_M/eval_mistral_nemo_base_smartt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/bash
#SBATCH --job-name bcm_mnemo
#SBATCH --account OPEN-30-35
#SBATCH --partition qgpu
#SBATCH --time 48:00:00
#SBATCH --gpus-per-node 2
#SBATCH --nodes 1

NAME='mistral_nemo_base'
MODEL_NAME='mistralai/Mistral-Nemo-Base-2407'
BACKEND='huggingface'

source ./jobs/scripts/submit/fire/fire_M_smartt.sh
13 changes: 13 additions & 0 deletions jobs/scripts/submit/models_M/eval_mixtral8x7_base_smartt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/bash
#SBATCH --job-name bcm_mmix
#SBATCH --account OPEN-30-35
#SBATCH --partition qgpu
#SBATCH --time 48:00:00
#SBATCH --gpus-per-node 8
#SBATCH --nodes 1

NAME='mixtralM_base'
MODEL_NAME='mistralai/Mixtral-8x7B-v0.1'
BACKEND='vllm'

source ./jobs/scripts/submit/fire/fire_M_smartt.sh
12 changes: 12 additions & 0 deletions jobs/scripts/submit/models_S/eval_cstllama_smartt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/bash
#SBATCH --job-name bcm_tllama
#SBATCH --account OPEN-30-35
#SBATCH --partition qgpu
#SBATCH --time 16:00:00
#SBATCH --gpus-per-node 1
#SBATCH --nodes 1

NAME='cstllama'
MODEL_NAME='BUT-FIT/CSTinyLlama-1.2B'

source ./jobs/scripts/submit/fire/fire_S_smartt.sh
12 changes: 12 additions & 0 deletions jobs/scripts/submit/models_S/eval_gemma2-2b_instruct_smartt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/bash
#SBATCH --job-name bcm_g2b
#SBATCH --account OPEN-30-35
#SBATCH --partition qgpu
#SBATCH --time 16:00:00
#SBATCH --gpus-per-node 1
#SBATCH --nodes 1

NAME='gemma2-2b-instruct'
MODEL_NAME='google/gemma-2-2b-it'

source ./jobs/scripts/submit/fire/fire_S_smartt.sh
12 changes: 12 additions & 0 deletions jobs/scripts/submit/models_S/eval_gemma2-2b_smartt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/bash
#SBATCH --job-name bcm_g2b
#SBATCH --account OPEN-30-35
#SBATCH --partition qgpu
#SBATCH --time 16:00:00
#SBATCH --gpus-per-node 1
#SBATCH --nodes 1

NAME='gemma2-2b'
MODEL_NAME='google/gemma-2-2b'

source ./jobs/scripts/submit/fire/fire_S_smartt.sh
13 changes: 13 additions & 0 deletions jobs/scripts/submit/models_S/eval_gemma2_smartt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/bash
#SBATCH --job-name bcm_gemma2
#SBATCH --account OPEN-30-35
#SBATCH --partition qgpu
#SBATCH --time 16:00:00
#SBATCH --gpus-per-node 1
#SBATCH --nodes 1

NAME='gemma2_lm'
MODEL_NAME='google/gemma-2-9b'

source ./jobs/scripts/submit/fire/fire_S_smartt.sh

Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/bash
#SBATCH --job-name bcm_hermes31
#SBATCH --account OPEN-30-35
#SBATCH --partition qgpu
#SBATCH --time 16:00:00
#SBATCH --gpus-per-node 1
#SBATCH --nodes 1

NAME='hermes_llama31_instruct'
MODEL_NAME='NousResearch/Hermes-3-Llama-3.1-8B'

source ./jobs/scripts/submit/fire/fire_S_smartt.sh


12 changes: 12 additions & 0 deletions jobs/scripts/submit/models_S/eval_llama31_smartt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/bash
#SBATCH --job-name bcm_llama31
#SBATCH --account OPEN-30-35
#SBATCH --partition qgpu
#SBATCH --time 16:00:00
#SBATCH --gpus-per-node 1
#SBATCH --nodes 1

NAME='llama31_lm'
MODEL_NAME='meta-llama/Meta-Llama-3.1-8B'

source ./jobs/scripts/submit/fire/fire_S_smartt.sh
Loading

0 comments on commit d3b47a2

Please sign in to comment.