Zyphra · Quentin-Anthony · Nov 9, 2023 · Nov 9, 2023 · Nov 9, 2023 · Nov 14, 2023
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -0,0 +1,35 @@
+name: python
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches:
+      '**'
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    strategy:
+      matrix:
+        pyVersion: ["3.7", "3.8", "3.9", "3.10"]
+      fail-fast: false
+
+    runs-on: ubuntu-22.04
+    container:
+      image: deepspeed/gh-builder:py${{ matrix.pyVersion }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: environment
+        run: |
+          which python
+          python --version
+      - name: Install Megatron-DeepSpeed
+        run: |
+          pip3 install .
diff --git a/README.md b/README.md
@@ -131,7 +131,8 @@ python tools/preprocess_data.py \
        --output-prefix my-bert \
        --vocab-file bert-vocab.txt \
        --tokenizer-type BertWordPieceLowerCase \
-       --split-sentences
+       --split-sentences \
+       --workers 5
 </pre>
 
 The output will be two files named, in this case, `my-bert_text_sentence.bin` and `my-bert_text_sentence.idx`. The `--data-path` specified in later BERT training is the full path and new filename, but without the file extension.
@@ -150,7 +151,8 @@ python tools/preprocess_data.py \
        --dataset-impl mmap \
        --tokenizer-type GPT2BPETokenizer \
        --merge-file gpt2-merges.txt \
-       --append-eod
+       --append-eod \
+       --workers 5
 </pre>
 
 Here the output files are named `my-gpt2_text_document.bin` and `my-gpt2_text_document.idx`. As before, in GPT training, use the longer name without the extension as `--data-path`.

diff --git a/examples_deepspeed/MoE/ds_evalharness.sh b/examples_deepspeed/MoE/ds_evalharness.sh
@@ -28,7 +28,7 @@ TASKS="lambada"
 VOCAB_FILE=/data/Megatron-LM/data/gpt2-vocab.json
 MERGE_FILE=/data/Megatron-LM/data/gpt2-merges.txt
 
-export HF_DATASETS_OFFLINE=1
+# export HF_DATASETS_OFFLINE=1
 
 # Dummy arguments to make megatron happy. No need to configure them.
 # The reason we don't need to configure them and many other arguments is
@@ -53,6 +53,7 @@ CMD="../../tasks/eval_harness/evaluate.py \
     --no-load-rng \
     --inference \
     --disable-moe-token-dropping \
+    --tokenizer-type GPT2BPETokenizer \
     --adaptive_seq_len\
     --eval_fp32\
     --task_list $TASKS\

diff --git a/examples_deepspeed/MoE/readme_evalharness.md b/examples_deepspeed/MoE/readme_evalharness.md
@@ -11,11 +11,10 @@ This particular setup uses the normal deepspeed checkpoint and requires no conve
 On login console with external network
 
 Get lm-eval harness (https://github.com/EleutherAI/lm-evaluation-harness) and `best-download==0.0.7` needed to download some tasks.
+Below package version numbers are what we tested that work.
 ```
 (maybe need pip install --upgrade pip)
-pip install best-download==0.0.7
-pip install lm-eval
-(previously we used "pip install git+https://github.com/EleutherAI/lm-evaluation-harness" to install, but later found the command above has less dependency issues)
+pip install best-download==0.0.7 lm-eval==0.2.0 datasets==1.15.1 transformers==4.20.1 huggingface-hub==0.8.1
 ```
 
 2. Pre-download needed datasets
@@ -33,7 +32,8 @@ Then install datasets for the tasks:
 ```
 python ../../tasks/eval_harness/download.py --task_list hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext,logiqa,mathqa,mc_taco,mrpc,prost,pubmedqa,qnli,qqp,sciq,sst,wnli
 ```
-and make sure that `export HF_DATASETS_OFFLINE=1`
+
+Previously we set `export HF_DATASETS_OFFLINE=1` to make the dataset offline after the above manual download. But somehow now this could trigger error on some kind of online verification for some of the datasets, so it's recommended to only set offline mode when necessary.
 
 <!-- If there are things like custom tokenizers, pre-download those too, e.g.:
 

diff --git a/examples_deepspeed/compression/ds_evalharness.sh b/examples_deepspeed/compression/ds_evalharness.sh
@@ -1,4 +1,4 @@
-# This is an example zero-shot eval script. Please first read the readme_evalharness.md under the same directory.
+# This is an example zero-shot eval script. Please first read the readme_evalharness.md under the ../MoE directory.
 
 # CHECKPOINT_PATH=/blob/users/minjiaz/compression_library/checkpoint/125M10L_Compression_Test_INT8_64gpu_lr6e-5_tokens5.25B_nocl_alpha-no_pp/global_step2000/
 # CHECKPOINT_PATH=/blob/users/conglli/project/gpt3_with_pile/checkpoint/gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-64-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B/global_step71000/
@@ -31,7 +31,7 @@ TASKS="lambada,wikitext"
 VOCAB_FILE=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
 MERGE_FILE=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
 
-export HF_DATASETS_OFFLINE=1
+# export HF_DATASETS_OFFLINE=1
 
 # Dummy arguments to make megatron happy. No need to configure them.
 # The reason we don't need to configure them and many other arguments is
@@ -56,6 +56,7 @@ CMD="../../tasks/eval_harness/evaluate.py \
     --no-load-rng \
     --inference \
     --disable-moe-token-dropping \
+    --tokenizer-type GPT2BPETokenizer \
     --adaptive_seq_len\
     --eval_fp32\
     --task_list $TASKS\

diff --git a/examples_deepspeed/curriculum_learning/ds_config_gpt_slw_TEMPLATE.json b/examples_deepspeed/curriculum_learning/ds_config_gpt_slw_TEMPLATE.json
@@ -0,0 +1,34 @@
+{
+  "train_batch_size": GBSIZE,
+  "train_micro_batch_size_per_gpu": MBSIZE,
+  "steps_per_print": LOG_INTERVAL,
+
+  "zero_optimization": {
+    "stage": ZERO_STAGE
+  },
+
+  "gradient_clipping": 1.0,
+  "prescale_gradients": PRESCALE_GRAD,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 11
+  },
+
+  "wall_clock_breakdown" : false,
+  "curriculum_learning": {
+    "enabled": true,
+    "curriculum_type": "seqlen",
+    "min_difficulty": CONFIG_CL_MIN,
+    "max_difficulty": CONFIG_CL_MAX,
+    "schedule_type": "fixed_linear",
+    "schedule_config": {
+      "total_curriculum_step": CONFIG_CL_DURATION,
+      "difficulty_step": 8
+    }
+  }
+}