alibaba · SeaOfOcean · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024
diff --git a/README_CN.md b/README_CN.md
@@ -1,5 +1,5 @@
 
-[![docs](https://img.shields.io/badge/docs-latest-brightgreen.svg)](https://chatlearn.readthedocs.io/zh/latest/)
+[![docs](https://img.shields.io/badge/docs-latest-brightgreen.svg)](https://chatlearn.readthedocs.io/zh-cn/latest/)
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/alibaba/ChatLearn/blob/main/LICENSE)
 
 <p align="center">
@@ -35,7 +35,7 @@ ChatLearn的特点如下:
 
 # 快速开始
 
-请参考 [文档](https://chatlearn.readthedocs.io/zh/latest/) 快速开始.
+请参考 [文档](https://chatlearn.readthedocs.io/zh-cn/latest/) 快速开始.
 
 1. [环境和代码准备](docs/zh/installation.md)
 2. [基于 LLaMA/LLaMA2 模型的端到端训练教程](docs/zh/tutorial/tutorial_llama2.md)

diff --git a/docs/en/chatlearn.md b/docs/en/chatlearn.md
@@ -1,6 +1,6 @@
-# ChatLearn
+# ChatLearn: A flexible and efficient training framework for large-scale alignment
 
-ChatLearn is an efficient training framework that supports large-scale alignment. It aims to provide a flexible and user-friendly platform for alignment training based on Large Language Models (LLMs) such as ChatGPT.
+ChatLearn aims to provide a flexible and user-friendly platform for alignment training based on Large Language Models (LLMs) such as ChatGPT.
 
 ## Introduction
 
@@ -42,7 +42,7 @@ By providing a comprehensive and efficient framework, ChatLearn empowers researc
 
 ## Quick Start
 
-Please refer to the [Documentation](https://chatlearn.readthedocs.io/zh/latest/) for a quick start guide.
+Please refer to the [Documentation](https://chatlearn.readthedocs.io/en/latest/) for a quick start guide.
 
 1. [Environment and Code Setup](installation.md) 
 2. [End-to-End Training Tutorial with Llama/Llama2 Model](tutorial/tutorial_llama2.md)

diff --git a/docs/en/conf.py b/docs/en/conf.py
@@ -25,21 +25,23 @@
 sys.path.insert(0, os.path.abspath("../../"))
 
 from unittest import mock
-
-# 使用unittest.mock来mock模块
-imports = ['torch', 'cupy.cuda', 'pynvml', 'ray', 'ray.util', 'ray.util.collective.collective_group.nccl_util',
-        'ray.util.scheduling_strategies', 'pynvml', 'ray.util.state', 'ray._private', 'ray._private.utils',
-        'ray._private.ray_logging', 'ray._private.worker', 'ray.util.collective', 'ray.util.collective.collective_group',
-        "ray.util.collective.collective_group.base_collective_group", 'ray.util.collective.collective_group.nccl_collective_group',
-        'torch.utils.data', 'torch._utils', 'transformers', 'transformers.integrations', 'transformers.trainer', 'deepspeed',
-        'deepspeed.ops.adam', 'deepspeed.runtime.zero.partition_parameters', 'torch.distributed', 'torch.nn', 'torch.nn.utils.rnn', 'ray.util.queue',
-        'ray.experimental.state.api', 'torch.cuda', 'ray.util.placement_group', "cupy.cuda.nccl", 'tqdm', 'numpy']
-
-for key in imports:
-    sys.modules[key] = mock.MagicMock()
-
-import chatlearn
-from chatlearn.utils import arguments
+try:
+    import chatlearn
+    from chatlearn.utils import arguments
+except ImportError:
+    imports = ['torch', 'cupy.cuda', 'pynvml', 'ray', 'ray.util', 'ray.util.collective.collective_group.nccl_util',
+            'ray.util.scheduling_strategies', 'pynvml', 'ray.util.state', 'ray._private', 'ray._private.utils',
+            'ray._private.ray_logging', 'ray._private.worker', 'ray.util.collective', 'ray.util.collective.collective_group',
+            "ray.util.collective.collective_group.base_collective_group", 'ray.util.collective.collective_group.nccl_collective_group',
+            'torch.utils.data', 'torch._utils', 'transformers', 'transformers.integrations', 'transformers.trainer', 'deepspeed',
+            'deepspeed.ops.adam', 'deepspeed.runtime.zero.partition_parameters', 'torch.distributed', 'torch.nn', 'torch.nn.utils.rnn', 'ray.util.queue',
+            'ray.experimental.state.api', 'torch.cuda', 'ray.util.placement_group', "cupy.cuda.nccl", 'tqdm', 'numpy']
+
+    for key in imports:
+        sys.modules[key] = mock.MagicMock()
+
+    import chatlearn
+    from chatlearn.utils import arguments
 
 from importlib.machinery import SourceFileLoader
 version = SourceFileLoader("chatlearn.version", "../../chatlearn/utils/version.py") \

diff --git a/docs/en/index.rst b/docs/en/index.rst
@@ -4,22 +4,17 @@ ChatLearn Documentation
 
 .. toctree::
    :maxdepth: 1
-   :caption: ChatLearn: An Efficient Training Framework for Large-Scale Alignment
+   :caption: Introduction
 
    chatlearn
 
-|
-|
 
 .. toctree::
    :maxdepth: 1
    :caption: Installation
 
    installation
 
-|
-|
-
 .. toctree::
    :maxdepth: 1
    :caption: Tutorial
@@ -34,10 +29,6 @@ ChatLearn Documentation
    tutorial/ems
    tutorial/profile
 
-|
-|
-
-
 .. toctree::
    :maxdepth: 1
    :caption: Programming
@@ -55,14 +46,8 @@ ChatLearn Documentation
 
    api/index
 
-|
-|
-
 .. toctree::
    :maxdepth: 1
    :caption: FAQ
 
    faq
-
-|
-|
diff --git a/docs/en/programming/vllm.md b/docs/en/programming/vllm.md
@@ -6,7 +6,7 @@ For now, we enable vLLM to accelerate policy generation.
 
 ## Model Definition
 
-Similar to inheriting `MegatronModule` for implementing [PolicyInference Model](../../../examples/megatron/models/old_policy_inference.py), the vLLM backend can be enabled by inheriting `VLLMModule` class and implementing the following key modules:
+Similar to inheriting `MegatronModule` for implementing [PolicyInference Model](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/models/old_policy_inference.py), the vLLM backend can be enabled by inheriting `VLLMModule` class and implementing the following key modules:
 - model_provider: model definition function.
 - setup: call model_provider to define model. Optionly, call `load_checkpoint` or others.
 - build_dataset: Preprocess train/eval dataset with vLLM tokenizer.
@@ -48,9 +48,9 @@ class VLLMPolicyInference(VLLMModule):
         pass
 ```
 
-You can refer to[vllm_policy_inference.py](../../../examples/megatron/models/vllm_policy_inference.py), in which build_dataset/_add_request/forward_step/decode_internal clarified as following:
+You can refer to[vllm_policy_inference.py](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/models/vllm_policy_inference.py), in which build_dataset/_add_request/forward_step/decode_internal clarified as following:
 
-- build_dataset: Use `tokenizer`, you only need to return prompt_ids and prompt string. In `build_dataset`, [VLLMPromptPipeline](../../../examples/megatron/data/prompt_dataset.py#141) shows as following:
+- build_dataset: Use `tokenizer`, you only need to return prompt_ids and prompt string. In `build_dataset`, [VLLMPromptPipeline](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/data/prompt_dataset.py#141) shows as following:
 ```python
 class VLLMPromptPipeline(PromptPipeline):
     def __init__(self, prompts: List[str], max_prompt_length: int, tokenizer=None):
@@ -108,7 +108,7 @@ class VLLMPolicyInference(VLLMModule):
         return self._forward_step(data, iteration, eval_mode=False)
 ```
 
-- decode_internal: Refer to [examples](../../../examples/megatron/models/vllm_policy_inference.py#L119) for more details. Format of param `batched_outputs` is List[RequestOutput], in which [RequestOutput](https://github.com/vllm-project/vllm/blob/v0.5.1/vllm/outputs.py#L67)includes the following key attributes:
+- decode_internal: Refer to [examples](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/models/vllm_policy_inference.py#L119) for more details. Format of param `batched_outputs` is List[RequestOutput], in which [RequestOutput](https://github.com/vllm-project/vllm/blob/v0.5.1/vllm/outputs.py#L67)includes the following key attributes:
 
 |   Attibute  |Type| Comment  |
 |:------:|:-----:|:-----:|
@@ -140,7 +140,7 @@ policy:
     ...
 ```
 
-Or you can refer to [llama2 model yaml](../../../examples/megatron/configs/llama2/vllm_rlhf.yaml).
+Or you can refer to [llama2 model yaml](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/configs/llama2/vllm_rlhf.yaml).
 
 ## hyperparameter configuration yaml
 
@@ -186,4 +186,4 @@ Hyperparameter for vLLM can be divied into 5 parts:
 - Others: `includes` specifies model structure.
 
 
-You can refer to [vLLM Hyperparameter Configuration](../../../examples/megatron/configs/llama2/vllm_policy_inference.yaml) for details.
+You can refer to [vLLM Hyperparameter Configuration](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/configs/llama2/vllm_policy_inference.yaml) for details.
diff --git a/docs/en/tutorial/ems.md b/docs/en/tutorial/ems.md
@@ -26,4 +26,4 @@ Alternatively, it can also be configured in the training script using environmen
 - PPO policy model: `export free_memory_ppo_policy=True`
 - PPO value model: `export free_memory_ppo_value=True`
 
-A complete example can be found in the [llama2 configuration](../../../examples/megatron/configs/llama2/rlhf.yaml).
+A complete example can be found in the [llama2 configuration](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/configs/llama2/rlhf.yaml).
diff --git a/docs/en/tutorial/run.md b/docs/en/tutorial/run.md
@@ -15,9 +15,6 @@ Select the job type as `PyTorch` and paste the command into the `Execution Comma
 
 
 
-For RLHF, DPO, OnlineDPO, GRPO training task, you need set the advanced setting as `customPortList=30000-30050,createSvcForAllWorkers=true`.
-
-
 ## Non-PAI-DLC environment
 
 If you want to submit distributed training in a non-PAI-DLC environment,

diff --git a/docs/en/tutorial/tutorial_llama2.md b/docs/en/tutorial/tutorial_llama2.md
@@ -1,6 +1,6 @@
 # End-to-end Training Tutorial with Llama Model
 
-This document provides instructions for end-to-end training using the ChatLearn, Megatron-LM framework, and the Llama/Llama2 model. ChatLearn supports three training policies as follows:
+This document provides instructions for end-to-end training using the ChatLearn, Megatron-LM and vLLM framework, and the Llama/Llama2 model. ChatLearn supports three training policies as follows:
 1. RLHF(Reinforcement Learning from Human Feedback): which includes three stages of training: SFT, Reward, and RLHF training.
 2. Direct Preference Optimization(DPO): which includes two stages of training: SFT and DPO training.
 3. OnlineDPO/GRPO: which fall in between RLHF and DPO, includes three stages of training: SFT, Reward, and DPO training.
@@ -59,24 +59,25 @@ bash scripts/convert_hf_to_megatron.sh
 ### Start SFT Training
 
 The script below is an example of SFT training. The `DATASET_PATH` is the path to the SFT training set, such as `$DATASET_ROOT/sft/train.jsonl`. 
-The `MODEL_SIZE` is an environment variable specified in the script to indicate the size of the model, which can be `llama2-7B`, `llama2-13B`, or `llama2-70B`.
+The `model_size` is an environment variable specified in the script to indicate the size of the model, which can be `llama2-7B`, `llama2-13B`, or `llama2-70B`.
 
 ```bash
 export CHATLEARN=path-to-chatlearn
 export MEGATRON=path-to-megatron-lm
 cd ${CHATLEARN}/examples/megatron/
 
-MODEL_SIZE=$MODEL_SIZE \
+export model_size=llama2-7B
+
 LOAD_PATH=$MEGATRON_LLAMA2_CKPT_PATH \
 TOKENIZER_MODEL=$LLAMA2_TOKENIZER_MODEL \
 DATASET_PATH=$DATASET_ROOT/sft/ \
 bash scripts/train_sft_llama.sh
 ```
 
 The training logs and the completed models will be stored in `${CHATLEARN}/output/sft` by default.
-For specific definitions, please refer to the script `${CHATLEARN}/2024-08-21/rlhf/examples/megatron/scripts/train_sft_llama.sh`.
+For specific definitions, please refer to the script `${CHATLEARN}/examples/megatron/scripts/train_sft_llama.sh`.
 
-In our training script, the resource requirements (assuming the resources are A100-80GB/A800-80GB/H800-80GB GPUs) are as follows:
+In our training script, the resource requirements (assuming the resources are A100-80GB/A800-80GB GPUs) are as follows:
 1. llama2-7B SFT: 8 GPUs
 2. llama2-13B SFT: 8 GPUs
 3. llama2-70B SFT: 4*8 GPUs
@@ -97,7 +98,7 @@ Based on InstructGPT[1], the Reward model training is initialized with the SFT m
 
 ```bash
 export CHATLEARN=path-to-chatlearn
-export MEGATRON=path-to-megatron-lm-extension
+export MEGATRON=path-to-megatron-lm
 cd ${CHATLEARN}/examples/megatron/
 
 LOAD_PATH=path-to-sft-ckpt \
@@ -128,7 +129,7 @@ In this example, the user needs to set `POLICY_LOAD` to the checkpoint path gene
 The Policy and Reference models will be initialized with the SFT checkpoint.
 `REWARD_LOAD` should be set to the checkpoint path generated by the Reward training, and the user can specify the iteration number for the loaded checkpoint.
 The Reward and Value models will be initialized with the weights of the Reward model.
-`TOKENIZER_MODEL` should be set to the folder path where the `tokenizer.model` for LlamaTokenizer is located.
+`TOKENIZER_MODEL` should be set to the folder path where the `tokenizer.model` for Llama2Tokenizer is located.
 
 ```bash
 export CHATLEARN=path-to-chatlearn
@@ -140,7 +141,7 @@ cd ${CHATLEARN}/examples/megatron/
 export model_size=llama2-7B
 
 POLICY_LOAD=path-to-sft-ckpt \
-REWARD_LOAD=path-to-trained-rm-checkpoint \
+REWARD_LOAD=path-to-rm-ckpt \
 REWARD_LOAD_ITERATION=1000 \
 TOKENIZER_MODEL=$LLAMA2_TOKENIZER_MODEL \
 bash run_scripts/train_rlhf_llama.sh
@@ -160,7 +161,7 @@ cd ${CHATLEARN}/examples/megatron/
 export model_size=llama2-7B
 
 POLICY_LOAD=path-to-sft-ckpt \
-REWARD_LOAD=path-to-trained-rm-checkpoint \
+REWARD_LOAD=path-to-rm-ckpt \
 REWARD_LOAD_ITERATION=1000 \
 TOKENIZER_MODEL=$LLAMA2_TOKENIZER_MODEL \
 bash scripts/train_online_dpo_llama.sh
@@ -172,7 +173,7 @@ bash scripts/train_online_dpo_llama.sh
 Here is a training script for Llama2-7B Policy and 7B Reward models.
 In this example, the user needs to set `POLICY_LOAD` to the checkpoint path generated by SFT.
 The Policy and Reference models will be initialized with the SFT checkpoint.
-`TOKENIZER_MODEL` should be set to the folder path where the `tokenizer.model` for LlamaTokenizer is located.
+`TOKENIZER_MODEL` should be set to the folder path where the `tokenizer.model` for Llama2Tokenizer is located.
 
 ```bash
 export CHATLEARN=path-to-chatlearn
@@ -201,7 +202,7 @@ cd ${CHATLEARN}/examples/megatron/
 export model_size=llama2-7B
 
 POLICY_LOAD=path-to-sft-ckpt \
-REWARD_LOAD=path-to-trained-rm-checkpoint \
+REWARD_LOAD=path-to-rm-ckpt \
 REWARD_LOAD_ITERATION=1000 \
 TOKENIZER_MODEL=$LLAMA2_TOKENIZER_MODEL \
 bash scripts/train_grpo_math_llama.sh
@@ -212,15 +213,14 @@ bash scripts/train_grpo_math_llama.sh
 If you need to train a llama2-13B / llama2-70B model, simply change `export model_size=llama2-7B` with `export model_size=llama2-13B` / `export model_size=llama2-70B`.
 You can also modify the model configuration and other parameters according to your needs.
 
-In our training script, the resource requirements (assuming the resources are A100-80GB / A800-80GB / H800-80GB GPUs) are as follows:
+In our training script, the resource requirements (assuming the resources are A100-80GB / A800-80GB GPUs) are as follows:
 
 1. llama2-7B RLHF: 8 GPUs
 2. llama2-13B RLHF: 2*8 GPUs
 3. llama2-70B RLHF: 4*8 GPUs
 
 For the environment variables and configurations required for distributed execution, please refer to [Distributed Execution](run.md).
 
-Note that for RLHF tasks, if you are running on PAI DLC, you need to fill in the advanced configuration `customPortList=30000-30050,createSvcForAllWorkers=true`.
 
 ### Evaluation
 

diff --git a/docs/zh/chatlearn.md b/docs/zh/chatlearn.md
@@ -1,6 +1,4 @@
-# ChatLearn
-
-ChatLearn 是一个灵活、易用、高效的大规模 Alignment 训练框架。
+# ChatLearn: 灵活、易用、高效的大规模 Alignmant 训练框架
 
 ## 概述
 
@@ -38,7 +36,7 @@ ChatGPT 是由 OpenAI 开发的基于大型语言模型 (Large Language Model, L
 
 ## 快速开始
 
-请参考 [文档](https://chatlearn.readthedocs.io/zh/latest/) 快速开始.
+请参考 [文档](https://chatlearn.readthedocs.io/zh-cn/latest/) 快速开始.
 
 1. [环境和代码准备](installation.md)
 2. [基于 Llama/Llama2 模型的端到端训练教程](tutorial/tutorial_llama2.md)

diff --git a/docs/zh/conf.py b/docs/zh/conf.py
@@ -27,19 +27,23 @@
 from unittest import mock
 
 # 使用unittest.mock来mock模块
-imports = ['torch', 'cupy.cuda', 'pynvml', 'ray', 'ray.util', 'ray.util.collective.collective_group.nccl_util',
-        'ray.util.scheduling_strategies', 'pynvml', 'ray.util.state', 'ray._private', 'ray._private.utils',
-        'ray._private.ray_logging', 'ray._private.worker', 'ray.util.collective', 'ray.util.collective.collective_group',
-        "ray.util.collective.collective_group.base_collective_group", 'ray.util.collective.collective_group.nccl_collective_group',
-        'torch.utils.data', 'torch._utils', 'transformers', 'transformers.integrations', 'transformers.trainer', 'deepspeed',
-        'deepspeed.ops.adam', 'deepspeed.runtime.zero.partition_parameters', 'torch.distributed', 'torch.nn', 'torch.nn.utils.rnn', 'ray.util.queue',
-        'ray.experimental.state.api', 'torch.cuda', 'ray.util.placement_group', "cupy.cuda.nccl", 'tqdm', 'numpy']
-
-for key in imports:
-    sys.modules[key] = mock.MagicMock()
-
-import chatlearn
-from chatlearn.utils import arguments
+try:
+    import chatlearn
+    from chatlearn.utils import arguments
+except ImportError:
+    imports = ['torch', 'cupy.cuda', 'pynvml', 'ray', 'ray.util', 'ray.util.collective.collective_group.nccl_util',
+            'ray.util.scheduling_strategies', 'pynvml', 'ray.util.state', 'ray._private', 'ray._private.utils',
+            'ray._private.ray_logging', 'ray._private.worker', 'ray.util.collective', 'ray.util.collective.collective_group',
+            "ray.util.collective.collective_group.base_collective_group", 'ray.util.collective.collective_group.nccl_collective_group',
+            'torch.utils.data', 'torch._utils', 'transformers', 'transformers.integrations', 'transformers.trainer', 'deepspeed',
+            'deepspeed.ops.adam', 'deepspeed.runtime.zero.partition_parameters', 'torch.distributed', 'torch.nn', 'torch.nn.utils.rnn', 'ray.util.queue',
+            'ray.experimental.state.api', 'torch.cuda', 'ray.util.placement_group', "cupy.cuda.nccl", 'tqdm', 'numpy']
+
+    for key in imports:
+        sys.modules[key] = mock.MagicMock()
+
+    import chatlearn
+    from chatlearn.utils import arguments
 
 from importlib.machinery import SourceFileLoader
 version = SourceFileLoader("chatlearn.version", "../../chatlearn/utils/version.py") \