Skip to content

Commit

Permalink
Add finetuning component (#502)
Browse files Browse the repository at this point in the history
Signed-off-by: Xinyu Ye <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: lkk <[email protected]>
Co-authored-by: test <[email protected]>
Co-authored-by: root <[email protected]>
Co-authored-by: Letong Han <[email protected]>
  • Loading branch information
6 people authored Aug 21, 2024
1 parent 40f1463 commit ad0bb7c
Show file tree
Hide file tree
Showing 22 changed files with 1,812 additions and 2 deletions.
6 changes: 4 additions & 2 deletions comps/cores/mega/micro_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import asyncio
import multiprocessing
from typing import Any, Optional, Type
from typing import Any, List, Optional, Type

from ..proto.docarray import TextDoc
from .constants import ServiceRoleType, ServiceType
Expand Down Expand Up @@ -154,6 +154,7 @@ def register_microservice(
output_datatype: Type[Any] = TextDoc,
provider: Optional[str] = None,
provider_endpoint: Optional[str] = None,
methods: List[str] = ["POST"],
):
def decorator(func):
if name not in opea_microservices:
Expand All @@ -173,7 +174,8 @@ def decorator(func):
provider_endpoint=provider_endpoint,
)
opea_microservices[name] = micro_service
opea_microservices[name].app.router.add_api_route(endpoint, func, methods=["POST"])
opea_microservices[name].app.router.add_api_route(endpoint, func, methods=methods)

return func

return decorator
222 changes: 222 additions & 0 deletions comps/cores/proto/api_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,3 +539,225 @@ def check_requests(request) -> Optional[JSONResponse]:
)

return None


class Hyperparameters(BaseModel):
batch_size: Optional[Union[Literal["auto"], int]] = "auto"
"""Number of examples in each batch.
A larger batch size means that model parameters are updated less frequently, but with lower variance.
"""

learning_rate_multiplier: Optional[Union[Literal["auto"], float]] = "auto"
"""Scaling factor for the learning rate.
A smaller learning rate may be useful to avoid overfitting.
"""

n_epochs: Optional[Union[Literal["auto"], int]] = "auto"
"""The number of epochs to train the model for.
An epoch refers to one full cycle through the training dataset. "auto" decides
the optimal number of epochs based on the size of the dataset. If setting the
number manually, we support any number between 1 and 50 epochs.
"""


class FineTuningJobWandbIntegration(BaseModel):
project: str
"""The name of the project that the new run will be created under."""

entity: Optional[str] = None
"""The entity to use for the run.
This allows you to set the team or username of the WandB user that you would
like associated with the run. If not set, the default entity for the registered
WandB API key is used.
"""

name: Optional[str] = None
"""A display name to set for the run.
If not set, we will use the Job ID as the name.
"""

tags: Optional[List[str]] = None
"""A list of tags to be attached to the newly created run.
These tags are passed through directly to WandB. Some default tags are generated
by OpenAI: "openai/finetune", "openai/{base-model}", "openai/{ftjob-abcdef}".
"""


class FineTuningJobWandbIntegrationObject(BaseModel):
type: Literal["wandb"]
"""The type of the integration being enabled for the fine-tuning job."""

wandb: FineTuningJobWandbIntegration
"""The settings for your integration with Weights and Biases.
This payload specifies the project that metrics will be sent to. Optionally, you
can set an explicit display name for your run, add tags to your run, and set a
default entity (team, username, etc) to be associated with your run.
"""


class FineTuningJobsRequest(BaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/fine-tuning/create
model: str
"""The name of the model to fine-tune."""

training_file: str
"""The ID of an uploaded file that contains training data."""

hyperparameters: Optional[Hyperparameters] = None
"""The hyperparameters used for the fine-tuning job."""

suffix: Optional[str] = None
"""A string of up to 64 characters that will be added to your fine-tuned model name."""

validation_file: Optional[str] = None
"""The ID of an uploaded file that contains validation data."""

integrations: Optional[List[FineTuningJobWandbIntegrationObject]] = None
"""A list of integrations to enable for your fine-tuning job."""

seed: Optional[str] = None


class Error(BaseModel):
code: str
"""A machine-readable error code."""

message: str
"""A human-readable error message."""

param: Optional[str] = None
"""The parameter that was invalid, usually `training_file` or `validation_file`.
This field will be null if the failure was not parameter-specific.
"""


class FineTuningJob(BaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/fine-tuning/object
id: str
"""The object identifier, which can be referenced in the API endpoints."""

created_at: int
"""The Unix timestamp (in seconds) for when the fine-tuning job was created."""

error: Optional[Error] = None
"""For fine-tuning jobs that have `failed`, this will contain more information on
the cause of the failure."""

fine_tuned_model: Optional[str] = None
"""The name of the fine-tuned model that is being created.
The value will be null if the fine-tuning job is still running.
"""

finished_at: Optional[int] = None
"""The Unix timestamp (in seconds) for when the fine-tuning job was finished.
The value will be null if the fine-tuning job is still running.
"""

hyperparameters: Hyperparameters
"""The hyperparameters used for the fine-tuning job.
See the [fine-tuning guide](https://platform.openai.com/docs/guides/fine-tuning)
for more details.
"""

model: str
"""The base model that is being fine-tuned."""

object: Literal["fine_tuning.job"] = "fine_tuning.job"
"""The object type, which is always "fine_tuning.job"."""

organization_id: Optional[str] = None
"""The organization that owns the fine-tuning job."""

result_files: List[str] = None
"""The compiled results file ID(s) for the fine-tuning job.
You can retrieve the results with the
[Files API](https://platform.openai.com/docs/api-reference/files/retrieve-contents).
"""

status: Literal["validating_files", "queued", "running", "succeeded", "failed", "cancelled"]
"""The current status of the fine-tuning job, which can be either
`validating_files`, `queued`, `running`, `succeeded`, `failed`, or `cancelled`."""

trained_tokens: Optional[int] = None
"""The total number of billable tokens processed by this fine-tuning job.
The value will be null if the fine-tuning job is still running.
"""

training_file: str
"""The file ID used for training.
You can retrieve the training data with the
[Files API](https://platform.openai.com/docs/api-reference/files/retrieve-contents).
"""

validation_file: Optional[str] = None
"""The file ID used for validation.
You can retrieve the validation results with the
[Files API](https://platform.openai.com/docs/api-reference/files/retrieve-contents).
"""

integrations: Optional[List[FineTuningJobWandbIntegrationObject]] = None
"""A list of integrations to enable for this fine-tuning job."""

seed: Optional[int] = None
"""The seed used for the fine-tuning job."""

estimated_finish: Optional[int] = None
"""The Unix timestamp (in seconds) for when the fine-tuning job is estimated to
finish.
The value will be null if the fine-tuning job is not running.
"""


class FineTuningJobIDRequest(BaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/fine-tuning/retrieve
# https://platform.openai.com/docs/api-reference/fine-tuning/cancel
fine_tuning_job_id: str
"""The ID of the fine-tuning job."""


class FineTuningJobListRequest(BaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/fine-tuning/list
after: Optional[str] = None
"""Identifier for the last job from the previous pagination request."""

limit: Optional[int] = 20
"""Number of fine-tuning jobs to retrieve."""


class FineTuningJobList(BaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/fine-tuning/list
object: str = "list"
"""The object type, which is always "list".
This indicates that the returned data is a list of fine-tuning jobs.
"""

data: List[FineTuningJob]
"""A list containing FineTuningJob objects."""

has_more: bool
"""Indicates whether there are more fine-tuning jobs beyond the current list.
If true, additional requests can be made to retrieve more jobs.
"""
121 changes: 121 additions & 0 deletions comps/finetuning/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# LLM Fine-tuning Microservice

LLM Fine-tuning microservice involves adapting a base model to a specific task or dataset to improve its performance on that task.

# 🚀1. Start Microservice with Python (Optional 1)

## 1.1 Install Requirements

```bash
python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
python -m pip install intel-extension-for-pytorch
python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
pip install -r requirements.txt
```

## 1.2 Start Finetuning Service with Python Script

### 1.2.1 Start Ray Cluster

OneCCL and Intel MPI libraries should be dynamically linked in every node before Ray starts:

```bash
source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl; print(torch_ccl.cwd)")/env/setvars.sh
```

Start Ray locally using the following command.

```bash
ray start --head
```

For a multi-node cluster, start additional Ray worker nodes with below command.

```bash
ray start --address='${head_node_ip}:6379'
```

### 1.2.2 Start Finetuning Service

```bash
export HF_TOKEN=${your_huggingface_token}
python finetuning_service.py
```

# 🚀2. Start Microservice with Docker (Optional 2)

## 2.1 Setup on CPU

### 2.1.1 Build Docker Image

Build docker image with below command:

```bash
export HF_TOKEN=${your_huggingface_token}
cd ../../
docker build -t opea/finetuning:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg HF_TOKEN=$HF_TOKEN -f comps/finetuning/docker/Dockerfile_cpu .
```

### 2.1.2 Run Docker with CLI

Start docker container with below command:

```bash
docker run -d --name="finetuning-server" -p 8005:8005 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest
```

## 2.2 Setup on Gaudi2

### 2.2.1 Build Docker Image

Build docker image with below command:

```bash
cd ../../
docker build -t opea/finetuning-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/finetuning/docker/Dockerfile_hpu .
```

### 2.2.2 Run Docker with CLI

Start docker container with below command:

```bash
export HF_TOKEN=${your_huggingface_token}
docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8005:8005 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest
```

# 🚀3. Consume Finetuning Service

## 3.1 Create fine-tuning job

Assuming a training file `alpaca_data.json` is uploaded, it can be downloaded in [here](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json), the following script launches a finetuning job using `meta-llama/Llama-2-7b-chat-hf` as base model:

```bash
# upload a training file
curl http://${your_ip}:8005/v1/finetune/upload_training_files -X POST -H "Content-Type: multipart/form-data" -F "files=@./alpaca_data.json"

# create a finetuning job
curl http://${your_ip}:8005/v1/fine_tuning/jobs \
-X POST \
-H "Content-Type: application/json" \
-d '{
"training_file": "alpaca_data.json",
"model": "meta-llama/Llama-2-7b-chat-hf"
}'

# list finetuning jobs
curl http://${your_ip}:8005/v1/fine_tuning/jobs -X GET

# retrieve one finetuning job
curl http://localhost:8005/v1/fine_tuning/jobs/retrieve -X POST -H "Content-Type: application/json" -d '{
"fine_tuning_job_id": ${fine_tuning_job_id}}'

# cancel one finetuning job

curl http://localhost:8005/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type: application/json" -d '{
"fine_tuning_job_id": ${fine_tuning_job_id}}'

# list checkpoints of a finetuning job
curl http://${your_ip}:8005/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'

```
Empty file.
Loading

0 comments on commit ad0bb7c

Please sign in to comment.