Skip to content

Commit

Permalink
Add prepare_dataset command (#38)
Browse files Browse the repository at this point in the history
Co-authored-by: Joel Lamy-Poirier <[email protected]>
  • Loading branch information
tscholak and jlamypoirier authored Nov 13, 2024
1 parent 7989595 commit 2905d38
Show file tree
Hide file tree
Showing 20 changed files with 496 additions and 56 deletions.
7 changes: 7 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# Ignore everything by default
*

# Allow specific files and directories
!setup.py
!setup.cfg
!Megatron-LM
Expand All @@ -7,3 +10,7 @@
!tools
!tests
!pyproject.toml

# Exclude Python cache directories and shared object files within included directories
**/__pycache__/
**/*.so
8 changes: 2 additions & 6 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,9 @@ jobs:
ghcr.io/servicenow/fast-llm
tags: |
type=schedule
type=ref,event=branch
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=semver,pattern={{major}}
type=pep440,pattern={{version}}
type=sha
type=raw,value=latest,enabled={{github.ref == 'refs/heads/main'}}
type=raw,value=latest,enable={{is_default_branch}}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
Expand All @@ -78,7 +75,6 @@ jobs:
uses: docker/build-push-action@v6
with:
context: .
# push: ${{ github.event_name != 'pull_request' }}
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
Expand Down
51 changes: 28 additions & 23 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,34 +1,39 @@
# syntax=docker/dockerfile:1.7-labs
FROM nvcr.io/nvidia/pytorch:24.07-py3

# Install git-lfs for Huggingface hub interaction and sudo for system adjustments
# Install dependencies.
RUN apt-get update \
&& apt-get install --no-install-recommends -y git-lfs sudo util-linux \
&& apt-get install --no-install-recommends -y acl git-lfs \
&& rm -rf /var/lib/apt/lists/* \
&& git lfs install

# Add a user for Fast-LLM with sudo privileges for runtime adjustments
ARG FAST_LLM_USER_ID=1000
RUN useradd -m -u $FAST_LLM_USER_ID -s /bin/bash fast_llm \
&& echo 'fast_llm ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers

USER fast_llm
# Set the working directory.
WORKDIR /app
# Set the permission to 777 for all files and directories in `/app`, `/home` and python install directories:
# 1. Create directories explicitly because docker use the wrong permission for explicit creation.
# 2. For the rest, set the default ACL to 777 for all users.
RUN mkdir -m 777 /app/Megatron-LM /app/examples /app/fast_llm /app/tests /app/tools \
&& setfacl -m d:u::rwx,d:g::rwx,d:o::rwx,u::rwx,g::rwx,o::rwx \
/app \
/home \
/usr \
/usr/local \
/usr/local/bin \
/usr/local/lib \
/usr/local/lib/python3.10 \
/usr/local/lib/python3.10/dist-packages \
/usr/local/lib/python3.10/dist-packages/__pycache__

# Environment settings for Python and PATH
ENV PYTHONPATH=/app:/app/Megatron-LM \
PATH=$PATH:/home/fast_llm/.local/bin/

# Copy the dependency files and install dependencies
COPY --chown=fast_llm setup.py setup.cfg pyproject.toml ./
COPY --chown=fast_llm ./fast_llm/csrc/ fast_llm/csrc/
RUN PIP_NO_INPUT=1 pip3 install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,DEV]"
# Copy dependency files with universal write permissions for all users.
COPY --chmod=777 setup.py setup.cfg pyproject.toml ./
COPY --chmod=777 ./fast_llm/csrc/ fast_llm/csrc/

# Copy the rest of the code
COPY --chown=fast_llm ./Megatron-LM Megatron-LM
COPY --chown=fast_llm ./examples examples
COPY --chown=fast_llm ./tests tests
COPY --chown=fast_llm ./tools tools
# Install dependencies within the virtual environment.
RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,DEV]"

# Copy the main source code for Fast-LLM
COPY --exclude=./fast_llm/csrc/ --chown=fast_llm ./fast_llm/ fast_llm/
# Copy the remaining source code with universal write permissions.
COPY --chmod=777 ./Megatron-LM Megatron-LM
COPY --chmod=777 ./examples examples
COPY --chmod=777 ./tests tests
COPY --chmod=777 ./tools tools
COPY --chmod=777 --exclude=./fast_llm/csrc/ ./fast_llm/ fast_llm/
10 changes: 8 additions & 2 deletions fast_llm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,15 +301,21 @@ def __setattr__(self, key, value):
# Allow setting the exact same object to facilitate setup of cross-dependencies.
# Ex. allow re-setting cross-dependencies of already validated sub-configs.
return
raise RuntimeError()
raise RuntimeError(
f"Cannot set attribute `{key}`"
f" in configuration class `{get_type_name(type(self))}` after validation."
)
super().__setattr__(key, value)

def __delattr__(self, key):
"""
Make the class read-only after validation.
"""
if getattr(self, "_validated", False):
raise RuntimeError()
raise RuntimeError(
f"Cannot delete attribute `{key}`"
f" in configuration class `{get_type_name(type(self))}` after validation."
)
super().__delattr__(key)

def validate(self, *, _is_validating=False):
Expand Down
12 changes: 12 additions & 0 deletions fast_llm/data/auto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from fast_llm.data.preparator.gpt_memmap.config import GPTMemmapDatasetPreparatorConfig
from fast_llm.utils import Registry

dataset_preparator_registry = Registry(
"DatasetPreparator",
{
dataset_preparator.preparator_name: dataset_preparator
for dataset_preparator in [
GPTMemmapDatasetPreparatorConfig,
]
},
)
2 changes: 1 addition & 1 deletion fast_llm/data/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def _validate(self):
class TokenizerConfig(Config):
"""
Configuration for the tokenizer.
Currently, the tokenizer is only needed for FIM.
The tokenizer is needed for FIM and dataset preparation.
"""

format: str = Field(
Expand Down
24 changes: 7 additions & 17 deletions fast_llm/data/gpt/memmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import numpy as np

from fast_llm.data.gpt.dataset import GPTIndexedDataset
from fast_llm.data.preparator.gpt_memmap.config import MEMMAP_DTYPES, MEMMAP_DTYPES_INV, MEMMAP_INDEX_HEADER
from fast_llm.engine.config_utils.data_type import DataType
from fast_llm.utils import Assert, div, padded_cumsum


Expand All @@ -16,18 +18,6 @@ class GPTMemmapDataset(GPTIndexedDataset):
See https://github.com/NVIDIA/Megatron-LM?tab=readme-ov-file#data-preprocessing for more details.
"""

_DTYPES = {
1: np.uint8,
2: np.int8,
3: np.int16,
4: np.int32,
5: np.int64,
6: np.float32,
7: np.float64,
8: np.uint16,
}
_INDEX_HEADER = b"MMIDIDX\x00\x00"

def __init__(self, name: str, prefix: pathlib.Path | str):
self._init(name, prefix)

Expand All @@ -37,10 +27,10 @@ def _init(self, name: str, prefix: pathlib.Path | str):
self._prefix = pathlib.Path(prefix)

with self._prefix.with_suffix(".idx").open("rb") as stream:
Assert.eq(stream.read(9), self._INDEX_HEADER)
Assert.eq(stream.read(9), MEMMAP_INDEX_HEADER)
Assert.eq(struct.unpack("<Q", stream.read(8))[0], 1)

self._dtype = self._DTYPES[struct.unpack("<B", stream.read(1))[0]]
self._dtype = MEMMAP_DTYPES[struct.unpack("<B", stream.read(1))[0]].numpy
self._num_documents = struct.unpack("<Q", stream.read(8))[0]
_ = struct.unpack("<Q", stream.read(8))[0]
offset = stream.tell()
Expand Down Expand Up @@ -106,13 +96,13 @@ def write_dataset(cls, prefix: pathlib.Path | str, documents: list[np.ndarray]):
dtype = documents[0].dtype
num_documents = len(documents)
lengths = np.array([len(document) for document in documents], dtype=np.int32)
pointers = padded_cumsum(lengths[:-1].astype(np.int64) * 2)
pointers = padded_cumsum(lengths[:-1].astype(np.int64)) * np.dtype(dtype).itemsize
prefix.parent.mkdir(parents=True, exist_ok=True)
with prefix.with_suffix(".idx").open("wb") as stream:
stream.write(cls._INDEX_HEADER)
stream.write(MEMMAP_INDEX_HEADER)
stream.write(struct.pack("<Q", 1))
# Data type
stream.write(struct.pack("<B", {y: x for x, y in cls._DTYPES.items()}[dtype.type]))
stream.write(struct.pack("<B", MEMMAP_DTYPES_INV[DataType.from_numpy(dtype.type)]))
# "Number of sequences", same as documents in our case.
stream.write(struct.pack("<Q", num_documents))
# "Number of documents", needs a +1 for some reason.
Expand Down
Empty file.
34 changes: 34 additions & 0 deletions fast_llm/data/preparator/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import abc
import argparse
import typing

from fast_llm.config import config_class
from fast_llm.engine.config_utils.runnable import RunnableConfig
from fast_llm.utils import Assert


@config_class()
class DatasetPreparatorConfig(RunnableConfig):
preparator_name: typing.ClassVar[str]

@classmethod
def get_dataset_preparator_class(cls) -> type["DatasetPreparator"]:
raise NotImplementedError

def _get_runnable(self, parsed: argparse.Namespace) -> typing.Callable[[], None]:
dataset_preparator = self.get_dataset_preparator_class()(config=self)
return dataset_preparator.run


class DatasetPreparator(abc.ABC):
_config: DatasetPreparatorConfig
config_class: typing.ClassVar[type[DatasetPreparatorConfig]] = DatasetPreparatorConfig

def __init__(self, config: DatasetPreparatorConfig) -> None:
Assert.custom(isinstance, config, self.config_class)
config.validate()
self._config = config

@abc.abstractmethod
def run(self) -> None:
raise NotImplementedError
Empty file.
Loading

0 comments on commit 2905d38

Please sign in to comment.