Add prepare_dataset command (#38)

Co-authored-by: Joel Lamy-Poirier <[email protected]>
ServiceNow · Nov 13, 2024 · 2905d38 · 2905d38
1 parent 7989595
commit 2905d38
Show file tree

Hide file tree

Showing 20 changed files with 496 additions and 56 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,4 +1,7 @@
+# Ignore everything by default
 *
+
+# Allow specific files and directories
 !setup.py
 !setup.cfg
 !Megatron-LM
@@ -7,3 +10,7 @@
 !tools
 !tests
 !pyproject.toml
+
+# Exclude Python cache directories and shared object files within included directories
+**/__pycache__/
+**/*.so
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -57,12 +57,9 @@ jobs:
             ghcr.io/servicenow/fast-llm
           tags: |
             type=schedule
-            type=ref,event=branch
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=semver,pattern={{major}}
+            type=pep440,pattern={{version}}
             type=sha
-            type=raw,value=latest,enabled={{github.ref == 'refs/heads/main'}}
+            type=raw,value=latest,enable={{is_default_branch}}
 
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
@@ -78,7 +75,6 @@ jobs:
         uses: docker/build-push-action@v6
         with:
           context: .
-          # push: ${{ github.event_name != 'pull_request' }}
           push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}

diff --git a/Dockerfile b/Dockerfile
@@ -1,34 +1,39 @@
 # syntax=docker/dockerfile:1.7-labs
 FROM nvcr.io/nvidia/pytorch:24.07-py3
 
-# Install git-lfs for Huggingface hub interaction and sudo for system adjustments
+# Install dependencies.
 RUN apt-get update \
-    && apt-get install --no-install-recommends -y git-lfs sudo util-linux \
+    && apt-get install --no-install-recommends -y acl git-lfs \
     && rm -rf /var/lib/apt/lists/* \
     && git lfs install
 
-# Add a user for Fast-LLM with sudo privileges for runtime adjustments
-ARG FAST_LLM_USER_ID=1000
-RUN useradd -m -u $FAST_LLM_USER_ID -s /bin/bash fast_llm \
-    && echo 'fast_llm ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers
-
-USER fast_llm
+# Set the working directory.
 WORKDIR /app
+# Set the permission to 777 for all files and directories in `/app`, `/home` and python install directories:
+#   1. Create directories explicitly because docker use the wrong permission for explicit creation.
+#   2. For the rest, set the default ACL to 777 for all users.
+RUN mkdir -m 777 /app/Megatron-LM /app/examples /app/fast_llm /app/tests /app/tools \
+    && setfacl -m d:u::rwx,d:g::rwx,d:o::rwx,u::rwx,g::rwx,o::rwx \
+      /app \
+      /home \
+      /usr \
+      /usr/local \
+      /usr/local/bin \
+      /usr/local/lib \
+      /usr/local/lib/python3.10 \
+      /usr/local/lib/python3.10/dist-packages \
+      /usr/local/lib/python3.10/dist-packages/__pycache__
 
-# Environment settings for Python and PATH
-ENV PYTHONPATH=/app:/app/Megatron-LM \
-    PATH=$PATH:/home/fast_llm/.local/bin/
-
-# Copy the dependency files and install dependencies
-COPY --chown=fast_llm setup.py setup.cfg pyproject.toml ./
-COPY --chown=fast_llm ./fast_llm/csrc/ fast_llm/csrc/
-RUN PIP_NO_INPUT=1 pip3 install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,DEV]"
+# Copy dependency files with universal write permissions for all users.
+COPY --chmod=777 setup.py setup.cfg pyproject.toml ./
+COPY --chmod=777 ./fast_llm/csrc/ fast_llm/csrc/
 
-# Copy the rest of the code
-COPY --chown=fast_llm ./Megatron-LM Megatron-LM
-COPY --chown=fast_llm ./examples examples
-COPY --chown=fast_llm ./tests tests
-COPY --chown=fast_llm ./tools tools
+# Install dependencies within the virtual environment.
+RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,DEV]"
 
-# Copy the main source code for Fast-LLM
-COPY --exclude=./fast_llm/csrc/ --chown=fast_llm ./fast_llm/ fast_llm/
+# Copy the remaining source code with universal write permissions.
+COPY --chmod=777 ./Megatron-LM Megatron-LM
+COPY --chmod=777 ./examples examples
+COPY --chmod=777 ./tests tests
+COPY --chmod=777 ./tools tools
+COPY --chmod=777 --exclude=./fast_llm/csrc/ ./fast_llm/ fast_llm/
diff --git a/fast_llm/config.py b/fast_llm/config.py
@@ -301,15 +301,21 @@ def __setattr__(self, key, value):
                 # Allow setting the exact same object to facilitate setup of cross-dependencies.
                 # Ex. allow re-setting cross-dependencies of already validated sub-configs.
                 return
-            raise RuntimeError()
+            raise RuntimeError(
+                f"Cannot set attribute `{key}`"
+                f" in configuration class `{get_type_name(type(self))}` after validation."
+            )
         super().__setattr__(key, value)
 
     def __delattr__(self, key):
         """
         Make the class read-only after validation.
         """
         if getattr(self, "_validated", False):
-            raise RuntimeError()
+            raise RuntimeError(
+                f"Cannot delete attribute `{key}`"
+                f" in configuration class `{get_type_name(type(self))}` after validation."
+            )
         super().__delattr__(key)
 
     def validate(self, *, _is_validating=False):

diff --git a/fast_llm/data/auto.py b/fast_llm/data/auto.py
@@ -0,0 +1,12 @@
+from fast_llm.data.preparator.gpt_memmap.config import GPTMemmapDatasetPreparatorConfig
+from fast_llm.utils import Registry
+
+dataset_preparator_registry = Registry(
+    "DatasetPreparator",
+    {
+        dataset_preparator.preparator_name: dataset_preparator
+        for dataset_preparator in [
+            GPTMemmapDatasetPreparatorConfig,
+        ]
+    },
+)
diff --git a/fast_llm/data/config.py b/fast_llm/data/config.py
@@ -107,7 +107,7 @@ def _validate(self):
 class TokenizerConfig(Config):
     """
     Configuration for the tokenizer.
-    Currently, the tokenizer is only needed for FIM.
+    The tokenizer is needed for FIM and dataset preparation.
     """
 
     format: str = Field(

diff --git a/fast_llm/data/gpt/memmap.py b/fast_llm/data/gpt/memmap.py
@@ -4,6 +4,8 @@
 import numpy as np
 
 from fast_llm.data.gpt.dataset import GPTIndexedDataset
+from fast_llm.data.preparator.gpt_memmap.config import MEMMAP_DTYPES, MEMMAP_DTYPES_INV, MEMMAP_INDEX_HEADER
+from fast_llm.engine.config_utils.data_type import DataType
 from fast_llm.utils import Assert, div, padded_cumsum
 
 
@@ -16,18 +18,6 @@ class GPTMemmapDataset(GPTIndexedDataset):
     See https://github.com/NVIDIA/Megatron-LM?tab=readme-ov-file#data-preprocessing for more details.
     """
 
-    _DTYPES = {
-        1: np.uint8,
-        2: np.int8,
-        3: np.int16,
-        4: np.int32,
-        5: np.int64,
-        6: np.float32,
-        7: np.float64,
-        8: np.uint16,
-    }
-    _INDEX_HEADER = b"MMIDIDX\x00\x00"
-
     def __init__(self, name: str, prefix: pathlib.Path | str):
         self._init(name, prefix)
 
@@ -37,10 +27,10 @@ def _init(self, name: str, prefix: pathlib.Path | str):
         self._prefix = pathlib.Path(prefix)
 
         with self._prefix.with_suffix(".idx").open("rb") as stream:
-            Assert.eq(stream.read(9), self._INDEX_HEADER)
+            Assert.eq(stream.read(9), MEMMAP_INDEX_HEADER)
             Assert.eq(struct.unpack("<Q", stream.read(8))[0], 1)
 
-            self._dtype = self._DTYPES[struct.unpack("<B", stream.read(1))[0]]
+            self._dtype = MEMMAP_DTYPES[struct.unpack("<B", stream.read(1))[0]].numpy
             self._num_documents = struct.unpack("<Q", stream.read(8))[0]
             _ = struct.unpack("<Q", stream.read(8))[0]
             offset = stream.tell()
@@ -106,13 +96,13 @@ def write_dataset(cls, prefix: pathlib.Path | str, documents: list[np.ndarray]):
         dtype = documents[0].dtype
         num_documents = len(documents)
         lengths = np.array([len(document) for document in documents], dtype=np.int32)
-        pointers = padded_cumsum(lengths[:-1].astype(np.int64) * 2)
+        pointers = padded_cumsum(lengths[:-1].astype(np.int64)) * np.dtype(dtype).itemsize
         prefix.parent.mkdir(parents=True, exist_ok=True)
         with prefix.with_suffix(".idx").open("wb") as stream:
-            stream.write(cls._INDEX_HEADER)
+            stream.write(MEMMAP_INDEX_HEADER)
             stream.write(struct.pack("<Q", 1))
             # Data type
-            stream.write(struct.pack("<B", {y: x for x, y in cls._DTYPES.items()}[dtype.type]))
+            stream.write(struct.pack("<B", MEMMAP_DTYPES_INV[DataType.from_numpy(dtype.type)]))
             # "Number of sequences", same as documents in our case.
             stream.write(struct.pack("<Q", num_documents))
             # "Number of documents", needs a +1 for some reason.

diff --git a/fast_llm/data/preparator/__init__.py b/fast_llm/data/preparator/__init__.py
diff --git a/fast_llm/data/preparator/config.py b/fast_llm/data/preparator/config.py
@@ -0,0 +1,34 @@
+import abc
+import argparse
+import typing
+
+from fast_llm.config import config_class
+from fast_llm.engine.config_utils.runnable import RunnableConfig
+from fast_llm.utils import Assert
+
+
+@config_class()
+class DatasetPreparatorConfig(RunnableConfig):
+    preparator_name: typing.ClassVar[str]
+
+    @classmethod
+    def get_dataset_preparator_class(cls) -> type["DatasetPreparator"]:
+        raise NotImplementedError
+
+    def _get_runnable(self, parsed: argparse.Namespace) -> typing.Callable[[], None]:
+        dataset_preparator = self.get_dataset_preparator_class()(config=self)
+        return dataset_preparator.run
+
+
+class DatasetPreparator(abc.ABC):
+    _config: DatasetPreparatorConfig
+    config_class: typing.ClassVar[type[DatasetPreparatorConfig]] = DatasetPreparatorConfig
+
+    def __init__(self, config: DatasetPreparatorConfig) -> None:
+        Assert.custom(isinstance, config, self.config_class)
+        config.validate()
+        self._config = config
+
+    @abc.abstractmethod
+    def run(self) -> None:
+        raise NotImplementedError
diff --git a/fast_llm/data/preparator/gpt_memmap/__init__.py b/fast_llm/data/preparator/gpt_memmap/__init__.py