Merge branch 'main' into logger

Signed-off-by: Costa Shulyupin <[email protected]>
instructlab · Dec 11, 2024 · 67e221a · 67e221a
2 parents 07e5d74 + dcbabc5
commit 67e221a
Show file tree

Hide file tree

Showing 60 changed files with 1,163 additions and 472 deletions.
diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -46,7 +46,7 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ secrets.AWS_REGION }}
+          aws-region: ${{ vars.AWS_REGION }}
 
       - name: Start EC2 runner
         id: start-ec2-runner
@@ -130,7 +130,7 @@ jobs:
           . ../instructlab/venv/bin/activate
           pip install  -v .
 
-      - name: Check disk
+      - name: Check disk before tests
         run: |
           df -h
   
@@ -142,6 +142,10 @@ jobs:
           . venv/bin/activate
           ./scripts/e2e-ci.sh -m
 
+      - name: Check disk after tests
+        run: |
+          df -h
+
   stop-medium-ec2-runner:
     needs:
       - start-medium-ec2-runner
@@ -154,7 +158,7 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ secrets.AWS_REGION }}
+          aws-region: ${{ vars.AWS_REGION }}
 
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7

diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -24,7 +24,7 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ secrets.AWS_REGION }}
+          aws-region: ${{ vars.AWS_REGION }}
 
       - name: Start EC2 runner
         id: start-ec2-runner
@@ -160,7 +160,7 @@ jobs:
           pip install .
           pip install .[cuda]
 
-      - name: Check disk
+      - name: Check disk before tests
         run: |
           df -h
 
@@ -172,6 +172,10 @@ jobs:
           . venv/bin/activate
           ./scripts/e2e-ci.sh -l
 
+      - name: Check disk after tests
+        run: |
+          df -h
+
       - name: Add comment to PR if the workflow failed
         if: failure() && steps.check_pr.outputs.is_pr == 'true'
         working-directory: ./sdg
@@ -228,7 +232,7 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ secrets.AWS_REGION }}
+          aws-region: ${{ vars.AWS_REGION }}
 
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7

diff --git a/.github/workflows/e2e-nvidia-t4-x1.yml b/.github/workflows/e2e-nvidia-t4-x1.yml
@@ -46,7 +46,7 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ secrets.AWS_REGION }}
+          aws-region: ${{ vars.AWS_REGION }}
 
       - name: Start EC2 runner
         id: start-ec2-runner
@@ -128,7 +128,7 @@ jobs:
           . ../instructlab/venv/bin/activate
           pip install .
 
-      - name: Check disk
+      - name: Check disk before tests
         run: |
           df -h
 
@@ -138,6 +138,10 @@ jobs:
           . venv/bin/activate
           ./scripts/e2e-ci.sh -s
 
+      - name: Check disk after tests
+        run: |
+          df -h
+
   stop-small-ec2-runner:
     needs:
       - start-small-ec2-runner
@@ -150,7 +154,7 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ secrets.AWS_REGION }}
+          aws-region: ${{ vars.AWS_REGION }}
 
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7

diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
@@ -78,7 +78,7 @@ jobs:
                   path: dist
 
             - name: "Upload to Test PyPI"
-              uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc # v1.12.2
+              uses: pypa/gh-action-pypi-publish@67339c736fd9354cd4f8cb0b744f2b82a74b5c70 # v1.12.3
               with:
                   repository-url: https://test.pypi.org/legacy/
 
@@ -130,4 +130,4 @@ jobs:
                   rm ./dist/*.sigstore.json
 
             - name: "Upload to PyPI"
-              uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc # v1.12.2
+              uses: pypa/gh-action-pypi-publish@67339c736fd9354cd4f8cb0b744f2b82a74b5c70 # v1.12.3
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -90,7 +90,7 @@ jobs:
           pip cache remove llama_cpp_python
 
       - name: Cache huggingface
-        uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+        uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
         with:
           path: ~/.cache/huggingface
           # config contains DEFAULT_MODEL

diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml
@@ -7,10 +7,12 @@ config:
   code-block-style: false
   no-duplicate-header: false
   single-trailing-newline: false
+  no-duplicate-heading: false
 globs:
   - "**/*.md"
 ignores:
   - ".github/**"
+  - ".tox/**"
   - "venv/**"
   - ".venv/**"
   - "**/testdata/**"
diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt
@@ -3,18 +3,24 @@
 # SPDX-License-Identifier: Apache-2.0
 Backport
 backported
+CLI
 codebase
+config
 configs
 Dataset
 dataset
 datasets
 distractor
+Docling
+docling
 Eval
 eval
 FIXME
 freeform
 ICL
 icl
+ie
+Jinja
 JSON
 Langchain's
 LLM
@@ -23,18 +29,28 @@ MCQ
 Merlinite
 Mixtral
 MMLU
+multiphase
 Ouput
 Pre
 pre
+precomputed
 Pregenerated
 qna
 quantized
 repo
 sdg
 Splitter
+subdirectory
 subfolder
 Tatsu
+templating
+Tesseract
+TODO
+tokenizer
+tokenizers
 unchunked
+upsampled
 UUID
 vLLM
 yaml
+yamls
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,47 @@
+## Unreleased 0.7.x
+
+### Features
+
+#### Custom Blocks and Teacher Models via BlockRegistry and PromptRegistry
+
+Advanced users are now able to supply custom Pipeline `Block` implementations by registering new blocks with the `BlockRegistry`. It's also possible to register new chat templates for custom teacher models using the new `PromptRegistry`.
+
+See the `tests/testdata/custom_block.py` and `tests/testdata/custom_block_pipeline.yaml` files in this repository for an example of how to create custom blocks and use them from your own pipeline config yamls.
+
+See the `tests/testdata/custom_prompt.py` file in this repository for an example how to register custom chat templates used when formatting prompts.
+
+### Breaking Changes
+
+#### Pipeline configs and Prompt templates switched to Jinja
+
+All of our [Pipeline config yamls](src/instructlab/sdg/pipelines) and [prompt template files](src/instructlab/sdg/configs) have moved to [Jinja templates](https://pypi.org/project/Jinja2/) instead of Python string `format()` calls. This brings more expressiveness into our templating language - especially for prompt templates - but does mean any variable substitutions need to be updated from single brackets to double brackets - ie `{document}` becomes `{{document}}`. This only impacts you if you were using custom pipeline config yaml files or custom prompt templates in your config blocks.
+
+#### ImportBlock removed from Pipeline blocks
+
+Any users that were specifying custom pipeline configs (instead of using the default `full` or `simple` shipped by us) and also using the `ImportBlock` will now need to rewrite their pipelines to no longer use that block. We do not anticipate that anyone was actually using this block, but please reach out if you were so we can capture your needs in a future release.
+
+### Fixes
+
+## v0.6.2
+
+### Fixes
+
+* Fixed a bug in our version specification of `docling` and `docling_parse` dependencies that were causing new installs of InstructLab to pull in incompatible versions of these. We also fixed a similar bug in the `mypy` dependency, but that one only impacts developers of SDG as opposed to users of InstructLab.
+
+## v0.6.1
+
+### Fixes
+
+* Fixed a bug where generating data from a taxonomy with 2 or more changed knowledge leaf nodes would fail with a message about a destination path `already exists and is not an empty directory`
+
+## v0.6.0
+
+### Features
+
+* Small knowledge datasets will automatically get upsampled during final data mixing based on the length of any precomputed skills datasets used during data mixing. This avoids issues where very large precomputed skills datasets were swamping the comparatively minor number of knowledge samples, resulting in lower than optimal knowledge retention during multiphase training. If a large precomputed dataset isn't in use during mixing (which is how things operate by default), this change is a no-op.
+* When chunking PDF documents, we'll now look for the docling models on-disk in `$XDG_DATA_HOME/instructlab/sdg/models` (as well as `$XDG_DATA_DIRS` with the same `instructlab/sdg/models` subdirectory). If they are not found on disk, they'll automatically be downloaded from HuggingFace.
+* When chunking PDF documents with Docling, we'll automatically configure Docling to use `tesserocr` if a working implementation is found instead of relying on `easyocr`. We fallback to `easyocr` if Tesseract is not properly configured for use by `tesserocr`.
+
+### Breaking Changes
+
+* Teacher model tokenizers are loaded from the local teacher model on-disk and not downloaded automatically from HuggingFace. The typical workflows in use so far expect the teacher model to exist on-disk, and this enforces that at least its tokenizer exists.
diff --git a/pyproject.toml b/pyproject.toml
@@ -97,8 +97,8 @@ exclude = [
     "^src/instructlab/sdg/generate_data\\.py$",
     "^src/instructlab/sdg/utils/taxonomy\\.py$",
     "^src/instructlab/sdg/default_flows\\.py$",
-    "^src/instructlab/sdg/llmblock\\.py$",
-    "^src/instructlab/sdg/utilblocks\\.py$",
+    "^src/instructlab/sdg/blocks/llmblock\\.py$",
+    "^src/instructlab/sdg/blocks/utilblocks\\.py$",
 ]
 # honor excludes by not following there through imports
 follow_imports = "silent"
diff --git a/requirements.txt b/requirements.txt
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 click>=8.1.7,<9.0.0
 datasets>=2.18.0,<3.0.0
-docling[tesserocr]>=2.4.2,<3.0.0
+docling[tesserocr]>=2.4.2,<=2.8.3
+docling-parse>=2.0.0,<3.0.0
 GitPython>=3.1.42,<4.0.0
 gguf>=0.6.0
 httpx>=0.25.0,<1.0.0
 instructlab-schema>=0.4.0
+jinja2>=3.0.0
 langchain-text-splitters
 # Note: this dependency goes along with langchain-text-splitters and may be
 #       removed once that one is removed.

diff --git a/src/instructlab/sdg/__init__.py b/src/instructlab/sdg/__init__.py
@@ -3,6 +3,8 @@
 # NOTE: This package imports Torch and other heavy packages.
 __all__ = (
     "Block",
+    "BlockConfigParserError",
+    "BlockRegistry",
     "CombineColumnsBlock",
     "ConditionalLLMBlock",
     "DuplicateColumnsBlock",
@@ -11,27 +13,44 @@
     "FilterByValueBlockError",
     "FlattenColumnsBlock",
     "GenerateException",
-    "ImportBlock",
+    "IterBlock",
     "LLMBlock",
+    "LLMLogProbBlock",
+    "LLMMessagesBlock",
     "Pipeline",
     "PipelineBlockError",
     "PipelineConfigParserError",
     "PipelineContext",
+    "PromptRegistry",
     "RenameColumnsBlock",
     "SamplePopulatorBlock",
     "SelectorBlock",
     "SetToMajorityValueBlock",
-    "SIMPLE_PIPELINES_PACKAGE",
     "FULL_PIPELINES_PACKAGE",
+    "SIMPLE_PIPELINES_PACKAGE",
     "generate_data",
 )
 
 # Local
-from .block import Block
-from .filterblock import FilterByValueBlock, FilterByValueBlockError
+from .blocks.block import Block, BlockConfigParserError
+from .blocks.filterblock import FilterByValueBlock, FilterByValueBlockError
+from .blocks.iterblock import IterBlock
+from .blocks.llmblock import (
+    ConditionalLLMBlock,
+    LLMBlock,
+    LLMLogProbBlock,
+    LLMMessagesBlock,
+)
+from .blocks.utilblocks import (
+    CombineColumnsBlock,
+    DuplicateColumnsBlock,
+    FlattenColumnsBlock,
+    RenameColumnsBlock,
+    SamplePopulatorBlock,
+    SelectorBlock,
+    SetToMajorityValueBlock,
+)
 from .generate_data import generate_data
-from .importblock import ImportBlock
-from .llmblock import ConditionalLLMBlock, LLMBlock
 from .pipeline import (
     FULL_PIPELINES_PACKAGE,
     SIMPLE_PIPELINES_PACKAGE,
@@ -41,14 +60,6 @@
     PipelineConfigParserError,
     PipelineContext,
 )
-from .utilblocks import (
-    CombineColumnsBlock,
-    DuplicateColumnsBlock,
-    FlattenColumnsBlock,
-    RenameColumnsBlock,
-    SamplePopulatorBlock,
-    SelectorBlock,
-    SetToMajorityValueBlock,
-)
+from .registry import BlockRegistry, PromptRegistry
 from .utils import GenerateException
 from .utils.taxonomy import TaxonomyReadingException
diff --git a/src/instructlab/sdg/block.py b/src/instructlab/sdg/block.py
diff --git a/src/instructlab/sdg/blocks/__init__.py b/src/instructlab/sdg/blocks/__init__.py