From d9bc8887be2c2eddf92c75ef34aa5529f5cdba0c Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 22 Oct 2024 09:38:21 -0400
Subject: [PATCH] fix

---
 .github/PULL_REQUEST_TEMPLATE.md       |  64 ++++++++++
 .github/workflows/ci.yaml              |  85 +++++++++++++
 .github/workflows/docs.yaml            |  59 +++++++++
 CITATION.cff                           |  28 +++++
 CODE_OF_CONDUCT.md                     |  46 +++++++
 CONTRIBUTING.md                        |  62 ++++++++++
 README.md                              | 161 +++++++++++++++++++++++++
 SECURITY.md                            |  27 +++++
 examples/fast-llm-pvc.yaml             |  12 ++
 examples/fast-llm.pytorchjob.yaml      | 127 +++++++++++++++++++
 examples/fast-llm.sbat                 |  37 ++++++
 examples/mistral-4-node-benchmark.yaml |  55 +++++++++
 setup.cfg                              |  11 +-
 tests/common.py                        |   3 +
 tests/test_checkpoint.py               |   2 +
 tests/test_config.py                   |  11 ++
 tests/test_functional.py               |   3 +
 tests/test_triton_kernels.py           |  11 ++
 18 files changed, 803 insertions(+), 1 deletion(-)
 create mode 100644 .github/PULL_REQUEST_TEMPLATE.md
 create mode 100644 .github/workflows/ci.yaml
 create mode 100644 .github/workflows/docs.yaml
 create mode 100644 CITATION.cff
 create mode 100644 CODE_OF_CONDUCT.md
 create mode 100644 CONTRIBUTING.md
 create mode 100644 README.md
 create mode 100644 SECURITY.md
 create mode 100644 examples/fast-llm-pvc.yaml
 create mode 100644 examples/fast-llm.pytorchjob.yaml
 create mode 100644 examples/fast-llm.sbat
 create mode 100644 examples/mistral-4-node-benchmark.yaml

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 00000000..31a65d7c
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,64 @@
+# ✨ Description
+
+Please provide a brief summary of the changes, relevant motivation, and context.
+Include any related issue numbers or links to discussions, and explain why this change is necessary.
+
+Closes # <!-- Insert issue number here, if applicable -->
+
+## 🔍 Type of change
+
+Select all that apply:
+
+- [ ] 🐛 **Bug fix** (non-breaking change that addresses a specific issue)
+- [ ] 🚀 **New feature** (non-breaking change that adds functionality)
+- [ ] ⚠️ **Breaking change** (a change that could affect existing functionality)
+- [ ] 📈 **Performance improvement/optimization** (improves speed, memory usage, or efficiency)
+- [ ] 🛠️ **Code refactor** (non-functional changes that improve code readability, structure, etc.)
+- [ ] 📦 **Dependency bump** (updates dependencies, including Dockerfile or package changes)
+- [ ] 📝 **Documentation change** (updates documentation, including new content or typo fixes)
+- [ ] 🔧 **Infrastructure/Build change** (affects build process, CI/CD, or dependencies)
+
+## 📝 Changes
+
+List the key changes introduced in this PR:
+
+1. Change A
+2. Change B
+
+# ✅ Checklist
+
+Make sure the following tasks are completed before submitting the PR:
+
+### General:
+- [ ] 📜 I have read and followed the [contributing guidelines](CONTRIBUTING.md).
+- [ ] 🎉 The functionality is complete, and I have tested the changes.
+- [ ] 📝 I have updated the documentation if needed.
+- [ ] ⚠️ The change does not introduce any new issues (e.g., runtime warnings, type checker errors, linting problems, unhandled edge cases).
+- [ ] 🧩 I have commented my code, especially in hard-to-understand areas.
+
+### Dependencies and Configuration:
+- [ ] 🐋 I have updated the Docker configuration or dependencies, if applicable.
+- [ ] 🔄 I have ensured compatibility with the existing setup after dependency changes.
+
+### Testing:
+- [ ] 🧪 I have added or updated tests to cover my changes.
+- [ ] ✔️ New and existing tests pass locally with my changes.
+- [ ] 🚦 I have tested these changes on GPUs and verified training stability.
+- [ ] 🏋️ I have tested the changes on realistic training workloads, if applicable.
+
+### Performance Impact:
+- [ ] 📊 I have run benchmarks where applicable to evaluate the performance impact.
+- [ ] ✅ The benchmarks show no performance regression.
+- [ ] 🚀 The benchmarks indicate a potential performance improvement.
+- [ ] ⚠️ The benchmarks indicate a potential performance degradation.
+- [ ] 📈 I have provided benchmark results and detailed any performance impact below, if applicable.
+
+# 📊 Performance Impact Details
+
+If there is any impact on performance, describe it and provide benchmark results, if applicable:
+
+---
+
+# 📝 Additional Notes
+
+Include any additional context, information, or considerations here, such as known issues, follow-up tasks, or backward compatibility concerns.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 00000000..18dca43b
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,85 @@
+name: CI
+
+on:
+  schedule:
+    - cron: "0 10 * * *"
+  push:
+    branches:
+      - "main"
+    tags:
+      - "v*.*.*"
+  pull_request:
+    branches:
+      - "main"
+
+jobs:
+  test:
+    name: Test
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          pip install "torch>=2.2.2"
+          FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE pip install --no-build-isolation -e ".[CORE,OPTIONAL,DEV,DOCS]"
+
+      - name: Run tests
+        run: pytest .
+
+  docker:
+    name: Docker
+    runs-on: ubuntu-latest
+    needs: test
+    steps:
+      - name: Clean unused files
+        run: |
+          sudo rm -rf /usr/local/lib/android || true  # will release about 10 GB
+          sudo rm -rf /usr/share/dotnet || true  # will release about 20GB
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf /usr/local/.ghcup || true
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            ghcr.io/servicenow/fast-llm
+          tags: |
+            type=schedule
+            type=ref,event=branch
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=semver,pattern={{major}}
+            type=sha
+            type=raw,value=latest,enabled={{github.ref == 'refs/heads/main'}}
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          # push: ${{ github.event_name != 'pull_request' }}
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=registry,ref=ghcr.io/servicenow/fast-llm:cache
+          cache-to: type=registry,ref=ghcr.io/servicenow/fast-llm:cache,mode=max
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
new file mode 100644
index 00000000..76853324
--- /dev/null
+++ b/.github/workflows/docs.yaml
@@ -0,0 +1,59 @@
+name: Documentation
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+permissions:
+  contents: write
+
+jobs:
+  build:
+    name: Build
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: "pip"
+      - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
+      - uses: actions/cache@v4
+        with:
+          key: mkdocs-material-${{ env.cache_id }}
+          path: .cache
+          restore-keys: |
+            mkdocs-material-
+      - run: |
+          pip install "torch>=2.2.2"
+          FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE pip install --no-build-isolation -e ".[CORE,OPTIONAL,DEV,DOCS]"
+      - name: Build the documentation
+        run: mkdocs build
+
+  deploy:
+    if: github.event_name == 'push'
+    name: Deploy
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: "pip"
+      - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
+      - uses: actions/cache@v4
+        with:
+          key: mkdocs-material-${{ env.cache_id }}
+          path: .cache
+          restore-keys: |
+            mkdocs-material-
+      - run: |
+          pip install "torch>=2.2.2"
+          FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE pip install --no-build-isolation -e ".[CORE,OPTIONAL,DEV,DOCS]"
+      - name: Publish the documentation
+        run: mkdocs gh-deploy --force --dirty
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 00000000..39d30ccb
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,28 @@
+cff-version: 1.2.0
+message: "If you use Fast-LLM in your research, please cite it as follows:"
+title: "Fast-LLM"
+repository-code: "https://github.com/ServiceNow/Fast-LLM"
+url: "https://github.com/ServiceNow/Fast-LLM"
+license: "Apache-2.0"
+keywords:
+  - large language models
+  - machine learning
+  - deep learning
+  - distributed training
+  - open source
+authors:
+  - family-names: "Lamy Poirier"
+    given-names: "Joel"
+  - family-names: "Tian"
+    given-names: "Max"
+  - family-names: "Li"
+    given-names: "Raymond"
+  - family-names: "Guille-Escuret"
+    given-names: "Charles"
+  - family-names: "Kumar"
+    given-names: "Luke Nitish"
+  - family-names: "Kocetkov"
+    given-names: "Denis"
+  - family-names: "Scholak"
+    given-names: "Torsten"
+date-released: "2024-10-19"
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..b3b61bc8
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,46 @@
+### ServiceNow Open Source Code-of-Conduct
+
+This code of conduct provides guidelines for participation in ServiceNow-managed open-source communities and projects.
+
+**Discussion forum guidelines**
+
+Communities thrive when members support each other and provide useful feedback.
+
+- Be polite and courteous. Respect and treat others as you would expect to be treated yourself.
+- Respect your audience. Posts should not upset, annoy, threaten, harass, abuse or embarrass other members.
+- User Contributions must not include material that is defamatory, obscene, indecent, abusive, offensive, harassing, violent, hateful, inflammatory or otherwise objectionable.
+- Lively and collegial discussions are always encouraged in a healthy community. It is okay to argue facts but not okay to argue personalities or personal beliefs.
+- Do not use text formats such as all caps or bold that may be read as annoying, rude or send a strong message.
+- Do not publish anyone’s private personal information without their explicit consent.
+- Avoid using abbreviations or terminology that others may not understand. An abbreviation may mean something to you but in another context or country, it may have another meaning.
+- Be accountable for your actions by correcting your mistakes and indicating where you have changed a previous post of yours.
+- Mark content as correct and helpful, and provide feedback. If you read a discussion post that you find helpful, we encourage you to leave a positive vote and comment in the replies. If you find a post that is unhelpful, please provide more information in the issue comments.
+
+**Issue board guidelines**
+
+Many open-source projects provide an Issues board, with similar functionality to a Discussions forum. The same rules from the discussion forum guidelines apply to the Issues board.
+
+ServiceNow suggests the following technical support pathways for open-source projects:
+
+1. Clearly identify and document the issue or question you have.
+2. View the Documentation.
+3. Search the Discussions.
+4. Search the project knowledge base or Wiki for known errors, useful solutions, and troubleshooting tips.
+5. Check the project guidelines in the [`CONTRIBUTING.md`](CONTRIBUTING.md) file if you would like details on how you can submit a change. Community contributions are valued and appreciated!
+6. Log an Issue if it hasn’t already been logged. If the issue has already been logged by another user, vote it up, and add a comment with additional or missing information. Do your best to choose the correct category when logging a new issue. This will make it easier to differentiate bugs from new feature requests or ideas. If after logging an issue you find the solution, please close your issue and provide a comment with the solution. This will help the project owners and other users.
+7. Contact the project team contributors of the project to see if they can help as a last resort only.
+
+**Repositories**
+
+- Read and follow the license instructions
+- Remember to include citations if you use someone else’s work in your own project. Use the [`CITATION.cff`](CITATION.cff) to find the correct project citation reference.
+- ‘Star’ project repos to save for future reference.
+- ‘Watch’ project repos to get notifications of changes – this can get noisy for some projects, so only watch the ones you really need to track closely.
+
+**Enforcement and reporting**
+
+We encourage community members and users to help each other and to resolve issues amongst themselves as much as possible. If a matter cannot be resolved in good faith within the means available, please reach out to a team member or email fast-llm-team@servicenow.com.
+
+**ServiceNow Disclaimer.**
+
+We may, but are under no obligation to, monitor or censor comments made by users or content provided by contributors and we are not responsible for the accuracy, completeness, appropriateness or legality of anything posted, depicted or otherwise provided by third‑party users and we disclaim any and all liability relating thereto.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..e5ab7694
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,62 @@
+# Contributing to Fast-LLM 🚀
+
+Thank you for your interest in contributing to Fast-LLM! We're thrilled to have you here, and your support is invaluable in helping us accelerate LLM training to full speed. This guide will walk you through the steps to contribute, from reporting issues to submitting changes and setting up your development environment.
+
+If you have questions or want to start a discussion, feel free to [open a discussion](https://github.com/ServiceNow/Fast-LLM/discussions) on our GitHub page.
+
+## Getting Started
+
+To get started with contributing to Fast-LLM, follow these steps to set up your environment:
+
+1. **Set Up the Development Environment**: Fast-LLM is built on [PyTorch](https://pytorch.org/) and [Triton](https://triton-lang.org/). Check out our [setup guide](https://servicenow.github.io/Fast-LLM/development/setup) for instructions on getting everything ready, including the development environment and dependencies.
+2. **Learn Our Best Practices**: Get familiar with our [development best practices](https://servicenow.github.io/Fast-LLM/development/dev-practices/), which cover code style, pre-commit hooks, and testing strategies.
+3. **Launch Fast-LLM Locally or with Docker**: Need help getting started? Follow the instructions in the [launching section](https://servicenow.github.io/Fast-LLM/development/launching) to get Fast-LLM up and running.
+
+## How to Report a Bug 🐞
+
+Found a bug? Let's squash it together! [Open an issue](https://github.com/ServiceNow/Fast-LLM/issues/new/choose) and select "Bug report." Please include as much information as possible:
+
+- Steps to reproduce the issue.
+- What you expected to happen versus what actually happened.
+- Screenshots, log files, or error messages (if applicable).
+- Details about your environment setup (e.g., OS, Docker version, and relevant configurations).
+
+If you're familiar with the codebase, consider adding a failing unit test to demonstrate the problem (optional, but helpful!).
+
+## Proposing Changes
+
+Before diving into code, [open an issue](https://github.com/ServiceNow/Fast-LLM/issues) to discuss your proposal. This is especially important if you're planning significant changes or adding new dependencies. Once your idea is approved, follow these steps:
+
+1. **Fork the Repository**: [Fork Fast-LLM](https://github.com/ServiceNow/Fast-LLM/fork) to your own GitHub account.
+2. **Clone Your Fork Locally**: Use `git clone` to bring the code to your local machine.
+3. **Create a New Branch**: Name your branch descriptively, such as `feature/awesome-feature` or `fix/nasty-bug`.
+4. **Make Your Changes**: Work your magic! Don't forget to add or update tests, benchmarks, or configurations as needed.
+5. **Create a Properly Titled Pull Request**: When you're ready to open a PR, make sure to use a clear and descriptive title that follows our [PR title guidelines](https://servicenow.github.io/Fast-LLM/development/pr-title-guidelines). This title will become the commit message for the squashed merge.
+6. **Push to Your Fork**: Push the branch to your GitHub fork.
+7. **Open a Pull Request**: [Submit a pull request](https://github.com/ServiceNow/Fast-LLM/compare) to the `main` branch. Reference the original issue number and provide a brief summary of your changes.
+
+### Guidelines for a Successful Pull Request
+
+Here are some tips to ensure your pull request gets reviewed and merged promptly:
+
+- **Follow our coding standards**: Stick to our [development best practices](https://servicenow.github.io/Fast-LLM/development/dev-practices/) to keep the code clean and consistent.
+- **Write tests**: Verify your changes with unit tests for new features or bug fixes.
+- **Test on GPUs and real-world workloads**: Since Fast-LLM is all about training large language models, make sure your changes work smoothly in GPU environments and on typical training setups.
+- **Run benchmarks and performance tests**: Make sure your changes don't slow things down. If there's any impact on performance, provide benchmark results to back it up.
+- **Avoid introducing new issues**: Check that there are no new runtime warnings, type checker errors, linting problems, or unhandled edge cases.
+- **Comment non-trivial code**: Make your code easy to understand for others.
+- **Keep sensitive data out**: Make sure your code or commit messages don't expose private or proprietary information.
+- **Use the [PR template](https://github.com/ServiceNow/Fast-LLM/blob/main/.github/pull_request_template.md)**: Complete the checklist to make sure everything is in order before hitting submit.
+
+## Seeking Help or Clarification
+
+If you're unsure about something or need help, you've got options:
+
+- **GitHub Discussions**: [Start a discussion](https://github.com/ServiceNow/Fast-LLM/discussions) if you need advice or just want to chat.
+- **Project Maintainers**: Mention a maintainer in an issue or pull request if you need a review or guidance.
+
+## Contributors
+
+We're grateful for all the awesome contributors who help make Fast-LLM better. Join our contributors' list and make your first contribution!
+
+To learn more about the team and maintainers, visit our [About page](https://servicenow.github.io/Fast-LLM/about-us/).
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..0973a158
--- /dev/null
+++ b/README.md
@@ -0,0 +1,161 @@
+<div align="center" style="margin-bottom: 1em;">
+
+<img width=50% src="docs/assets/images/logo.png" alt="Fast-LLM Logo"></img>
+
+[![Docker][ci-badge]][ci-workflow]
+[![Documentation][docs-badge]][docs-workflow]
+[![License][license-badge]][license]
+
+*Accelerating your LLM training to full speed*
+
+Made with ❤️ by [ServiceNow Research][servicenow-research]
+
+</div>
+
+## Overview
+
+Fast-LLM is a new open-source library for training large language models, built on [PyTorch][pytorch] and [Triton][triton]. It is extremely fast, scales to large clusters, supports a wide range of model architectures, and is easy to use. Unlike commercial frameworks like Megatron-LM, which are largely closed off and fragmented across forks, Fast-LLM is fully open-source and encourages community-driven development. Researchers can freely customize and optimize as needed, making it a flexible and hackable alternative that combines the speed of specialized tools with the openness of libraries like [Hugging Face Transformers][transformers].
+
+> [!NOTE]
+> Fast-LLM is not affiliated with Fast.AI, FastHTML, FastAPI, FastText, or other similarly named projects. Our library's name refers to its speed and efficiency in language model training.
+
+## Why Fast-LLM?
+
+1. 🚀 **Fast-LLM is Blazingly Fast**:
+    - ⚡️ Optimized kernel efficiency and reduced overheads.
+    - 🔋 Optimized memory usage for best performance.
+    - ⏳ Minimizes training time and cost.
+
+2. 📈 **Fast-LLM is Highly Scalable**:
+    - 📡 Distributed training across multiple GPUs and nodes using 3D parallelism (Data, Tensor, and Pipeline).
+    - 🔗 Supports sequence length parallelism to handle longer sequences effectively.
+    - 🧠 ZeRO-1, ZeRO-2, and ZeRO-3 implementations for improved memory efficiency.
+    - 🎛️ Mixed precision training support for better performance.
+    - 🏋️‍♂️ Large batch training and gradient accumulation support.
+    - 🔄 Reproducible training with deterministic behavior.
+
+3. 🎨 **Fast-LLM is Incredibly Flexible**:
+    - 🤖 Compatible with all common language model architectures in a unified class.
+    - ⚡ Efficient dropless Mixture-of-Experts (MoE) implementation with SoTA performance.
+    - 🧩 Customizable language model architectures, data loaders, loss functions, and optimizers (in progress).
+    - 🤗 Seamless integration with [Hugging Face Transformers][transformers].
+
+4. 🎯 **Fast-LLM is Super Easy to Use**:
+    - 📦 [Pre-built Docker images](https://github.com/ServiceNow/Fast-LLM/pkgs/container/fast-llm) for quick deployment.
+    - 📝 Simple YAML configuration for hassle-free setup.
+    - 💻 Command-line interface for easy launches.
+    - 📊 Detailed logging and real-time monitoring features.
+    - 📚 Extensive [documentation][docs] and practical tutorials (in progress).
+
+5. 🌐 **Fast-LLM is Truly Open Source**:
+    - ⚖️ Licensed under [Apache 2.0][license] for maximum freedom to use Fast-LLM at work, in your projects, or for research.
+    - 💻 Fully developed on GitHub with a public [roadmap][roadmap] and transparent [issue tracking][issues].
+    - 🤝 Contributions and collaboration are always welcome!
+
+## Usage
+
+We'll walk you through how to use Fast-LLM to train a large language model on a cluster with multiple nodes and GPUs. We'll show an example setup using a Slurm cluster and a Kubernetes cluster.
+
+For this demo, we will train a Mistral-7B model from scratch for 100 steps on random data. The config file `examples/mistral-4-node-benchmark.yaml` is pre-configured for a multi-node setup with 4 DGX nodes, each with 8 A100-80GB or H100-80GB GPUs.
+
+> [!NOTE]
+> Fast-LLM scales from a single GPU to large clusters. You can start small and expand based on your resources.
+
+Expect to see a significant speedup in training time compared to other libraries! For training Mistral-7B, Fast-LLM is expected to achieve a throughput of **9,800 tokens/s/H100** (batch size 32, sequence length 8k) on a 4-node cluster with 32 H100s.
+
+### Running Fast-LLM on a Slurm Cluster
+
+#### Prerequisites
+
+- A [Slurm](https://slurm.schedmd.com/) cluster with at least 4 DGX nodes with 8 A100-80GB or H100-80GB GPUs each.
+- CUDA 12.1 or higher.
+- Dependencies: [PyTorch][pytorch], [Triton][triton], and [Apex](https://github.com/NVIDIA/apex) installed on all nodes.
+
+#### Steps
+
+1. Deploy the [nvcr.io/nvidia/pytorch:24.07-py3](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) Docker image to all nodes (recommended), because it contains all the necessary dependencies.
+2. Install Fast-LLM on all nodes:
+
+    ```bash
+    sbatch <<EOF
+    #!/bin/bash
+    #SBATCH --nodes=$(scontrol show node | grep -c NodeName)
+    #SBATCH --ntasks-per-node=1
+    #SBATCH --ntasks=$(scontrol show node | grep -c NodeName)
+    #SBATCH --exclusive
+
+    srun bash -c 'pip install --no-cache-dir -e "git+https://github.com/ServiceNow/Fast-LLM.git#egg=llm[CORE,OPTIONAL,DEV]"'
+    EOF
+    ```
+
+3. Use the example Slurm job script [examples/fast-llm.sbat](examples/fast-llm.sbat) to submit the job to the cluster:
+
+    ```bash
+    sbatch examples/fast-llm.sbat
+    ```
+
+4. Monitor the job's progress:
+
+    - Logs: Follow `job_output.log` and `job_error.log` in your working directory for logs.
+    - Status: Use `squeue -u $USER` to see the job status.
+
+Now, you can sit back and relax while Fast-LLM trains your model at full speed! ☕
+
+### Running Fast-LLM on a Kubernetes Cluster
+
+#### Prerequisites
+
+- A [Kubernetes](https://kubernetes.io/) cluster with at least 4 DGX nodes with 8 A100-80GB or H100-80GB GPUs each.
+- [KubeFlow](https://www.kubeflow.org/) installed.
+- Locked memory limit set to unlimited at the host level on all nodes. Ask your cluster admin to do this if needed.
+
+#### Steps
+
+1. Create a Kubernetes [PersistentVolumeClaim](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) (PVC) named `fast-llm-home` that will be mounted to `/home/fast-llm` in the container using [examples/fast-llm-pvc.yaml](examples/fast-llm-pvc.yaml):
+
+    ```bash
+    kubectl apply -f examples/fast-llm-pvc.yaml
+    ```
+
+2. Create a [PyTorchJob](https://www.kubeflow.org/docs/components/training/user-guides/pytorch/) resource using the example configuration file [examples/fast-llm.pytorchjob.yaml](examples/fast-llm.pytorchjob.yaml):
+
+    ```bash
+    kubectl apply -f examples/fast-llm.pytorchjob.yaml
+    ```
+
+3. Monitor the job status:
+
+    - Use `kubectl get pytorchjobs` to see the job status.
+    - Use `kubectl logs -f fast-llm-master-0 -c pytorch` to follow the logs.
+
+That's it! You're now up and running with Fast-LLM on Kubernetes. 🚀
+
+## Next Steps
+
+📖 **Want to learn more?** Check out our [documentation][docs] for more information on how to use Fast-LLM.
+
+🔨 **We welcome contributions to Fast-LLM!** Have a look at our [contribution guidelines](CONTRIBUTING.md).
+
+🐞 **Something doesn't work?** Open an [issue](https://github.com/ServiceNow/Fast-LLM/issues)!
+
+## License
+
+Fast-LLM is licensed by ServiceNow, Inc. under the Apache 2.0 License. See [LICENSE][license] for more information.
+
+## Vulnerability Reporting
+
+For security issues, email [disclosure@servicenow.com](mailto:disclosure@servicenow.com). See our [security policy](SECURITY.md).
+
+[roadmap]: https://github.com/ServiceNow/Fast-LLM/milestones
+[issues]: https://github.com/ServiceNow/Fast-LLM/issues
+[ci-badge]: https://github.com/ServiceNow/Fast-LLM/actions/workflows/ci.yaml/badge.svg
+[ci-workflow]: https://github.com/ServiceNow/Fast-LLM/actions/workflows/ci.yaml
+[docs-badge]: https://github.com/ServiceNow/Fast-LLM/actions/workflows/docs.yaml/badge.svg
+[docs-workflow]: https://github.com/ServiceNow/Fast-LLM/actions/workflows/docs.yaml
+[docs]: https://servicenow.github.io/Fast-LLM
+[license-badge]: https://img.shields.io/badge/License-Apache%202.0-blue.svg
+[license]: LICENSE
+[servicenow-research]: https://www.servicenow.com/research/
+[pytorch]: https://pytorch.org/
+[triton]: https://triton-lang.org
+[transformers]: https://huggingface.co/transformers
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 00000000..7230bdd5
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,27 @@
+# Security Policy
+
+## Supported Versions
+
+The Fast-LLM project is currently in a pre-release state. There are no officially released versions supported for security updates at this time. This section will be updated once formal releases are made.
+
+<!-- | Version | Supported          |
+| ------- | ------------------ |
+| 0.1.x   | :white_check_mark: |
+| < 0.1.0 | :x:                | -->
+
+## Reporting a Vulnerability
+
+To report a security vulnerability in Fast-LLM, please email our [Product Security Incident Response Team (PSIRT)](https://securitylab.servicenow.com) at [disclosure@servicenow.com](mailto:disclosure@servicenow.com). Include a detailed description of the issue, steps to reproduce it, and any relevant information that may help in investigating the matter.
+
+## Guidelines
+
+Please follow the guidelines below when [disclosing vulnerabilities](https://www.servicenow.com/company/trust/privacy/responsible-disclosure.html):
+
+- Report any potential security issue as soon as possible. ServiceNow will make every effort to quickly resolve the issue.
+- Provide sufficient detail to reproduce the vulnerability, including proof of concept. The use of ReproNow to demonstrate reproducibility is encouraged but not required.
+- Please do not disclose an issue to the public or any third party until ServiceNow has resolved it.
+- Make a good faith effort to avoid privacy violations, data destruction, and interruption or degradation of our services. Only interact with accounts you own or have explicit permission from the account holder to access.
+- Redact any language or images that may identify the program or ServiceNow customers from information about a resolved vulnerability.
+- Do not engage in disruptive testing (such as Denial of Service attacks) or any action that could impact the confidentiality, integrity, or availability of information and systems.
+- Do not engage in social engineering or phishing against customers or employees.
+- Please do not request compensation for time, materials, or discovered vulnerabilities through the Responsible Disclosure Program.
diff --git a/examples/fast-llm-pvc.yaml b/examples/fast-llm-pvc.yaml
new file mode 100644
index 00000000..b26e27eb
--- /dev/null
+++ b/examples/fast-llm-pvc.yaml
@@ -0,0 +1,12 @@
+# Create persistent volume claim for Fast-LLM
+apiVersion: "v1"
+kind: "PersistentVolumeClaim"
+metadata:
+ name: "pvc-fast-llm-home"
+spec:
+ storageClassName: local-path
+ accessModes:
+  - ReadWriteMany
+ resources:
+  requests:
+   storage: 1000Gi
diff --git a/examples/fast-llm.pytorchjob.yaml b/examples/fast-llm.pytorchjob.yaml
new file mode 100644
index 00000000..9decff91
--- /dev/null
+++ b/examples/fast-llm.pytorchjob.yaml
@@ -0,0 +1,127 @@
+apiVersion: "kubeflow.org/v1"
+kind: "PyTorchJob"
+metadata:
+  name: "fast-llm"
+spec:
+  nprocPerNode: "8"
+  pytorchReplicaSpecs:
+    Master:
+      replicas: 1
+      restartPolicy: Never
+      template:
+        spec:
+          tolerations:
+            - key: nvidia.com/gpu
+              value: "true"
+              operator: Equal
+              effect: NoSchedule
+          containers:
+            - name: pytorch
+              image: servicenowdocker/fast-llm:latest
+              resources:
+                limits:
+                  nvidia.com/gpu: 8
+                  rdma/rdma_shared_device_a: 1
+                  memory: "1024Gi"
+                  cpu:
+                requests:
+                  nvidia.com/gpu: 8
+                  rdma/rdma_shared_device_a: 1
+                  memory: "1024Gi"
+                  cpu: 128
+              command:
+                - /bin/bash
+                - -c
+                - |
+                  torchrun --rdzv_backend=static \
+                           --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
+                           --node_rank=${RANK} \
+                           --nproc_per_node=${PET_NPROC_PER_NODE} \
+                           --nnodes=${PET_NNODES} \
+                           --max_restarts=0 \
+                           --rdzv_conf=timeout=3600 \
+                           --no_python \
+                           fast-llm train gpt \
+                           --config examples/mistral-4-node-benchmark.yaml
+              env:
+                - name: NCCL_DEBUG
+                  value: "INFO"
+                - name: PYTHONHASHSEED
+                  value: "0"
+              securityContext:
+                capabilities:
+                  add:
+                    - IPC_LOCK
+              volumeMounts:
+                - mountPath: /home/fast-llm
+                  name: fast-llm-home
+                - mountPath: /dev/shm
+                  name: dshm
+          volumes:
+            - name: fast-llm-home
+              persistentVolumeClaim:
+                claimName: pvc-fast-llm-home
+            - name: dshm
+              emptyDir:
+                medium: Memory
+                sizeLimit: "1024Gi"
+    Worker:
+      replicas: 3
+      restartPolicy: Never
+      template:
+        spec:
+          tolerations:
+            - key: nvidia.com/gpu
+              value: "true"
+              operator: Equal
+              effect: NoSchedule
+          containers:
+            - name: pytorch
+              image: servicenowdocker/fast-llm:latest
+              resources:
+                limits:
+                  nvidia.com/gpu: 8
+                  rdma/rdma_shared_device_a: 1
+                  memory: "1024Gi"
+                  cpu:
+                requests:
+                  nvidia.com/gpu: 8
+                  rdma/rdma_shared_device_a: 1
+                  memory: "1024Gi"
+                  cpu: 128
+              command:
+                - /bin/bash
+                - -c
+                - |
+                  torchrun --rdzv_backend=static \
+                           --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
+                           --node_rank=${RANK} \
+                           --nproc_per_node=${PET_NPROC_PER_NODE} \
+                           --nnodes=${PET_NNODES} \
+                           --max_restarts=0 \
+                           --rdzv_conf=timeout=3600 \
+                           --no_python \
+                           fast-llm train gpt \
+                           --config examples/mistral-4-node-benchmark.yaml
+              env:
+                - name: NCCL_DEBUG
+                  value: "INFO"
+                - name: PYTHONHASHSEED
+                  value: "0"
+              securityContext:
+                capabilities:
+                  add:
+                    - IPC_LOCK
+              volumeMounts:
+                - mountPath: /home/fast-llm
+                  name: fast-llm-home
+                - mountPath: /dev/shm
+                  name: dshm
+          volumes:
+            - name: fast-llm-home
+              persistentVolumeClaim:
+                claimName: pvc-fast-llm-home
+            - name: dshm
+              emptyDir:
+                medium: Memory
+                sizeLimit: "1024Gi"
diff --git a/examples/fast-llm.sbat b/examples/fast-llm.sbat
new file mode 100644
index 00000000..13a966ec
--- /dev/null
+++ b/examples/fast-llm.sbat
@@ -0,0 +1,37 @@
+#!/bin/bash
+#SBATCH --job-name=fast_llm_train
+#SBATCH --nodes=4
+#SBATCH --gpus-per-node=8
+#SBATCH --ntasks-per-node=1
+#SBATCH --exclusive
+#SBATCH --output=job_output.log
+#SBATCH --error=job_error.log
+
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=8001
+
+echo $MASTER_ADDR
+
+export NCCL_DEBUG=WARN
+export NCCL_SOCKET_IFNAME=eno1
+export UCX_TLS=self,shm,tcp
+export NCCL_NET_GDR_LEVEL=PIX
+export NCCL_IB_PCI_RELAXED_ORDERING=1
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export PYTHONHASHSEED=0
+export TRITON_ALLOW_NON_CONSTEXPR_GLOBALS=1
+
+srun --gpus-per-node=$SLURM_GPUS_PER_NODE \
+     --ntasks-per-node=$SLURM_NTASKS_PER_NODE \
+     bash -c "
+        torchrun --rdzv_backend=static \
+                 --rdzv_id=0 \
+                 --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
+                 --node_rank=\$SLURM_NODEID \
+                 --nproc_per_node=\$SLURM_GPUS_PER_NODE \
+                 --nnodes=\$SLURM_NNODES \
+                 --max_restarts=0 \
+                 --rdzv_conf=timeout=3600 \
+                 --no_python \
+                 fast-llm train gpt \
+                 --config examples/mistral_4_node_benchmark.yaml"
diff --git a/examples/mistral-4-node-benchmark.yaml b/examples/mistral-4-node-benchmark.yaml
new file mode 100644
index 00000000..99dd0ee7
--- /dev/null
+++ b/examples/mistral-4-node-benchmark.yaml
@@ -0,0 +1,55 @@
+training:
+  train_iters: 100
+  num_workers: 8
+  logs:
+    interval: 10
+  validation:
+    iterations: null
+  test_iters: 0
+batch:
+  sequence_length: 8192
+  micro_batch_size: 1
+  batch_size: 32
+data:
+  format: random
+  split: [1, 0, 0]
+optimizer:
+  learning_rate:
+    base: 1.0e-05
+    decay_style: constant
+    warmup_iterations: 0
+  weight_decay: 0.1
+  beta_1: 0.9
+  beta_2: 0.95
+model:
+  base_model:
+    transformer:
+      normalization:
+        type: rms_norm
+        epsilon: 1.0e-05
+      num_layers: 32
+      hidden_size: 4096
+      ffn_hidden_size: 14336
+      num_attention_heads: 32
+      head_groups: 8
+      add_linear_biases: false
+      use_rotary_embeddings: true
+      gated: true
+      activation_type: silu
+      triton_rotary: true
+      kv_channels: 128
+      rotary_embedding_scale: -9.210340371976184
+      window_size: 4096
+      init_method_std: 0.009021
+      attention_dropout: 0.0
+      hidden_dropout: 0.0
+    vocab_size: 32000
+    tie_word_embeddings: false
+  multi_stage:
+    zero_stage: 2
+  distributed:
+    training_dtype: bf16
+    distributed_timeout: 3600
+    seed: 984059
+run:
+  experiment_dir: mistral_4_nodes_benchmark
diff --git a/setup.cfg b/setup.cfg
index d5de782e..55816ff4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -15,7 +15,7 @@ install_requires =
 CORE =
     # Available through the nvidia base image
     # Keeping an older min version because later ones have no x86 wheel for Mac OS
-    torch >=2.2.2
+    torch>=2.2.2
     # Numpy major needs to match torch
     numpy>=1.24.4,<2.0.0
     # Used for checkpoints
@@ -39,6 +39,15 @@ DEV =
     pytest>=8.3.2
     pytest-depends>=1.0.1
 
+# Required for building the documentation
+DOCS =
+    mkdocs
+    mkdocs-material
+    mkdocs-material[imaging]
+    mkdocs-section-index
+    mkdocstrings[python]
+    mkdocs-git-committers-plugin-2
+    mkdocs-git-revision-date-localized-plugin
 
 [options.entry_points]
 console_scripts =
diff --git a/tests/common.py b/tests/common.py
index 127dfb73..edc6d211 100644
--- a/tests/common.py
+++ b/tests/common.py
@@ -164,6 +164,9 @@
 TEST_MODEL_TYPE, CONFIG_FAST_LLM, CONFIG_GPT2, CONFIG_COMMON, HUGGINGFACE_MODEL_TYPE = _CONFIGS[TEST_MODEL]
 
 
+requires_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
+
+
 def get_test_data():
     if not TOKENIZER_FILE.is_file():
         import transformers
diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py
index 5fd272a4..6c5bc82d 100644
--- a/tests/test_checkpoint.py
+++ b/tests/test_checkpoint.py
@@ -19,6 +19,7 @@
     TEST_MODEL,
     TEST_MODEL_TYPE,
     TEST_RESULTS_PATH,
+    requires_cuda,
     run_test_script,
 )
 from tests.compare_tensor_logs import CompareConfig, compare_logged_tensor
@@ -30,6 +31,7 @@
 TEST_ARCHITECTURE_CONFIG_CLS = TEST_BASE_MODEL_CONFIG_CLS.architecture_cls
 
 
+@requires_cuda
 @pytest.mark.depends()
 def test_checkpoint_and_eval():
     # A baseline config (single-gpu, bf16, flash-attn).
diff --git a/tests/test_config.py b/tests/test_config.py
index 840323d6..7dc50eca 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,6 +1,10 @@
 import pathlib
 import subprocess
 
+import yaml
+
+from fast_llm.models.auto import trainer_registry
+
 
 def test_validate_without_import():
     # Make sure validation imports only the bare minimum.
@@ -26,3 +30,10 @@ def test_validate_without_import():
     completed_proc = subprocess.run(command)
     if completed_proc.returncode:
         raise RuntimeError(f"Process failed with return code {completed_proc.returncode}")
+
+
+def test_validate_example_config():
+    fast_llm_config_dict = yaml.safe_load(
+        (pathlib.Path(__file__).parents[1] / "examples" / "mistral-4-node-benchmark.yaml").read_text()
+    )
+    trainer_registry["gpt"].from_dict(fast_llm_config_dict)
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 4b696216..531ebccb 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -5,8 +5,10 @@
 from fast_llm.functional.triton.mlp import mlp_autograd, mlp_autograd_looped, torch_mlp_activation
 from fast_llm.functional.triton.sparse_copy import get_sparse_map
 from fast_llm.utils import Assert
+from tests.common import requires_cuda
 
 
+@requires_cuda
 @pytest.mark.parametrize("gated", [True, False])
 @pytest.mark.parametrize(
     "activation_type", [ActivationType.gelu, ActivationType.silu, ActivationType.relu, ActivationType.squared_relu]
@@ -62,6 +64,7 @@ def test_mlp_recomputation(gated, activation_type):
                 Assert.all_equal(param.grad_buffer, param_grad_ref)
 
 
+@requires_cuda
 def test_dropless_mlp():
     num_experts = 4
     experts_per_token = 4
diff --git a/tests/test_triton_kernels.py b/tests/test_triton_kernels.py
index 5cb7d1bb..3ad4605a 100644
--- a/tests/test_triton_kernels.py
+++ b/tests/test_triton_kernels.py
@@ -24,8 +24,10 @@
 from fast_llm.functional.triton.rotary import triton_rotary_
 from fast_llm.functional.triton.sparse_copy import get_sparse_map
 from fast_llm.utils import Assert, rms_diff
+from tests.common import requires_cuda
 
 
+@requires_cuda
 def test_triton_fill():
     assert TritonConfig.TRITON_ENABLED
     x = torch.randn(425, 549, dtype=torch.bfloat16, device="cuda")
@@ -33,6 +35,7 @@ def test_triton_fill():
     assert x.min().item() == x.max().item() == 32
 
 
+@requires_cuda
 def test_triton_copy():
     assert TritonConfig.TRITON_ENABLED
     x = torch.randn(7563, dtype=torch.bfloat16, device="cuda")
@@ -44,6 +47,7 @@ def test_triton_copy():
     Assert.all_equal(x, x1)
 
 
+@requires_cuda
 def test_triton_copy_cast():
     assert TritonConfig.TRITON_ENABLED
     x = torch.randn(7563, dtype=torch.bfloat16, device="cuda")
@@ -55,6 +59,7 @@ def test_triton_copy_cast():
     Assert.all_equal(x, x1)
 
 
+@requires_cuda
 def test_triton_add():
     assert TritonConfig.TRITON_ENABLED
     x = torch.randn(8934, dtype=torch.float32, device="cuda")
@@ -69,6 +74,7 @@ def test_triton_add():
     Assert.all_equal(y, y1)
 
 
+@requires_cuda
 @pytest.mark.parametrize(
     ("batch_size", "sequence_length", "num_heads", "kv_channels"),
     [(4, 1024, 8, 128), (1, 32, 1, 16), (2, 2048, 2, 192), (3, 519, 7, 134)],
@@ -90,6 +96,7 @@ def test_triton_rotary(batch_size, sequence_length, num_heads, kv_channels):
     Assert.rms_close(y1, y2, 1e-3)
 
 
+@requires_cuda
 @pytest.mark.parametrize("has_bias", [True, False])
 @pytest.mark.parametrize("zero_centered", [True, False])
 def test_triton_normalization(has_bias, zero_centered):
@@ -139,6 +146,7 @@ def test_triton_normalization(has_bias, zero_centered):
         Assert.rms_close(bias_grad0, bias.grad, 1e-3)
 
 
+@requires_cuda
 @pytest.mark.parametrize("gated", [True, False])
 @pytest.mark.parametrize(
     "activation_type", [ActivationType.gelu, ActivationType.silu, ActivationType.relu, ActivationType.squared_relu]
@@ -161,6 +169,7 @@ def test_triton_mlp_activation(gated, activation_type, recompute):
         Assert.rms_close(output1, output3, 1e-5)
 
 
+@requires_cuda
 def test_triton_cross_entropy():
     assert TritonConfig.TRITON_ENABLED
     logits = torch.randn(1024, 8192, dtype=torch.bfloat16, device="cuda", requires_grad=True)
@@ -181,6 +190,7 @@ def test_triton_cross_entropy():
     Assert.rms_close(g2, g3, 1e-3)
 
 
+@requires_cuda
 def test_triton_adam():
     assert TritonConfig.TRITON_ENABLED
     params = torch.randn(4576427, dtype=torch.float32, device="cuda")
@@ -238,6 +248,7 @@ def compare(i, j, fn, arg):
     compare(0, 4, Assert.eq, 0)
 
 
+@requires_cuda
 @pytest.mark.parametrize(
     ("num_rows_dense", "num_experts", "num_experts_per_token"),
     [(2048, 8, 2), (2048, 6, 2), (2048, 8, 8), (256, 8, 2), (5627, 8, 2)],