From ac13217cbfb0654db5cf0b467f209ebdebe52b0e Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Thu, 1 Aug 2024 13:08:31 -0400
Subject: [PATCH] Replace pydocstyle with Ruff (#1417)

---
 .pre-commit-config.yaml                       | 11 ---
 .../data_prep/convert_dataset_json.py         |  2 +-
 .../data_prep/convert_delta_to_json.py        | 26 +++---
 llmfoundry/data/finetuning/dataloader.py      | 86 ++++++++++++++++---
 llmfoundry/data/finetuning/tasks.py           | 32 ++++++-
 llmfoundry/data/packing.py                    |  2 +-
 .../in_context_learning_evaluation.py         | 27 ++++--
 llmfoundry/eval/datasets/utils.py             |  2 +-
 llmfoundry/eval/metrics/nlp.py                |  2 +-
 llmfoundry/models/hf/hf_causal_lm.py          |  1 +
 llmfoundry/models/layers/attention.py         |  1 +
 llmfoundry/models/layers/ffn.py               |  1 +
 llmfoundry/models/mpt/configuration_mpt.py    |  1 +
 llmfoundry/tokenizers/tiktoken.py             |  1 +
 llmfoundry/utils/builders.py                  | 11 ++-
 .../utils/checkpoint_conversion_helpers.py    |  1 +
 llmfoundry/utils/config_utils.py              |  3 +
 llmfoundry/utils/model_download_utils.py      |  4 +-
 llmfoundry/utils/registry_utils.py            |  2 +
 pyproject.toml                                | 40 ++++++---
 20 files changed, 187 insertions(+), 69 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index dc2e3f55cd..b45021dd8c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -77,17 +77,6 @@ repos:
   hooks:
   - id: docformatter
     args: [--in-place, --wrap-summaries=80, --wrap-descriptions=80]
-- repo: https://github.com/PyCQA/pydocstyle
-  hooks:
-  - id: pydocstyle
-    name: pydocstyle
-    entry: pydocstyle
-    language: python
-    types: [python]
-    exclude: (.ci|.github)
-    additional_dependencies:
-    - toml
-  rev: 6.1.1
 - repo: https://github.com/adrienverge/yamllint.git
   rev: v1.28.0
   hooks:
diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_json.py b/llmfoundry/command_utils/data_prep/convert_dataset_json.py
index 9f174d1aaf..35d7e637e6 100644
--- a/llmfoundry/command_utils/data_prep/convert_dataset_json.py
+++ b/llmfoundry/command_utils/data_prep/convert_dataset_json.py
@@ -34,7 +34,7 @@ def build_hf_dataset(
     """Build an IterableDataset over the HF C4 or pile source data.
 
     Args:
-        dataset_name (str): Dataset name
+        path (str): Dataset name
         split (str): Split name.
         mode (ConcatMode): NO_CONCAT, or CONCAT_TOKENS
         max_length (int): The length of concatenated tokens
diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
index b76e457e2c..635efd54d4 100644
--- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
+++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
@@ -78,15 +78,16 @@ def to_cf(self: 'SparkConnectClient',
     return the schema and drops all other responses.
 
     Args:
-       plan (pb2.Plan): The plan object to be executed by spark.
-       type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
+        self (SparkConnectClient): The SparkConnectClient we are processing.
+        plan (pb2.Plan): The plan object to be executed by spark.
+        type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
 
     Returns:
-       Tuple[List[Result], int, bool]: A tuple containing:
-           - A list of Result namedtuples, each containing a URL, row count, compressed size,
-             and uncompressed size of the part of the result.
-           - Total row count of all parts of the result.
-           - A boolean indicating whether the result has been truncated.
+        Tuple[List[Result], int, bool]: A tuple containing:
+            - A list of Result namedtuples, each containing a URL, row count, compressed size,
+                and uncompressed size of the part of the result.
+            - Total row count of all parts of the result.
+            - A boolean indicating whether the result has been truncated.
     """
     req = self._execute_plan_request_with_metadata()
     req.plan.CopyFrom(plan)
@@ -120,8 +121,9 @@ def to_cf(self: 'SparkConnectClient',
     )
 
     # Create the iterator
-    from pyspark.sql.connect.client.reattach import \
-        ExecutePlanResponseReattachableIterator
+    from pyspark.sql.connect.client.reattach import (
+        ExecutePlanResponseReattachableIterator,
+    )
     iterator = ExecutePlanResponseReattachableIterator(
         req,
         self._stub,
@@ -169,6 +171,7 @@ def collect_as_cf(self: 'DataFrame',
     uses the `to_cf` method to execute the plan and fetch results as presigned URLs.
 
     Args:
+        self (pd.DataFrame): The dataframe we are processing.
         type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
 
     Returns:
@@ -693,8 +696,9 @@ def _check_imports():
         import pyspark.sql.connect.proto.cloud_pb2 as cloud_pb2
         from pyspark.sql import SparkSession
         from pyspark.sql.connect.client.core import SparkConnectClient
-        from pyspark.sql.connect.client.reattach import \
-            ExecutePlanResponseReattachableIterator
+        from pyspark.sql.connect.client.reattach import (
+            ExecutePlanResponseReattachableIterator,
+        )
         from pyspark.sql.connect.dataframe import DataFrame
         from pyspark.sql.dataframe import DataFrame as SparkDataFrame
         from pyspark.sql.types import Row
diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 11104ac706..d9450bc657 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -64,9 +64,12 @@ def build_finetuning_dataloader(
     on which you intend to use, as explained below.
 
     Args:
-        name (str): The type of dataloader to build. Must = "finetuning".
-        ---
-        *** HuggingFace dataset config fields ***
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
+            prepare the data from raw text. Any missing sentinel tokens will
+            be added by the collator.
+        device_batch_size (int, float): The size of the batches (number of examples)
+            that the dataloader will produce.
+        dataset (Dict[str, Any]): A HuggingFace dataset config which contains the following fields:
             dataset.hf_name (str, optional): The name of the HuggingFace dataset
                 to use. Can also be a remote http(s) directory or object store bucket
                 containing the file {split}.jsonl in the format (prompt, response),
@@ -130,16 +133,32 @@ def build_finetuning_dataloader(
                     The script `scripts/misc/profile_packing.py` can help
                     you choose the best packing_ratio.
             dataset.shuffle (bool): Whether to shuffle the dataset.
-            ___
             See :class:`StreamingFinetuningDataset` for info on other standard config
                 options within `dataset` that will be passed as kwargs if
                 using the streaming codepath.
-            ---
-        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
-            prepare the data from raw text. Any missing sentinel tokens will
-            be added by the collator.
-        device_batch_size (int, float): The size of the batches (number of examples)
-            that the dataloader will produce.
+        num_workers (int, optional): How many subprocesses to use for data loading.
+            0 means that the data will be loaded in the main process. The default is 0.
+            This argument is passed directly to the pytorch :class:`DataLoader`.
+        drop_last (bool, optional): If true, drop the last incomplete batch, if the dataset
+            size is not divisible by the batch size. If False and the size of dataset is
+            not divisible by the batch size, then the last batch will be smaller. The
+            default is False. This argument is passed directly to the pytorch :class:`DataLoader`.
+        pin_memory (bool, optional): If True, the data loader will copy Tensors into device/CUDA
+            pinned memory before returning them. If your data elements are a custom type, or your
+            `collate_fn` returns a batch that is a custom type. This argument is passed directly to
+            the pytorch :class:`DataLoader`.
+        prefetch_factor (int, optional): Number of batches loaded in advance by each worker.
+            2 means there will be a total of 2 * num_workers batches prefetched across all workers.
+            (default value depends on the set value for num_workers. If value of num_workers=0 default
+            is None. Otherwise, if value of num_workers > 0 default is 2). This argument is passed
+            directly to the pytorch :class:`DataLoader`.
+        persistent_workers (bool, optional): If True, the data loader will not shut down the worker
+            processes after a dataset has been consumed once. This allows to maintain the workers
+            Dataset instances alive. The default is False. This argument is passed directly to the
+            pytorch :class:`DataLoader`.
+        timeout (int, optional): If positive, the timeout value for collecting a batch from workers.
+            Should always be non-negative. The default is 0. This argument is passed directly to the
+            pytorch :class:`DataLoader`.
         See :class:`DataLoader` for standard argument options to the pytorch
             dataloader, such as `drop_last`, `num_workers`, etc.
 
@@ -357,7 +376,50 @@ def _validate_config(
     the other.
 
     Args:
-        dataset_cfg (DictConfig): The dataset configuration to be validated.
+        max_seq_len (int): The maximum length of sequences
+            in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
+            for details.
+        decoder_only_format (bool): Whether to format the
+            examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
+            docstring for details.
+        hf_name (str, optional): The name of the HuggingFace dataset
+            to use. Can also be a remote http(s) directory or object store bucket
+            containing the file {split}.jsonl in the format (prompt, response),
+            in which case the builder will create a HuggingFace dataset.
+        local (str, optional): Local path where remote data
+            will be streamed to. Only valid if `cfg.dataset.remote` has
+            also been set.
+        remote (str, optional): Location of a MDS-formatted
+            streaming dataset to use. Setting this will tell the builder
+            to create a streaming dataset rather than a HuggingFace dataset.
+        hf_kwargs (DictConfig, optional): Additional kwargs to
+            pass to `datasets.load_dataset`, which can be used to load
+            a dataset from local files.
+        preprocessing_fn (str, optional): The name/import path of
+            the preprocessing function to use for formatting the data examples.
+            If ``None`` (default), the builder will use the preprocessing function
+                registered under `hf_name` (see `tasks.py`), if one exists,
+                otherwise it will skip preprocessing.
+            If `preprocessing_fn` corresponds to a registered preprocessing
+                function in `tasks.py`, the builder will use that.
+            Otherwise, it will interpret `preprocessing_fn` as a
+                "import.path:function_name" import path; e.g., it will call
+                `from import.path import function_name` and use the imported
+                function as the preprocessing function.
+        safe_load (bool, optional): Whether to enforce safe loading of the dataset.
+            If `None`, will default to not applying any safe loading.
+        streams (Dict[str, Any], optional): A dictionary with multiple data streams.
+            If `None`, will assume no streams.
+        target_prompts (str): Which prompts are used as training targets.
+            Defaults to "none", meaning prompts are never used as training targets.
+            See :class:`Seq2SeqFinetuningCollator` docstring for details.
+        target_responses (str): Which responses are used as training targets.
+            Defaults to "last", meaning only the final response in multi-turn examples
+            will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for
+            details.
+        kwargs (DictConfig, optional): Additional kwargs to
+                pass to `datasets.load_dataset`, which can be used to load
+                a dataset from local files.
 
     Raises:
         ValueError: If the dataset configuration does not meet the requirements.
@@ -504,7 +566,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
     completed, the function removes the signal file.
 
     Args:
-        hf_name (str): The path of the HuggingFace dataset to download.
+        remote_path (str): The path of the HuggingFace dataset to download.
         split (str): The dataset split to download (e.g., 'train', 'validation', 'test').
 
     Returns:
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index 78bfb9c74c..397b619e73 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -162,7 +162,7 @@ def _is_empty_or_nonexistent(dirpath: str) -> bool:
     Args:
         dirpath (str): Directory path to check.
 
-    Returns
+    Returns:
         True if directory is empty or non-existent. False otherwise.
     """
     return not os.path.isdir(dirpath) or len(os.listdir(dirpath)) == 0
@@ -820,9 +820,33 @@ def build_from_hf(
         Note: This function will drop examples where the prompt is longer than the max_seq_len
 
         Args:
-            cfg (DictConfig): The dataset configuration.
-            max_seq_len (int): The maximum sequence length. Examples with prompts longer than this will be dropped.
-            tokenizer (Tokenizer): The tokenizer to be used for tokenizing the dataset.
+            dataset_name (str): The name of the HuggingFace dataset
+                to use. Can also be a remote http(s) directory or object store bucket
+                containing the file {split}.jsonl in the format (prompt, response),
+                in which case the builder will create a HuggingFace dataset.
+            split (str): The split of the HuggingFace dataset.
+            safe_load (bool, optional): Whether to enforce safe loading of the dataset.
+                If `None`, will default to not applying any safe loading.
+            max_seq_len (int): The maximum length of sequences
+                in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
+                for details.
+            preprocessing_fn (Callable, optional): The preprocessing function to use for
+                formatting the data examples.
+            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used for tokenizing
+                the HuggingFace dataset.
+            target_prompts (str): Which prompts are used as training targets.
+                Defaults to "none", meaning prompts are never used as training targets.
+                See :class:`Seq2SeqFinetuningCollator` docstring for details.
+            target_responses (str): Which responses are used as training targets.
+                Defaults to "last", meaning only the final response in multi-turn examples
+                will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for
+                details.
+            decoder_only_format (bool): Whether to format the
+                examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
+                docstring for details.
+            hf_kwargs (DictConfig, optional): Additional kwargs to
+                pass to `datasets.load_dataset`, which can be used to load
+                a dataset from local files.
 
         Returns:
             Dataset: The tokenized dataset.
diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
index a6fdf34953..5579066f89 100644
--- a/llmfoundry/data/packing.py
+++ b/llmfoundry/data/packing.py
@@ -337,7 +337,7 @@ def auto_packing_ratio(
         dataloader_cfg (DictConfig): The dataloader configuration for profiling.
         tokenizer (PreTrainedTokenizerBase): The tokenizer for profiling.
         device_batch_size (int): The size of the batches (number of examples) per device.
-        num_packing_ratio (int): The number of packing ratios to try.
+        num_packing_ratios (int): The number of packing ratios to try.
 
     Returns:
         A packing ratio that minimizes padding while maintaining zero waste.
diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index 8a8b9de551..4e49be3fba 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -251,8 +251,9 @@ def read_dataset(
         """
         from datasets import \
             Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
-        from datasets import \
-            load_dataset  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import (  # pyright: ignore[reportGeneralTypeIssues]
+            load_dataset,
+        )
         if 'hf://' in dataset_uri:
             dataset_uri = dataset_uri.replace('hf://', '')
             if hf_loading_vars is None:
@@ -363,6 +364,7 @@ def get_answer_from_example(
 
         Args:
             example (Dict): The example from which to retrieve the answer
+            in_context (bool): Whether this is an in-context example. Default to False.
 
         Returns:
             str: The answer in the example
@@ -712,6 +714,7 @@ def get_answer_from_example(
 
         Args:
             example (Dict): The example from which to retrieve the answer
+            in_context (bool): Whether this is an in-context example. Default to False.
 
         Returns:
             str: The answer in from the example with chain of thought and delimiter if needed
@@ -731,7 +734,7 @@ def tokenize_example(
 
         Args:
             prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): The specific example's derived context
+            ctxt (str): The specific example's derived context
             example (Dict): The example as a dictionary.
 
         Returns:
@@ -1035,6 +1038,7 @@ def get_answer_from_example(
 
         Args:
             example (Dict): The example from which to retrieve the answer
+            in_context (bool): Whether this is an in-context example. Default to False.
 
         Returns:
             str: The full string of the correct answer based on the 'gold' key
@@ -1053,7 +1057,7 @@ def tokenize_example(
 
         Args:
             prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): The specific example's derived context
+            ctxt (str): The specific example's derived context
             example (Dict): The example as a dictionary.
 
         Returns:
@@ -1129,6 +1133,7 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         since the batch may consist of multiple questions, the choice_groupings indicates
         which contiguous sequences of elements in the batch correspond to which question
         gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
+
         Args:
             data (List): List of tokenized datapoints (dicts returned by self._tokenize_example)
 
@@ -1168,6 +1173,7 @@ def split_batch(self, batch: Any,
         and real example, which refers to one possible continuation. As example count and
         microbatch_size are tracked in logical example, we split logical attributes by
         microbatch_size and real attributes by microbatch_size * num_choices.
+
         Args:
             batch (Dict): Batch of data
             microbatch_size (int | float): Size of microbatches
@@ -1419,7 +1425,7 @@ def tokenize_example(
 
         Args:
             prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): The specific example's derived context
+            context_options (str): A list of contexts for this specific example.
             example (Dict): The example as a dictionary.
 
         Returns:
@@ -1548,6 +1554,10 @@ def partition_dataset_by_category(
     Args:
         dataset_uri (str): Location of dataset.
         destination_path (str): Base destination path, we will write a separate partition off this URI for each category.
+        hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
+        hf_parsing_map (Dict): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
+            Column contents will be concatenated with ' ' separating them. If not included, will load the columns already present in the HF dataset.
+
 
     Raises:
         MissingConditionalImportError: If datasets not installed raise exception.
@@ -1643,8 +1653,7 @@ def get_icl_task_dataloader(
             # At this point, hf_model is randomly initialized
             composer_model = HuggingFaceModel(hf_model, hf_tokenizer)
 
-        Example:
-
+    Example:
         .. testcode::
 
 
@@ -1685,8 +1694,8 @@ def get_icl_task_dataloader(
         hf_loading_vars (Dict, default = None): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
         hf_parsing_map (Dict, default = None): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
             Column contents will be concatenated with ' ' separating them. If not included, will load the columns already present in the HF dataset.
-        kwargs (Dict[str, Any], default=None): Dictionary containing a mapping
-        from ICL dataset constructor's parameter names and their desired values.
+        destination_path: Where the dataloader will be saved.
+        kwargs (Dict[str, Any], default=None): Dictionary containing a mapping from ICL dataset constructor's parameter names and their desired values.
 
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
diff --git a/llmfoundry/eval/datasets/utils.py b/llmfoundry/eval/datasets/utils.py
index 1ce249437d..c19ae15dd9 100644
--- a/llmfoundry/eval/datasets/utils.py
+++ b/llmfoundry/eval/datasets/utils.py
@@ -130,7 +130,7 @@ def make_padded_input(
     Args:
         context_enc (List): The encoded input to the model
         continuation_enc (List): The encoded desired output for the example
-        max_seq_list (int): Maximum length sequences can be
+        max_seq_len (int): Maximum length sequences can be
         pad_tok_id (int): The token id we pad with
         padding_side (str): Which side to pad the context on. Can be 'right' or 'left
 
diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index 3ee30ebf5e..f0fbba3ece 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -80,7 +80,7 @@ def update(
         Args:
             batch (dict): Batch must consist minimally of `input_ids` as well as any other structure needed
                 to compute the metric.
-            output_logits (torch.Tensor): The model outputs evaluated on the batch `input_ids`
+            outputs (torch.Tensor): The model outputs evaluated on the batch `input_ids`.
             labels (torch.Tensor): The correct outputs.
 
         Raises:
diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index f7f372f5fa..34ce22d694 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -205,6 +205,7 @@ def build_inner_model(
             use_auth_token (bool): Whether to use an authentication token.
             config_overrides (Dict[str, Any]): The configuration overrides.
             load_in_8bit (bool): Whether to load in 8-bit.
+            pretrained (bool): Whether the model is pretrained.
             prepare_for_fsdp (bool, optional): Whether to prepare the model for FSDP wrapping. Default: False.
 
         Returns:
diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
index c7fdb5b987..3e365edc47 100644
--- a/llmfoundry/models/layers/attention.py
+++ b/llmfoundry/models/layers/attention.py
@@ -606,6 +606,7 @@ def get_qkv(
 
         Args:
             x (torch.Tensor): The input tensor.
+            prev_layer_key_value  (Optional[Tuple[torch.Tensor, torch.Tensor]]): The key value of the previous layer.
 
         Returns:
             query (torch.Tensor): The query tensor.
diff --git a/llmfoundry/models/layers/ffn.py b/llmfoundry/models/layers/ffn.py
index 8028a65a8b..f5d6d67040 100644
--- a/llmfoundry/models/layers/ffn.py
+++ b/llmfoundry/models/layers/ffn.py
@@ -429,6 +429,7 @@ def set_ffn_device_mesh(
         ffn (nn.Module): The FFN module.
         moe_world_size (int): The MoE world size.
         device_mesh (DeviceMesh): The full device mesh.
+        get_fsdp_submesh (Callable[[DeviceMesh], DeviceMesh]): A function to get the fsdp submesh.
 
     Raises:
         RuntimeError: If the device mesh is 3D.
diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py
index 86cc3519ba..9671eb6ed5 100644
--- a/llmfoundry/models/mpt/configuration_mpt.py
+++ b/llmfoundry/models/mpt/configuration_mpt.py
@@ -147,6 +147,7 @@ def __init__(
                             reuse_kv_layer:
                                 attn_config:
                                     reuse_kv_layer_idx: -6 # Relative index of the layer whose kv cache to reuse
+            kwargs (Any): Other relevant keyword arguments.
         """
         self.d_model = d_model
         self.n_heads = n_heads
diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py
index f087664344..fd0fc5948a 100644
--- a/llmfoundry/tokenizers/tiktoken.py
+++ b/llmfoundry/tokenizers/tiktoken.py
@@ -90,6 +90,7 @@ def __init__(
             errors (str, optional): Paradigm to follow when decoding bytes to UTF-8. See
                 [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
                 Defaults to `"replace"`.
+            kwargs (Any): Other relevant keyword arguments.
         """
         try:
             import tiktoken
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 9f18c31ec6..000155f1a4 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -35,8 +35,9 @@
 from llmfoundry import registry
 from llmfoundry.callbacks import EvalGauntlet
 from llmfoundry.data.dataloader import build_dataloader
-from llmfoundry.eval.datasets.in_context_learning_evaluation import \
-    get_icl_task_dataloader
+from llmfoundry.eval.datasets.in_context_learning_evaluation import (
+    get_icl_task_dataloader,
+)
 from llmfoundry.utils.config_utils import to_dict_container, to_list_container
 from llmfoundry.utils.registry_utils import construct_from_registry
 
@@ -191,7 +192,8 @@ def build_load_planner(name: str, **kwargs: Any) -> LoadPlanner:
     """Builds a load planner from the registry.
 
     Args:
-        name: Name of the load planner to build.
+        name (str): Name of the load planner to build.
+        kwargs (Any): Other relevant keyword arguments.
 
     Returns:
         LoadPlanner: The load planner.
@@ -210,7 +212,8 @@ def build_save_planner(name: str, **kwargs: Any) -> SavePlanner:
     """Builds a save planner from the registry.
 
     Args:
-        name: Name of the save planner to build.
+        name (str): Name of the save planner to build.
+        kwargs (Any): Other relevant keyword arguments.
 
     Returns:
         savePlanner: The save planner.
diff --git a/llmfoundry/utils/checkpoint_conversion_helpers.py b/llmfoundry/utils/checkpoint_conversion_helpers.py
index 905afd6edb..5c65a7475e 100644
--- a/llmfoundry/utils/checkpoint_conversion_helpers.py
+++ b/llmfoundry/utils/checkpoint_conversion_helpers.py
@@ -177,6 +177,7 @@ def _convert_weight_to_ft_each(
         tensor_name (str): Name of the weight tensor. Used in naming the output file.
         config (Dict[str, Any]): Configuration for the model. This is used in getting model specific parameters.
         data (np.ndarray): Tensor data in np.ndarray format.
+        np_weight_data_type (np.dtype): Data type of the numpy array `data`.
 
     Returns:
         None: Writes to a file in `save_dir`. File name is based on the `tensor_name`
diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py
index 6811b09e7d..f10fe32735 100644
--- a/llmfoundry/utils/config_utils.py
+++ b/llmfoundry/utils/config_utils.py
@@ -466,6 +466,9 @@ def update_config_with_batch_size_info(
 
     Args:
         cfg (Dict[str, Any]): The config to update.
+        device_train_batch_size (Union[int, float]): The batch size of the training dataset for each device.
+        device_train_microbatch_size (Union[int, float, Literal['auto']]): The microbatch size of the training dataset for each device.
+        device_train_grad_accum (Union[int, Literal['auto']]): The gradient accumulation settings for each device.
 
     Returns:
         Dict[str, Any]: The updated config.
diff --git a/llmfoundry/utils/model_download_utils.py b/llmfoundry/utils/model_download_utils.py
index dde8240d8b..9609982fda 100644
--- a/llmfoundry/utils/model_download_utils.py
+++ b/llmfoundry/utils/model_download_utils.py
@@ -69,7 +69,7 @@ def download_from_hf_hub(
     Safetensors weights will be downloaded unless `prefer_safetensors` is set to False.
 
     Args:
-        repo_id (str): The Hugging Face Hub repo ID.
+        model (str): The Hugging Face Hub repo ID.
         save_dir (str, optional): The local path to the directory where the model files will be downloaded.
         prefer_safetensors (bool): Whether to prefer Safetensors weights over PyTorch weights if both are
             available. Defaults to True.
@@ -157,7 +157,7 @@ def _recursive_download(
 
     Args:
         session: A requests.Session through which to make requests to the remote server.
-        url (str): The base URL where the files are located.
+        base_url (str): The base URL where the files are located.
         path (str): The path from the base URL to the files to download. The full URL for the download is equal to
             '<base_url>/<path>'.
         save_dir (str): The directory to save downloaded files to.
diff --git a/llmfoundry/utils/registry_utils.py b/llmfoundry/utils/registry_utils.py
index 3ea7cc58a7..f96e72b3a2 100644
--- a/llmfoundry/utils/registry_utils.py
+++ b/llmfoundry/utils/registry_utils.py
@@ -127,6 +127,7 @@ def construct_from_registry(
             before constructing the item to return. This should throw an exception if validation fails. Defaults to None.
         post_validation_function (Optional[Callable[[Any], None]], optional): An optional validation function called after
             constructing the item to return. This should throw an exception if validation fails. Defaults to None.
+        kwargs (Optional[Dict[str, Any]]): Other relevant keyword arguments.
 
     Raises:
         ValueError: If the validation functions failed or the registered item is invalid
@@ -176,6 +177,7 @@ def import_file(loc: Union[str, Path]) -> ModuleType:
     """Import module from a file.
 
     Used to run arbitrary python code.
+
     Args:
         name (str): Name of module to load.
         loc (str / Path): Path to the file.
diff --git a/pyproject.toml b/pyproject.toml
index e5c931f4c5..fdbabfff96 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,23 +11,44 @@ skip = [ "env", "wandb", "runs", "build", "node_modules" ]
 include_trailing_comma = true
 split_on_trailing_comma = true
 
+# Ruff global
+[tool.ruff]
+exclude = [
+    "build/**",
+    "docs/**",
+    "node_modules/**",
+]
+
+# Ruff linter
 [tool.ruff.lint]
 select = [
     "C4",
-    # TODO port pydocstyle
-    # "D", # pydocstyle
     "LOG",
     "PERF",
     "PLE",
     "COM812",
+    "D", # pydocstyle
 ]
-[tool.ruff]
-exclude = [
-    "build/**",
-    "docs/**",
-    "node_modules/**",
+
+extend-select = ["D404"] # pydocstyle
+
+ignore = [
+    "D100",
+    "D101",
+    "D102",
+    "D103",
+    "D104",
+    "D105",
+    "D107",
+    "D400",
+    "D401",
+    "D415",
 ]
 
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+
+
 # Coverage
 [tool.coverage.run]
 parallel = true
@@ -506,8 +527,3 @@ ignore_patterns = [
     "wandb/**/*.py",
     "build/**/*.py",
 ]
-
-[tool.pydocstyle]
-convention="google"
-add_ignore="D100,D101,D102,D103,D104,D105,D107,D400,D401,D415"
-add_select="D404"