Replace pydocstyle with Ruff (#1417)

mosaicml · Aug 1, 2024 · ac13217 · ac13217
1 parent f670108
commit ac13217
Show file tree

Hide file tree

Showing 20 changed files with 187 additions and 69 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -77,17 +77,6 @@ repos:
   hooks:
   - id: docformatter
     args: [--in-place, --wrap-summaries=80, --wrap-descriptions=80]
-- repo: https://github.com/PyCQA/pydocstyle
-  hooks:
-  - id: pydocstyle
-    name: pydocstyle
-    entry: pydocstyle
-    language: python
-    types: [python]
-    exclude: (.ci|.github)
-    additional_dependencies:
-    - toml
-  rev: 6.1.1
 - repo: https://github.com/adrienverge/yamllint.git
   rev: v1.28.0
   hooks:

diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_json.py b/llmfoundry/command_utils/data_prep/convert_dataset_json.py
@@ -34,7 +34,7 @@ def build_hf_dataset(
     """Build an IterableDataset over the HF C4 or pile source data.
 
     Args:
-        dataset_name (str): Dataset name
+        path (str): Dataset name
         split (str): Split name.
         mode (ConcatMode): NO_CONCAT, or CONCAT_TOKENS
         max_length (int): The length of concatenated tokens

diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
@@ -78,15 +78,16 @@ def to_cf(self: 'SparkConnectClient',
     return the schema and drops all other responses.
 
     Args:
-       plan (pb2.Plan): The plan object to be executed by spark.
-       type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
+        self (SparkConnectClient): The SparkConnectClient we are processing.
+        plan (pb2.Plan): The plan object to be executed by spark.
+        type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
 
     Returns:
-       Tuple[List[Result], int, bool]: A tuple containing:
-           - A list of Result namedtuples, each containing a URL, row count, compressed size,
-             and uncompressed size of the part of the result.
-           - Total row count of all parts of the result.
-           - A boolean indicating whether the result has been truncated.
+        Tuple[List[Result], int, bool]: A tuple containing:
+            - A list of Result namedtuples, each containing a URL, row count, compressed size,
+                and uncompressed size of the part of the result.
+            - Total row count of all parts of the result.
+            - A boolean indicating whether the result has been truncated.
     """
     req = self._execute_plan_request_with_metadata()
     req.plan.CopyFrom(plan)
@@ -120,8 +121,9 @@ def to_cf(self: 'SparkConnectClient',
     )
 
     # Create the iterator
-    from pyspark.sql.connect.client.reattach import \
-        ExecutePlanResponseReattachableIterator
+    from pyspark.sql.connect.client.reattach import (
+        ExecutePlanResponseReattachableIterator,
+    )
     iterator = ExecutePlanResponseReattachableIterator(
         req,
         self._stub,
@@ -169,6 +171,7 @@ def collect_as_cf(self: 'DataFrame',
     uses the `to_cf` method to execute the plan and fetch results as presigned URLs.
 
     Args:
+        self (pd.DataFrame): The dataframe we are processing.
         type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
 
     Returns:
@@ -693,8 +696,9 @@ def _check_imports():
         import pyspark.sql.connect.proto.cloud_pb2 as cloud_pb2
         from pyspark.sql import SparkSession
         from pyspark.sql.connect.client.core import SparkConnectClient
-        from pyspark.sql.connect.client.reattach import \
-            ExecutePlanResponseReattachableIterator
+        from pyspark.sql.connect.client.reattach import (
+            ExecutePlanResponseReattachableIterator,
+        )
         from pyspark.sql.connect.dataframe import DataFrame
         from pyspark.sql.dataframe import DataFrame as SparkDataFrame
         from pyspark.sql.types import Row

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
@@ -64,9 +64,12 @@ def build_finetuning_dataloader(
     on which you intend to use, as explained below.
 
     Args:
-        name (str): The type of dataloader to build. Must = "finetuning".
-        ---
-        *** HuggingFace dataset config fields ***
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
+            prepare the data from raw text. Any missing sentinel tokens will
+            be added by the collator.
+        device_batch_size (int, float): The size of the batches (number of examples)
+            that the dataloader will produce.
+        dataset (Dict[str, Any]): A HuggingFace dataset config which contains the following fields:
             dataset.hf_name (str, optional): The name of the HuggingFace dataset
                 to use. Can also be a remote http(s) directory or object store bucket
                 containing the file {split}.jsonl in the format (prompt, response),
@@ -130,16 +133,32 @@ def build_finetuning_dataloader(
                     The script `scripts/misc/profile_packing.py` can help
                     you choose the best packing_ratio.
             dataset.shuffle (bool): Whether to shuffle the dataset.
-            ___
             See :class:`StreamingFinetuningDataset` for info on other standard config
                 options within `dataset` that will be passed as kwargs if
                 using the streaming codepath.
-            ---
-        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
-            prepare the data from raw text. Any missing sentinel tokens will
-            be added by the collator.
-        device_batch_size (int, float): The size of the batches (number of examples)
-            that the dataloader will produce.
+        num_workers (int, optional): How many subprocesses to use for data loading.
+            0 means that the data will be loaded in the main process. The default is 0.
+            This argument is passed directly to the pytorch :class:`DataLoader`.
+        drop_last (bool, optional): If true, drop the last incomplete batch, if the dataset
+            size is not divisible by the batch size. If False and the size of dataset is
+            not divisible by the batch size, then the last batch will be smaller. The
+            default is False. This argument is passed directly to the pytorch :class:`DataLoader`.
+        pin_memory (bool, optional): If True, the data loader will copy Tensors into device/CUDA
+            pinned memory before returning them. If your data elements are a custom type, or your
+            `collate_fn` returns a batch that is a custom type. This argument is passed directly to
+            the pytorch :class:`DataLoader`.
+        prefetch_factor (int, optional): Number of batches loaded in advance by each worker.
+            2 means there will be a total of 2 * num_workers batches prefetched across all workers.
+            (default value depends on the set value for num_workers. If value of num_workers=0 default
+            is None. Otherwise, if value of num_workers > 0 default is 2). This argument is passed
+            directly to the pytorch :class:`DataLoader`.
+        persistent_workers (bool, optional): If True, the data loader will not shut down the worker
+            processes after a dataset has been consumed once. This allows to maintain the workers
+            Dataset instances alive. The default is False. This argument is passed directly to the
+            pytorch :class:`DataLoader`.
+        timeout (int, optional): If positive, the timeout value for collecting a batch from workers.
+            Should always be non-negative. The default is 0. This argument is passed directly to the
+            pytorch :class:`DataLoader`.
         See :class:`DataLoader` for standard argument options to the pytorch
             dataloader, such as `drop_last`, `num_workers`, etc.
 
@@ -357,7 +376,50 @@ def _validate_config(
     the other.
 
     Args:
-        dataset_cfg (DictConfig): The dataset configuration to be validated.
+        max_seq_len (int): The maximum length of sequences
+            in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
+            for details.
+        decoder_only_format (bool): Whether to format the
+            examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
+            docstring for details.
+        hf_name (str, optional): The name of the HuggingFace dataset
+            to use. Can also be a remote http(s) directory or object store bucket
+            containing the file {split}.jsonl in the format (prompt, response),
+            in which case the builder will create a HuggingFace dataset.
+        local (str, optional): Local path where remote data
+            will be streamed to. Only valid if `cfg.dataset.remote` has
+            also been set.
+        remote (str, optional): Location of a MDS-formatted
+            streaming dataset to use. Setting this will tell the builder
+            to create a streaming dataset rather than a HuggingFace dataset.
+        hf_kwargs (DictConfig, optional): Additional kwargs to
+            pass to `datasets.load_dataset`, which can be used to load
+            a dataset from local files.
+        preprocessing_fn (str, optional): The name/import path of
+            the preprocessing function to use for formatting the data examples.
+            If ``None`` (default), the builder will use the preprocessing function
+                registered under `hf_name` (see `tasks.py`), if one exists,
+                otherwise it will skip preprocessing.
+            If `preprocessing_fn` corresponds to a registered preprocessing
+                function in `tasks.py`, the builder will use that.
+            Otherwise, it will interpret `preprocessing_fn` as a
+                "import.path:function_name" import path; e.g., it will call
+                `from import.path import function_name` and use the imported
+                function as the preprocessing function.
+        safe_load (bool, optional): Whether to enforce safe loading of the dataset.
+            If `None`, will default to not applying any safe loading.
+        streams (Dict[str, Any], optional): A dictionary with multiple data streams.
+            If `None`, will assume no streams.
+        target_prompts (str): Which prompts are used as training targets.
+            Defaults to "none", meaning prompts are never used as training targets.
+            See :class:`Seq2SeqFinetuningCollator` docstring for details.
+        target_responses (str): Which responses are used as training targets.
+            Defaults to "last", meaning only the final response in multi-turn examples
+            will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for
+            details.
+        kwargs (DictConfig, optional): Additional kwargs to
+                pass to `datasets.load_dataset`, which can be used to load
+                a dataset from local files.
 
     Raises:
         ValueError: If the dataset configuration does not meet the requirements.
@@ -504,7 +566,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
     completed, the function removes the signal file.
 
     Args:
-        hf_name (str): The path of the HuggingFace dataset to download.
+        remote_path (str): The path of the HuggingFace dataset to download.
         split (str): The dataset split to download (e.g., 'train', 'validation', 'test').
 
     Returns:

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
@@ -162,7 +162,7 @@ def _is_empty_or_nonexistent(dirpath: str) -> bool:
     Args:
         dirpath (str): Directory path to check.
 
-    Returns
+    Returns:
         True if directory is empty or non-existent. False otherwise.
     """
     return not os.path.isdir(dirpath) or len(os.listdir(dirpath)) == 0
@@ -820,9 +820,33 @@ def build_from_hf(
         Note: This function will drop examples where the prompt is longer than the max_seq_len
 
         Args:
-            cfg (DictConfig): The dataset configuration.
-            max_seq_len (int): The maximum sequence length. Examples with prompts longer than this will be dropped.
-            tokenizer (Tokenizer): The tokenizer to be used for tokenizing the dataset.
+            dataset_name (str): The name of the HuggingFace dataset
+                to use. Can also be a remote http(s) directory or object store bucket
+                containing the file {split}.jsonl in the format (prompt, response),
+                in which case the builder will create a HuggingFace dataset.
+            split (str): The split of the HuggingFace dataset.
+            safe_load (bool, optional): Whether to enforce safe loading of the dataset.
+                If `None`, will default to not applying any safe loading.
+            max_seq_len (int): The maximum length of sequences
+                in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
+                for details.
+            preprocessing_fn (Callable, optional): The preprocessing function to use for
+                formatting the data examples.
+            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used for tokenizing
+                the HuggingFace dataset.
+            target_prompts (str): Which prompts are used as training targets.
+                Defaults to "none", meaning prompts are never used as training targets.
+                See :class:`Seq2SeqFinetuningCollator` docstring for details.
+            target_responses (str): Which responses are used as training targets.
+                Defaults to "last", meaning only the final response in multi-turn examples
+                will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for
+                details.
+            decoder_only_format (bool): Whether to format the
+                examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
+                docstring for details.
+            hf_kwargs (DictConfig, optional): Additional kwargs to
+                pass to `datasets.load_dataset`, which can be used to load
+                a dataset from local files.
 
         Returns:
             Dataset: The tokenized dataset.

diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
@@ -337,7 +337,7 @@ def auto_packing_ratio(
         dataloader_cfg (DictConfig): The dataloader configuration for profiling.
         tokenizer (PreTrainedTokenizerBase): The tokenizer for profiling.
         device_batch_size (int): The size of the batches (number of examples) per device.
-        num_packing_ratio (int): The number of packing ratios to try.
+        num_packing_ratios (int): The number of packing ratios to try.
 
     Returns:
         A packing ratio that minimizes padding while maintaining zero waste.

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -251,8 +251,9 @@ def read_dataset(
         """
         from datasets import \
             Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
-        from datasets import \
-            load_dataset  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import (  # pyright: ignore[reportGeneralTypeIssues]
+            load_dataset,
+        )
         if 'hf://' in dataset_uri:
             dataset_uri = dataset_uri.replace('hf://', '')
             if hf_loading_vars is None:
@@ -363,6 +364,7 @@ def get_answer_from_example(
 
         Args:
             example (Dict): The example from which to retrieve the answer
+            in_context (bool): Whether this is an in-context example. Default to False.
 
         Returns:
             str: The answer in the example
@@ -712,6 +714,7 @@ def get_answer_from_example(
 
         Args:
             example (Dict): The example from which to retrieve the answer
+            in_context (bool): Whether this is an in-context example. Default to False.
 
         Returns:
             str: The answer in from the example with chain of thought and delimiter if needed
@@ -731,7 +734,7 @@ def tokenize_example(
 
         Args:
             prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): The specific example's derived context
+            ctxt (str): The specific example's derived context
             example (Dict): The example as a dictionary.
 
         Returns:
@@ -1035,6 +1038,7 @@ def get_answer_from_example(
 
         Args:
             example (Dict): The example from which to retrieve the answer
+            in_context (bool): Whether this is an in-context example. Default to False.
 
         Returns:
             str: The full string of the correct answer based on the 'gold' key
@@ -1053,7 +1057,7 @@ def tokenize_example(
 
         Args:
             prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): The specific example's derived context
+            ctxt (str): The specific example's derived context
             example (Dict): The example as a dictionary.
 
         Returns:
@@ -1129,6 +1133,7 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         since the batch may consist of multiple questions, the choice_groupings indicates
         which contiguous sequences of elements in the batch correspond to which question
         gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
+
         Args:
             data (List): List of tokenized datapoints (dicts returned by self._tokenize_example)
 
@@ -1168,6 +1173,7 @@ def split_batch(self, batch: Any,
         and real example, which refers to one possible continuation. As example count and
         microbatch_size are tracked in logical example, we split logical attributes by
         microbatch_size and real attributes by microbatch_size * num_choices.
+
         Args:
             batch (Dict): Batch of data
             microbatch_size (int | float): Size of microbatches
@@ -1419,7 +1425,7 @@ def tokenize_example(
 
         Args:
             prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): The specific example's derived context
+            context_options (str): A list of contexts for this specific example.
             example (Dict): The example as a dictionary.
 
         Returns:
@@ -1548,6 +1554,10 @@ def partition_dataset_by_category(
     Args:
         dataset_uri (str): Location of dataset.
         destination_path (str): Base destination path, we will write a separate partition off this URI for each category.
+        hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
+        hf_parsing_map (Dict): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
+            Column contents will be concatenated with ' ' separating them. If not included, will load the columns already present in the HF dataset.
+
 
     Raises:
         MissingConditionalImportError: If datasets not installed raise exception.
@@ -1643,8 +1653,7 @@ def get_icl_task_dataloader(
             # At this point, hf_model is randomly initialized
             composer_model = HuggingFaceModel(hf_model, hf_tokenizer)
 
-        Example:
-
+    Example:
         .. testcode::
 
 
@@ -1685,8 +1694,8 @@ def get_icl_task_dataloader(
         hf_loading_vars (Dict, default = None): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
         hf_parsing_map (Dict, default = None): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
             Column contents will be concatenated with ' ' separating them. If not included, will load the columns already present in the HF dataset.
-        kwargs (Dict[str, Any], default=None): Dictionary containing a mapping
-        from ICL dataset constructor's parameter names and their desired values.
+        destination_path: Where the dataloader will be saved.
+        kwargs (Dict[str, Any], default=None): Dictionary containing a mapping from ICL dataset constructor's parameter names and their desired values.
 
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.