Skip to content

Commit

Permalink
Replace pydocstyle with Ruff (#1417)
Browse files Browse the repository at this point in the history
  • Loading branch information
eitanturok authored Aug 1, 2024
1 parent f670108 commit ac13217
Show file tree
Hide file tree
Showing 20 changed files with 187 additions and 69 deletions.
11 changes: 0 additions & 11 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,17 +77,6 @@ repos:
hooks:
- id: docformatter
args: [--in-place, --wrap-summaries=80, --wrap-descriptions=80]
- repo: https://github.com/PyCQA/pydocstyle
hooks:
- id: pydocstyle
name: pydocstyle
entry: pydocstyle
language: python
types: [python]
exclude: (.ci|.github)
additional_dependencies:
- toml
rev: 6.1.1
- repo: https://github.com/adrienverge/yamllint.git
rev: v1.28.0
hooks:
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/command_utils/data_prep/convert_dataset_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def build_hf_dataset(
"""Build an IterableDataset over the HF C4 or pile source data.
Args:
dataset_name (str): Dataset name
path (str): Dataset name
split (str): Split name.
mode (ConcatMode): NO_CONCAT, or CONCAT_TOKENS
max_length (int): The length of concatenated tokens
Expand Down
26 changes: 15 additions & 11 deletions llmfoundry/command_utils/data_prep/convert_delta_to_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,16 @@ def to_cf(self: 'SparkConnectClient',
return the schema and drops all other responses.
Args:
plan (pb2.Plan): The plan object to be executed by spark.
type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
self (SparkConnectClient): The SparkConnectClient we are processing.
plan (pb2.Plan): The plan object to be executed by spark.
type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
Returns:
Tuple[List[Result], int, bool]: A tuple containing:
- A list of Result namedtuples, each containing a URL, row count, compressed size,
and uncompressed size of the part of the result.
- Total row count of all parts of the result.
- A boolean indicating whether the result has been truncated.
Tuple[List[Result], int, bool]: A tuple containing:
- A list of Result namedtuples, each containing a URL, row count, compressed size,
and uncompressed size of the part of the result.
- Total row count of all parts of the result.
- A boolean indicating whether the result has been truncated.
"""
req = self._execute_plan_request_with_metadata()
req.plan.CopyFrom(plan)
Expand Down Expand Up @@ -120,8 +121,9 @@ def to_cf(self: 'SparkConnectClient',
)

# Create the iterator
from pyspark.sql.connect.client.reattach import \
ExecutePlanResponseReattachableIterator
from pyspark.sql.connect.client.reattach import (
ExecutePlanResponseReattachableIterator,
)
iterator = ExecutePlanResponseReattachableIterator(
req,
self._stub,
Expand Down Expand Up @@ -169,6 +171,7 @@ def collect_as_cf(self: 'DataFrame',
uses the `to_cf` method to execute the plan and fetch results as presigned URLs.
Args:
self (pd.DataFrame): The dataframe we are processing.
type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
Returns:
Expand Down Expand Up @@ -693,8 +696,9 @@ def _check_imports():
import pyspark.sql.connect.proto.cloud_pb2 as cloud_pb2
from pyspark.sql import SparkSession
from pyspark.sql.connect.client.core import SparkConnectClient
from pyspark.sql.connect.client.reattach import \
ExecutePlanResponseReattachableIterator
from pyspark.sql.connect.client.reattach import (
ExecutePlanResponseReattachableIterator,
)
from pyspark.sql.connect.dataframe import DataFrame
from pyspark.sql.dataframe import DataFrame as SparkDataFrame
from pyspark.sql.types import Row
Expand Down
86 changes: 74 additions & 12 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,12 @@ def build_finetuning_dataloader(
on which you intend to use, as explained below.
Args:
name (str): The type of dataloader to build. Must = "finetuning".
---
*** HuggingFace dataset config fields ***
tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
prepare the data from raw text. Any missing sentinel tokens will
be added by the collator.
device_batch_size (int, float): The size of the batches (number of examples)
that the dataloader will produce.
dataset (Dict[str, Any]): A HuggingFace dataset config which contains the following fields:
dataset.hf_name (str, optional): The name of the HuggingFace dataset
to use. Can also be a remote http(s) directory or object store bucket
containing the file {split}.jsonl in the format (prompt, response),
Expand Down Expand Up @@ -130,16 +133,32 @@ def build_finetuning_dataloader(
The script `scripts/misc/profile_packing.py` can help
you choose the best packing_ratio.
dataset.shuffle (bool): Whether to shuffle the dataset.
___
See :class:`StreamingFinetuningDataset` for info on other standard config
options within `dataset` that will be passed as kwargs if
using the streaming codepath.
---
tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
prepare the data from raw text. Any missing sentinel tokens will
be added by the collator.
device_batch_size (int, float): The size of the batches (number of examples)
that the dataloader will produce.
num_workers (int, optional): How many subprocesses to use for data loading.
0 means that the data will be loaded in the main process. The default is 0.
This argument is passed directly to the pytorch :class:`DataLoader`.
drop_last (bool, optional): If true, drop the last incomplete batch, if the dataset
size is not divisible by the batch size. If False and the size of dataset is
not divisible by the batch size, then the last batch will be smaller. The
default is False. This argument is passed directly to the pytorch :class:`DataLoader`.
pin_memory (bool, optional): If True, the data loader will copy Tensors into device/CUDA
pinned memory before returning them. If your data elements are a custom type, or your
`collate_fn` returns a batch that is a custom type. This argument is passed directly to
the pytorch :class:`DataLoader`.
prefetch_factor (int, optional): Number of batches loaded in advance by each worker.
2 means there will be a total of 2 * num_workers batches prefetched across all workers.
(default value depends on the set value for num_workers. If value of num_workers=0 default
is None. Otherwise, if value of num_workers > 0 default is 2). This argument is passed
directly to the pytorch :class:`DataLoader`.
persistent_workers (bool, optional): If True, the data loader will not shut down the worker
processes after a dataset has been consumed once. This allows to maintain the workers
Dataset instances alive. The default is False. This argument is passed directly to the
pytorch :class:`DataLoader`.
timeout (int, optional): If positive, the timeout value for collecting a batch from workers.
Should always be non-negative. The default is 0. This argument is passed directly to the
pytorch :class:`DataLoader`.
See :class:`DataLoader` for standard argument options to the pytorch
dataloader, such as `drop_last`, `num_workers`, etc.
Expand Down Expand Up @@ -357,7 +376,50 @@ def _validate_config(
the other.
Args:
dataset_cfg (DictConfig): The dataset configuration to be validated.
max_seq_len (int): The maximum length of sequences
in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
for details.
decoder_only_format (bool): Whether to format the
examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
docstring for details.
hf_name (str, optional): The name of the HuggingFace dataset
to use. Can also be a remote http(s) directory or object store bucket
containing the file {split}.jsonl in the format (prompt, response),
in which case the builder will create a HuggingFace dataset.
local (str, optional): Local path where remote data
will be streamed to. Only valid if `cfg.dataset.remote` has
also been set.
remote (str, optional): Location of a MDS-formatted
streaming dataset to use. Setting this will tell the builder
to create a streaming dataset rather than a HuggingFace dataset.
hf_kwargs (DictConfig, optional): Additional kwargs to
pass to `datasets.load_dataset`, which can be used to load
a dataset from local files.
preprocessing_fn (str, optional): The name/import path of
the preprocessing function to use for formatting the data examples.
If ``None`` (default), the builder will use the preprocessing function
registered under `hf_name` (see `tasks.py`), if one exists,
otherwise it will skip preprocessing.
If `preprocessing_fn` corresponds to a registered preprocessing
function in `tasks.py`, the builder will use that.
Otherwise, it will interpret `preprocessing_fn` as a
"import.path:function_name" import path; e.g., it will call
`from import.path import function_name` and use the imported
function as the preprocessing function.
safe_load (bool, optional): Whether to enforce safe loading of the dataset.
If `None`, will default to not applying any safe loading.
streams (Dict[str, Any], optional): A dictionary with multiple data streams.
If `None`, will assume no streams.
target_prompts (str): Which prompts are used as training targets.
Defaults to "none", meaning prompts are never used as training targets.
See :class:`Seq2SeqFinetuningCollator` docstring for details.
target_responses (str): Which responses are used as training targets.
Defaults to "last", meaning only the final response in multi-turn examples
will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for
details.
kwargs (DictConfig, optional): Additional kwargs to
pass to `datasets.load_dataset`, which can be used to load
a dataset from local files.
Raises:
ValueError: If the dataset configuration does not meet the requirements.
Expand Down Expand Up @@ -504,7 +566,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
completed, the function removes the signal file.
Args:
hf_name (str): The path of the HuggingFace dataset to download.
remote_path (str): The path of the HuggingFace dataset to download.
split (str): The dataset split to download (e.g., 'train', 'validation', 'test').
Returns:
Expand Down
32 changes: 28 additions & 4 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def _is_empty_or_nonexistent(dirpath: str) -> bool:
Args:
dirpath (str): Directory path to check.
Returns
Returns:
True if directory is empty or non-existent. False otherwise.
"""
return not os.path.isdir(dirpath) or len(os.listdir(dirpath)) == 0
Expand Down Expand Up @@ -820,9 +820,33 @@ def build_from_hf(
Note: This function will drop examples where the prompt is longer than the max_seq_len
Args:
cfg (DictConfig): The dataset configuration.
max_seq_len (int): The maximum sequence length. Examples with prompts longer than this will be dropped.
tokenizer (Tokenizer): The tokenizer to be used for tokenizing the dataset.
dataset_name (str): The name of the HuggingFace dataset
to use. Can also be a remote http(s) directory or object store bucket
containing the file {split}.jsonl in the format (prompt, response),
in which case the builder will create a HuggingFace dataset.
split (str): The split of the HuggingFace dataset.
safe_load (bool, optional): Whether to enforce safe loading of the dataset.
If `None`, will default to not applying any safe loading.
max_seq_len (int): The maximum length of sequences
in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
for details.
preprocessing_fn (Callable, optional): The preprocessing function to use for
formatting the data examples.
tokenizer (PreTrainedTokenizerBase): The tokenizer to be used for tokenizing
the HuggingFace dataset.
target_prompts (str): Which prompts are used as training targets.
Defaults to "none", meaning prompts are never used as training targets.
See :class:`Seq2SeqFinetuningCollator` docstring for details.
target_responses (str): Which responses are used as training targets.
Defaults to "last", meaning only the final response in multi-turn examples
will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for
details.
decoder_only_format (bool): Whether to format the
examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
docstring for details.
hf_kwargs (DictConfig, optional): Additional kwargs to
pass to `datasets.load_dataset`, which can be used to load
a dataset from local files.
Returns:
Dataset: The tokenized dataset.
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/data/packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def auto_packing_ratio(
dataloader_cfg (DictConfig): The dataloader configuration for profiling.
tokenizer (PreTrainedTokenizerBase): The tokenizer for profiling.
device_batch_size (int): The size of the batches (number of examples) per device.
num_packing_ratio (int): The number of packing ratios to try.
num_packing_ratios (int): The number of packing ratios to try.
Returns:
A packing ratio that minimizes padding while maintaining zero waste.
Expand Down
27 changes: 18 additions & 9 deletions llmfoundry/eval/datasets/in_context_learning_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,9 @@ def read_dataset(
"""
from datasets import \
Dataset as HFDataset # pyright: ignore[reportGeneralTypeIssues]
from datasets import \
load_dataset # pyright: ignore[reportGeneralTypeIssues]
from datasets import ( # pyright: ignore[reportGeneralTypeIssues]
load_dataset,
)
if 'hf://' in dataset_uri:
dataset_uri = dataset_uri.replace('hf://', '')
if hf_loading_vars is None:
Expand Down Expand Up @@ -363,6 +364,7 @@ def get_answer_from_example(
Args:
example (Dict): The example from which to retrieve the answer
in_context (bool): Whether this is an in-context example. Default to False.
Returns:
str: The answer in the example
Expand Down Expand Up @@ -712,6 +714,7 @@ def get_answer_from_example(
Args:
example (Dict): The example from which to retrieve the answer
in_context (bool): Whether this is an in-context example. Default to False.
Returns:
str: The answer in from the example with chain of thought and delimiter if needed
Expand All @@ -731,7 +734,7 @@ def tokenize_example(
Args:
prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
ctx (str): The specific example's derived context
ctxt (str): The specific example's derived context
example (Dict): The example as a dictionary.
Returns:
Expand Down Expand Up @@ -1035,6 +1038,7 @@ def get_answer_from_example(
Args:
example (Dict): The example from which to retrieve the answer
in_context (bool): Whether this is an in-context example. Default to False.
Returns:
str: The full string of the correct answer based on the 'gold' key
Expand All @@ -1053,7 +1057,7 @@ def tokenize_example(
Args:
prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
ctx (str): The specific example's derived context
ctxt (str): The specific example's derived context
example (Dict): The example as a dictionary.
Returns:
Expand Down Expand Up @@ -1129,6 +1133,7 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
since the batch may consist of multiple questions, the choice_groupings indicates
which contiguous sequences of elements in the batch correspond to which question
gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
Args:
data (List): List of tokenized datapoints (dicts returned by self._tokenize_example)
Expand Down Expand Up @@ -1168,6 +1173,7 @@ def split_batch(self, batch: Any,
and real example, which refers to one possible continuation. As example count and
microbatch_size are tracked in logical example, we split logical attributes by
microbatch_size and real attributes by microbatch_size * num_choices.
Args:
batch (Dict): Batch of data
microbatch_size (int | float): Size of microbatches
Expand Down Expand Up @@ -1419,7 +1425,7 @@ def tokenize_example(
Args:
prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
ctx (str): The specific example's derived context
context_options (str): A list of contexts for this specific example.
example (Dict): The example as a dictionary.
Returns:
Expand Down Expand Up @@ -1548,6 +1554,10 @@ def partition_dataset_by_category(
Args:
dataset_uri (str): Location of dataset.
destination_path (str): Base destination path, we will write a separate partition off this URI for each category.
hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
hf_parsing_map (Dict): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
Column contents will be concatenated with ' ' separating them. If not included, will load the columns already present in the HF dataset.
Raises:
MissingConditionalImportError: If datasets not installed raise exception.
Expand Down Expand Up @@ -1643,8 +1653,7 @@ def get_icl_task_dataloader(
# At this point, hf_model is randomly initialized
composer_model = HuggingFaceModel(hf_model, hf_tokenizer)
Example:
Example:
.. testcode::
Expand Down Expand Up @@ -1685,8 +1694,8 @@ def get_icl_task_dataloader(
hf_loading_vars (Dict, default = None): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
hf_parsing_map (Dict, default = None): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
Column contents will be concatenated with ' ' separating them. If not included, will load the columns already present in the HF dataset.
kwargs (Dict[str, Any], default=None): Dictionary containing a mapping
from ICL dataset constructor's parameter names and their desired values.
destination_path: Where the dataloader will be saved.
kwargs (Dict[str, Any], default=None): Dictionary containing a mapping from ICL dataset constructor's parameter names and their desired values.
Returns:
DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
Expand Down
Loading

0 comments on commit ac13217

Please sign in to comment.