From 425e1a0426c8da23a00c456fd69745a1b3e6db5f Mon Sep 17 00:00:00 2001 From: Mayank Mishra <32954280+mayank31398@users.noreply.github.com> Date: Fri, 3 May 2024 05:02:17 -0400 Subject: [PATCH 01/49] add mlp bias for llama models (#30031) * add bias * fix quality --- src/transformers/models/cohere/modeling_cohere.py | 1 - src/transformers/models/llama/configuration_llama.py | 6 +++++- src/transformers/models/llama/modeling_llama.py | 6 +++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index a184fe4450f3ef..ce6fe29b18859c 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -161,7 +161,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype) -# Copied from transformers.models.llama.modeling_llama.LlamaMLP Llama->Cohere class CohereMLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 6d0f68162cce43..66d668be882658 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -94,10 +94,12 @@ class LlamaConfig(PretrainedConfig): these scaling strategies behave: https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an experimental feature, subject to breaking API changes in future versions. - attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. ```python >>> from transformers import LlamaModel, LlamaConfig @@ -137,6 +139,7 @@ def __init__( rope_scaling=None, attention_bias=False, attention_dropout=0.0, + mlp_bias=False, **kwargs, ): self.vocab_size = vocab_size @@ -161,6 +164,7 @@ def __init__( self._rope_scaling_validation() self.attention_bias = attention_bias self.attention_dropout = attention_dropout + self.mlp_bias = mlp_bias super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 898bf23aaea7f2..0e971478e26ef3 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -214,9 +214,9 @@ def __init__(self, config): self.config = config self.hidden_size = config.hidden_size self.intermediate_size = config.intermediate_size - self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) self.act_fn = ACT2FN[config.hidden_act] def forward(self, x): From 66f675eb656078a00469b223a65f88fd11790857 Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Fri, 3 May 2024 12:04:15 +0100 Subject: [PATCH 02/49] Fix W&B run name (#30462) * Remove comparison to output_dir * Update docs for `run_name` * Add warning --- src/transformers/integrations/integration_utils.py | 11 ++++++++--- src/transformers/training_args.py | 4 ++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index 2839ee876ed456..4543cf9f98b5f3 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -762,9 +762,14 @@ def setup(self, args, state, model, **kwargs): if trial_name is not None: init_args["name"] = trial_name init_args["group"] = args.run_name - else: - if not (args.run_name is None or args.run_name == args.output_dir): - init_args["name"] = args.run_name + elif args.run_name is not None: + init_args["name"] = args.run_name + if args.run_name == args.output_dir: + self._wandb.termwarn( + "The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was " + "not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.", + repeat=False, + ) if self._wandb.run is None: self._wandb.init( diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index d59b9ab37c7e6e..b92f9e18c6726c 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -420,9 +420,9 @@ class TrainingArguments: the past hidden states for their predictions. If this argument is set to a positive int, the `Trainer` will use the corresponding output (usually index 2) as the past state and feed it to the model at the next training step under the keyword argument `mems`. - run_name (`str`, *optional*): + run_name (`str`, *optional*, defaults to `output_dir`): A descriptor for the run. Typically used for [wandb](https://www.wandb.com/) and - [mlflow](https://www.mlflow.org/) logging. + [mlflow](https://www.mlflow.org/) logging. If not specified, will be the same as `output_dir`. disable_tqdm (`bool`, *optional*): Whether or not to disable the tqdm progress bars and table of metrics produced by [`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is From d0c72c15c25e100860b6af692d084f06546a0b7a Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Fri, 3 May 2024 16:01:15 +0200 Subject: [PATCH 03/49] HQQ: PEFT support for HQQ (#30632) Update quantizer_hqq.py --- src/transformers/quantizers/quantizer_hqq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/quantizers/quantizer_hqq.py b/src/transformers/quantizers/quantizer_hqq.py index dd58c2c1bc5a27..06949d059a5de3 100755 --- a/src/transformers/quantizers/quantizer_hqq.py +++ b/src/transformers/quantizers/quantizer_hqq.py @@ -197,4 +197,4 @@ def is_serializable(self): @property def is_trainable(self) -> bool: - return False + return True From deb7605a2a805eb0ef11f2706471c3d7cd68bfc0 Mon Sep 17 00:00:00 2001 From: Yen Ting Date: Fri, 3 May 2024 08:49:28 -0700 Subject: [PATCH 04/49] Prevent `TextGenerationPipeline._sanitize_parameters` from overriding previously provided parameters (#30362) * Fixed TextGenerationPipeline._sanitize_parameters default params * removed empty spaces --------- Co-authored-by: Ng, Yen Ting --- src/transformers/pipelines/text_generation.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py index ff5af53ff74ef9..0c9e2f3e7a8229 100644 --- a/src/transformers/pipelines/text_generation.py +++ b/src/transformers/pipelines/text_generation.py @@ -129,19 +129,25 @@ def _sanitize_parameters( prefix=None, handle_long_generation=None, stop_sequence=None, - add_special_tokens=False, truncation=None, - padding=False, max_length=None, **generate_kwargs, ): - preprocess_params = { - "add_special_tokens": add_special_tokens, - "truncation": truncation, - "padding": padding, - "max_length": max_length, - } + preprocess_params = {} + + add_special_tokens = False + if "add_special_tokens" in generate_kwargs: + preprocess_params["add_special_tokens"] = generate_kwargs["add_special_tokens"] + add_special_tokens = generate_kwargs["add_special_tokens"] + + if "padding" in generate_kwargs: + preprocess_params["padding"] = generate_kwargs["padding"] + + if truncation is not None: + preprocess_params["truncation"] = truncation + if max_length is not None: + preprocess_params["max_length"] = max_length generate_kwargs["max_length"] = max_length if prefix is not None: From 91d155ea92da372b319a79dd4eef69533ee15170 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 3 May 2024 18:19:30 +0200 Subject: [PATCH 05/49] Avoid duplication in PR slow CI model list (#30634) update Co-authored-by: ydshieh --- utils/pr_slow_ci_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/pr_slow_ci_models.py b/utils/pr_slow_ci_models.py index f7cd960ec91cdf..391e99fc2276f8 100644 --- a/utils/pr_slow_ci_models.py +++ b/utils/pr_slow_ci_models.py @@ -142,4 +142,4 @@ def get_models(commit_message: str): new_model = get_new_model() specified_models = get_models(args.commit_message) models = ([] if new_model == "" else [new_model]) + specified_models - print(models) + print(sorted(set(models))) From 307f632bb2fa39f6f8dcd54644d55edd461d73d0 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Mon, 6 May 2024 10:10:32 +0200 Subject: [PATCH 06/49] [`CI update`] Try to use dockers and no cache (#29202) * change cis * nits * update * minor updates * [push-ci-image] * nit [push-ci-image] * nitsssss * [build-ci-image] * [push-ci-image] * [push-ci-image] * both * [push-ci-image] * this? * [push-ci-image] * pypi-kenlm needs g++ * [push-ci-image] * nit * more nits [push-ci-image] * nits [push-ci-image] * [push-ci-image] * [push-ci-image] * [push-ci-image] * add vision * [push-ci-image] * [push-ci-image] * add new dummy file but will need to update them [push-ci-image] * [push-ci-image] * show package size as well * [push-ci-image] * potentially ignore failures * workflow updates * nits [push-ci-image] * [push-ci-image] * fix consistency * clean nciida triton * also show big packages [push-ci-image] * nit * update * another one * line escape? * add accelerate [push-ci-image] * updates [push-ci-image] * nits to run tests, no push-ci * try to parse skip reason to make sure nothing is skipped that should no be skippped * nit? * always show skipped reasons * nits * better parsing of the test outputs * action="store_true", * failure on failed * show matched * debug * update short summary with skipped, failed and errors * nits * nits * coolu pdates * remove docbuilder * fix * always run checks * oups * nits * don't error out on library printing * non zero exi codes * no warning * nit * WAT? * format nit * [push-ci-image] * fail if fail is needed * [push-ci-image] * sound file for torch light? * [push-ci-image] * order is important [push-ci-image] * [push-ci-image] reduce even further * [push-ci-image] * use pytest rich ! * yes [push-ci-image] * oupsy * bring back the full traceback, but pytest rich should help * nit * [push-ci-image] * re run * nit * [push-ci-image] * [push-ci-image] * [push-ci-image] * empty push to trigger * [push-ci-image] * nit? [push-ci-image] * empty * try to install timm with no deps * [push-ci-image] * oups [push-ci-image] * [push-ci-image] * [push-ci-image] ? * [push-ci-image] open ssh client for git checkout fast * empty for torch light * updates [push-ci-image] * nit * @v4 for checkout * [push-ci-image] * [push-ci-image] * fix fetch tests with parallelism * [push-ci-image] * more parallelism * nit * more nits * empty to re-trigger * empty to re-trigger * split by timing * did not work with previous commit * junit.xml * no path? * mmm this? * junitxml format * split by timing * nit * fix junit family * now we can test if the xunit1 is compatible! * this? * fully list tests * update * update * oups * finally * use classname * remove working directory to make sure the path does not interfere * okay no juni should have the correct path * name split? * sort by classname is what make most sense * some testing * naem * oups * test something fun * autodetect * 18? * nit * file size? * uip * 4 is best * update to see versions * better print * [push-ci-image] * [push-ci-image] * please install the correct keras version * [push-ci-image] * [push-ci-image] * [push-ci-image] * [push-ci-image] * [push-ci-image] * uv is fucking me up * [push-ci-image] * [push-ci-image] * [push-ci-image] * nits * [push-ci-image] * [push-ci-image] * install issues an pins * tapas as well * nits * more paralellism * short tb * soundfile * soundfile * [push-ci-image] * [push-ci-image] * [push-ci-image] * oups * [push-ci-image] * fix some things * [push-ci-image] * [push-ci-image] * [push-ci-image] * [push-ci-image] * use torch-light for hub * small git lfs for hub job * [push-ci-image] * [push-ci-image] * [push-ci-image] * [push-ci-image] * fix tf tapas * [push-ci-image] * nits * [push-ci-image] * don't update the test * [push-ci-image] * [push-ci-image] * [push-ci-image] * no use them * [push-ci-image] * [push-ci-image] * [push-ci-image] * [push-ci-image] * update tf proba * [push-ci-image] * [push-ci-image] * woops * [push-ci-image] * [push-ci-image] * [push-ci-image] * [push-ci-image] * [push-ci-image] * [push-ci-image] * test with built dockers * [push-ci-image] * skip annoying tests * revert fix copy * update test values * update * last skip and fixup * nit * ALL GOOOD * quality * Update tests/models/layoutlmv2/test_image_processing_layoutlmv2.py * Update docker/quality.dockerfile Co-authored-by: Lysandre Debut * Update src/transformers/models/tapas/modeling_tf_tapas.py Co-authored-by: Lysandre Debut * Apply suggestions from code review Co-authored-by: Lysandre Debut * use torch-speed * updates * [push-ci-image] * [push-ci-image] * [push-ci-image] * [push-ci-image] * fuck ken-lm [push-ci-image] * [push-ci-image] * [push-ci-image] --------- Co-authored-by: Lysandre Debut --- .circleci/config.yml | 63 +--- .circleci/create_circleci_config.py | 275 ++++-------------- .circleci/parse_test_outputs.py | 70 +++++ .github/workflows/build-ci-docker-images.yml | 54 ++++ docker/consistency.dockerfile | 14 + docker/custom-tokenizers.dockerfile | 26 ++ docker/examples-tf.dockerfile | 12 + docker/examples-torch.dockerfile | 11 + docker/exotic-models.dockerfile | 17 ++ docker/jax-light.dockerfile | 9 + docker/pipeline-tf.dockerfile | 9 + docker/pipeline-torch.dockerfile | 10 + docker/quality.dockerfile | 8 + docker/tf-light.dockerfile | 11 + docker/torch-jax-light.dockerfile | 15 + docker/torch-light.dockerfile | 10 + docker/torch-tf-light.dockerfile | 19 ++ setup.py | 39 ++- src/transformers/dependency_versions_table.py | 9 +- .../models/groupvit/modeling_tf_groupvit.py | 9 + .../models/tapas/modeling_tf_tapas.py | 11 +- .../test_image_processing_layoutlmv2.py | 5 +- .../test_image_processing_layoutlmv3.py | 4 +- tests/models/tapas/test_modeling_tapas.py | 7 +- tests/models/tapas/test_modeling_tf_tapas.py | 4 + 25 files changed, 430 insertions(+), 291 deletions(-) create mode 100644 .circleci/parse_test_outputs.py create mode 100644 .github/workflows/build-ci-docker-images.yml create mode 100644 docker/consistency.dockerfile create mode 100644 docker/custom-tokenizers.dockerfile create mode 100644 docker/examples-tf.dockerfile create mode 100644 docker/examples-torch.dockerfile create mode 100644 docker/exotic-models.dockerfile create mode 100644 docker/jax-light.dockerfile create mode 100644 docker/pipeline-tf.dockerfile create mode 100644 docker/pipeline-torch.dockerfile create mode 100644 docker/quality.dockerfile create mode 100644 docker/tf-light.dockerfile create mode 100644 docker/torch-jax-light.dockerfile create mode 100644 docker/torch-light.dockerfile create mode 100644 docker/torch-tf-light.dockerfile diff --git a/.circleci/config.yml b/.circleci/config.yml index 044493315d2003..ddb8869a6ae5aa 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -12,7 +12,7 @@ jobs: # Ensure running with CircleCI/huggingface check_circleci_user: docker: - - image: cimg/python:3.8.12 + - image: python:3.10-slim parallelism: 1 steps: - run: echo $CIRCLE_PROJECT_USERNAME @@ -26,13 +26,11 @@ jobs: fetch_tests: working_directory: ~/transformers docker: - - image: cimg/python:3.8.12 + - image: huggingface/transformers-quality parallelism: 1 steps: - checkout - - run: pip install --upgrade --upgrade-strategy eager pip - - run: pip install -U --upgrade-strategy eager GitPython - - run: pip install -U --upgrade-strategy eager . + - run: uv pip install -U -e . - run: mkdir -p test_preparation - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt - store_artifacts: @@ -88,25 +86,22 @@ jobs: echo "No tests to run, exiting early!" circleci-agent step halt fi - - run: cp test_preparation/generated_config.yml test_preparation/generated_config.txt - store_artifacts: - path: test_preparation/generated_config.txt + path: test_preparation/generated_config.yml - store_artifacts: - path: test_preparation/filtered_test_list_cross_tests.txt + path: test_preparation/filtered_test_list_cross_tests.txt - continuation/continue: - configuration_path: test_preparation/generated_config.yml + configuration_path: test_preparation/generated_config.yml # To run all tests for the nightly build fetch_all_tests: working_directory: ~/transformers docker: - - image: cimg/python:3.8.12 + - image: huggingface/transformers-consistency parallelism: 1 steps: - checkout - - run: pip install --upgrade --upgrade-strategy eager pip - - run: pip install -U --upgrade-strategy eager GitPython - - run: pip install -U --upgrade-strategy eager . + - run: uv pip install -e . - run: | mkdir test_preparation echo -n "tests" > test_preparation/test_list.txt @@ -126,7 +121,7 @@ jobs: check_code_quality: working_directory: ~/transformers docker: - - image: cimg/python:3.8.12 + - image: huggingface/transformers-quality resource_class: large environment: TRANSFORMERS_IS_CI: yes @@ -134,24 +129,7 @@ jobs: parallelism: 1 steps: - checkout - - restore_cache: - keys: - - v0.7-code_quality-pip-{{ checksum "setup.py" }} - - v0.7-code-quality-pip - - restore_cache: - keys: - - v0.7-code_quality-site-packages-{{ checksum "setup.py" }} - - v0.7-code-quality-site-packages - - run: pip install --upgrade --upgrade-strategy eager pip - - run: pip install -U --upgrade-strategy eager .[all,quality] - - save_cache: - key: v0.7-code_quality-pip-{{ checksum "setup.py" }} - paths: - - '~/.cache/pip' - - save_cache: - key: v0.7-code_quality-site-packages-{{ checksum "setup.py" }} - paths: - - '~/.pyenv/versions/' + - run: uv pip install -e . - run: name: Show installed libraries and their versions command: pip freeze | tee installed.txt @@ -167,7 +145,7 @@ jobs: check_repository_consistency: working_directory: ~/transformers docker: - - image: cimg/python:3.8.12 + - image: huggingface/transformers-consistency resource_class: large environment: TRANSFORMERS_IS_CI: yes @@ -175,24 +153,7 @@ jobs: parallelism: 1 steps: - checkout - - restore_cache: - keys: - - v0.7-repository_consistency-pip-{{ checksum "setup.py" }} - - v0.7-repository_consistency-pip - - restore_cache: - keys: - - v0.7-repository_consistency-site-packages-{{ checksum "setup.py" }} - - v0.7-repository_consistency-site-packages - - run: pip install --upgrade --upgrade-strategy eager pip - - run: pip install -U --upgrade-strategy eager .[all,quality] - - save_cache: - key: v0.7-repository_consistency-pip-{{ checksum "setup.py" }} - paths: - - '~/.cache/pip' - - save_cache: - key: v0.7-repository_consistency-site-packages-{{ checksum "setup.py" }} - paths: - - '~/.pyenv/versions/' + - run: uv pip install -e . - run: name: Show installed libraries and their versions command: pip freeze | tee installed.txt diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 349e844720dc19..49c46c2b83ddb3 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -19,7 +19,7 @@ import random from dataclasses import dataclass from typing import Any, Dict, List, Optional - +import glob import yaml @@ -41,7 +41,6 @@ class EmptyJob: def to_dict(self): return { - "working_directory": "~/transformers", "docker": copy.deepcopy(DEFAULT_DOCKER_IMAGE), "steps":["checkout"], } @@ -61,7 +60,6 @@ class CircleCIJob: pytest_options: Dict[str, Any] = None resource_class: Optional[str] = "2xlarge" tests_to_run: Optional[List[str]] = None - working_directory: str = "~/transformers" # This should be only used for doctest job! command_timeout: Optional[int] = None @@ -92,7 +90,6 @@ def to_dict(self): cache_branch_prefix = "pull" job = { - "working_directory": self.working_directory, "docker": self.docker_image, "environment": env, } @@ -102,34 +99,14 @@ def to_dict(self): job["parallelism"] = self.parallelism steps = [ "checkout", - {"attach_workspace": {"at": "~/transformers/test_preparation"}}, - { - "restore_cache": { - "keys": [ - # check the fully-matched cache first - f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-pip-" + '{{ checksum "setup.py" }}', - # try the partially-matched cache from `main` - f"v{self.cache_version}-{self.cache_name}-main-pip-", - # try the general partially-matched cache - f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-pip-", - ] - } - }, - { - "restore_cache": { - "keys": [ - f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-site-packages-" + '{{ checksum "setup.py" }}', - f"v{self.cache_version}-{self.cache_name}-main-site-packages-", - f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-site-packages-", - ] - } - }, + {"attach_workspace": {"at": "test_preparation"}}, ] steps.extend([{"run": l} for l in self.install_steps]) - steps.extend([{"run": 'pip install "fsspec>=2023.5.0,<2023.10.0"'}]) - steps.extend([{"run": "pip install pytest-subtests"}]) - steps.append({"run": {"name": "Show installed libraries and their versions", "command": "pip freeze | tee installed.txt"}}) - steps.append({"store_artifacts": {"path": "~/transformers/installed.txt"}}) + steps.append({"run": {"name": "Show installed libraries and their size", "command": """du -h -d 1 "$(pip -V | cut -d ' ' -f 4 | sed 's/pip//g')" | grep -vE "dist-info|_distutils_hack|__pycache__" | sort -h | tee installed.txt || true"""}}) + steps.append({"run": {"name": "Show installed libraries and their versions", "command": """pip list --format=freeze | tee installed.txt || true"""}}) + + steps.append({"run":{"name":"Show biggest libraries","command":"""dpkg-query --show --showformat='${Installed-Size}\t${Package}\n' | sort -rh | head -25 | sort -h | awk '{ package=$2; sub(".*/", "", package); printf("%.5f GB %s\n", $1/1024/1024, package)}' || true"""}}) + steps.append({"store_artifacts": {"path": "installed.txt"}}) all_options = {**COMMON_PYTEST_OPTIONS, **self.pytest_options} pytest_flags = [f"--{key}={value}" if (value is not None or key in ["doctest-modules"]) else f"-{key}" for key, value in all_options.items()] @@ -138,11 +115,11 @@ def to_dict(self): ) steps.append({"run": {"name": "Create `test-results` directory", "command": "mkdir test-results"}}) - test_command = "" if self.command_timeout: test_command = f"timeout {self.command_timeout} " - test_command += f"python -m pytest -rs --junitxml=test-results/junit.xml -n {self.pytest_num_workers} " + " ".join(pytest_flags) + # junit familiy xunit1 is necessary to support splitting on test name or class name with circleci split + test_command += f"python3 -m pytest -rsfE -p no:warnings -o junit_family=xunit1 --tb=short --junitxml=test-results/junit.xml -n {self.pytest_num_workers} " + " ".join(pytest_flags) if self.parallelism == 1: if self.tests_to_run is None: @@ -167,13 +144,11 @@ def to_dict(self): if test.endswith(".py"): expanded_tests.append(test) elif test == "tests/models": - expanded_tests.extend([os.path.join(test, x) for x in os.listdir(test)]) + expanded_tests.extend(glob.glob("tests/models/**/test*.py", recursive=True)) elif test == "tests/pipelines": expanded_tests.extend([os.path.join(test, x) for x in os.listdir(test)]) else: expanded_tests.append(test) - # Avoid long tests always being collected together - random.shuffle(expanded_tests) tests = " ".join(expanded_tests) # Each executor to run ~10 tests @@ -190,13 +165,13 @@ def to_dict(self): command = 'TESTS=$(circleci tests split tests.txt) && echo $TESTS > splitted_tests.txt' steps.append({"run": {"name": "Split tests", "command": command}}) - steps.append({"store_artifacts": {"path": "~/transformers/tests.txt"}}) - steps.append({"store_artifacts": {"path": "~/transformers/splitted_tests.txt"}}) + steps.append({"store_artifacts": {"path": "tests.txt"}}) + steps.append({"store_artifacts": {"path": "splitted_tests.txt"}}) test_command = "" - if self.timeout: - test_command = f"timeout {self.timeout} " - test_command += f"python -m pytest -rs -n {self.pytest_num_workers} " + " ".join(pytest_flags) + if self.command_timeout: + test_command = f"timeout {self.command_timeout} " + test_command += f"python3 -m pytest -rsfE -p no:warnings --tb=short -o junit_family=xunit1 --junitxml=test-results/junit.xml -n {self.pytest_num_workers} " + " ".join(pytest_flags) test_command += " $(cat splitted_tests.txt)" if self.marker is not None: test_command += f" -m {self.marker}" @@ -211,61 +186,17 @@ def to_dict(self): # failure. test_command = f"({test_command}) || true" else: - test_command = f"({test_command} | tee tests_output.txt) || true" + test_command = f"({test_command} | tee tests_output.txt)" steps.append({"run": {"name": "Run tests", "command": test_command}}) - # Deal with errors - check_test_command = f'if [ -s reports/{self.job_name}/errors.txt ]; ' - check_test_command += 'then echo "Some tests errored out!"; echo ""; ' - check_test_command += f'cat reports/{self.job_name}/errors.txt; ' - check_test_command += 'echo ""; echo ""; ' - - py_command = f'import os; fp = open("reports/{self.job_name}/summary_short.txt"); failed = os.linesep.join([x for x in fp.read().split(os.linesep) if x.startswith("ERROR ")]); fp.close(); fp = open("summary_short.txt", "w"); fp.write(failed); fp.close()' - check_test_command += f"$(python3 -c '{py_command}'); " - check_test_command += 'cat summary_short.txt; echo ""; exit -1; ' - - # Deeal with failed tests - check_test_command += f'elif [ -s reports/{self.job_name}/failures_short.txt ]; ' - check_test_command += 'then echo "Some tests failed!"; echo ""; ' - check_test_command += f'cat reports/{self.job_name}/failures_short.txt; ' - check_test_command += 'echo ""; echo ""; ' - - py_command = f'import os; fp = open("reports/{self.job_name}/summary_short.txt"); failed = os.linesep.join([x for x in fp.read().split(os.linesep) if x.startswith("FAILED ")]); fp.close(); fp = open("summary_short.txt", "w"); fp.write(failed); fp.close()' - check_test_command += f"$(python3 -c '{py_command}'); " - check_test_command += 'cat summary_short.txt; echo ""; exit -1; ' - - check_test_command += f'elif [ -s reports/{self.job_name}/stats.txt ]; then echo "All tests pass!"; ' - - # return code `124` means the previous (pytest run) step is timeout - if self.name == "pr_documentation_tests": - check_test_command += 'elif [ -f 124.txt ]; then echo "doctest timeout!"; ' - - check_test_command += 'else echo "other fatal error"; echo ""; exit -1; fi;' - - steps.append({"run": {"name": "Check test results", "command": check_test_command}}) + steps.append({"run": {"name": "Skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}}) + steps.append({"run": {"name": "Failed tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}}) + steps.append({"run": {"name": "Errors", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}}) steps.append({"store_test_results": {"path": "test-results"}}) - - steps.append({"store_artifacts": {"path": "~/transformers/tests_output.txt"}}) - steps.append({"store_artifacts": {"path": "~/transformers/reports"}}) - - # save cache at the end: so pytest step runs before cache saving and we can see results earlier - steps.append( - { - "save_cache": { - "key": f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-pip-" + '{{ checksum "setup.py" }}', - "paths": ["~/.cache/pip"], - } - } - ) - steps.append( - { - "save_cache": { - "key": f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-site-packages-" + '{{ checksum "setup.py" }}', - "paths": ["~/.pyenv/versions/"], - } - } - ) + steps.append({"store_artifacts": {"path": "tests_output.txt"}}) + steps.append({"store_artifacts": {"path": "test-results/junit.xml"}}) + steps.append({"store_artifacts": {"path": "reports"}}) job["steps"] = steps return job @@ -278,18 +209,9 @@ def job_name(self): # JOBS torch_and_tf_job = CircleCIJob( "torch_and_tf", + docker_image=[{"image":"huggingface/transformers-torch-tf-light"}], + install_steps=["uv venv && uv pip install ."], additional_env={"RUN_PT_TF_CROSS_TESTS": True}, - install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng git-lfs cmake", - "git lfs install", - "pip install --upgrade --upgrade-strategy eager pip", - "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]", - "pip install -U --upgrade-strategy eager tensorflow_probability", - # Without --no-deps we can't pin dependency versions in the future - "pip install -U --upgrade-strategy eager --no-deps -e git+https://github.com/huggingface/accelerate@main#egg=accelerate", - # TODO: remove this one after fixing the dependency issue(s) above - "pip install -U --upgrade-strategy eager torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu", - ], marker="is_pt_tf_cross_test", pytest_options={"rA": None, "durations": 0}, ) @@ -298,77 +220,53 @@ def job_name(self): torch_and_flax_job = CircleCIJob( "torch_and_flax", additional_env={"RUN_PT_FLAX_CROSS_TESTS": True}, - install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng", - "pip install -U --upgrade-strategy eager --upgrade pip", - "pip install -U --upgrade-strategy eager .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]", - # Without --no-deps we can't pin dependency versions in the future - "pip install -U --upgrade-strategy eager --no-deps -e git+https://github.com/huggingface/accelerate@main#egg=accelerate", - ], + docker_image=[{"image":"huggingface/transformers-torch-jax-light"}], + install_steps=["uv venv && uv pip install ."], marker="is_pt_flax_cross_test", pytest_options={"rA": None, "durations": 0}, ) - torch_job = CircleCIJob( "torch", - install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time", - "pip install --upgrade --upgrade-strategy eager pip", - "pip install -U --upgrade-strategy eager .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]", - # Without --no-deps we can't pin dependency versions in the future - "pip install -U --upgrade-strategy eager --no-deps -e git+https://github.com/huggingface/accelerate@main#egg=accelerate", - ], - parallelism=1, - pytest_num_workers=12, + docker_image=[{"image": "huggingface/transformers-torch-light"}], + install_steps=["uv venv && uv pip install ."], + parallelism=6, + pytest_num_workers=16 ) tf_job = CircleCIJob( "tf", - install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake", - "pip install --upgrade --upgrade-strategy eager pip", - "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]", - "pip install -U --upgrade-strategy eager tensorflow_probability", - ], - parallelism=1, + docker_image=[{"image":"huggingface/transformers-tf-light"}], + install_steps=["uv venv", "uv pip install -e."], + parallelism=6, + pytest_num_workers=16, ) flax_job = CircleCIJob( "flax", - install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng", - "pip install --upgrade --upgrade-strategy eager pip", - "pip install -U --upgrade-strategy eager .[flax,testing,sentencepiece,flax-speech,vision]", - ], - parallelism=1, + docker_image=[{"image":"huggingface/transformers-jax-light"}], + install_steps=["uv venv && uv pip install ."], + parallelism=6, + pytest_num_workers=16 ) pipelines_torch_job = CircleCIJob( "pipelines_torch", additional_env={"RUN_PIPELINE_TESTS": True}, - install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng", - "pip install --upgrade --upgrade-strategy eager pip", - "pip install -U --upgrade-strategy eager .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video]", - ], + docker_image=[{"image":"huggingface/transformers-torch-light"}], + install_steps=["uv venv && uv pip install ."], marker="is_pipeline_test", - pytest_num_workers=12, ) pipelines_tf_job = CircleCIJob( "pipelines_tf", additional_env={"RUN_PIPELINE_TESTS": True}, - install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y cmake", - "pip install --upgrade --upgrade-strategy eager pip", - "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,testing,sentencepiece,vision]", - "pip install -U --upgrade-strategy eager tensorflow_probability", - ], + docker_image=[{"image":"huggingface/transformers-tf-light"}], + install_steps=["uv venv && uv pip install ."], marker="is_pipeline_test", ) @@ -376,22 +274,8 @@ def job_name(self): custom_tokenizers_job = CircleCIJob( "custom_tokenizers", additional_env={"RUN_CUSTOM_TOKENIZERS": True}, - install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y cmake", - { - "name": "install jumanpp", - "command": - "wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz\n" - "tar xvf jumanpp-2.0.0-rc3.tar.xz\n" - "mkdir jumanpp-2.0.0-rc3/bld\n" - "cd jumanpp-2.0.0-rc3/bld\n" - "sudo cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local\n" - "sudo make install\n", - }, - "pip install --upgrade --upgrade-strategy eager pip", - "pip install -U --upgrade-strategy eager .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]", - "python -m unidic download", - ], + docker_image=[{"image": "huggingface/transformers-custom-tokenizers"}], + install_steps=["uv venv","uv pip install -e ."], parallelism=None, resource_class=None, tests_to_run=[ @@ -406,14 +290,8 @@ def job_name(self): "examples_torch", additional_env={"OMP_NUM_THREADS": 8}, cache_name="torch_examples", - install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng", - "pip install --upgrade --upgrade-strategy eager pip", - "pip install -U --upgrade-strategy eager .[sklearn,torch,sentencepiece,testing,torch-speech]", - "pip install -U --upgrade-strategy eager -r examples/pytorch/_tests_requirements.txt", - # Without --no-deps we can't pin dependency versions in the future - "pip install -U --upgrade-strategy eager --no-deps -e git+https://github.com/huggingface/accelerate@main#egg=accelerate", - ], + docker_image=[{"image":"huggingface/transformers-examples-torch"}], + install_steps=["uv venv && uv pip install ."], pytest_num_workers=1, ) @@ -421,24 +299,20 @@ def job_name(self): examples_tensorflow_job = CircleCIJob( "examples_tensorflow", cache_name="tensorflow_examples", - install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y cmake", - "pip install --upgrade --upgrade-strategy eager pip", - "pip install -U --upgrade-strategy eager .[sklearn,tensorflow,sentencepiece,testing]", - "pip install -U --upgrade-strategy eager -r examples/tensorflow/_tests_requirements.txt", - ], + docker_image=[{"image":"huggingface/transformers-examples-tf"}], + install_steps=["uv venv && uv pip install ."], + parallelism=8 ) hub_job = CircleCIJob( "hub", additional_env={"HUGGINGFACE_CO_STAGING": True}, + docker_image=[{"image":"huggingface/transformers-torch-light"}], install_steps=[ - "sudo apt-get -y update && sudo apt-get install git-lfs", + "uv venv && uv pip install .", 'git config --global user.email "ci@dummy.com"', 'git config --global user.name "ci"', - "pip install --upgrade --upgrade-strategy eager pip", - "pip install -U --upgrade-strategy eager .[torch,sentencepiece,testing,vision]", ], marker="is_staging_test", pytest_num_workers=1, @@ -447,10 +321,11 @@ def job_name(self): onnx_job = CircleCIJob( "onnx", + docker_image=[{"image":"huggingface/transformers-torch-tf-light"}], install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y cmake", - "pip install --upgrade --upgrade-strategy eager pip", - "pip install -U --upgrade-strategy eager .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]", + "uv venv && uv pip install .", + "uv pip install --upgrade eager pip", + "uv pip install .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]", ], pytest_options={"k onnx": None}, pytest_num_workers=1, @@ -459,22 +334,8 @@ def job_name(self): exotic_models_job = CircleCIJob( "exotic_models", - install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev", - "pip install --upgrade --upgrade-strategy eager pip", - "pip install -U --upgrade-strategy eager .[torch,testing,vision]", - "pip install -U --upgrade-strategy eager torchvision", - "pip install -U --upgrade-strategy eager scipy", - "pip install -U --upgrade-strategy eager 'git+https://github.com/facebookresearch/detectron2.git'", - "sudo apt install tesseract-ocr", - "pip install -U --upgrade-strategy eager pytesseract", - "pip install --upgrade-strategy eager sentencepiece", - "pip install -U --upgrade-strategy eager natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels", - "pip install -U --upgrade-strategy eager python-Levenshtein", - "pip install -U --upgrade-strategy eager opencv-python", - "pip install -U --upgrade-strategy eager nltk", - "pip uninstall -y torch torchvision torchaudio && pip install -U --upgrade-strategy eager 'torch<2.2.0' 'torchvision<0.17' 'torchaudio<2.2.0'" - ], + install_steps=["uv venv && uv pip install ."], + docker_image=[{"image":"huggingface/transformers-exotic-models"}], tests_to_run=[ "tests/models/*layoutlmv*", "tests/models/*nat", @@ -482,17 +343,16 @@ def job_name(self): "tests/models/udop", "tests/models/nougat", ], - pytest_num_workers=1, + pytest_num_workers=12, + parallelism=4, pytest_options={"durations": 100}, ) repo_utils_job = CircleCIJob( "repo_utils", - install_steps=[ - "pip install --upgrade --upgrade-strategy eager pip", - "pip install -U --upgrade-strategy eager .[quality,testing,torch]", - ], + docker_image=[{"image":"huggingface/transformers-consistency"}], + install_steps=["uv venv && uv pip install ."], parallelism=None, pytest_num_workers=1, resource_class="large", @@ -508,20 +368,9 @@ def job_name(self): command = f'echo "{py_command}" > pr_documentation_tests_temp.txt' doc_test_job = CircleCIJob( "pr_documentation_tests", + docker_image=[{"image":"huggingface/transformers-consistency"}], additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"}, install_steps=[ - "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time ffmpeg", - "pip install --upgrade --upgrade-strategy eager pip", - "pip install -U --upgrade-strategy eager -e .[dev]", - # Without --no-deps we can't pin dependency versions in the future - "pip install -U --upgrade-strategy eager --no-deps -e git+https://github.com/huggingface/accelerate@main#egg=accelerate", - "pip install --upgrade --upgrade-strategy eager 'pytest<8.0.0' pytest-sugar", - "pip install -U --upgrade-strategy eager natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels", - "pip install -U --upgrade-strategy eager g2p-en", - # TODO: remove this one after fixing the dependency issue(s) above - "pip install -U --upgrade-strategy eager torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu", - "find -name __pycache__ -delete", - "find . -name \*.pyc -delete", # Add an empty file to keep the test step running correctly even no file is selected to be tested. "touch dummy.py", { diff --git a/.circleci/parse_test_outputs.py b/.circleci/parse_test_outputs.py new file mode 100644 index 00000000000000..b80ce8513a1f91 --- /dev/null +++ b/.circleci/parse_test_outputs.py @@ -0,0 +1,70 @@ +import re +import argparse + +def parse_pytest_output(file_path): + skipped_tests = {} + skipped_count = 0 + with open(file_path, 'r') as file: + for line in file: + match = re.match(r'^SKIPPED \[(\d+)\] (tests/.*): (.*)$', line) + if match: + skipped_count += 1 + test_file, test_line, reason = match.groups() + skipped_tests[reason] = skipped_tests.get(reason, []) + [(test_file, test_line)] + for k,v in sorted(skipped_tests.items(), key=lambda x:len(x[1])): + print(f"{len(v):4} skipped because: {k}") + print("Number of skipped tests:", skipped_count) + +def parse_pytest_failure_output(file_path): + failed_tests = {} + failed_count = 0 + with open(file_path, 'r') as file: + for line in file: + match = re.match(r'^FAILED (tests/.*) - (.*): (.*)$', line) + if match: + failed_count += 1 + _, error, reason = match.groups() + failed_tests[reason] = failed_tests.get(reason, []) + [error] + for k,v in sorted(failed_tests.items(), key=lambda x:len(x[1])): + print(f"{len(v):4} failed because `{v[0]}` -> {k}") + print("Number of failed tests:", failed_count) + if failed_count>0: + exit(1) + +def parse_pytest_errors_output(file_path): + print(file_path) + error_tests = {} + error_count = 0 + with open(file_path, 'r') as file: + for line in file: + match = re.match(r'^ERROR (tests/.*) - (.*): (.*)$', line) + if match: + error_count += 1 + _, test_error, reason = match.groups() + error_tests[reason] = error_tests.get(reason, []) + [test_error] + for k,v in sorted(error_tests.items(), key=lambda x:len(x[1])): + print(f"{len(v):4} errored out because of `{v[0]}` -> {k}") + print("Number of errors:", error_count) + if error_count>0: + exit(1) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--file", help="file to parse") + parser.add_argument("--skip", action="store_true", help="show skipped reasons") + parser.add_argument("--fail", action="store_true", help="show failed tests") + parser.add_argument("--errors", action="store_true", help="show failed tests") + args = parser.parse_args() + + if args.skip: + parse_pytest_output(args.file) + + if args.fail: + parse_pytest_failure_output(args.file) + + if args.errors: + parse_pytest_errors_output(args.file) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/.github/workflows/build-ci-docker-images.yml b/.github/workflows/build-ci-docker-images.yml new file mode 100644 index 00000000000000..f7b75a3a30e763 --- /dev/null +++ b/.github/workflows/build-ci-docker-images.yml @@ -0,0 +1,54 @@ +name: Build pr ci-docker + +on: + push: + branches: + - change-ci # for now let's only build on this branch + repository_dispatch: + workflow_call: + inputs: + image_postfix: + required: true + type: string + schedule: + - cron: "6 0 * * *" + + +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-22.04 + + if: ${{ contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' }} + + strategy: + matrix: + file: ["quality", "consistency", "custom-tokenizers", "torch-light", "tf-light", "exotic-models", "torch-tf-light", "torch-jax-light", "jax-light", "examples-torch", "examples-tf"] + continue-on-error: true + + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v4 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build ${{ matrix.file }}.dockerfile + uses: docker/build-push-action@v5 + with: + context: ./docker + build-args: | + REF=${{ github.sha }} + file: "./docker/${{ matrix.file }}.dockerfile" + push: true + tags: huggingface/transformers-${{ matrix.file }} \ No newline at end of file diff --git a/docker/consistency.dockerfile b/docker/consistency.dockerfile new file mode 100644 index 00000000000000..fa94259e2dedae --- /dev/null +++ b/docker/consistency.dockerfile @@ -0,0 +1,14 @@ +FROM python:3.10-slim +ENV PYTHONDONTWRITEBYTECODE=1 +USER root +RUN apt-get update && apt-get install -y time git pkg-config make git-lfs +ENV VIRTUAL_ENV=/usr/local +RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython +RUN uv pip install --no-cache-dir --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --no-cache-dir tensorflow-cpu tf-keras +RUN uv pip install --no-cache-dir "transformers[flax,quality,vision,testing]" +RUN git lfs install + +RUN pip uninstall -y transformers +RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean + diff --git a/docker/custom-tokenizers.dockerfile b/docker/custom-tokenizers.dockerfile new file mode 100644 index 00000000000000..19860841da629e --- /dev/null +++ b/docker/custom-tokenizers.dockerfile @@ -0,0 +1,26 @@ +FROM python:3.10-slim +ENV PYTHONDONTWRITEBYTECODE=1 +USER root +RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler +ENV VIRTUAL_ENV=/usr/local +RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools + +RUN wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz +RUN tar xvf jumanpp-2.0.0-rc3.tar.xz +RUN mkdir jumanpp-2.0.0-rc3/bld +WORKDIR ./jumanpp-2.0.0-rc3/bld +RUN wget -LO catch.hpp https://github.com/catchorg/Catch2/releases/download/v2.13.8/catch.hpp +RUN mv catch.hpp ../libs/ +RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local +RUN make install -j 10 + + +RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --no-cache-dir --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --no-cache-dir "transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite +# spacy is not used so not tested. Causes to failures. TODO fix later +RUN python3 -m unidic download +RUN pip uninstall -y transformers + +RUN apt-get clean && rm -rf /var/lib/apt/lists/* +RUN apt remove -y g++ cmake xz-utils libprotobuf-dev protobuf-compiler \ No newline at end of file diff --git a/docker/examples-tf.dockerfile b/docker/examples-tf.dockerfile new file mode 100644 index 00000000000000..898f199504da54 --- /dev/null +++ b/docker/examples-tf.dockerfile @@ -0,0 +1,12 @@ +FROM python:3.10-slim +ENV PYTHONDONTWRITEBYTECODE=1 +USER root +RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git +RUN apt-get install -y g++ cmake +ENV VIRTUAL_ENV=/usr/local +RUN pip --no-cache-dir install uv && uv venv +RUN uv pip install --no-cache-dir -U pip setuptools albumentations seqeval +RUN pip install --upgrade --no-cache-dir "transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]" +RUN uv pip install --no-cache-dir "protobuf==3.20.3" +RUN pip uninstall -y transformers +RUN apt-get clean && rm -rf /var/lib/apt/lists/* \ No newline at end of file diff --git a/docker/examples-torch.dockerfile b/docker/examples-torch.dockerfile new file mode 100644 index 00000000000000..fa8d63865da12b --- /dev/null +++ b/docker/examples-torch.dockerfile @@ -0,0 +1,11 @@ +FROM python:3.10-slim +ENV PYTHONDONTWRITEBYTECODE=1 +USER root +RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git +ENV VIRTUAL_ENV=/usr/local +RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools +RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer +RUN pip uninstall -y transformers +RUN apt-get clean && rm -rf /var/lib/apt/lists/* \ No newline at end of file diff --git a/docker/exotic-models.dockerfile b/docker/exotic-models.dockerfile new file mode 100644 index 00000000000000..ea2db367402563 --- /dev/null +++ b/docker/exotic-models.dockerfile @@ -0,0 +1,17 @@ +FROM python:3.10-slim +ENV PYTHONDONTWRITEBYTECODE=1 +ARG REF=main +USER root +RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr +ENV VIRTUAL_ENV=/usr/local +RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools +RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --no-cache-dir --no-deps timm accelerate +RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk +# RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels +RUN pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose' 'dataset' +# RUN git clone https://github.com/facebookresearch/detectron2.git +# RUN python3 -m pip install --no-cache-dir -e detectron2 +RUN pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3' +RUN pip uninstall -y transformers +RUN apt-get clean && rm -rf /var/lib/apt/lists/* diff --git a/docker/jax-light.dockerfile b/docker/jax-light.dockerfile new file mode 100644 index 00000000000000..838333062839e8 --- /dev/null +++ b/docker/jax-light.dockerfile @@ -0,0 +1,9 @@ +FROM python:3.10-slim +ENV PYTHONDONTWRITEBYTECODE=1 +USER root +RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake +ENV VIRTUAL_ENV=/usr/local +RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools +RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax,testing,sentencepiece,flax-speech,vision]" +RUN pip uninstall -y transformers +RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean \ No newline at end of file diff --git a/docker/pipeline-tf.dockerfile b/docker/pipeline-tf.dockerfile new file mode 100644 index 00000000000000..81af0ea9c15b91 --- /dev/null +++ b/docker/pipeline-tf.dockerfile @@ -0,0 +1,9 @@ +FROM python:3.10-slim +ENV PYTHONDONTWRITEBYTECODE=1 +USER root +RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake g++ +ENV VIRTUAL_ENV=/usr/local +RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools +RUN pip install --no-cache-dir "transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]" +RUN uv pip install --no-cache-dir "protobuf==3.20.3" tensorflow_probability +RUN apt-get clean && rm -rf /var/lib/apt/lists/* \ No newline at end of file diff --git a/docker/pipeline-torch.dockerfile b/docker/pipeline-torch.dockerfile new file mode 100644 index 00000000000000..554d9783144232 --- /dev/null +++ b/docker/pipeline-torch.dockerfile @@ -0,0 +1,10 @@ +FROM python:3.10-slim +ENV PYTHONDONTWRITEBYTECODE=1 +USER root +RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git +ENV VIRTUAL_ENV=/usr/local +RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools +RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]" +RUN pip uninstall -y transformers \ No newline at end of file diff --git a/docker/quality.dockerfile b/docker/quality.dockerfile new file mode 100644 index 00000000000000..471af0526b4f53 --- /dev/null +++ b/docker/quality.dockerfile @@ -0,0 +1,8 @@ +FROM python:3.10-slim +ENV PYTHONDONTWRITEBYTECODE=1 +USER root +RUN apt-get update && apt-get install -y time git +ENV VIRTUAL_ENV=/usr/local +RUN pip install uv && uv venv +RUN uv pip install --no-cache-dir -U pip setuptools GitPython transformers "ruff==0.1.5" urllib3 +RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/* \ No newline at end of file diff --git a/docker/tf-light.dockerfile b/docker/tf-light.dockerfile new file mode 100644 index 00000000000000..23dcd40db2094b --- /dev/null +++ b/docker/tf-light.dockerfile @@ -0,0 +1,11 @@ +FROM python:3.10-slim +ENV PYTHONDONTWRITEBYTECODE=1 +USER root +RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ pkg-config openssh-client git +RUN apt-get install -y cmake +ENV VIRTUAL_ENV=/usr/local +RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools +RUN pip install --upgrade --no-cache-dir "transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]" +RUN uv pip install --no-cache-dir "protobuf==3.20.3" +RUN pip uninstall -y transformers +RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean \ No newline at end of file diff --git a/docker/torch-jax-light.dockerfile b/docker/torch-jax-light.dockerfile new file mode 100644 index 00000000000000..ef28563ec102ba --- /dev/null +++ b/docker/torch-jax-light.dockerfile @@ -0,0 +1,15 @@ +FROM python:3.10-slim +ENV PYTHONDONTWRITEBYTECODE=1 +USER root +RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git +ENV VIRTUAL_ENV=/usr/local +RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools +RUN uv pip install --no-deps accelerate +RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu +RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax, audio, sklearn,sentencepiece,vision,testing]" + + +# RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax,testing,sentencepiece,flax-speech,vision]" + +RUN pip uninstall -y transformers +RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean diff --git a/docker/torch-light.dockerfile b/docker/torch-light.dockerfile new file mode 100644 index 00000000000000..2172d66ca8a769 --- /dev/null +++ b/docker/torch-light.dockerfile @@ -0,0 +1,10 @@ +FROM python:3.10-slim +ENV PYTHONDONTWRITEBYTECODE=1 +USER root +RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs +ENV VIRTUAL_ENV=/usr/local +RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools +RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]" +RUN pip uninstall -y transformers \ No newline at end of file diff --git a/docker/torch-tf-light.dockerfile b/docker/torch-tf-light.dockerfile new file mode 100644 index 00000000000000..f7122930b444ec --- /dev/null +++ b/docker/torch-tf-light.dockerfile @@ -0,0 +1,19 @@ +FROM python:3.10-slim +ENV PYTHONDONTWRITEBYTECODE=1 +ARG REF=main +RUN echo ${REF} +USER root +RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs +ENV VIRTUAL_ENV=/usr/local +RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools +RUN uv pip install --no-cache-dir --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu +RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu +RUN git lfs install + +RUN uv pip install --no-cache-dir pypi-kenlm +RUN pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,sentencepiece,vision,testing]" +RUN uv pip install --no-cache-dir "protobuf==3.20.3" librosa + + +RUN pip uninstall -y transformers +RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean \ No newline at end of file diff --git a/setup.py b/setup.py index 37fa11379d6417..aae789a07d5111 100644 --- a/setup.py +++ b/setup.py @@ -116,7 +116,6 @@ "ftfy", "fugashi>=1.0", "GitPython<3.1.19", - "hf-doc-builder>=0.3.0", "huggingface-hub>=0.19.3,<1.0", "importlib_metadata", "ipadic>=1.0.0,<2.0", @@ -126,7 +125,7 @@ "jieba", "kenlm", # Keras pin - this is to make sure Keras 3 doesn't destroy us. Remove or change when we have proper support. - "keras<2.16", + "keras>2.9,<2.16", "keras-nlp>=0.3.1", "librosa", "nltk", @@ -169,9 +168,10 @@ "sudachidict_core>=20220729", "tensorboard", # TensorFlow pin. When changing this value, update examples/tensorflow/_tests_requirements.txt accordingly - "tensorflow-cpu>=2.6,<2.16", - "tensorflow>=2.6,<2.16", + "tensorflow-cpu>2.9,<2.16", + "tensorflow>2.9,<2.16", "tensorflow-text<2.16", + "tensorflow-probability<2.16", "tf2onnx", "timeout-decorator", "timm", @@ -185,6 +185,7 @@ "unidic_lite>=1.0.7", "urllib3<2.0.0", "uvicorn", + "pytest-rich", ] @@ -258,7 +259,7 @@ def run(self): extras["sklearn"] = deps_list("scikit-learn") extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp") -extras["tf-cpu"] = deps_list("tensorflow-cpu", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp") +extras["tf-cpu"] = deps_list("keras", "tensorflow-cpu", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp", "tensorflow-probability") extras["torch"] = deps_list("torch", "accelerate") extras["accelerate"] = deps_list("accelerate") @@ -302,6 +303,7 @@ def run(self): extras["testing"] = ( deps_list( "pytest", + "pytest-rich", "pytest-xdist", "timeout-decorator", "parameterized", @@ -315,8 +317,6 @@ def run(self): "rouge-score", "nltk", "GitPython", - "hf-doc-builder", - "protobuf", # Can be removed once we can unpin protobuf "sacremoses", "rjieba", "beautifulsoup4", @@ -330,7 +330,7 @@ def run(self): extras["deepspeed-testing"] = extras["deepspeed"] + extras["testing"] + extras["optuna"] + extras["sentencepiece"] -extras["quality"] = deps_list("datasets", "isort", "ruff", "GitPython", "hf-doc-builder", "urllib3") +extras["quality"] = deps_list("datasets", "isort", "ruff", "GitPython", "urllib3") extras["all"] = ( extras["tf"] @@ -348,11 +348,6 @@ def run(self): + extras["video"] ) -# Might need to add doc-builder and some specific deps in the future -extras["docs_specific"] = ["hf-doc-builder"] - -# "docs" needs "all" to resolve all the references -extras["docs"] = extras["all"] + extras["docs_specific"] extras["dev-torch"] = ( extras["testing"] @@ -367,7 +362,6 @@ def run(self): + extras["codecarbon"] + extras["quality"] + extras["ja"] - + extras["docs_specific"] + extras["sklearn"] + extras["modelcreation"] + extras["onnxruntime"] @@ -379,7 +373,6 @@ def run(self): + extras["tokenizers"] + extras["vision"] + extras["quality"] - + extras["docs_specific"] + extras["sklearn"] + extras["modelcreation"] + extras["onnx"] @@ -390,7 +383,6 @@ def run(self): + extras["testing"] + extras["quality"] + extras["ja"] - + extras["docs_specific"] + extras["sklearn"] + extras["modelcreation"] ) @@ -463,3 +455,18 @@ def run(self): ], cmdclass={"deps_table_update": DepsTableUpdateCommand}, ) + +extras["tests_torch"] = deps_list() +extras["tests_tf"] = deps_list() +extras["tests_flax"] = deps_list() +extras["tests_torch_and_tf"] = deps_list() +extras["tests_torch_and_flax"] = deps_list() +extras["tests_hub"] = deps_list() +extras["tests_pipelines_torch"] = deps_list() +extras["tests_pipelines_tf"] = deps_list() +extras["tests_onnx"] = deps_list() +extras["tests_examples_torch"] = deps_list() +extras["tests_examples_tf"] = deps_list() +extras["tests_custom_tokenizers"] = deps_list() +extras["tests_exotic_models"] = deps_list() +extras["consistency"] = deps_list() \ No newline at end of file diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 7f78c8285bb31f..8db7b73637f462 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -23,7 +23,6 @@ "ftfy": "ftfy", "fugashi": "fugashi>=1.0", "GitPython": "GitPython<3.1.19", - "hf-doc-builder": "hf-doc-builder>=0.3.0", "huggingface-hub": "huggingface-hub>=0.19.3,<1.0", "importlib_metadata": "importlib_metadata", "ipadic": "ipadic>=1.0.0,<2.0", @@ -32,7 +31,7 @@ "jaxlib": "jaxlib>=0.4.1,<=0.4.13", "jieba": "jieba", "kenlm": "kenlm", - "keras": "keras<2.16", + "keras": "keras>2.9,<2.16", "keras-nlp": "keras-nlp>=0.3.1", "librosa": "librosa", "nltk": "nltk", @@ -74,9 +73,10 @@ "sudachipy": "sudachipy>=0.6.6", "sudachidict_core": "sudachidict_core>=20220729", "tensorboard": "tensorboard", - "tensorflow-cpu": "tensorflow-cpu>=2.6,<2.16", - "tensorflow": "tensorflow>=2.6,<2.16", + "tensorflow-cpu": "tensorflow-cpu>2.9,<2.16", + "tensorflow": "tensorflow>2.9,<2.16", "tensorflow-text": "tensorflow-text<2.16", + "tensorflow-probability": "tensorflow-probability<2.16", "tf2onnx": "tf2onnx", "timeout-decorator": "timeout-decorator", "timm": "timm", @@ -90,4 +90,5 @@ "unidic_lite": "unidic_lite>=1.0.7", "urllib3": "urllib3<2.0.0", "uvicorn": "uvicorn", + "pytest-rich": "pytest-rich", } diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py index 31c76083e02287..0b22a28260aa9e 100644 --- a/src/transformers/models/groupvit/modeling_tf_groupvit.py +++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py @@ -63,6 +63,15 @@ "It seems you have `tensorflow_probability` installed with the wrong tensorflow version." "Please try to reinstall it following the instructions here: https://github.com/tensorflow/probability." ) +else: + try: + import tensorflow_probability as tfp + + # On the first call, check whether a compatible version of TensorFlow is installed + # TensorFlow Probability depends on a recent stable release of TensorFlow + _ = tfp.distributions.Normal(loc=0.0, scale=1.0) + except ImportError: + pass _CHECKPOINT_FOR_DOC = "nvidia/groupvit-gcc-yfcc" diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py index 6b2ed5fab455a8..7cb64a482f3845 100644 --- a/src/transformers/models/tapas/modeling_tf_tapas.py +++ b/src/transformers/models/tapas/modeling_tf_tapas.py @@ -50,7 +50,6 @@ is_tensorflow_probability_available, logging, replace_return_docstrings, - requires_backends, ) from .configuration_tapas import TapasConfig @@ -71,6 +70,15 @@ "It seems you have `tensorflow_probability` installed with the wrong tensorflow version. " "Please try to reinstall it following the instructions here: https://github.com/tensorflow/probability." ) +else: + try: + import tensorflow_probability as tfp + + # On the first call, check whether a compatible version of TensorFlow is installed + # TensorFlow Probability depends on a recent stable release of TensorFlow + _ = tfp.distributions.Normal(loc=0.0, scale=1.0) + except ImportError: + pass _CONFIG_FOR_DOC = "TapasConfig" _CHECKPOINT_FOR_DOC = "google/tapas-base" @@ -830,7 +838,6 @@ class TFTapasMainLayer(keras.layers.Layer): config_class = TapasConfig def __init__(self, config: TapasConfig, add_pooling_layer: bool = True, **kwargs): - requires_backends(self, "tensorflow_probability") super().__init__(**kwargs) self.config = config diff --git a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py index b6200c3ee5602c..eebb7420be30b0 100644 --- a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py @@ -95,6 +95,7 @@ def test_image_processor_from_dict_with_kwargs(self): image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + @unittest.skip("Tesseract version is not correct in ci. @Arthur FIXME") def test_layoutlmv2_integration_test(self): # with apply_OCR = True image_processing = LayoutLMv2ImageProcessor() @@ -111,9 +112,9 @@ def test_layoutlmv2_integration_test(self): self.assertEqual(len(encoding.words), len(encoding.boxes)) # fmt: off - # the words and boxes were obtained with Tesseract 4.1.1 + # the words and boxes were obtained with Tesseract 5.3.0 expected_words = [['11:14', 'to', '11:39', 'a.m', '11:39', 'to', '11:44', 'a.m.', '11:44', 'a.m.', 'to', '12:25', 'p.m.', '12:25', 'to', '12:58', 'p.m.', '12:58', 'to', '4:00', 'p.m.', '2:00', 'to', '5:00', 'p.m.', 'Coffee', 'Break', 'Coffee', 'will', 'be', 'served', 'for', 'men', 'and', 'women', 'in', 'the', 'lobby', 'adjacent', 'to', 'exhibit', 'area.', 'Please', 'move', 'into', 'exhibit', 'area.', '(Exhibits', 'Open)', 'TRRF', 'GENERAL', 'SESSION', '(PART', '|)', 'Presiding:', 'Lee', 'A.', 'Waller', 'TRRF', 'Vice', 'President', '“Introductory', 'Remarks”', 'Lee', 'A.', 'Waller,', 'TRRF', 'Vice', 'Presi-', 'dent', 'Individual', 'Interviews', 'with', 'TRRF', 'Public', 'Board', 'Members', 'and', 'Sci-', 'entific', 'Advisory', 'Council', 'Mem-', 'bers', 'Conducted', 'by', 'TRRF', 'Treasurer', 'Philip', 'G.', 'Kuehn', 'to', 'get', 'answers', 'which', 'the', 'public', 'refrigerated', 'warehousing', 'industry', 'is', 'looking', 'for.', 'Plus', 'questions', 'from', 'the', 'floor.', 'Dr.', 'Emil', 'M.', 'Mrak,', 'University', 'of', 'Cal-', 'ifornia,', 'Chairman,', 'TRRF', 'Board;', 'Sam', 'R.', 'Cecil,', 'University', 'of', 'Georgia', 'College', 'of', 'Agriculture;', 'Dr.', 'Stanley', 'Charm,', 'Tufts', 'University', 'School', 'of', 'Medicine;', 'Dr.', 'Robert', 'H.', 'Cotton,', 'ITT', 'Continental', 'Baking', 'Company;', 'Dr.', 'Owen', 'Fennema,', 'University', 'of', 'Wis-', 'consin;', 'Dr.', 'Robert', 'E.', 'Hardenburg,', 'USDA.', 'Questions', 'and', 'Answers', 'Exhibits', 'Open', 'Capt.', 'Jack', 'Stoney', 'Room', 'TRRF', 'Scientific', 'Advisory', 'Council', 'Meeting', 'Ballroom', 'Foyer']] # noqa: E231 - expected_boxes = [[[141, 57, 214, 69], [228, 58, 252, 69], [141, 75, 216, 88], [230, 79, 280, 88], [142, 260, 218, 273], [230, 261, 255, 273], [143, 279, 218, 290], [231, 282, 290, 291], [143, 342, 218, 354], [231, 345, 289, 355], [202, 362, 227, 373], [143, 379, 220, 392], [231, 382, 291, 394], [144, 714, 220, 726], [231, 715, 256, 726], [144, 732, 220, 745], [232, 736, 291, 747], [144, 769, 218, 782], [231, 770, 256, 782], [141, 788, 202, 801], [215, 791, 274, 804], [143, 826, 204, 838], [215, 826, 240, 838], [142, 844, 202, 857], [215, 847, 274, 859], [334, 57, 427, 69], [440, 57, 522, 69], [369, 75, 461, 88], [469, 75, 516, 88], [528, 76, 562, 88], [570, 76, 667, 88], [675, 75, 711, 87], [721, 79, 778, 88], [789, 75, 840, 88], [369, 97, 470, 107], [484, 94, 507, 106], [518, 94, 562, 107], [576, 94, 655, 110], [668, 94, 792, 109], [804, 95, 829, 107], [369, 113, 465, 125], [477, 116, 547, 125], [562, 113, 658, 125], [671, 116, 748, 125], [761, 113, 811, 125], [369, 131, 465, 143], [477, 133, 548, 143], [563, 130, 698, 145], [710, 130, 802, 146], [336, 171, 412, 183], [423, 171, 572, 183], [582, 170, 716, 184], [728, 171, 817, 187], [829, 171, 844, 186], [338, 197, 482, 212], [507, 196, 557, 209], [569, 196, 595, 208], [610, 196, 702, 209], [505, 214, 583, 226], [595, 214, 656, 227], [670, 215, 807, 227], [335, 259, 543, 274], [556, 259, 708, 272], [372, 279, 422, 291], [435, 279, 460, 291], [474, 279, 574, 292], [587, 278, 664, 291], [676, 278, 738, 291], [751, 279, 834, 291], [372, 298, 434, 310], [335, 341, 483, 354], [497, 341, 655, 354], [667, 341, 728, 354], [740, 341, 825, 354], [335, 360, 430, 372], [442, 360, 534, 372], [545, 359, 687, 372], [697, 360, 754, 372], [765, 360, 823, 373], [334, 378, 428, 391], [440, 378, 577, 394], [590, 378, 705, 391], [720, 378, 801, 391], [334, 397, 400, 409], [370, 416, 529, 429], [544, 416, 576, 432], [587, 416, 665, 428], [677, 416, 814, 429], [372, 435, 452, 450], [465, 434, 495, 447], [511, 434, 600, 447], [611, 436, 637, 447], [649, 436, 694, 451], [705, 438, 824, 447], [369, 453, 452, 466], [464, 454, 509, 466], [522, 453, 611, 469], [625, 453, 792, 469], [370, 472, 556, 488], [570, 472, 684, 487], [697, 472, 718, 485], [732, 472, 835, 488], [369, 490, 411, 503], [425, 490, 484, 503], [496, 490, 635, 506], [645, 490, 707, 503], [718, 491, 761, 503], [771, 490, 840, 503], [336, 510, 374, 521], [388, 510, 447, 522], [460, 510, 489, 521], [503, 510, 580, 522], [592, 509, 736, 525], [745, 509, 770, 522], [781, 509, 840, 522], [338, 528, 434, 541], [448, 528, 596, 541], [609, 527, 687, 540], [700, 528, 792, 541], [336, 546, 397, 559], [407, 546, 431, 559], [443, 546, 525, 560], [537, 546, 680, 562], [688, 546, 714, 559], [722, 546, 837, 562], [336, 565, 449, 581], [461, 565, 485, 577], [497, 565, 665, 581], [681, 565, 718, 577], [732, 565, 837, 580], [337, 584, 438, 597], [452, 583, 521, 596], [535, 584, 677, 599], [690, 583, 787, 596], [801, 583, 825, 596], [338, 602, 478, 615], [492, 602, 530, 614], [543, 602, 638, 615], [650, 602, 676, 614], [688, 602, 788, 615], [802, 602, 843, 614], [337, 621, 502, 633], [516, 621, 615, 637], [629, 621, 774, 636], [789, 621, 827, 633], [337, 639, 418, 652], [432, 640, 571, 653], [587, 639, 731, 655], [743, 639, 769, 652], [780, 639, 841, 652], [338, 658, 440, 673], [455, 658, 491, 670], [508, 658, 602, 671], [616, 658, 638, 670], [654, 658, 835, 674], [337, 677, 429, 689], [337, 714, 482, 726], [495, 714, 548, 726], [561, 714, 683, 726], [338, 770, 461, 782], [474, 769, 554, 785], [489, 788, 562, 803], [576, 788, 643, 801], [656, 787, 751, 804], [764, 788, 844, 801], [334, 825, 421, 838], [430, 824, 574, 838], [584, 824, 723, 841], [335, 844, 450, 857], [464, 843, 583, 860], [628, 862, 755, 875], [769, 861, 848, 878]]] # noqa: E231 + expected_boxes = [[[141, 57, 210, 69], [228, 58, 252, 69], [141, 75, 216, 88], [230, 79, 280, 88], [142, 260, 218, 273], [230, 261, 255, 273], [143, 279, 218, 290], [231, 282, 290, 291], [143, 342, 218, 354], [231, 345, 289, 355], [202, 362, 227, 373], [143, 379, 220, 392], [231, 382, 291, 394], [144, 714, 220, 726], [231, 715, 256, 726], [144, 732, 220, 745], [232, 736, 291, 747], [144, 769, 218, 782], [231, 770, 256, 782], [141, 788, 202, 801], [215, 791, 274, 804], [143, 826, 204, 838], [215, 826, 240, 838], [142, 844, 202, 857], [215, 847, 274, 859], [334, 57, 427, 69], [440, 57, 522, 69], [369, 75, 461, 88], [469, 75, 516, 88], [528, 76, 562, 88], [570, 76, 667, 88], [675, 75, 711, 87], [721, 79, 778, 88], [789, 75, 840, 88], [369, 97, 470, 107], [484, 94, 507, 106], [518, 94, 562, 107], [576, 94, 655, 110], [668, 94, 792, 109], [804, 95, 829, 107], [369, 113, 465, 125], [477, 116, 547, 125], [562, 113, 658, 125], [671, 116, 748, 125], [761, 113, 811, 125], [369, 131, 465, 143], [477, 133, 548, 143], [563, 130, 698, 145], [710, 130, 802, 146], [336, 171, 412, 183], [423, 171, 572, 183], [582, 170, 716, 184], [728, 171, 817, 187], [829, 171, 844, 186], [338, 197, 482, 212], [507, 196, 557, 209], [569, 196, 595, 208], [610, 196, 702, 209], [505, 214, 583, 226], [595, 214, 656, 227], [670, 215, 807, 227], [335, 259, 543, 274], [556, 259, 708, 272], [372, 279, 422, 291], [435, 279, 460, 291], [474, 279, 574, 292], [587, 278, 664, 291], [676, 278, 738, 291], [751, 279, 834, 291], [372, 298, 434, 310], [335, 341, 483, 354], [497, 341, 655, 354], [667, 341, 728, 354], [740, 341, 825, 354], [335, 360, 430, 372], [442, 360, 534, 372], [545, 359, 687, 372], [697, 360, 754, 372], [765, 360, 823, 373], [334, 378, 428, 391], [440, 378, 577, 394], [590, 378, 705, 391], [720, 378, 801, 391], [334, 397, 400, 409], [370, 416, 529, 429], [544, 416, 576, 432], [587, 416, 665, 428], [677, 416, 814, 429], [372, 435, 452, 450], [465, 434, 495, 447], [511, 434, 600, 447], [611, 436, 637, 447], [649, 436, 694, 451], [705, 438, 824, 447], [369, 453, 452, 466], [464, 454, 509, 466], [522, 453, 611, 469], [625, 453, 792, 469], [370, 472, 556, 488], [570, 472, 684, 487], [697, 472, 718, 485], [732, 472, 835, 488], [369, 490, 411, 503], [425, 490, 484, 503], [496, 490, 635, 506], [645, 490, 707, 503], [718, 491, 761, 503], [771, 490, 840, 503], [336, 510, 374, 521], [388, 510, 447, 522], [460, 510, 489, 521], [503, 510, 580, 522], [592, 509, 736, 525], [745, 509, 770, 522], [781, 509, 840, 522], [338, 528, 434, 541], [448, 528, 596, 541], [609, 527, 687, 540], [700, 528, 792, 541], [336, 546, 397, 559], [407, 546, 431, 559], [443, 546, 525, 560], [537, 546, 680, 562], [695, 546, 714, 559], [722, 546, 837, 562], [336, 565, 449, 581], [461, 565, 485, 577], [497, 565, 665, 581], [681, 565, 718, 577], [732, 565, 837, 580], [337, 584, 438, 597], [452, 583, 521, 596], [535, 584, 677, 599], [690, 583, 787, 596], [801, 583, 825, 596], [338, 602, 478, 615], [492, 602, 530, 614], [543, 602, 638, 615], [650, 602, 676, 614], [688, 602, 788, 615], [802, 602, 843, 614], [337, 621, 502, 633], [516, 621, 615, 637], [629, 621, 774, 636], [789, 621, 827, 633], [337, 639, 418, 652], [432, 640, 571, 653], [587, 639, 731, 655], [743, 639, 769, 652], [780, 639, 841, 652], [338, 658, 440, 673], [455, 658, 491, 670], [508, 658, 602, 671], [616, 658, 638, 670], [654, 658, 835, 674], [337, 677, 429, 689], [337, 714, 482, 726], [495, 714, 548, 726], [561, 714, 683, 726], [338, 770, 461, 782], [474, 769, 554, 785], [489, 788, 562, 803], [576, 788, 643, 801], [656, 787, 751, 804], [764, 788, 844, 801], [334, 825, 421, 838], [430, 824, 574, 838], [584, 824, 723, 841], [335, 844, 450, 857], [464, 843, 583, 860], [628, 862, 755, 875], [769, 861, 848, 878]]] # noqa: E231 # fmt: on self.assertListEqual(encoding.words, expected_words) diff --git a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py index 9b19c376d90ba1..8d4b64c2ccd409 100644 --- a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py @@ -111,9 +111,9 @@ def test_LayoutLMv3_integration_test(self): self.assertEqual(len(encoding.words), len(encoding.boxes)) # fmt: off - # the words and boxes were obtained with Tesseract 4.1.1 + # the words and boxes were obtained with Tesseract 5.3.0 expected_words = [['11:14', 'to', '11:39', 'a.m', '11:39', 'to', '11:44', 'a.m.', '11:44', 'a.m.', 'to', '12:25', 'p.m.', '12:25', 'to', '12:58', 'p.m.', '12:58', 'to', '4:00', 'p.m.', '2:00', 'to', '5:00', 'p.m.', 'Coffee', 'Break', 'Coffee', 'will', 'be', 'served', 'for', 'men', 'and', 'women', 'in', 'the', 'lobby', 'adjacent', 'to', 'exhibit', 'area.', 'Please', 'move', 'into', 'exhibit', 'area.', '(Exhibits', 'Open)', 'TRRF', 'GENERAL', 'SESSION', '(PART', '|)', 'Presiding:', 'Lee', 'A.', 'Waller', 'TRRF', 'Vice', 'President', '“Introductory', 'Remarks”', 'Lee', 'A.', 'Waller,', 'TRRF', 'Vice', 'Presi-', 'dent', 'Individual', 'Interviews', 'with', 'TRRF', 'Public', 'Board', 'Members', 'and', 'Sci-', 'entific', 'Advisory', 'Council', 'Mem-', 'bers', 'Conducted', 'by', 'TRRF', 'Treasurer', 'Philip', 'G.', 'Kuehn', 'to', 'get', 'answers', 'which', 'the', 'public', 'refrigerated', 'warehousing', 'industry', 'is', 'looking', 'for.', 'Plus', 'questions', 'from', 'the', 'floor.', 'Dr.', 'Emil', 'M.', 'Mrak,', 'University', 'of', 'Cal-', 'ifornia,', 'Chairman,', 'TRRF', 'Board;', 'Sam', 'R.', 'Cecil,', 'University', 'of', 'Georgia', 'College', 'of', 'Agriculture;', 'Dr.', 'Stanley', 'Charm,', 'Tufts', 'University', 'School', 'of', 'Medicine;', 'Dr.', 'Robert', 'H.', 'Cotton,', 'ITT', 'Continental', 'Baking', 'Company;', 'Dr.', 'Owen', 'Fennema,', 'University', 'of', 'Wis-', 'consin;', 'Dr.', 'Robert', 'E.', 'Hardenburg,', 'USDA.', 'Questions', 'and', 'Answers', 'Exhibits', 'Open', 'Capt.', 'Jack', 'Stoney', 'Room', 'TRRF', 'Scientific', 'Advisory', 'Council', 'Meeting', 'Ballroom', 'Foyer']] # noqa: E231 - expected_boxes = [[[141, 57, 214, 69], [228, 58, 252, 69], [141, 75, 216, 88], [230, 79, 280, 88], [142, 260, 218, 273], [230, 261, 255, 273], [143, 279, 218, 290], [231, 282, 290, 291], [143, 342, 218, 354], [231, 345, 289, 355], [202, 362, 227, 373], [143, 379, 220, 392], [231, 382, 291, 394], [144, 714, 220, 726], [231, 715, 256, 726], [144, 732, 220, 745], [232, 736, 291, 747], [144, 769, 218, 782], [231, 770, 256, 782], [141, 788, 202, 801], [215, 791, 274, 804], [143, 826, 204, 838], [215, 826, 240, 838], [142, 844, 202, 857], [215, 847, 274, 859], [334, 57, 427, 69], [440, 57, 522, 69], [369, 75, 461, 88], [469, 75, 516, 88], [528, 76, 562, 88], [570, 76, 667, 88], [675, 75, 711, 87], [721, 79, 778, 88], [789, 75, 840, 88], [369, 97, 470, 107], [484, 94, 507, 106], [518, 94, 562, 107], [576, 94, 655, 110], [668, 94, 792, 109], [804, 95, 829, 107], [369, 113, 465, 125], [477, 116, 547, 125], [562, 113, 658, 125], [671, 116, 748, 125], [761, 113, 811, 125], [369, 131, 465, 143], [477, 133, 548, 143], [563, 130, 698, 145], [710, 130, 802, 146], [336, 171, 412, 183], [423, 171, 572, 183], [582, 170, 716, 184], [728, 171, 817, 187], [829, 171, 844, 186], [338, 197, 482, 212], [507, 196, 557, 209], [569, 196, 595, 208], [610, 196, 702, 209], [505, 214, 583, 226], [595, 214, 656, 227], [670, 215, 807, 227], [335, 259, 543, 274], [556, 259, 708, 272], [372, 279, 422, 291], [435, 279, 460, 291], [474, 279, 574, 292], [587, 278, 664, 291], [676, 278, 738, 291], [751, 279, 834, 291], [372, 298, 434, 310], [335, 341, 483, 354], [497, 341, 655, 354], [667, 341, 728, 354], [740, 341, 825, 354], [335, 360, 430, 372], [442, 360, 534, 372], [545, 359, 687, 372], [697, 360, 754, 372], [765, 360, 823, 373], [334, 378, 428, 391], [440, 378, 577, 394], [590, 378, 705, 391], [720, 378, 801, 391], [334, 397, 400, 409], [370, 416, 529, 429], [544, 416, 576, 432], [587, 416, 665, 428], [677, 416, 814, 429], [372, 435, 452, 450], [465, 434, 495, 447], [511, 434, 600, 447], [611, 436, 637, 447], [649, 436, 694, 451], [705, 438, 824, 447], [369, 453, 452, 466], [464, 454, 509, 466], [522, 453, 611, 469], [625, 453, 792, 469], [370, 472, 556, 488], [570, 472, 684, 487], [697, 472, 718, 485], [732, 472, 835, 488], [369, 490, 411, 503], [425, 490, 484, 503], [496, 490, 635, 506], [645, 490, 707, 503], [718, 491, 761, 503], [771, 490, 840, 503], [336, 510, 374, 521], [388, 510, 447, 522], [460, 510, 489, 521], [503, 510, 580, 522], [592, 509, 736, 525], [745, 509, 770, 522], [781, 509, 840, 522], [338, 528, 434, 541], [448, 528, 596, 541], [609, 527, 687, 540], [700, 528, 792, 541], [336, 546, 397, 559], [407, 546, 431, 559], [443, 546, 525, 560], [537, 546, 680, 562], [688, 546, 714, 559], [722, 546, 837, 562], [336, 565, 449, 581], [461, 565, 485, 577], [497, 565, 665, 581], [681, 565, 718, 577], [732, 565, 837, 580], [337, 584, 438, 597], [452, 583, 521, 596], [535, 584, 677, 599], [690, 583, 787, 596], [801, 583, 825, 596], [338, 602, 478, 615], [492, 602, 530, 614], [543, 602, 638, 615], [650, 602, 676, 614], [688, 602, 788, 615], [802, 602, 843, 614], [337, 621, 502, 633], [516, 621, 615, 637], [629, 621, 774, 636], [789, 621, 827, 633], [337, 639, 418, 652], [432, 640, 571, 653], [587, 639, 731, 655], [743, 639, 769, 652], [780, 639, 841, 652], [338, 658, 440, 673], [455, 658, 491, 670], [508, 658, 602, 671], [616, 658, 638, 670], [654, 658, 835, 674], [337, 677, 429, 689], [337, 714, 482, 726], [495, 714, 548, 726], [561, 714, 683, 726], [338, 770, 461, 782], [474, 769, 554, 785], [489, 788, 562, 803], [576, 788, 643, 801], [656, 787, 751, 804], [764, 788, 844, 801], [334, 825, 421, 838], [430, 824, 574, 838], [584, 824, 723, 841], [335, 844, 450, 857], [464, 843, 583, 860], [628, 862, 755, 875], [769, 861, 848, 878]]] # noqa: E231 + expected_boxes = [[[141, 57, 210, 69], [228, 58, 252, 69], [141, 75, 216, 88], [230, 79, 280, 88], [142, 260, 218, 273], [230, 261, 255, 273], [143, 279, 218, 290], [231, 282, 290, 291], [143, 342, 218, 354], [231, 345, 289, 355], [202, 362, 227, 373], [143, 379, 220, 392], [231, 382, 291, 394], [144, 714, 220, 726], [231, 715, 256, 726], [144, 732, 220, 745], [232, 736, 291, 747], [144, 769, 218, 782], [231, 770, 256, 782], [141, 788, 202, 801], [215, 791, 274, 804], [143, 826, 204, 838], [215, 826, 240, 838], [142, 844, 202, 857], [215, 847, 274, 859], [334, 57, 427, 69], [440, 57, 522, 69], [369, 75, 461, 88], [469, 75, 516, 88], [528, 76, 562, 88], [570, 76, 667, 88], [675, 75, 711, 87], [721, 79, 778, 88], [789, 75, 840, 88], [369, 97, 470, 107], [484, 94, 507, 106], [518, 94, 562, 107], [576, 94, 655, 110], [668, 94, 792, 109], [804, 95, 829, 107], [369, 113, 465, 125], [477, 116, 547, 125], [562, 113, 658, 125], [671, 116, 748, 125], [761, 113, 811, 125], [369, 131, 465, 143], [477, 133, 548, 143], [563, 130, 698, 145], [710, 130, 802, 146], [336, 171, 412, 183], [423, 171, 572, 183], [582, 170, 716, 184], [728, 171, 817, 187], [829, 171, 844, 186], [338, 197, 482, 212], [507, 196, 557, 209], [569, 196, 595, 208], [610, 196, 702, 209], [505, 214, 583, 226], [595, 214, 656, 227], [670, 215, 807, 227], [335, 259, 543, 274], [556, 259, 708, 272], [372, 279, 422, 291], [435, 279, 460, 291], [474, 279, 574, 292], [587, 278, 664, 291], [676, 278, 738, 291], [751, 279, 834, 291], [372, 298, 434, 310], [335, 341, 483, 354], [497, 341, 655, 354], [667, 341, 728, 354], [740, 341, 825, 354], [335, 360, 430, 372], [442, 360, 534, 372], [545, 359, 687, 372], [697, 360, 754, 372], [765, 360, 823, 373], [334, 378, 428, 391], [440, 378, 577, 394], [590, 378, 705, 391], [720, 378, 801, 391], [334, 397, 400, 409], [370, 416, 529, 429], [544, 416, 576, 432], [587, 416, 665, 428], [677, 416, 814, 429], [372, 435, 452, 450], [465, 434, 495, 447], [511, 434, 600, 447], [611, 436, 637, 447], [649, 436, 694, 451], [705, 438, 824, 447], [369, 453, 452, 466], [464, 454, 509, 466], [522, 453, 611, 469], [625, 453, 792, 469], [370, 472, 556, 488], [570, 472, 684, 487], [697, 472, 718, 485], [732, 472, 835, 488], [369, 490, 411, 503], [425, 490, 484, 503], [496, 490, 635, 506], [645, 490, 707, 503], [718, 491, 761, 503], [771, 490, 840, 503], [336, 510, 374, 521], [388, 510, 447, 522], [460, 510, 489, 521], [503, 510, 580, 522], [592, 509, 736, 525], [745, 509, 770, 522], [781, 509, 840, 522], [338, 528, 434, 541], [448, 528, 596, 541], [609, 527, 687, 540], [700, 528, 792, 541], [336, 546, 397, 559], [407, 546, 431, 559], [443, 546, 525, 560], [537, 546, 680, 562], [695, 546, 714, 559], [722, 546, 837, 562], [336, 565, 449, 581], [461, 565, 485, 577], [497, 565, 665, 581], [681, 565, 718, 577], [732, 565, 837, 580], [337, 584, 438, 597], [452, 583, 521, 596], [535, 584, 677, 599], [690, 583, 787, 596], [801, 583, 825, 596], [338, 602, 478, 615], [492, 602, 530, 614], [543, 602, 638, 615], [650, 602, 676, 614], [688, 602, 788, 615], [802, 602, 843, 614], [337, 621, 502, 633], [516, 621, 615, 637], [629, 621, 774, 636], [789, 621, 827, 633], [337, 639, 418, 652], [432, 640, 571, 653], [587, 639, 731, 655], [743, 639, 769, 652], [780, 639, 841, 652], [338, 658, 440, 673], [455, 658, 491, 670], [508, 658, 602, 671], [616, 658, 638, 670], [654, 658, 835, 674], [337, 677, 429, 689], [337, 714, 482, 726], [495, 714, 548, 726], [561, 714, 683, 726], [338, 770, 461, 782], [474, 769, 554, 785], [489, 788, 562, 803], [576, 788, 643, 801], [656, 787, 751, 804], [764, 788, 844, 801], [334, 825, 421, 838], [430, 824, 574, 838], [584, 824, 723, 841], [335, 844, 450, 857], [464, 843, 583, 860], [628, 862, 755, 875], [769, 861, 848, 878]]] # noqa: E231 # fmt: on self.assertListEqual(encoding.words, expected_words) diff --git a/tests/models/tapas/test_modeling_tapas.py b/tests/models/tapas/test_modeling_tapas.py index 6a482d03bed987..7918cad2b98ce9 100644 --- a/tests/models/tapas/test_modeling_tapas.py +++ b/tests/models/tapas/test_modeling_tapas.py @@ -520,8 +520,13 @@ def test_for_sequence_classification(self): self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) @require_tensorflow_probability + @unittest.skip("tfp is not defined even if installed. FIXME @Arthur in a followup PR!") def test_pt_tf_model_equivalence(self): - super().test_pt_tf_model_equivalence() + pass + + @unittest.skip("tfp is not defined even if installed. FIXME @Arthur in a followup PR!") + def test_tf_from_pt_safetensors(self): + pass def prepare_tapas_single_inputs_for_inference(): diff --git a/tests/models/tapas/test_modeling_tf_tapas.py b/tests/models/tapas/test_modeling_tf_tapas.py index 7687144eaf2f9d..0da8b2879e8c07 100644 --- a/tests/models/tapas/test_modeling_tf_tapas.py +++ b/tests/models/tapas/test_modeling_tf_tapas.py @@ -528,6 +528,10 @@ def test_keras_fit(self): def test_loss_computation(self): pass + @unittest.skip("tfp is not defined even if installed. FIXME @Arthur in a followup PR!") + def test_pt_tf_model_equivalence(self): + pass + def prepare_tapas_single_inputs_for_inference(): # Here we prepare a single table-question pair to test TAPAS inference on: From 09edd77f64481cc9ce03067f4cfe74cef9d5d776 Mon Sep 17 00:00:00 2001 From: jiaqianjing Date: Mon, 6 May 2024 16:32:19 +0800 Subject: [PATCH 07/49] Check if the current compiled version of pytorch supports MPS (#30664) --- src/transformers/utils/import_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 158896347a7a6b..44641b427c59da 100755 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -373,7 +373,7 @@ def is_torch_mps_available(): import torch if hasattr(torch.backends, "mps"): - return torch.backends.mps.is_available() + return torch.backends.mps.is_available() and torch.backends.mps.is_built() return False From a45c5148993a67c9fb5ac157754e68dfeba6f076 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Mon, 6 May 2024 11:26:04 +0200 Subject: [PATCH 08/49] Hotfix-change-ci (#30669) * dmmy change * fiux * revert change --- .circleci/create_circleci_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 49c46c2b83ddb3..f03506af52eb23 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -152,7 +152,7 @@ def to_dict(self): tests = " ".join(expanded_tests) # Each executor to run ~10 tests - n_executors = max(len(tests) // 10, 1) + n_executors = max(len(expanded_tests) // 10, 1) # Avoid empty test list on some executor(s) or launching too many executors if n_executors > self.parallelism: n_executors = self.parallelism From 9c772ac88883dd35166b58cc8cf10cffa3ca7844 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Mon, 6 May 2024 11:33:52 +0200 Subject: [PATCH 09/49] Quantization / HQQ: Fix HQQ tests on our runner (#30668) Update test_hqq.py --- tests/quantization/hqq/test_hqq.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/quantization/hqq/test_hqq.py b/tests/quantization/hqq/test_hqq.py index e4e01f86496388..45c64676a7e42a 100755 --- a/tests/quantization/hqq/test_hqq.py +++ b/tests/quantization/hqq/test_hqq.py @@ -35,7 +35,7 @@ class HQQLLMRunner: - def __init__(self, model_id, quant_config, compute_dtype, device, cache_dir): + def __init__(self, model_id, quant_config, compute_dtype, device, cache_dir=None): self.model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=compute_dtype, @@ -118,7 +118,7 @@ def test_fp16_quantized_model(self): check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj) check_forward(self, hqq_runner.model) - def test_bfp16_quantized_model_with_offloading(self): + def test_f16_quantized_model_with_offloading(self): """ Simple LLM model testing bfp16 with meta-data offloading """ @@ -137,7 +137,7 @@ def test_bfp16_quantized_model_with_offloading(self): ) hqq_runner = HQQLLMRunner( - model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.bfloat16, device=torch_device + model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device ) check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj) From aa64f086a2a229315e5a52050600a64872833580 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Mon, 6 May 2024 14:01:26 +0200 Subject: [PATCH 10/49] Fix llava next tie_word_embeddings config (#30640) * fix llava next embedding * add docstring * Update src/transformers/models/llava_next/configuration_llava_next.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> --------- Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> --- .../models/llava_next/configuration_llava_next.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py index d7b3ff7233f3a4..1097a716bad95d 100644 --- a/src/transformers/models/llava_next/configuration_llava_next.py +++ b/src/transformers/models/llava_next/configuration_llava_next.py @@ -55,6 +55,8 @@ class LlavaNextConfig(PretrainedConfig): image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`): A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list of the form `(height, width)`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. Example: @@ -90,6 +92,7 @@ def __init__( vision_feature_select_strategy="default", vision_feature_layer=-2, image_grid_pinpoints=None, + tie_word_embeddings=False, **kwargs, ): self.ignore_index = ignore_index @@ -138,4 +141,4 @@ def __init__( self.text_config = text_config - super().__init__(**kwargs) + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) From e076953079469983b8430fdb032f8a61dc2ae5f4 Mon Sep 17 00:00:00 2001 From: Clara Pohland <54847419+claralp@users.noreply.github.com> Date: Mon, 6 May 2024 14:22:52 +0200 Subject: [PATCH 11/49] Trainer._load_from_checkpoint - support loading multiple Peft adapters (#30505) * Trainer: load checkpoint model with multiple adapters * Trainer._load_from_checkpoint support multiple active adapters * PeftModel.set_adapter does not support multiple adapters yet * Trainer._load_from_checkpoint test multiple adapters --------- Co-authored-by: Clara Luise Pohland --- src/transformers/trainer.py | 24 ++++++++++++++- tests/trainer/test_trainer.py | 57 +++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index d967c9314b1025..c18404dfcd6e13 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -2413,6 +2413,20 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None): # this checks the FSDP state dict when `FULL_STATE_DICT` is used or os.path.isfile(os.path.join(resume_from_checkpoint, f"{FSDP_MODEL_NAME}.bin")) ) + # if multiple adapters exist, they get saved in sub directories + adapter_subdirs = ( + [ + folder_name + for folder_name in os.listdir(resume_from_checkpoint) + if os.path.isdir(os.path.join(resume_from_checkpoint, folder_name)) + and ( + os.path.isfile(os.path.join(resume_from_checkpoint, folder_name, ADAPTER_WEIGHTS_NAME)) + or os.path.isfile(os.path.join(resume_from_checkpoint, folder_name, ADAPTER_SAFE_WEIGHTS_NAME)) + ) + ] + if os.path.isdir(resume_from_checkpoint) + else [] + ) if is_fsdp_ckpt and not self.is_fsdp_enabled: raise ValueError(f"Checkpoint found at {resume_from_checkpoint} is only supported when using PyTorch FSDP") @@ -2430,6 +2444,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None): ] ) or is_fsdp_ckpt + or adapter_subdirs ): raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}") @@ -2503,7 +2518,14 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None): # If train a model using PEFT & LoRA, assume that adapter have been saved properly. if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"): if os.path.exists(resume_from_checkpoint): - model.load_adapter(resume_from_checkpoint, model.active_adapter, is_trainable=True) + if adapter_subdirs: + active_adapter = model.active_adapter + for subdir_name in adapter_subdirs: + peft_id = os.path.join(resume_from_checkpoint, subdir_name) + model.load_adapter(peft_id, subdir_name, is_trainable=(subdir_name == active_adapter)) + model.set_adapter(active_adapter) + else: + model.load_adapter(resume_from_checkpoint, model.active_adapter, is_trainable=True) else: logger.warning( "The intermediate checkpoints of PEFT may not be saved correctly, " diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 8913de4db1a1f1..89b26221f30497 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -964,6 +964,63 @@ def test_bnb_compile(self): with self.assertRaises(ValueError): _ = Trainer(tiny_model, args, train_dataset=train_dataset) # noqa + @require_peft + def test_multiple_peft_adapters(self): + from peft import LoraConfig, get_peft_model + + # Tests if resuming from checkpoint works if the model has multiple adapters + + MODEL_ID = "hf-internal-testing/tiny-random-LlamaForCausalLM" + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + tiny_model = AutoModelForCausalLM.from_pretrained(MODEL_ID) + + peft_config = LoraConfig( + r=4, + lora_alpha=16, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + tiny_model = get_peft_model(tiny_model, peft_config, "adapter1") + tiny_model.add_adapter("adapter2", peft_config) + + train_dataset = LineByLineTextDataset( + tokenizer=tokenizer, + file_path=PATH_SAMPLE_TEXT, + block_size=tokenizer.max_len_single_sentence, + ) + for example in train_dataset.examples: + example["labels"] = example["input_ids"] + + tokenizer.pad_token = tokenizer.eos_token + + with tempfile.TemporaryDirectory() as tmpdir: + args = TrainingArguments( + tmpdir, + per_device_train_batch_size=1, + learning_rate=1e-9, + save_steps=5, + logging_steps=5, + max_steps=10, + use_cpu=True, + ) + trainer = Trainer(tiny_model, args, tokenizer=tokenizer, train_dataset=train_dataset) + + trainer.train() + parameters = dict(tiny_model.named_parameters()) + state = dataclasses.asdict(trainer.state) + + # Reinitialize trainer + trainer = Trainer(tiny_model, args, tokenizer=tokenizer, train_dataset=train_dataset) + + checkpoint = os.path.join(tmpdir, "checkpoint-5") + + trainer.train(resume_from_checkpoint=checkpoint) + parameters1 = dict(tiny_model.named_parameters()) + state1 = dataclasses.asdict(trainer.state) + self.assertEqual(parameters, parameters1) + self.check_trainer_state_are_the_same(state, state1) + @require_bitsandbytes def test_rmsprop_bnb(self): config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) From df475bf8e62e843bfd4f604b81696b2d950ec990 Mon Sep 17 00:00:00 2001 From: Nate Cibik <50897218+FoamoftheSea@users.noreply.github.com> Date: Mon, 6 May 2024 05:23:40 -0700 Subject: [PATCH 12/49] Trainer - add cache clearing and the option for batched eval metrics computation (#28769) * Added cache clearing for GPU efficiency. * Added cache clearing for GPU efficiency. * Added batch_eval_metrics capability * Ran make fixup * Fixed bug * Fixed whitespace issue * Fixed outdated condition * Updated docstrings with instructions for batch_eval_metrics. Updated end of dataloader logic * Added first version of batch_eval_metrics Trainer test * Fixed batch_eval_metrics Trainer tests for both eval and predict * Fixed batch_eval_metrics behavior for new Trainer variables * Fixed batch_eval_metrics Trainer tests * Ran fixup --- src/transformers/trainer.py | 89 ++++++++++++++++++++--- src/transformers/training_args.py | 11 +++ tests/trainer/test_trainer.py | 116 ++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+), 11 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index c18404dfcd6e13..d324a65235c3fd 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -327,7 +327,10 @@ class Trainer: inner layers, dropout probabilities etc). compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*): The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return - a dictionary string to metric values. + a dictionary string to metric values. *Note* When passing TrainingArgs with `batch_eval_metrics` set to + `True`, your compute_metrics function must take a boolean `compute_result` argument. This will be triggered + after the last eval batch to signal that the function needs to calculate and return the global summary + statistics rather than accumulating the batch-level statistics. callbacks (List of [`TrainerCallback`], *optional*): A list of callbacks to customize the training loop. Will add those to the list of default callbacks detailed in [here](callback). @@ -382,6 +385,13 @@ def __init__( output_dir = "tmp_trainer" logger.info(f"No `TrainingArguments` passed, using `output_dir={output_dir}`.") args = TrainingArguments(output_dir=output_dir) + if args.batch_eval_metrics and compute_metrics is not None: + if "compute_result" not in inspect.signature(compute_metrics).parameters.keys(): + raise ValueError( + "When using `batch_eval_metrics`, your `compute_metrics` function must take a `compute_result`" + " boolean argument which will be triggered after the last batch of the eval set to signal that the" + " summary statistics should be returned by the function." + ) self.args = args # Seed must be set before instantiating the model when using model enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed) @@ -3205,6 +3215,9 @@ def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, with self.compute_loss_context_manager(): loss = self.compute_loss(model, inputs) + del inputs + torch.cuda.empty_cache() + if self.args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training @@ -3703,6 +3716,8 @@ def evaluation_loop( all_labels = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100) all_inputs = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100) + metrics = None + # Will be useful when we have an iterable dataset so don't know its length. observed_num_examples = 0 @@ -3731,27 +3746,50 @@ def evaluation_loop( if inputs_decode is not None: inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100) inputs_decode = self.gather_function((inputs_decode)) - all_inputs.add(inputs_decode) + if not self.args.batch_eval_metrics or description == "Prediction": + all_inputs.add(inputs_decode) if logits is not None: logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100) if self.preprocess_logits_for_metrics is not None: logits = self.preprocess_logits_for_metrics(logits, labels) logits = self.gather_function((logits)) - all_preds.add(logits) + if not self.args.batch_eval_metrics or description == "Prediction": + all_preds.add(logits) if labels is not None: labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100) labels = self.gather_function((labels)) - all_labels.add(labels) + if not self.args.batch_eval_metrics or description == "Prediction": + all_labels.add(labels) self.control = self.callback_handler.on_prediction_step(args, self.state, self.control) + if self.args.batch_eval_metrics: + if self.compute_metrics is not None and logits is not None and labels is not None: + is_last_step = self.accelerator.gradient_state.end_of_dataloader + if args.include_inputs_for_metrics: + metrics = self.compute_metrics( + EvalPrediction(predictions=logits, label_ids=labels, inputs=inputs), + compute_result=is_last_step, + ) + else: + metrics = self.compute_metrics( + EvalPrediction(predictions=logits, label_ids=labels), + compute_result=is_last_step, + ) + + del losses, logits, labels, inputs + torch.cuda.empty_cache() + # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. - if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0: + elif args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0: all_losses.to_cpu_and_numpy() all_preds.to_cpu_and_numpy() all_labels.to_cpu_and_numpy() all_inputs.to_cpu_and_numpy() + del losses, logits, labels, inputs + torch.cuda.empty_cache() + # After all calls to `.gather_function`, reset to `gather_for_metrics`: self.gather_function = self.accelerator.gather_for_metrics if args.past_index and hasattr(self, "_past"): @@ -3780,14 +3818,19 @@ def evaluation_loop( num_samples = observed_num_examples # Metrics! - if self.compute_metrics is not None and all_preds is not None and all_labels is not None: + if ( + self.compute_metrics is not None + and all_preds is not None + and all_labels is not None + and not self.args.batch_eval_metrics + ): if args.include_inputs_for_metrics: metrics = self.compute_metrics( EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs) ) else: metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels)) - else: + elif metrics is None: metrics = {} # To be JSON-serializable, we need to remove numpy types or zero-d tensors @@ -4243,6 +4286,7 @@ def prediction_loop( preds_host: Union[torch.Tensor, List[torch.Tensor]] = None labels_host: Union[torch.Tensor, List[torch.Tensor]] = None inputs_host: Union[torch.Tensor, List[torch.Tensor]] = None + metrics: Optional[dict] = None world_size = max(1, args.world_size) @@ -4284,8 +4328,24 @@ def prediction_loop( ) self.control = self.callback_handler.on_prediction_step(args, self.state, self.control) - # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. - if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0: + if self.args.batch_eval_metrics: + if self.compute_metrics is not None and preds_host is not None and labels_host is not None: + is_last_step = self.accelerator.gradient_state.end_of_dataloader + if args.include_inputs_for_metrics: + metrics = self.compute_metrics( + EvalPrediction(predictions=preds_host, label_ids=labels_host, inputs=inputs_host), + compute_result=is_last_step, + ) + else: + metrics = self.compute_metrics( + EvalPrediction(predictions=preds_host, label_ids=labels_host), + compute_result=is_last_step, + ) + + if self.args.batch_eval_metrics or ( + args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0 + ): + # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) if not prediction_loss_only: preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) @@ -4293,6 +4353,8 @@ def prediction_loop( inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids")) # Set back to None to begin a new accumulation + del losses_host, preds_host, labels_host, inputs_host + torch.cuda.empty_cache() losses_host, preds_host, labels_host, inputs_host = None, None, None, None if args.past_index and hasattr(self, "_past"): @@ -4311,14 +4373,19 @@ def prediction_loop( label_ids = labels_gatherer.finalize() if not prediction_loss_only else None inputs_ids = inputs_gatherer.finalize() if not prediction_loss_only else None - if self.compute_metrics is not None and preds is not None and label_ids is not None: + if ( + self.compute_metrics is not None + and preds is not None + and label_ids is not None + and not self.args.batch_eval_metrics + ): if args.include_inputs_for_metrics: metrics = self.compute_metrics( EvalPrediction(predictions=preds, label_ids=label_ids, inputs=inputs_ids) ) else: metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) - else: + elif metrics is None: metrics = {} # To be JSON-serializable, we need to remove numpy types or zero-d tensors diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index b92f9e18c6726c..6ea2a6674b4034 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -756,6 +756,12 @@ class TrainingArguments: See: https://github.com/jiaweizzhao/GaLore for more details. You need to make sure to pass a valid GaloRe optimizer, e.g. one of: "galore_adamw", "galore_adamw_8bit", "galore_adafactor" and make sure that the target modules are `nn.Linear` modules only. + + batch_eval_metrics (`Optional[bool]`, defaults to `False`): + If set to `True`, evaluation will call compute_metrics at the end of each batch to accumulate statistics + rather than saving all eval logits in memory. When set to `True`, you must pass a compute_metrics function + that takes a boolean argument `compute_result`, which when passed `True`, will trigger the final global + summary statistics from the batch-level summary statistics you've accumulated over the evaluation set. """ framework = "pt" @@ -1434,6 +1440,11 @@ class TrainingArguments: }, ) + batch_eval_metrics: bool = field( + default=False, + metadata={"help": "Break eval metrics calculation into batches to save memory."}, + ) + def __post_init__(self): # Parse in args that could be `dict` sent in from the CLI as a string for field in _VALID_DICT_FIELDS: diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 89b26221f30497..da6dcb2a4b72fb 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -230,6 +230,27 @@ def __call__(self, eval_pred): return {"accuracy": true.astype(np.float32).mean().item()} +class AlmostAccuracyBatched: + def __init__(self, thresh=0.25): + self.thresh = thresh + self.batch_acc = [] + + def __call__(self, eval_pred, compute_result): + predictions, labels = eval_pred + if isinstance(predictions, tuple): + predictions = predictions[0] + if isinstance(labels, tuple): + labels = labels[0] + batch_size = len(predictions) + true = torch.abs(predictions - labels) <= self.thresh + acc = true.type(torch.FloatTensor).mean().item() + self.batch_acc.extend([acc] * batch_size) + if compute_result: + result = {"accuracy": np.mean(self.batch_acc).item()} + self.batch_acc = [] + return result + + class RegressionModelConfig(PretrainedConfig): def __init__(self, a=0, b=0, double_output=False, random_torch=True, **kwargs): super().__init__(**kwargs) @@ -1524,6 +1545,49 @@ def test_evaluate(self): expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"] self.assertAlmostEqual(results["eval_accuracy"], expected_acc) + def test_evaluate_with_batch_eval_metrics(self): + trainer = get_regression_trainer( + a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True + ) + results = trainer.evaluate() + + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + pred = 1.5 * x + 2.5 + expected_loss = ((pred - y) ** 2).mean() + self.assertAlmostEqual(results["eval_loss"], expected_loss) + expected_acc = AlmostAccuracy()((pred, y))["accuracy"] + self.assertAlmostEqual(results["eval_accuracy"], expected_acc) + + # With a number of elements not a round multiple of the batch size + trainer = get_regression_trainer( + a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True + ) + results = trainer.evaluate() + + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + pred = 1.5 * x + 2.5 + expected_loss = ((pred - y) ** 2).mean() + self.assertAlmostEqual(results["eval_loss"], expected_loss) + expected_acc = AlmostAccuracy()((pred, y))["accuracy"] + self.assertAlmostEqual(results["eval_accuracy"], expected_acc) + + # With logits preprocess + trainer = get_regression_trainer( + a=1.5, + b=2.5, + compute_metrics=AlmostAccuracyBatched(), + batch_eval_metrics=True, + preprocess_logits_for_metrics=lambda logits, labels: logits + 1, + ) + results = trainer.evaluate() + + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + pred = 1.5 * x + 2.5 + expected_loss = ((pred - y) ** 2).mean() + self.assertAlmostEqual(results["eval_loss"], expected_loss) + expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"] + self.assertAlmostEqual(results["eval_accuracy"], expected_acc) + def test_evaluate_with_jit(self): trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy(), jit_mode_eval=True) results = trainer.evaluate() @@ -1651,6 +1715,58 @@ def test_predict(self): self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0])) self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1])) + def test_predict_with_batch_eval_metrics(self): + trainer = get_regression_trainer( + a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True + ) + results = trainer.predict(trainer.eval_dataset) + preds = results.predictions + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + gt = 1.5 * x + 2.5 + self.assertTrue(np.allclose(preds, gt)) + expected_acc = AlmostAccuracy()((preds, y))["accuracy"] + self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc) + + # With a number of elements not a round multiple of the batch size + trainer = get_regression_trainer( + a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True + ) + results = trainer.predict(trainer.eval_dataset) + preds = results.predictions + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) + expected_acc = AlmostAccuracy()((preds, y))["accuracy"] + self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc) + + # With more than one output of the model + trainer = get_regression_trainer( + a=1.5, b=2.5, double_output=True, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True + ) + preds = trainer.predict(trainer.eval_dataset).predictions + x = trainer.eval_dataset.x + self.assertEqual(len(preds), 2) + self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) + self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) + + # With more than one output/label of the model + trainer = get_regression_trainer( + a=1.5, + b=2.5, + double_output=True, + label_names=["labels", "labels_2"], + compute_metrics=AlmostAccuracyBatched(), + batch_eval_metrics=True, + ) + outputs = trainer.predict(trainer.eval_dataset) + preds = outputs.predictions + labels = outputs.label_ids + x = trainer.eval_dataset.x + self.assertEqual(len(preds), 2) + self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) + self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) + self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0])) + self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1])) + def test_predict_with_jit(self): trainer = get_regression_trainer(a=1.5, b=2.5, jit_mode_eval=True) preds = trainer.predict(trainer.eval_dataset).predictions From 277db238b70c5c8bb5a445b75becbfd79644db42 Mon Sep 17 00:00:00 2001 From: Sungkyun Chang <1ronyar@gmail.com> Date: Mon, 6 May 2024 22:54:39 +0900 Subject: [PATCH 13/49] Fix typo: llama3.md (#30653) Update llama3.md fix typo --- docs/source/en/model_doc/llama3.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/llama3.md b/docs/source/en/model_doc/llama3.md index 1a7546c7e68a4f..d8c682c80cd194 100644 --- a/docs/source/en/model_doc/llama3.md +++ b/docs/source/en/model_doc/llama3.md @@ -82,4 +82,4 @@ pipeline("Hey how are you doing today?") ``` ## Resources -A ton of cool resources are already available on the documentation page of [~llama2], inviting contributors to add new recourses curated for Llama3 here! 🤗 +A ton of cool resources are already available on the documentation page of [~llama2], inviting contributors to add new resourses curated for Llama3 here! 🤗 From 835de4c8335f72a9c53178f54cc3b4c0688960ec Mon Sep 17 00:00:00 2001 From: Lucain Date: Mon, 6 May 2024 18:01:15 +0200 Subject: [PATCH 14/49] Respect `resume_download` deprecation (#30620) * Deprecate resume_download * remove default resume_download value --------- Co-authored-by: Lysandre Debut --- setup.py | 3 ++- src/transformers/dependency_versions_table.py | 3 ++- src/transformers/dynamic_module_utils.py | 14 ++++++++------ src/transformers/feature_extraction_utils.py | 8 ++++---- .../generation/configuration_utils.py | 8 ++++---- src/transformers/image_processing_utils.py | 8 ++++---- src/transformers/modeling_flax_utils.py | 8 ++++---- src/transformers/modeling_tf_utils.py | 8 ++++---- src/transformers/modeling_utils.py | 8 ++++---- src/transformers/models/auto/auto_factory.py | 18 +++++++++--------- .../models/auto/configuration_auto.py | 6 +++--- .../models/auto/feature_extraction_auto.py | 13 +++++++------ .../models/auto/image_processing_auto.py | 13 +++++++------ .../models/auto/processing_auto.py | 6 +++--- .../models/auto/tokenization_auto.py | 13 +++++++------ .../models/bark/processing_bark.py | 4 ++-- .../models/wav2vec2/modeling_wav2vec2.py | 8 ++++---- src/transformers/processing_utils.py | 2 +- src/transformers/tokenization_utils_base.py | 8 ++++---- src/transformers/utils/hub.py | 16 +++++++++------- src/transformers/utils/peft_utils.py | 7 ++++--- 21 files changed, 96 insertions(+), 86 deletions(-) diff --git a/setup.py b/setup.py index aae789a07d5111..23021e8affb0a8 100644 --- a/setup.py +++ b/setup.py @@ -116,7 +116,8 @@ "ftfy", "fugashi>=1.0", "GitPython<3.1.19", - "huggingface-hub>=0.19.3,<1.0", + "hf-doc-builder>=0.3.0", + "huggingface-hub>=0.23.0,<1.0", "importlib_metadata", "ipadic>=1.0.0,<2.0", "isort>=5.5.4", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 8db7b73637f462..e4c9fbf0d923a7 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -23,7 +23,8 @@ "ftfy": "ftfy", "fugashi": "fugashi>=1.0", "GitPython": "GitPython<3.1.19", - "huggingface-hub": "huggingface-hub>=0.19.3,<1.0", + "hf-doc-builder": "hf-doc-builder>=0.3.0", + "huggingface-hub": "huggingface-hub>=0.23.0,<1.0", "importlib_metadata": "importlib_metadata", "ipadic": "ipadic>=1.0.0,<2.0", "isort": "isort>=5.5.4", diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py index f9abade2809166..d80ffdc58cc94a 100644 --- a/src/transformers/dynamic_module_utils.py +++ b/src/transformers/dynamic_module_utils.py @@ -214,7 +214,7 @@ def get_cached_module_file( module_file: str, cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, - resume_download: bool = False, + resume_download: Optional[bool] = None, proxies: Optional[Dict[str, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, @@ -244,8 +244,9 @@ def get_cached_module_file( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the configuration files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. @@ -389,7 +390,7 @@ def get_class_from_dynamic_module( pretrained_model_name_or_path: Union[str, os.PathLike], cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, - resume_download: bool = False, + resume_download: Optional[bool] = None, proxies: Optional[Dict[str, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, @@ -432,8 +433,9 @@ def get_class_from_dynamic_module( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the configuration files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py index 12fef5103d858a..76070ebeb81b7a 100644 --- a/src/transformers/feature_extraction_utils.py +++ b/src/transformers/feature_extraction_utils.py @@ -293,9 +293,9 @@ def from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the feature extractor files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file - exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. @@ -451,7 +451,7 @@ def get_feature_extractor_dict( """ cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) subfolder = kwargs.pop("subfolder", None) token = kwargs.pop("token", None) diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index 1c91361ee13c75..cf9c6e38758d2f 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -729,9 +729,9 @@ def from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the configuration files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file - exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. @@ -795,7 +795,7 @@ def from_pretrained( ```""" config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) use_auth_token = kwargs.pop("use_auth_token", None) subfolder = kwargs.pop("subfolder", "") diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 70f1a339de706a..e040cdd31aa04b 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -123,9 +123,9 @@ def from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the image processor files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file - exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. @@ -287,7 +287,7 @@ def get_image_processor_dict( """ cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) use_auth_token = kwargs.pop("use_auth_token", None) diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py index da373603420ba2..f669329ac01bda 100644 --- a/src/transformers/modeling_flax_utils.py +++ b/src/transformers/modeling_flax_utils.py @@ -591,9 +591,9 @@ def from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -645,7 +645,7 @@ def from_pretrained( >>> model = FlaxBertModel.from_pretrained("./pt_model/pytorch_model.bin", from_pt=True, config=config) ```""" from_pt = kwargs.pop("from_pt", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) use_auth_token = kwargs.pop("use_auth_token", None) trust_remote_code = kwargs.pop("trust_remote_code", None) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index d5e17d256869a1..f6b9b00117d0a3 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -2606,9 +2606,9 @@ def from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies: (`Dict[str, str], `optional`): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -2676,7 +2676,7 @@ def from_pretrained( >>> model = TFBertModel.from_pretrained("./pt_model/my_pytorch_model.bin", from_pt=True, config=config) ```""" from_pt = kwargs.pop("from_pt", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) output_loading_info = kwargs.pop("output_loading_info", False) use_auth_token = kwargs.pop("use_auth_token", None) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 59b6bf80752053..7d60380e0b7d47 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2803,9 +2803,9 @@ def from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -2967,7 +2967,7 @@ def from_pretrained( state_dict = kwargs.pop("state_dict", None) from_tf = kwargs.pop("from_tf", False) from_flax = kwargs.pop("from_flax", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) output_loading_info = kwargs.pop("output_loading_info", False) use_auth_token = kwargs.pop("use_auth_token", None) diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py index e53dcab379bb06..de813c43cfe0e8 100644 --- a/src/transformers/models/auto/auto_factory.py +++ b/src/transformers/models/auto/auto_factory.py @@ -122,9 +122,9 @@ force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -221,9 +221,9 @@ force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -320,9 +320,9 @@ force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index c8280a1270ac66..c655689d46c172 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -855,9 +855,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download the model weights and configuration files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index f8cb55091b02fd..1dbb4eb7dc5022 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -140,7 +140,7 @@ def get_feature_extractor_config( pretrained_model_name_or_path: Union[str, os.PathLike], cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, - resume_download: bool = False, + resume_download: Optional[bool] = None, proxies: Optional[Dict[str, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, @@ -165,8 +165,9 @@ def get_feature_extractor_config( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the configuration files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. @@ -278,9 +279,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the feature extractor files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file - exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index c8538a9a55143a..9e8daefb397a58 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -159,7 +159,7 @@ def get_image_processor_config( pretrained_model_name_or_path: Union[str, os.PathLike], cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, - resume_download: bool = False, + resume_download: Optional[bool] = None, proxies: Optional[Dict[str, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, @@ -184,8 +184,9 @@ def get_image_processor_config( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the configuration files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. @@ -297,9 +298,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the image processor files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file - exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index a7134f26a7d60c..1bbce2d85aa1bd 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -167,9 +167,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the feature extractor files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file - exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 1a4f983d9b8507..c883c2e836263e 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -556,7 +556,7 @@ def get_tokenizer_config( pretrained_model_name_or_path: Union[str, os.PathLike], cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, - resume_download: bool = False, + resume_download: Optional[bool] = None, proxies: Optional[Dict[str, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, @@ -582,8 +582,9 @@ def get_tokenizer_config( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the configuration files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. @@ -708,9 +709,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download the model weights and configuration files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py index d58b89bf6f8f9b..bb7690f9a16bb3 100644 --- a/src/transformers/models/bark/processing_bark.py +++ b/src/transformers/models/bark/processing_bark.py @@ -92,7 +92,7 @@ def from_pretrained( cache_dir=kwargs.pop("cache_dir", None), force_download=kwargs.pop("force_download", False), proxies=kwargs.pop("proxies", None), - resume_download=kwargs.pop("resume_download", False), + resume_download=kwargs.pop("resume_download", None), local_files_only=kwargs.pop("local_files_only", False), token=kwargs.pop("use_auth_token", None), revision=kwargs.pop("revision", None), @@ -188,7 +188,7 @@ def _load_voice_preset(self, voice_preset: str = None, **kwargs): cache_dir=kwargs.pop("cache_dir", None), force_download=kwargs.pop("force_download", False), proxies=kwargs.pop("proxies", None), - resume_download=kwargs.pop("resume_download", False), + resume_download=kwargs.pop("resume_download", None), local_files_only=kwargs.pop("local_files_only", False), token=kwargs.pop("use_auth_token", None), revision=kwargs.pop("revision", None), diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index e924765808bb25..8a4c3e0615b9c8 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -1560,9 +1560,9 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs): force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -1616,7 +1616,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs): cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) token = kwargs.pop("token", None) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 5b46d5ea4a4801..463591032ceb7c 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -273,7 +273,7 @@ def get_processor_dict( """ cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) local_files_only = kwargs.pop("local_files_only", False) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 0a57345e395453..12e8e92e1a04d7 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1904,9 +1904,9 @@ def from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download the vocabulary files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Attempt to resume the download if such a file - exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -1961,7 +1961,7 @@ def from_pretrained( # Otherwise use tokenizer.add_special_tokens({'unk_token': ''}) instead) assert tokenizer.unk_token == "" ```""" - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) use_auth_token = kwargs.pop("use_auth_token", None) subfolder = kwargs.pop("subfolder", None) diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py index 616bfa79ecb559..ef32ecde725fce 100644 --- a/src/transformers/utils/hub.py +++ b/src/transformers/utils/hub.py @@ -268,7 +268,7 @@ def cached_file( filename: str, cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, - resume_download: bool = False, + resume_download: Optional[bool] = None, proxies: Optional[Dict[str, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, @@ -299,8 +299,9 @@ def cached_file( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the configuration files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. @@ -474,7 +475,7 @@ def get_file_from_repo( filename: str, cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, - resume_download: bool = False, + resume_download: Optional[bool] = None, proxies: Optional[Dict[str, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, @@ -499,8 +500,9 @@ def get_file_from_repo( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the configuration files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. @@ -977,7 +979,7 @@ def get_checkpoint_shard_files( cache_dir=None, force_download=False, proxies=None, - resume_download=False, + resume_download=None, local_files_only=False, token=None, user_agent=None, diff --git a/src/transformers/utils/peft_utils.py b/src/transformers/utils/peft_utils.py index 2078f1ae960955..7efa80e92347e2 100644 --- a/src/transformers/utils/peft_utils.py +++ b/src/transformers/utils/peft_utils.py @@ -30,7 +30,7 @@ def find_adapter_config_file( model_id: str, cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, - resume_download: bool = False, + resume_download: Optional[bool] = None, proxies: Optional[Dict[str, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, @@ -51,8 +51,9 @@ def find_adapter_config_file( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the configuration files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. From 4980d62af32564379181782d8ac22b5daaee80b5 Mon Sep 17 00:00:00 2001 From: Simon <80467011+sorgfresser@users.noreply.github.com> Date: Tue, 7 May 2024 10:19:24 +0200 Subject: [PATCH 15/49] top-k instead of top-p in MixtralConfig docstring (#30687) top-k instead of top-p in docstring --- src/transformers/models/mixtral/configuration_mixtral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index a452260fb8ac6f..66c779248423e1 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -83,7 +83,7 @@ class MixtralConfig(PretrainedConfig): attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. num_experts_per_tok (`int`, *optional*, defaults to 2): - The number of experts to root per-token, can be also interpreted as the `top-p` routing + The number of experts to route per-token, can be also interpreted as the `top-k` routing parameter num_local_experts (`int`, *optional*, defaults to 8): Number of experts per Sparse MLP layer. From a898fb95bdb8ded526eda6b0affd5802367f967d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 09:28:56 +0100 Subject: [PATCH 16/49] Bump jinja2 from 3.1.3 to 3.1.4 in /examples/research_projects/decision_transformer (#30680) Bump jinja2 in /examples/research_projects/decision_transformer Bumps [jinja2](https://github.com/pallets/jinja) from 3.1.3 to 3.1.4. - [Release notes](https://github.com/pallets/jinja/releases) - [Changelog](https://github.com/pallets/jinja/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/jinja/compare/3.1.3...3.1.4) --- updated-dependencies: - dependency-name: jinja2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .../research_projects/decision_transformer/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt index 306c966e25a2ce..7e2d0e386d5e87 100644 --- a/examples/research_projects/decision_transformer/requirements.txt +++ b/examples/research_projects/decision_transformer/requirements.txt @@ -92,7 +92,7 @@ itsdangerous==2.1.1 jax==0.3.4 jaxlib==0.3.2 jedi==0.18.1 -Jinja2==3.1.3 +Jinja2==3.1.4 jinja2-time==0.2.0 jmespath==0.10.0 joblib==1.2.0 From ce47582d812f8c14aa4a89f46508c3ada12de15b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 09:39:35 +0100 Subject: [PATCH 17/49] Bump werkzeug from 3.0.1 to 3.0.3 in /examples/research_projects/decision_transformer (#30679) Bump werkzeug in /examples/research_projects/decision_transformer Bumps [werkzeug](https://github.com/pallets/werkzeug) from 3.0.1 to 3.0.3. - [Release notes](https://github.com/pallets/werkzeug/releases) - [Changelog](https://github.com/pallets/werkzeug/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/werkzeug/compare/3.0.1...3.0.3) --- updated-dependencies: - dependency-name: werkzeug dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .../research_projects/decision_transformer/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt index 7e2d0e386d5e87..a31e3d0de8b963 100644 --- a/examples/research_projects/decision_transformer/requirements.txt +++ b/examples/research_projects/decision_transformer/requirements.txt @@ -233,7 +233,7 @@ urllib3==1.26.18 wasabi==0.9.0 wcwidth==0.2.5 websocket-client==1.3.1 -Werkzeug==3.0.1 +Werkzeug==3.0.3 wrapt==1.14.0 xxhash==3.0.0 yarl==1.7.2 From 54a2361a29abe9e0e6a299f5c9986fd85e5d1699 Mon Sep 17 00:00:00 2001 From: "JB (Don)" <1557853+hackyon@users.noreply.github.com> Date: Tue, 7 May 2024 17:12:21 +0800 Subject: [PATCH 18/49] Adding _tie_weights() to prediction heads to support low_cpu_mem_usage=True (#29024) * Adding _tie_weights() to prediction heads to support low_cpu_mem_usage=True * Testing for the non-safe-tensors case, since the default is safe-tensors already * Running fixup/fix-copies * Adding accelerate annotations to tests --- .../models/albert/modeling_albert.py | 9 +- src/transformers/models/bert/modeling_bert.py | 6 ++ .../modeling_bert_generation.py | 9 +- .../models/big_bird/modeling_big_bird.py | 6 ++ .../models/blip/modeling_blip_text.py | 4 + .../models/deberta/modeling_deberta.py | 4 + .../models/deberta_v2/modeling_deberta_v2.py | 4 + .../models/ernie/modeling_ernie.py | 6 ++ .../models/flava/modeling_flava.py | 3 + src/transformers/models/fnet/modeling_fnet.py | 12 ++- src/transformers/models/fsmt/modeling_fsmt.py | 3 + .../models/ibert/modeling_ibert.py | 11 ++- .../models/layoutlm/modeling_layoutlm.py | 4 + .../models/markuplm/modeling_markuplm.py | 3 + .../megatron_bert/modeling_megatron_bert.py | 6 ++ .../models/mobilebert/modeling_mobilebert.py | 13 ++- .../models/mpnet/modeling_mpnet.py | 4 + src/transformers/models/mra/modeling_mra.py | 4 + .../models/nezha/modeling_nezha.py | 5 ++ .../nystromformer/modeling_nystromformer.py | 4 + .../models/qdqbert/modeling_qdqbert.py | 5 ++ .../models/realm/modeling_realm.py | 4 + .../models/reformer/modeling_reformer.py | 12 ++- .../models/roc_bert/modeling_roc_bert.py | 6 ++ .../models/roformer/modeling_roformer.py | 5 ++ .../squeezebert/modeling_squeezebert.py | 4 + .../models/tapas/modeling_tapas.py | 4 + src/transformers/models/vilt/modeling_vilt.py | 4 + .../visual_bert/modeling_visual_bert.py | 4 + .../xlm_roberta_xl/modeling_xlm_roberta_xl.py | 12 ++- src/transformers/models/yoso/modeling_yoso.py | 4 + .../test_modeling_deformable_detr.py | 12 +++ tests/models/deta/test_modeling_deta.py | 12 +++ tests/models/encodec/test_modeling_encodec.py | 12 +++ tests/models/lxmert/test_modeling_lxmert.py | 12 +++ tests/models/marian/test_modeling_marian.py | 12 +++ .../models/musicgen/test_modeling_musicgen.py | 12 +++ .../test_modeling_musicgen_melody.py | 12 +++ tests/models/sew/test_modeling_sew.py | 12 +++ tests/models/sew_d/test_modeling_sew_d.py | 12 +++ .../test_modeling_timm_backbone.py | 12 +++ tests/test_modeling_common.py | 82 +++++++++++++++++++ 42 files changed, 366 insertions(+), 20 deletions(-) diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index 87f5a9e30c8f54..ff50f2f1293e17 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -877,8 +877,12 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return prediction_scores def _tie_weights(self) -> None: - # To tie those two weights if they get disconnected (on TPU or when the bias is resized) - self.bias = self.decoder.bias + # For accelerate compatibility and to not break backward compatibility + if self.decoder.bias.device.type == "meta": + self.decoder.bias = self.bias + else: + # To tie those two weights if they get disconnected (on TPU or when the bias is resized) + self.bias = self.decoder.bias class AlbertSOPHead(nn.Module): @@ -915,6 +919,7 @@ def get_output_embeddings(self) -> nn.Linear: def set_output_embeddings(self, new_embeddings: nn.Linear) -> None: self.predictions.decoder = new_embeddings + self.predictions.bias = new_embeddings.bias def get_input_embeddings(self) -> nn.Embedding: return self.albert.embeddings.word_embeddings diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 9e2847b11b53f0..129336cc5280c0 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -778,6 +778,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -1186,6 +1189,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @@ -1295,6 +1299,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1448,6 +1453,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index 73c4d1d1e5da91..a5fb3d0531153e 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -851,8 +851,12 @@ def forward(self, hidden_states): return logits def _tie_weights(self): - # To tie those two weights if they get disconnected (on TPU or when the bias is resized) - self.bias = self.decoder.bias + # For accelerate compatibility and to not break backward compatibility + if self.decoder.bias.device.type == "meta": + self.decoder.bias = self.bias + else: + # To tie those two weights if they get disconnected (on TPU or when the bias is resized) + self.bias = self.decoder.bias @add_start_docstrings( @@ -879,6 +883,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + self.lm_head.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index 510c98079501ef..6e5363f0bc6e57 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -1704,6 +1704,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -2263,6 +2266,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=BigBirdForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @@ -2375,6 +2379,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) @@ -2516,6 +2521,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py index 3eb6ad45791030..a800ba89825dcb 100644 --- a/src/transformers/models/blip/modeling_blip_text.py +++ b/src/transformers/models/blip/modeling_blip_text.py @@ -523,6 +523,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -818,6 +821,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias def forward( self, diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index 42dae5c80894a8..02047a5cffd448 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -1021,6 +1021,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1117,6 +1118,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index dfe18b0d4964af..f898c33af09492 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -1120,6 +1120,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1217,6 +1218,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index 3db6501985604d..95e27121bc2046 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -603,6 +603,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -990,6 +993,7 @@ def get_output_embeddings(self): # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.set_output_embeddings def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=ErnieForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @@ -1104,6 +1108,7 @@ def get_output_embeddings(self): # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.set_output_embeddings def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1264,6 +1269,7 @@ def get_output_embeddings(self): # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.set_output_embeddings def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index 19f19d4c9d5666..d967335d8e0068 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -1661,6 +1661,9 @@ def __init__(self, config, weight=None): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, x): x = self.transform(x) x = self.decoder(x) diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py index 5724faee56cf85..a11b1c87a0254c 100755 --- a/src/transformers/models/fnet/modeling_fnet.py +++ b/src/transformers/models/fnet/modeling_fnet.py @@ -355,9 +355,13 @@ def forward(self, hidden_states): hidden_states = self.decoder(hidden_states) return hidden_states - def _tie_weights(self): - # To tie those two weights if they get disconnected (on TPU or when the bias is resized) - self.bias = self.decoder.bias + def _tie_weights(self) -> None: + # For accelerate compatibility and to not break backward compatibility + if self.decoder.bias.device.type == "meta": + self.decoder.bias = self.bias + else: + # To tie those two weights if they get disconnected (on TPU or when the bias is resized) + self.bias = self.decoder.bias class FNetOnlyMLMHead(nn.Module): @@ -624,6 +628,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=FNetForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @@ -718,6 +723,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py index 4c180c52678b82..4a0e591d62f580 100644 --- a/src/transformers/models/fsmt/modeling_fsmt.py +++ b/src/transformers/models/fsmt/modeling_fsmt.py @@ -692,6 +692,9 @@ def __init__(self, config: FSMTConfig, embed_tokens: nn.Embedding): self.output_projection = nn.Linear(embed_tokens_weight_shape[1], embed_tokens_weight_shape[0], bias=False) self.output_projection.weight = self.embed_tokens.weight + def _tie_weights(self): + self.embed_tokens.weight = self.output_projection.weight + def forward( self, input_ids: torch.Tensor, diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py index 54c37f507e3a63..f06557c2616078 100644 --- a/src/transformers/models/ibert/modeling_ibert.py +++ b/src/transformers/models/ibert/modeling_ibert.py @@ -868,6 +868,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + self.lm_head.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -952,9 +953,13 @@ def forward(self, features, **kwargs): return x - def _tie_weights(self): - # To tie those two weights if they get disconnected (on TPU or when the bias is resized) - self.bias = self.decoder.bias + def _tie_weights(self) -> None: + # For accelerate compatibility and to not break backward compatibility + if self.decoder.bias.device.type == "meta": + self.decoder.bias = self.bias + else: + # To tie those two weights if they get disconnected (on TPU or when the bias is resized) + self.bias = self.decoder.bias @add_start_docstrings( diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 6914f5ee3efb62..98765b3f75ff29 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -594,6 +594,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -873,6 +876,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py index 318110daf5d8d1..707f612459ddc0 100755 --- a/src/transformers/models/markuplm/modeling_markuplm.py +++ b/src/transformers/models/markuplm/modeling_markuplm.py @@ -316,6 +316,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py index 528bcca3d9bc00..a9d228bf3bb652 100755 --- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py +++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py @@ -657,6 +657,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -1021,6 +1024,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MegatronBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @@ -1130,6 +1134,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) @@ -1288,6 +1293,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index 8dc0aafa70fc25..92a18dfe599041 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -650,6 +650,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self) -> None: + self.decoder.bias = self.bias + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.transform(hidden_states) hidden_states = hidden_states.matmul(torch.cat([self.decoder.weight.t(), self.dense.weight], dim=0)) @@ -938,8 +941,9 @@ def __init__(self, config): def get_output_embeddings(self): return self.cls.predictions.decoder - def set_output_embeddings(self, new_embeddigs): - self.cls.predictions.decoder = new_embeddigs + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding: # resize dense output embedings at first @@ -1047,8 +1051,9 @@ def __init__(self, config): def get_output_embeddings(self): return self.cls.predictions.decoder - def set_output_embeddings(self, new_embeddigs): - self.cls.predictions.decoder = new_embeddigs + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding: # resize dense output embedings at first diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py index d9b9f90d398d90..12e6bdbffaaa7b 100644 --- a/src/transformers/models/mpnet/modeling_mpnet.py +++ b/src/transformers/models/mpnet/modeling_mpnet.py @@ -584,6 +584,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + self.lm_head.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -656,6 +657,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, features, **kwargs): x = self.dense(features) x = gelu(x) diff --git a/src/transformers/models/mra/modeling_mra.py b/src/transformers/models/mra/modeling_mra.py index 846578997c4a84..db918484d986cd 100644 --- a/src/transformers/models/mra/modeling_mra.py +++ b/src/transformers/models/mra/modeling_mra.py @@ -809,6 +809,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -1042,6 +1045,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/nezha/modeling_nezha.py index 6d983bd2378903..5ab2dc8958dff0 100644 --- a/src/transformers/models/nezha/modeling_nezha.py +++ b/src/transformers/models/nezha/modeling_nezha.py @@ -674,6 +674,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -1039,6 +1042,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=NezhaForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @@ -1147,6 +1151,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py index 1da61bc59e6a7a..df0dd0e405c0ef 100755 --- a/src/transformers/models/nystromformer/modeling_nystromformer.py +++ b/src/transformers/models/nystromformer/modeling_nystromformer.py @@ -426,6 +426,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -664,6 +667,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py index c5e9af7025842b..7f1916dc80bf5c 100755 --- a/src/transformers/models/qdqbert/modeling_qdqbert.py +++ b/src/transformers/models/qdqbert/modeling_qdqbert.py @@ -681,6 +681,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -1022,6 +1025,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) @@ -1188,6 +1192,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py index adec5647a28134..3753ba9dd28d01 100644 --- a/src/transformers/models/realm/modeling_realm.py +++ b/src/transformers/models/realm/modeling_realm.py @@ -797,6 +797,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -1391,6 +1394,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward( REALM_INPUTS_DOCSTRING.format("batch_size, num_candidates, sequence_length") diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py index e6768e897eca0c..bfb8fb5ebe1cfa 100755 --- a/src/transformers/models/reformer/modeling_reformer.py +++ b/src/transformers/models/reformer/modeling_reformer.py @@ -1768,9 +1768,13 @@ def forward_chunk(self, hidden_states): hidden_states = self.decoder(hidden_states) return hidden_states - def _tie_weights(self): - # To tie those two weights if they get disconnected (on TPU or when the bias is resized) - self.bias = self.decoder.bias + def _tie_weights(self) -> None: + # For accelerate compatibility and to not break backward compatibility + if self.decoder.bias.device.type == "meta": + self.decoder.bias = self.bias + else: + # To tie those two weights if they get disconnected (on TPU or when the bias is resized) + self.bias = self.decoder.bias class ReformerPreTrainedModel(PreTrainedModel): @@ -2208,6 +2212,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + self.lm_head.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(REFORMER_INPUTS_DOCSTRING) @add_code_sample_docstrings( @@ -2328,6 +2333,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + self.lm_head.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(REFORMER_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py index 739e60b550baf3..9d8284461f6679 100644 --- a/src/transformers/models/roc_bert/modeling_roc_bert.py +++ b/src/transformers/models/roc_bert/modeling_roc_bert.py @@ -749,6 +749,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -1094,6 +1097,7 @@ def get_output_embeddings(self): # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.set_output_embeddings def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) @@ -1286,6 +1290,7 @@ def get_output_embeddings(self): # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.set_output_embeddings def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( @@ -1423,6 +1428,7 @@ def get_output_embeddings(self): # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.set_output_embeddings def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py index b2a63221a8dc90..f7589d4853d581 100644 --- a/src/transformers/models/roformer/modeling_roformer.py +++ b/src/transformers/models/roformer/modeling_roformer.py @@ -657,6 +657,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self) -> None: + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -954,6 +957,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1053,6 +1057,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py index b5657f6e6f5003..d95a58daaf6164 100644 --- a/src/transformers/models/squeezebert/modeling_squeezebert.py +++ b/src/transformers/models/squeezebert/modeling_squeezebert.py @@ -400,6 +400,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self) -> None: + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -658,6 +661,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(SQUEEZEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index e2ce847926b38f..97636d8b28e18e 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -699,6 +699,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -978,6 +981,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py index 5545b881bd670a..e5f775cfc6f079 100755 --- a/src/transformers/models/vilt/modeling_vilt.py +++ b/src/transformers/models/vilt/modeling_vilt.py @@ -894,6 +894,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.mlm_score.decoder = new_embeddings + self.mlm_score.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(VILT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) @@ -1040,6 +1041,9 @@ def __init__(self, config, weight=None): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, x): x = self.transform(x) x = self.decoder(x) diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py index 07c8b7a4b5173c..33df2ac13cf5b9 100755 --- a/src/transformers/models/visual_bert/modeling_visual_bert.py +++ b/src/transformers/models/visual_bert/modeling_visual_bert.py @@ -489,6 +489,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -869,6 +872,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(VISUAL_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=VisualBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index d8994e335b1242..da76ca29ae27ee 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -852,6 +852,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + self.lm_head.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) @@ -1011,6 +1012,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + self.lm_head.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1099,9 +1101,13 @@ def forward(self, features, **kwargs): return x - def _tie_weights(self): - # To tie those two weights if they get disconnected (on TPU or when the bias is resized) - self.bias = self.decoder.bias + def _tie_weights(self) -> None: + # For accelerate compatibility and to not break backward compatibility + if self.decoder.bias.device.type == "meta": + self.decoder.bias = self.bias + else: + # To tie those two weights if they get disconnected (on TPU or when the bias is resized) + self.bias = self.decoder.bias @add_start_docstrings( diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py index b1fed0acc468df..3615ea80719be1 100644 --- a/src/transformers/models/yoso/modeling_yoso.py +++ b/src/transformers/models/yoso/modeling_yoso.py @@ -627,6 +627,9 @@ def __init__(self, config): # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias + def _tie_weights(self): + self.decoder.bias = self.bias + def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) @@ -861,6 +864,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py index 23e95267d8788c..5831d338dc3b50 100644 --- a/tests/models/deformable_detr/test_modeling_deformable_detr.py +++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py @@ -578,6 +578,18 @@ def test_initialization(self): msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_checkpoints(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_no_safetensors(self): + pass + def test_two_stage_training(self): model_class = DeformableDetrForObjectDetection config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/deta/test_modeling_deta.py b/tests/models/deta/test_modeling_deta.py index 70c1009a508907..faab773efd79a3 100644 --- a/tests/models/deta/test_modeling_deta.py +++ b/tests/models/deta/test_modeling_deta.py @@ -528,6 +528,18 @@ def test_initialization(self): msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_checkpoints(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_no_safetensors(self): + pass + # Inspired by tests.test_modeling_common.ModelTesterMixin.test_tied_weights_keys def test_tied_weights_keys(self): for model_class in self.all_model_classes: diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py index 0c021eaad21ab5..dd07f5a6e25246 100644 --- a/tests/models/encodec/test_modeling_encodec.py +++ b/tests/models/encodec/test_modeling_encodec.py @@ -325,6 +325,18 @@ def test_feed_forward_chunking(self): def test_hidden_states_output(self): pass + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_checkpoints(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_no_safetensors(self): + pass + def test_determinism(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/lxmert/test_modeling_lxmert.py b/tests/models/lxmert/test_modeling_lxmert.py index 723fef6061b3e4..a98643b33cd74a 100644 --- a/tests/models/lxmert/test_modeling_lxmert.py +++ b/tests/models/lxmert/test_modeling_lxmert.py @@ -766,6 +766,18 @@ def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict): return tf_inputs_dict + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_checkpoints(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_no_safetensors(self): + pass + @require_torch class LxmertModelIntegrationTest(unittest.TestCase): diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py index 593ef8e3405e38..3144dd48dab21f 100644 --- a/tests/models/marian/test_modeling_marian.py +++ b/tests/models/marian/test_modeling_marian.py @@ -372,6 +372,18 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_checkpoints(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_no_safetensors(self): + pass + def assert_tensors_close(a, b, atol=1e-12, prefix=""): """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error.""" diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index b04f99c05ec1e7..8482072d73cfe7 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -1273,6 +1273,18 @@ def test_tied_model_weights_key_ignore(self): def test_tied_weights_keys(self): pass + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_checkpoints(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_no_safetensors(self): + pass + # override since changing `output_hidden_states` / `output_attentions` from the top-level model config won't work def test_retain_grad_hidden_states_attentions(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index 628cc76a093ea7..b32b5082584668 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -1258,6 +1258,18 @@ def test_tied_model_weights_key_ignore(self): def test_tied_weights_keys(self): pass + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_checkpoints(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_no_safetensors(self): + pass + # override since changing `output_hidden_states` / `output_attentions` from the top-level model config won't work # Ignore copy def test_retain_grad_hidden_states_attentions(self): diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py index 528d5f84185e6e..5342df9e088039 100644 --- a/tests/models/sew/test_modeling_sew.py +++ b/tests/models/sew/test_modeling_sew.py @@ -356,6 +356,18 @@ def test_resize_tokens_embeddings(self): def test_model_common_attributes(self): pass + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_checkpoints(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_no_safetensors(self): + pass + def test_retain_grad_hidden_states_attentions(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.output_hidden_states = True diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py index 6fda7963a8009c..1980bd3ab12166 100644 --- a/tests/models/sew_d/test_modeling_sew_d.py +++ b/tests/models/sew_d/test_modeling_sew_d.py @@ -460,6 +460,18 @@ def _mock_init_weights(self, module): def test_feed_forward_chunking(self): pass + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_checkpoints(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_no_safetensors(self): + pass + @slow def test_model_from_pretrained(self): model = SEWDModel.from_pretrained("asapp/sew-d-tiny-100k") diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py index 60ab9e2a217eb5..1cd04cd4843933 100644 --- a/tests/models/timm_backbone/test_modeling_timm_backbone.py +++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py @@ -169,6 +169,18 @@ def test_from_pretrained_no_checkpoint(self): def test_save_load(self): pass + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_checkpoints(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_no_safetensors(self): + pass + @unittest.skip("model weights aren't tied in TimmBackbone.") def test_tie_model_weights(self): pass diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 66cac0d6c585a1..df585f4afc65e1 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -437,6 +437,88 @@ class CopyClass(model_class): max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item() self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical") + @slow + @require_accelerate + @mark.accelerate_tests + def test_save_load_low_cpu_mem_usage(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + with tempfile.TemporaryDirectory() as saved_model_path: + for model_class in self.all_model_classes: + model_to_save = model_class(config) + model_to_save.save_pretrained(saved_model_path) + + self._check_save_load_low_cpu_mem_usage(model_class, saved_model_path) + + @slow + @require_accelerate + @mark.accelerate_tests + def test_save_load_low_cpu_mem_usage_checkpoints(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + with tempfile.TemporaryDirectory() as saved_model_path: + for model_class in self.all_model_classes: + model_to_save = model_class(config) + model_to_save.config.save_pretrained(saved_model_path) + torch.save(model_to_save.state_dict(), os.path.join(saved_model_path, "pytorch_model.bin")) + + self._check_save_load_low_cpu_mem_usage(model_class, saved_model_path) + + @slow + @require_accelerate + @mark.accelerate_tests + def test_save_load_low_cpu_mem_usage_no_safetensors(self): + with tempfile.TemporaryDirectory() as saved_model_path: + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model_to_save = model_class(config) + + model_to_save.save_pretrained(saved_model_path, safe_serialization=False) + self._check_save_load_low_cpu_mem_usage(model_class, saved_model_path) + + def _check_save_load_low_cpu_mem_usage(self, model_class, saved_model_path): + # Load the low usage and the normal models. + model_low_usage, loading_info = model_class.from_pretrained( + saved_model_path, + low_cpu_mem_usage=True, + output_loading_info=True, + ) + model_non_low_usage = model_class.from_pretrained(saved_model_path) + + # Check that there were no missing keys. + self.assertEqual(loading_info["missing_keys"], []) + + # The low_cpu_mem_usage=True causes the model params to be initialized with device=meta, and then + # subsequently loaded with the correct values and onto the correct device. We check if there are any + # remaining params that were not properly loaded. + for name, param in model_low_usage.named_parameters(): + self.assertNotEqual( + param.device, + torch.device("meta"), + "Parameter '" + name + "' has not been properly loaded and has device=meta.", + ) + + # Tests moving the model to a device other than meta. + model_low_usage.to(torch_device) + + # Check that the parameters are equal. + for p1, p2 in zip(model_low_usage.parameters(), model_non_low_usage.parameters()): + self.assertEquals(p1.data.ne(p2.data).sum(), 0) + + # Check that the state dict keys are equal. + self.assertEqual(set(model_low_usage.state_dict().keys()), set(model_non_low_usage.state_dict().keys())) + + # Check that the shared tensors are equal. + tensor_ptrs1 = collections.defaultdict(list) + for name, tensor in model_low_usage.state_dict().items(): + tensor_ptrs1[id_tensor_storage(tensor)].append(name) + tied_params1 = [names for _, names in tensor_ptrs1.items() if len(names) > 1] + + tensor_ptrs2 = collections.defaultdict(list) + for name, tensor in model_non_low_usage.state_dict().items(): + tensor_ptrs2[id_tensor_storage(tensor)].append(name) + tied_params2 = [names for _, names in tensor_ptrs2.items() if len(names) > 1] + + self.assertEqual(tied_params1, tied_params2) + def test_fast_init_context_manager(self): # 1. Create a dummy class. Should have buffers as well? To make sure we test __init__ class MyClass(PreTrainedModel): From 4fda78c3f845232a07318e5f8f3f12bc0c3f6cce Mon Sep 17 00:00:00 2001 From: Zhakshylyk Nurlanov Date: Tue, 7 May 2024 11:13:11 +0200 Subject: [PATCH 19/49] Fix `cache_position` initialisation for generation with `use_cache=False` (#30485) * Fix cache_position init for generation * Update src/transformers/generation/utils.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Fix cache position update --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- src/transformers/generation/utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 7af79099a0bd29..7f4caf26aeac7d 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -667,7 +667,11 @@ def _update_model_kwargs_for_generation( dim=-1, ) - if "cache_position" in model_kwargs and model_kwargs["cache_position"] is not None: + if ( + model_kwargs.get("use_cache", True) + and "cache_position" in model_kwargs + and model_kwargs["cache_position"] is not None + ): model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens return model_kwargs @@ -1293,6 +1297,10 @@ def _prepare_generation_config( def _get_initial_cache_position(self, input_ids, model_kwargs): """Calculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past length""" + if not model_kwargs.get("use_cache", True): + model_kwargs["cache_position"] = None + return model_kwargs + past_length = 0 if "past_key_values" in model_kwargs: if isinstance(model_kwargs["past_key_values"], Cache): From 9c8979e35fc4d0f991214368b58054573b8747ce Mon Sep 17 00:00:00 2001 From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com> Date: Tue, 7 May 2024 11:17:27 +0200 Subject: [PATCH 20/49] Word-level timestamps broken for short-form audio (#30325) * force chunk_length_s in AutomaticSpeechRecognitionPipeline * compute num_frames even when stride is None * add slow tests * fix test * Update src/transformers/pipelines/automatic_speech_recognition.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/pipelines/test_pipelines_automatic_speech_recognition.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * add input validation * fixup * small fix --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- .../pipelines/automatic_speech_recognition.py | 12 ++ ..._pipelines_automatic_speech_recognition.py | 131 ++++++++++++++++++ 2 files changed, 143 insertions(+) diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index f2d0f136790922..de1a9b57ac6e3e 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -446,6 +446,8 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None): processed = self.feature_extractor( inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt" ) + if stride is None: + extra["segment_size"] = len(inputs) if self.torch_dtype is not None: processed = processed.to(dtype=self.torch_dtype) @@ -459,8 +461,12 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None): def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs): attention_mask = model_inputs.pop("attention_mask", None) stride = model_inputs.pop("stride", None) + segment_size = model_inputs.pop("segment_size", None) is_last = model_inputs.pop("is_last") + if stride is not None and segment_size is not None: + raise ValueError("segment_size must be used only when stride is None") + if self.type in {"seq2seq", "seq2seq_whisper"}: encoder = self.model.get_encoder() # Consume values so we can let extra information flow freely through @@ -488,6 +494,12 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs): else: generate_kwargs["num_frames"] = [s[0] // self.feature_extractor.hop_length for s in stride] + else: + if isinstance(segment_size, int): + generate_kwargs["num_frames"] = segment_size // self.feature_extractor.hop_length + else: + generate_kwargs["num_frames"] = segment_size[0] // self.feature_extractor.hop_length + if self.type == "seq2seq_whisper" and inputs.shape[-1] > self.feature_extractor.nb_max_frames: generate_kwargs["input_features"] = inputs else: diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index ddf901180893a4..a1ab2947830ba9 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -755,6 +755,94 @@ def test_whisper_timestamp_prediction(self): }, ) + @slow + @require_torch + def test_whisper_large_timestamp_prediction(self): + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + array = np.concatenate( + [ds[40]["audio"]["array"], ds[41]["audio"]["array"], ds[42]["audio"]["array"], ds[43]["audio"]["array"]] + ) + pipe = pipeline(model="openai/whisper-large-v3", return_timestamps=True) + + output = pipe(ds[40]["audio"]) + self.assertDictEqual( + output, + { + "text": " A man said to the universe, Sir, I exist.", + "chunks": [{"text": " A man said to the universe, Sir, I exist.", "timestamp": (0.0, 4.08)}], + }, + ) + + output = pipe(array, chunk_length_s=10) + + self.assertDictEqual( + nested_simplify(output), + { + "chunks": [ + {"timestamp": (0.0, 2.0), "text": (" A man said to the universe,")}, + {"timestamp": (2.0, 4.1), "text": (" Sir, I exist.")}, + {"timestamp": (5.14, 5.96), "text": (" Sweat covered")}, + {"timestamp": (5.96, 8.02), "text": (" Breon's body, trickling into")}, + {"timestamp": (8.02, 10.67), "text": (" the tight loincloth that was the only garment he wore,")}, + {"timestamp": (10.67, 13.67), "text": (" the cut on his chest still dripping blood,")}, + {"timestamp": (13.67, 17.61), "text": (" the ache of his overstrained eyes.")}, + { + "timestamp": (17.61, 24.0), + "text": ( + " Even the soaring arena around him with thousands of spectators were trivialities not worth thinking about." + ), + }, + { + "timestamp": (24.0, 29.94), + "text": (" His instant of panic was followed by a small, sharp blow high on his chest."), + }, + ], + "text": ( + " A man said to the universe, Sir, I exist. Sweat covered Breon's" + " body, trickling into the tight loincloth that was the only garment" + " he wore, the cut on his chest still dripping blood, the ache of his" + " overstrained eyes. Even the soaring arena around him with thousands" + " of spectators were trivialities not worth thinking about. His " + "instant of panic was followed by a small, sharp blow high on his chest." + ), + }, + ) + + output = pipe(array) + self.assertDictEqual( + output, + { + "chunks": [ + {"timestamp": (0.0, 1.96), "text": " A man said to the universe,"}, + {"timestamp": (2.7, 4.1), "text": " Sir, I exist."}, + {"timestamp": (5.14, 6.84), "text": " Sweat covered Brion's body,"}, + { + "timestamp": (7.4, 10.68), + "text": " trickling into the tight loincloth that was the only garment he wore,", + }, + {"timestamp": (11.6, 13.94), "text": " the cut on his chest still dripping blood,"}, + {"timestamp": (14.78, 16.72), "text": " the ache of his overstrained eyes,"}, + { + "timestamp": (17.32, 21.16), + "text": " even the soaring arena around him with the thousands of spectators", + }, + {"timestamp": (21.16, 23.94), "text": " were trivialities not worth thinking about."}, + { + "timestamp": (24.42, 29.94), + "text": " His instant panic was followed by a small sharp blow high on his chest.", + }, + ], + "text": ( + " A man said to the universe, Sir, I exist. Sweat covered Brion's body," + " trickling into the tight loincloth that was the only garment he wore, " + "the cut on his chest still dripping blood, the ache of his overstrained " + "eyes, even the soaring arena around him with the thousands of spectators " + "were trivialities not worth thinking about. His instant panic was followed " + "by a small sharp blow high on his chest." + ), + }, + ) + @slow @require_torch def test_whisper_word_timestamps_batched(self): @@ -799,6 +887,49 @@ def test_whisper_word_timestamps_batched(self): output = pipe(sample, batch_size=2) self.assertDictEqual(output, EXPECTED_OUTPUT) + @slow + @require_torch + def test_whisper_large_word_timestamps_batched(self): + pipe = pipeline( + task="automatic-speech-recognition", + model="openai/whisper-large-v3", + return_timestamps="word", + ) + data = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + sample = data[0]["audio"] + + # not the same output as test_simple_whisper_asr because of chunking + EXPECTED_OUTPUT = { + "text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.", + "chunks": [ + {"text": " Mr.", "timestamp": (0.0, 0.74)}, + {"text": " Quilter", "timestamp": (0.74, 1.04)}, + {"text": " is", "timestamp": (1.04, 1.3)}, + {"text": " the", "timestamp": (1.3, 1.44)}, + {"text": " apostle", "timestamp": (1.44, 1.74)}, + {"text": " of", "timestamp": (1.74, 2.18)}, + {"text": " the", "timestamp": (2.18, 2.28)}, + {"text": " middle", "timestamp": (2.28, 2.5)}, + {"text": " classes,", "timestamp": (2.5, 3.0)}, + {"text": " and", "timestamp": (3.0, 3.4)}, + {"text": " we", "timestamp": (3.4, 3.5)}, + {"text": " are", "timestamp": (3.5, 3.6)}, + {"text": " glad", "timestamp": (3.6, 3.84)}, + {"text": " to", "timestamp": (3.84, 4.1)}, + {"text": " welcome", "timestamp": (4.1, 4.4)}, + {"text": " his", "timestamp": (4.4, 4.7)}, + {"text": " gospel.", "timestamp": (4.7, 5.34)}, + ], + } + + # batch size 1: copy the audio sample since pipeline consumes it + output = pipe(sample.copy(), batch_size=1) + self.assertDictEqual(output, EXPECTED_OUTPUT) + + # batch size 2: input audio is chunked into smaller pieces so it's testing batching + output = pipe(sample, batch_size=2) + self.assertDictEqual(output, EXPECTED_OUTPUT) + @require_torch @slow def test_torch_speech_encoder_decoder(self): From e5f71ecaae50ea476d1e12351003790273c4b2ed Mon Sep 17 00:00:00 2001 From: Zafir Stojanovski Date: Tue, 7 May 2024 11:23:52 +0200 Subject: [PATCH 21/49] Updated docs of `forward` in `Idefics2ForConditionalGeneration` with correct `ignore_index` value (#30678) updated docs of `forward` in `Idefics2ForConditionalGeneration` with correct `ignore_index` value --- src/transformers/models/idefics2/modeling_idefics2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 7f61e95a9b8edf..d0e99158062774 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -1776,9 +1776,9 @@ def forward( Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - + config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics2ForConditionalGeneration`). + Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only + computed for the tokens with labels in `[0, ..., config.vocab_size]`. Returns: Example: From 4051d362cb36d78aae5a5979231394713178f35d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 11:57:10 +0100 Subject: [PATCH 22/49] Bump tqdm from 4.63.0 to 4.66.3 in /examples/research_projects/decision_transformer (#30646) Bump tqdm in /examples/research_projects/decision_transformer Bumps [tqdm](https://github.com/tqdm/tqdm) from 4.63.0 to 4.66.3. - [Release notes](https://github.com/tqdm/tqdm/releases) - [Commits](https://github.com/tqdm/tqdm/compare/v4.63.0...v4.66.3) --- updated-dependencies: - dependency-name: tqdm dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .../research_projects/decision_transformer/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt index a31e3d0de8b963..fc231148519d1b 100644 --- a/examples/research_projects/decision_transformer/requirements.txt +++ b/examples/research_projects/decision_transformer/requirements.txt @@ -220,7 +220,7 @@ toolz==0.11.2 torch==1.13.1 torchaudio==0.11.0 torchvision==0.12.0 -tqdm==4.63.0 +tqdm==4.66.3 traitlets==5.1.1 -e git+git@github.com:edbeeching/transformers.git@77b90113ca0a0e4058b046796c874bdc98f1da61#egg=transformers typing-extensions==4.1.1 From 3733391c532f74ac8c4b13a5961c78602d5e5c82 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 11:57:30 +0100 Subject: [PATCH 23/49] Bump tqdm from 4.48.2 to 4.66.3 in /examples/research_projects/visual_bert (#30645) Bump tqdm in /examples/research_projects/visual_bert Bumps [tqdm](https://github.com/tqdm/tqdm) from 4.48.2 to 4.66.3. - [Release notes](https://github.com/tqdm/tqdm/releases) - [Commits](https://github.com/tqdm/tqdm/compare/v4.48.2...v4.66.3) --- updated-dependencies: - dependency-name: tqdm dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- examples/research_projects/visual_bert/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/research_projects/visual_bert/requirements.txt b/examples/research_projects/visual_bert/requirements.txt index 20664d8427bdc0..60a1858b4e6d8d 100644 --- a/examples/research_projects/visual_bert/requirements.txt +++ b/examples/research_projects/visual_bert/requirements.txt @@ -87,7 +87,7 @@ tokenizers==0.8.1rc2 torch==1.13.1 torchvision==0.7.0 tornado==6.3.3 -tqdm==4.48.2 +tqdm==4.66.3 traitlets git+https://github.com/huggingface/transformers.git urllib3==1.26.18 From 0ba15cedbcbe9a9db2d5cde76ea4bb712c311934 Mon Sep 17 00:00:00 2001 From: Aymeric Roucher <69208727+aymeric-roucher@users.noreply.github.com> Date: Tue, 7 May 2024 12:59:49 +0200 Subject: [PATCH 24/49] Reboot Agents (#30387) * Create CodeAgent and ReactAgent * Fix formatting errors * Update documentation for agents * Add custom errors, improve logging * Support variable usage in ReactAgent * add messages * Add message passing format * Create React Code Agent * Update * Refactoring * Fix errors * Improve python interpreter * Only non-tensor inputs should be sent to device * Calculator tool slight refactor * Improve docstrings * Refactor * Fix tests * Fix more tests * Fix even more tests * Fix tests by replacing output and input types * Fix operand type issue * two small fixes * EM TTS * Fix agent running type errors * Change text to speech tests to allow changed outputs * Update doc with new agent types * Improve code interpreter * If max iterations reached, provide a real answer instead of an error * Add edge case in interpreter * Add safe imports to the interpreter * Interpreter tweaks: tuples and listcomp * Make style * Make quality * Add dictcomp to interpreter * Rename ReactJSONAgent to ReactJsonAgent * Misc changes * ToolCollection * Rename agent's logger to self.logger * Add while loops to interpreter * Update doc with new tools. still need to mention collections * Add collections to the doc * Small fixes on logs and interpretor * Fix toolbox return type * Docs + fixup * Skip doctests * Correct prompts with improved examples and formatting * Update prompt * Remove outdated docs * Change agent to accept Toolbox object for tools * Remove calculator tool * Propagate removal of calculator in doc * Fix 2 failing workflows * Simplify additional argument passing * AgentType audio * Minor changes: function name, types * Remove calculator tests * Fix test * Fix torch requirement * Fix final answer tests * Style fixes * Fix tests * Update docstrings with calculator removal * Small type hint fixes * Update tests/agents/test_translation.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update tests/agents/test_python_interpreter.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/agents/default_tools.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/agents/tools.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update tests/agents/test_agents.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/models/bert/configuration_bert.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/agents/tools.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/agents/speech_to_text.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update tests/agents/test_speech_to_text.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update tests/agents/test_tools_common.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * pygments * Answer comments * Cleaning up * Simplifying init for all agents * Improving prompts and making code nicer * Style fixes * Add multiple comparator test in interpreter * Style fixes * Improve BERT example in documentation * Add examples to doc * Fix python interpreter quality * Logging improvements * Change test flag to agents * Quality fix * Add example for HfEngine * Improve conversation example for HfEngine * typo fix * Verify doc * Update docs/source/en/agents.md Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/agents/agents.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/agents/prompts.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/agents/python_interpreter.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update docs/source/en/agents.md Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Fix style issues * local s2t tool --------- Co-authored-by: Cyril Kondratenko Co-authored-by: Lysandre Co-authored-by: Lysandre Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- conftest.py | 4 +- docs/source/en/_toctree.yml | 4 +- docs/source/en/agents.md | 490 ++++++++++ docs/source/en/custom_tools.md | 798 ----------------- docs/source/en/main_classes/agent.md | 75 +- docs/source/en/transformers_agents.md | 323 ------- docs/source/ja/custom_tools.md | 744 +--------------- docs/source/ja/main_classes/agent.md | 87 +- docs/source/ko/custom_tools.md | 732 +-------------- docs/source/zh/main_classes/agent.md | 81 +- src/transformers/__init__.py | 57 +- .../{tools => agents}/__init__.py | 22 +- .../{tools => agents}/agent_types.py | 74 +- src/transformers/agents/agents.py | 838 ++++++++++++++++++ src/transformers/agents/default_tools.py | 168 ++++ .../document_question_answering.py | 25 +- .../{tools => agents}/evaluate_agent.py | 300 +------ .../image_question_answering.py | 21 +- src/transformers/agents/llm_engine.py | 92 ++ src/transformers/agents/prompts.py | 364 ++++++++ src/transformers/agents/python_interpreter.py | 520 +++++++++++ .../{tools => agents}/speech_to_text.py | 20 +- .../{tools => agents}/text_to_speech.py | 14 +- .../{tools/base.py => agents/tools.py} | 363 ++++---- .../{tools => agents}/translation.py | 26 +- src/transformers/testing_utils.py | 12 +- src/transformers/tools/agents.py | 778 ---------------- src/transformers/tools/image_captioning.py | 51 -- src/transformers/tools/image_segmentation.py | 58 -- src/transformers/tools/prompts.py | 48 - src/transformers/tools/python_interpreter.py | 253 ------ src/transformers/tools/text_classification.py | 70 -- .../tools/text_question_answering.py | 52 -- src/transformers/tools/text_summarization.py | 52 -- src/transformers/utils/import_utils.py | 10 + tests/{tools => agents}/__init__.py | 0 tests/{tools => agents}/test_agent_types.py | 2 +- tests/agents/test_agents.py | 161 ++++ .../test_document_question_answering.py | 15 - tests/agents/test_final_answer.py | 71 ++ .../test_image_question_answering.py | 11 - tests/agents/test_python_interpreter.py | 355 ++++++++ .../{tools => agents}/test_speech_to_text.py | 16 +- .../{tools => agents}/test_text_to_speech.py | 16 +- tests/agents/test_tools_common.py | 107 +++ tests/agents/test_translation.py | 68 ++ tests/tools/test_image_captioning.py | 53 -- tests/tools/test_image_segmentation.py | 53 -- tests/tools/test_python_interpreter.py | 131 --- tests/tools/test_text_classification.py | 43 - tests/tools/test_text_question_answering.py | 52 -- tests/tools/test_text_summarization.py | 64 -- tests/tools/test_tools_common.py | 133 --- tests/tools/test_translation.py | 86 -- utils/not_doctested.txt | 31 +- 55 files changed, 3675 insertions(+), 5419 deletions(-) create mode 100644 docs/source/en/agents.md delete mode 100644 docs/source/en/custom_tools.md delete mode 100644 docs/source/en/transformers_agents.md rename src/transformers/{tools => agents}/__init__.py (65%) rename src/transformers/{tools => agents}/agent_types.py (71%) create mode 100644 src/transformers/agents/agents.py create mode 100644 src/transformers/agents/default_tools.py rename src/transformers/{tools => agents}/document_question_answering.py (80%) rename src/transformers/{tools => agents}/evaluate_agent.py (57%) rename src/transformers/{tools => agents}/image_question_answering.py (81%) create mode 100644 src/transformers/agents/llm_engine.py create mode 100644 src/transformers/agents/prompts.py create mode 100644 src/transformers/agents/python_interpreter.py rename src/transformers/{tools => agents}/speech_to_text.py (67%) rename src/transformers/{tools => agents}/text_to_speech.py (87%) rename src/transformers/{tools/base.py => agents/tools.py} (73%) rename src/transformers/{tools => agents}/translation.py (91%) delete mode 100644 src/transformers/tools/agents.py delete mode 100644 src/transformers/tools/image_captioning.py delete mode 100644 src/transformers/tools/image_segmentation.py delete mode 100644 src/transformers/tools/prompts.py delete mode 100644 src/transformers/tools/python_interpreter.py delete mode 100644 src/transformers/tools/text_classification.py delete mode 100644 src/transformers/tools/text_question_answering.py delete mode 100644 src/transformers/tools/text_summarization.py rename tests/{tools => agents}/__init__.py (100%) rename tests/{tools => agents}/test_agent_types.py (98%) create mode 100644 tests/agents/test_agents.py rename tests/{tools => agents}/test_document_question_answering.py (67%) create mode 100644 tests/agents/test_final_answer.py rename tests/{tools => agents}/test_image_question_answering.py (71%) create mode 100644 tests/agents/test_python_interpreter.py rename tests/{tools => agents}/test_speech_to_text.py (76%) rename tests/{tools => agents}/test_text_to_speech.py (75%) create mode 100644 tests/agents/test_tools_common.py create mode 100644 tests/agents/test_translation.py delete mode 100644 tests/tools/test_image_captioning.py delete mode 100644 tests/tools/test_image_segmentation.py delete mode 100644 tests/tools/test_python_interpreter.py delete mode 100644 tests/tools/test_text_classification.py delete mode 100644 tests/tools/test_text_question_answering.py delete mode 100644 tests/tools/test_text_summarization.py delete mode 100644 tests/tools/test_tools_common.py delete mode 100644 tests/tools/test_translation.py diff --git a/conftest.py b/conftest.py index 11e2b29756c11f..3f2dae258b211c 100644 --- a/conftest.py +++ b/conftest.py @@ -71,7 +71,7 @@ "ModelTester::test_pipeline_", "/repo_utils/", "/utils/", - "/tools/", + "/agents/", } # allow having multiple repository checkouts and not needing to remember to rerun @@ -94,7 +94,7 @@ def pytest_configure(config): config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested") config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment") config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate") - config.addinivalue_line("markers", "tool_tests: mark the tool tests that are run on their specific schedule") + config.addinivalue_line("markers", "agent_tests: mark the agent tests that are run on their specific schedule") config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu") diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index afc997a7aa54b7..d654478970d86d 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -23,7 +23,7 @@ title: Load and train adapters with 🤗 PEFT - local: model_sharing title: Share your model - - local: transformers_agents + - local: agents title: Agents - local: llm_tutorial title: Generation with LLMs @@ -133,8 +133,6 @@ title: Notebooks with examples - local: community title: Community resources - - local: custom_tools - title: Custom Tools and Prompts - local: troubleshooting title: Troubleshoot - local: hf_quantizer diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md new file mode 100644 index 00000000000000..ae9e5db2b7897b --- /dev/null +++ b/docs/source/en/agents.md @@ -0,0 +1,490 @@ + +# Agents and tools + +[[open-in-colab]] + +### What is an agent? + +Large Language Models (LLMs) trained to perform [causal language modeling](./tasks/language_modeling.) can tackle a wide range of tasks, but they often struggle with basic tasks like logic, calculation, and search. When prompted in domains in which they do not perform well, they often fail to generate the answer we expect them to. + +One approach to overcome this weakness is to create an *agent*. + +An agent is a system that uses an LLM as its engine, and it has access to functions called *tools*. + +These *tools* are functions for performing a task, and they contain all necessary description for the agent to properly use them. + +The agent can be programmed to: +- devise a series of actions/tools and run them all at once like the `CodeAgent` for example +- plan and execute actions/tools one by one and wait for the outcome of each action before launching the next one like the `ReactJsonAgent` for example + +### Types of agents + +#### Code agent + +This agent has a planning step, then generates python code to execute all its actions at once. It natively handles different input and output types for its tools, thus it is the recommended choice for multimodal tasks. + +#### React agents + +This is the go-to agent to solve reasoning tasks, since the ReAct framework ([Yao et al., 2022](https://huggingface.co/papers/2210.03629)) makes it really efficient to think on the basis of its previous observations. + +We implement two versions of ReactJsonAgent: +- [`~ReactJsonAgent`] generates tool calls as a JSON in its output. +- [`~ReactCodeAgent`] is a new type of ReactJsonAgent that generates its tool calls as blobs of code, which works really well for LLMs that have strong coding performance. + +> [!TIP] +> Read [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) blog post to learn more the ReAct agent. + +![Framework of a React Agent](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png) + +For example, here is how a ReAct agent would work its way through the following question. + +```py3 +>>> agent.run( +... "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?", +... ) +=====New task===== +How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need? +====Agent is executing the code below: +bert_blocks = search(query="number of blocks in BERT base encoder") +print("BERT blocks:", bert_blocks) +==== +Print outputs: +BERT blocks: twelve encoder blocks + +====Agent is executing the code below: +attention_layer = search(query="number of layers in Attention is All You Need") +print("Attention layers:", attention_layer) +==== +Print outputs: +Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture. + +====Agent is executing the code below: +bert_blocks = 12 +attention_layers = 6 +diff = bert_blocks - attention_layers +print("Difference in blocks:", diff) +final_answer(diff) +==== + +Print outputs: +Difference in blocks: 6 + +Final answer: 6 +``` + +### How can I build an agent? + +To initialize an agent, you need these arguments: + +- an LLM to power your agent - the agent is not exactly the LLM, it’s more like the agent is a program that uses an LLM as its engine. +- a system prompt: what the LLM engine will be prompted with to generate its output +- a toolbox from which the agent pick tools to execute +- a parser to extract from the LLM output which tools are to call and with which arguments + +Upon initialization of the agent system, the tool attributes are used to generate a tool description, then baked into the agent’s `system_prompt` to let it know which tools it can use and why. + +To start with, please install the `agents` extras in order to install all default dependencies. + +```bash +pip install transformers[agents] +``` + +Build your LLM engine by defining a `llm_engine` method which accepts a list of [messages](./chat_templating.) and returns text. This callable also needs to accept a `stop` argument that indicates when to stop generating. + +```python +from huggingface_hub import login, InferenceClient + +login("") + +client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct") + +def llm_engine(messages, stop_sequences=["Task"]) -> str: + response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000) + answer = response.choices[0].message.content + return answer +``` + +You could use any `llm_engine` method as long as: +1. it follows the [messages format](./chat_templating.md) for its input (`List[Dict[str, str]]`) and returns a `str` +2. it stops generating outputs at the sequences passed in the argument `stop` + +You also need a `tools` argument which accepts a list of `Tools`. You can provide an empty list for `tools`, but use the default toolbox with the optional argument `add_base_tools=True`. + +Now you can create an agent, like `CodeAgent`, and run it. For convenience, we also provide the `HfEngine` class that uses `huggingface_hub.InferenceClient` under the hood. + +```python +from transformers import CodeAgent, HfEngine + +llm_engine = HfEngine(model="meta-llama/Meta-Llama-3-70B-Instruct") +agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) + +agent.run( + "Could you translate this sentence from French, say it out loud and return the audio.", + sentence="Où est la boulangerie la plus proche?", +) +``` + +This will be handy in case of emergency baguette need! +You can even leave the argument `llm_engine` undefined, and an [~HfEngine] will be created by default. + +```python +from transformers import CodeAgent + +agent = CodeAgent(tools=[], add_base_tools=True) + +agent.run( + "Could you translate this sentence from French, say it out loud and give me the audio.", + sentence="Où est la boulangerie la plus proche?", +) +``` + +Note that we used an additional `sentence` argument: you can pass text as additional arguments to the model. + +You can also use this to indicate the path to local or remote files for the model to use: + +```py +from transformers import ReactCodeAgent + +agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) + +agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3") +``` + + +The prompt and output parser were automatically defined, but you can easily inspect them by calling the `system_prompt_template` on your agent. + +```python +print(agent.system_prompt_template) +``` + +It's important to explain as clearly as possible the task you want to perform. +Every [`~Agent.run`] operation is independent, and since an agent is powered by an LLM, minor variations in your prompt might yield completely different results. +You can also run an agent consecutively for different tasks: each time the attributes `agent.task` and `agent.logs` will be re-initialized. + + +#### Code execution + +A Python interpreter executes the code on a set of inputs passed along with your tools. +This should be safe because the only functions that can be called are the tools you provided (especially if it's only tools by Hugging Face) and the print function, so you're already limited in what can be executed. + +The Python interpreter also doesn't allow any attribute lookup or imports (which shouldn't be needed for passing inputs/outputs to a small set of functions) so all the most obvious attacks shouldn't be an issue. + +The execution will stop at any code trying to perform an illegal operation or if there is a regular Python error with the code generated by the agent. + +### The system prompt + +An agent, or rather the LLM that drives the agent, generates an output based on the system prompt. The system prompt can be customized and tailored to the intended task. For example, check the system prompt for the `ReactCodeAgent` (below version is slightly simplified). + +```text +You will be given a task to solve as best you can. +You have access to the following tools: +<> + +To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences. + +At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use. +Then in the 'Code:' sequence, you shold write the code in simple Python. The code sequence must end with '/End code' sequence. +During each intermediate step, you can use 'print()' to save whatever important information you will then need. +These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step. + +In the end you have to return a final answer using the `final_answer` tool. + +Here are a few examples using notional tools: +--- +{examples} + +Above example were using notional tools that might not exist for you. You only have acces to those tools: +<> +You also can perform computations in the python code you generate. + +Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```' sequence. You MUST provide at least the 'Code:' sequence to move forward. + +Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks. +Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result. + +Remember to make sure that variables you use are all defined. + +Now Begin! +``` + +The system prompt includes: +- An *introduction* that explains how the agent should behave and what tools are. +- A description of all the tools that is defined by a `<>` token that is dynamically replaced at runtime with the tools defined/chosen by the user. + - The tool description comes from the tool attributes, `name`, `description`, `inputs` and `output_type`, and a simple `jinja2` template that you can refine. +- The expected output format. + +You could improve the system prompt, for example, by adding an explanation of the output format. + +For maximum flexibility, you can overwrite the whole system prompt template by passing your custom prompt as an argument to the `system_prompt` parameter. + +```python +from transformers import ReactJsonAgent +from transformers.agents import PythonInterpreterTool + +agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}") +``` + +> [!WARNING] +> Please make sure to define the `<>` string somewhere in the `template` so the agent is aware +of the available tools. + +## Tools + +A tool is an atomic function to be used by an agent. + +You can for instance check the [~PythonInterpreterTool]: it has a name, a description, input descriptions, an output type, and a `__call__` method to perform the action. + +When the agent is initialized, the tool attributes are used to generate a tool description which is baked into the agent's system prompt. This lets the agent know which tools it can use and why. + +### Default toolbox + +Transformers comes with a default toolbox for empowering agents, that you can add to your agent upon initialization with argument `add_base_tools = True`: + +- **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document ([Donut](./model_doc/donut)) +- **Image question answering**: given an image, answer a question on this image ([VILT](./model_doc/vilt)) +- **Speech to text**: given an audio recording of a person talking, transcribe the speech into text ([Whisper](./model_doc/whisper)) +- **Text to speech**: convert text to speech ([SpeechT5](./model_doc/speecht5)) +- **Translation**: translates a given sentence from source language to target language. +- **Python code interpreter**: runs your the LLM generated Python code in a secure environment. This tool will only be added to [~ReactJsonAgent] if you use `add_base_tools=True`, since code-based tools can already execute Python code + + +You can manually use a tool by calling the [`load_tool`] function and a task to perform. + + +```python +from transformers import load_tool + +tool = load_tool("text-to-speech") +audio = tool("This is a text to speech tool") +``` + + +### Create a new tool + +You can create your own tool for use cases not covered by the default tools from Hugging Face. +For example, let's create a tool that returns the most downloaded model for a given task from the Hub. + +You'll start with the code below. + +```python +from huggingface_hub import list_models + +task = "text-classification" + +model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) +print(model.id) +``` + +This code can be converted into a class that inherits from the [`Tool`] superclass. + + +The custom tool needs: +- An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name is `model_download_counter`. +- An attribute `description` is used to populate the agent's system prompt. +- An `inputs` attribute, which is a dictionary with keys `"type"` and `"description"`. It contains information that helps the Python interpreter make educated choices about the input. +- An `output_type` attribute, which specifies the output type. +- A `forward` method which contains the inference code to be executed. + + +```python +from transformers import Tool +from huggingface_hub import list_models + +class HFModelDownloadsTool(Tool): + name = "model_download_counter" + description = ( + "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. " + "It returns the name of the checkpoint." + ) + + inputs = { + "task": { + "type": "text", + "description": "the task category (such as text-classification, depth-estimation, etc)", + } + } + output_type = "text" + + def forward(self, task: str): + model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) + return model.id +``` + +Now that the custom `HfModelDownloadsTool` class is ready, you can save it to a file named `model_downloads.py` and import it for use. + + +```python +from model_downloads import HFModelDownloadsTool + +tool = HFModelDownloadsTool() +``` + +You can also share your custom tool to the Hub by calling [`~Tool.push_to_hub`] on the tool. Make sure you've created a repository for it on the Hub and are using a token with read access. + +```python +tool.push_to_hub("{your_username}/hf-model-downloads") +``` + +Load the tool with the [`~Tool.load_tool`] function and pass it to the `tools` parameter in your agent. + +```python +from transformers import load_tool, CodeAgent + +model_download_tool = load_tool("m-ric/hf-model-downloads") +agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine) +agent.run( + "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?" +) +``` + +You get the following: +```text +======== New task ======== +Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub? +==== Agent is executing the code below: +most_downloaded_model = model_download_counter(task="text-to-video") +print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.") +==== +``` + +And the output: +`"The most downloaded model for the 'text-to-video' task is ByteDance/AnimateDiff-Lightning."` + + +### Manage agent toolbox + +If you have already initialized an agent, it is inconvenient to reinitialize it from scratch with a tool you want to use. With Transformers, you can manage an agent's toolbox by adding or replacing a tool. + +Let's add the `model_download_tool` to an existing agent initialized with only the default toolbox. + +```python +from transformers import CodeAgent + +agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) +agent.toolbox.add_tool(model_download_tool) +``` +Now we can leverage both the new tool and the previous text-to-speech tool: + +```python +agent.run( + "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?" +) +``` + + +| **Audio** | +|------------------------------------------------------------------------------------------------------------------------------------------------------| +|