From fc157d44afde5e7c41e184e3bb43837c6ca60e9b Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 12 Sep 2024 08:28:47 +0200 Subject: [PATCH 1/7] Replace punkt with punkt_tab in word_length --- measurements/word_length/word_length.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/measurements/word_length/word_length.py b/measurements/word_length/word_length.py index 3448a679b..5b31cd2db 100644 --- a/measurements/word_length/word_length.py +++ b/measurements/word_length/word_length.py @@ -20,6 +20,8 @@ import evaluate +NLTK_VERSION = version.parse(importlib_metadata.version("nltk")) + _DESCRIPTION = """ Returns the average length (in terms of the number of words) of the input data. """ @@ -75,7 +77,10 @@ def _info(self): def _download_and_prepare(self, dl_manager): import nltk - nltk.download("punkt") + if NLTK_VERSION >= version.Version("3.9.0"): + nltk.download("punkt_tab") + else: + nltk.download("punkt") def _compute(self, data, tokenizer=word_tokenize): """Returns the average word length of the input data""" From d1e6ec3c142362b13c7d0397d4eb0005c9f4d9da Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 12 Sep 2024 08:29:23 +0200 Subject: [PATCH 2/7] Replace punkt with punkt_tab in meteor --- metrics/meteor/meteor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/metrics/meteor/meteor.py b/metrics/meteor/meteor.py index 058ee80ed..bc023a8e2 100644 --- a/metrics/meteor/meteor.py +++ b/metrics/meteor/meteor.py @@ -120,7 +120,9 @@ def _download_and_prepare(self, dl_manager): import nltk nltk.download("wordnet") - if NLTK_VERSION >= version.Version("3.6.5"): + if NLTK_VERSION >= version.Version("3.9.0"): + nltk.download("punkt_tab") + elif NLTK_VERSION >= version.Version("3.6.5"): nltk.download("punkt") if NLTK_VERSION >= version.Version("3.6.6"): nltk.download("omw-1.4") From 31a954be920ccd6bcba7ea2a7338cde8f4c52034 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 12 Sep 2024 08:31:22 +0200 Subject: [PATCH 3/7] Replace punkt with punkt_tab in docs --- docs/source/creating_and_sharing.mdx | 4 ++-- docs/source/transformers_integrations.mdx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/creating_and_sharing.mdx b/docs/source/creating_and_sharing.mdx index c1b96d3fc..f09a34174 100644 --- a/docs/source/creating_and_sharing.mdx +++ b/docs/source/creating_and_sharing.mdx @@ -54,12 +54,12 @@ def _download_and_prepare(self, dl_manager): self.scorer = score.BleurtScorer(os.path.join(model_path, self.config_name)) ``` -Or if you need to download the NLTK `"punkt"` resources: +Or if you need to download the NLTK `"punkt_tab"` resources: ```py def _download_and_prepare(self, dl_manager): import nltk - nltk.download("punkt") + nltk.download("punkt_tab") ``` Next, we need to define how the computation of the evaluation module works. diff --git a/docs/source/transformers_integrations.mdx b/docs/source/transformers_integrations.mdx index 7993b2f28..bca014187 100644 --- a/docs/source/transformers_integrations.mdx +++ b/docs/source/transformers_integrations.mdx @@ -81,7 +81,7 @@ def preprocess_function(examples): tokenized_billsum = billsum.map(preprocess_function, batched=True) # Setup evaluation -nltk.download("punkt", quiet=True) +nltk.download("punkt_tab", quiet=True) metric = evaluate.load("rouge") def compute_metrics(eval_preds): From 46975f6ce5ac484fa8e47d8b499bc23913ca4a63 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 12 Sep 2024 08:31:55 +0200 Subject: [PATCH 4/7] Revert temporary pin nltk<3.9 This reverts commit d1a15f68c0018691740a06472d1c4c8fe3982771. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5a97a847f..bf4680a8a 100644 --- a/setup.py +++ b/setup.py @@ -74,7 +74,7 @@ "absl-py", "charcut>=1.1.1", # for charcut_mt "cer>=1.2.0", # for characTER - "nltk<3.9", # for NIST and probably others; temporarily pin < 3.9 to avoid "Resource punkt_tab not found" (GH-622) + "nltk", # for NIST and probably others "pytest", "pytest-datadir", "pytest-xdist", From 384920c7ebace0f21bbf6345b033b5987c397190 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 12 Sep 2024 08:56:49 +0200 Subject: [PATCH 5/7] Fix import in word_length --- measurements/word_length/word_length.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/measurements/word_length/word_length.py b/measurements/word_length/word_length.py index 5b31cd2db..873fa3c9b 100644 --- a/measurements/word_length/word_length.py +++ b/measurements/word_length/word_length.py @@ -20,6 +20,12 @@ import evaluate +if evaluate.config.PY_VERSION < version.parse("3.8"): + import importlib_metadata +else: + import importlib.metadata as importlib_metadata + + NLTK_VERSION = version.parse(importlib_metadata.version("nltk")) _DESCRIPTION = """ From 748b52b35ccd48d2b0b0f09cd418d01ee07b0605 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 12 Sep 2024 09:03:49 +0200 Subject: [PATCH 6/7] Fix import in word_length --- measurements/word_length/word_length.py | 1 + 1 file changed, 1 insertion(+) diff --git a/measurements/word_length/word_length.py b/measurements/word_length/word_length.py index 873fa3c9b..0a466913e 100644 --- a/measurements/word_length/word_length.py +++ b/measurements/word_length/word_length.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from packaging import version from statistics import mean import datasets From 3968103a07e9561f2f79d1acbf9261229f108d04 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 12 Sep 2024 09:17:34 +0200 Subject: [PATCH 7/7] Fix style --- measurements/word_length/word_length.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/measurements/word_length/word_length.py b/measurements/word_length/word_length.py index 0a466913e..46a840879 100644 --- a/measurements/word_length/word_length.py +++ b/measurements/word_length/word_length.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from packaging import version from statistics import mean import datasets from nltk import word_tokenize +from packaging import version import evaluate