diff --git a/docs/source/creating_and_sharing.mdx b/docs/source/creating_and_sharing.mdx index c1b96d3fc..f09a34174 100644 --- a/docs/source/creating_and_sharing.mdx +++ b/docs/source/creating_and_sharing.mdx @@ -54,12 +54,12 @@ def _download_and_prepare(self, dl_manager): self.scorer = score.BleurtScorer(os.path.join(model_path, self.config_name)) ``` -Or if you need to download the NLTK `"punkt"` resources: +Or if you need to download the NLTK `"punkt_tab"` resources: ```py def _download_and_prepare(self, dl_manager): import nltk - nltk.download("punkt") + nltk.download("punkt_tab") ``` Next, we need to define how the computation of the evaluation module works. diff --git a/docs/source/transformers_integrations.mdx b/docs/source/transformers_integrations.mdx index 7993b2f28..bca014187 100644 --- a/docs/source/transformers_integrations.mdx +++ b/docs/source/transformers_integrations.mdx @@ -81,7 +81,7 @@ def preprocess_function(examples): tokenized_billsum = billsum.map(preprocess_function, batched=True) # Setup evaluation -nltk.download("punkt", quiet=True) +nltk.download("punkt_tab", quiet=True) metric = evaluate.load("rouge") def compute_metrics(eval_preds): diff --git a/measurements/word_length/word_length.py b/measurements/word_length/word_length.py index 3448a679b..46a840879 100644 --- a/measurements/word_length/word_length.py +++ b/measurements/word_length/word_length.py @@ -16,10 +16,19 @@ import datasets from nltk import word_tokenize +from packaging import version import evaluate +if evaluate.config.PY_VERSION < version.parse("3.8"): + import importlib_metadata +else: + import importlib.metadata as importlib_metadata + + +NLTK_VERSION = version.parse(importlib_metadata.version("nltk")) + _DESCRIPTION = """ Returns the average length (in terms of the number of words) of the input data. """ @@ -75,7 +84,10 @@ def _info(self): def _download_and_prepare(self, dl_manager): import nltk - nltk.download("punkt") + if NLTK_VERSION >= version.Version("3.9.0"): + nltk.download("punkt_tab") + else: + nltk.download("punkt") def _compute(self, data, tokenizer=word_tokenize): """Returns the average word length of the input data""" diff --git a/metrics/meteor/meteor.py b/metrics/meteor/meteor.py index 058ee80ed..bc023a8e2 100644 --- a/metrics/meteor/meteor.py +++ b/metrics/meteor/meteor.py @@ -120,7 +120,9 @@ def _download_and_prepare(self, dl_manager): import nltk nltk.download("wordnet") - if NLTK_VERSION >= version.Version("3.6.5"): + if NLTK_VERSION >= version.Version("3.9.0"): + nltk.download("punkt_tab") + elif NLTK_VERSION >= version.Version("3.6.5"): nltk.download("punkt") if NLTK_VERSION >= version.Version("3.6.6"): nltk.download("omw-1.4") diff --git a/setup.py b/setup.py index 5a97a847f..bf4680a8a 100644 --- a/setup.py +++ b/setup.py @@ -74,7 +74,7 @@ "absl-py", "charcut>=1.1.1", # for charcut_mt "cer>=1.2.0", # for characTER - "nltk<3.9", # for NIST and probably others; temporarily pin < 3.9 to avoid "Resource punkt_tab not found" (GH-622) + "nltk", # for NIST and probably others "pytest", "pytest-datadir", "pytest-xdist",