From 2bdd0f0542b89d90295a9a69a3b929e7cf5175ff Mon Sep 17 00:00:00 2001 From: ZiyiXia Date: Tue, 3 Dec 2024 11:49:43 +0000 Subject: [PATCH] update tutorials --- Tutorials/1_Embedding/1.2.3_BGE_v1&1.5.ipynb | 4 +- Tutorials/4_Evaluation/4.5.1_MIRACL.ipynb | 2 +- Tutorials/4_Evaluation/4.5.2_MLDR.ipynb | 15 +- .../4_Evaluation/utils/compute_metrics.py | 95 ++++++++++ .../4_Evaluation/utils/normalize_text.py | 162 ++++++++++++++++++ 5 files changed, 263 insertions(+), 15 deletions(-) create mode 100644 Tutorials/4_Evaluation/utils/compute_metrics.py create mode 100644 Tutorials/4_Evaluation/utils/normalize_text.py diff --git a/Tutorials/1_Embedding/1.2.3_BGE_v1&1.5.ipynb b/Tutorials/1_Embedding/1.2.3_BGE_v1&1.5.ipynb index 7db93f71..29182377 100644 --- a/Tutorials/1_Embedding/1.2.3_BGE_v1&1.5.ipynb +++ b/Tutorials/1_Embedding/1.2.3_BGE_v1&1.5.ipynb @@ -71,7 +71,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Run the following cell to check the model of bge-base-en-v1.5. It has the exactly same structure of BERT-base, 12 encoder layers and hidden dimension of 768.\n", + "Run the following cell to check the model of bge-base-en-v1.5. It uses BERT-base as base model, with 12 encoder layers and hidden dimension of 768.\n", "\n", "Note that the corresponding models of BGE and BGE-v1.5 have same structures. For example, bge-base-en and bge-base-en-v1.5 have the same structure." ] @@ -391,7 +391,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/flag_models.py#L370) for more details." + "As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/inference/embedder/encoder_only/base.py) for more details." ] } ], diff --git a/Tutorials/4_Evaluation/4.5.1_MIRACL.ipynb b/Tutorials/4_Evaluation/4.5.1_MIRACL.ipynb index 8442b078..58dfdc08 100644 --- a/Tutorials/4_Evaluation/4.5.1_MIRACL.ipynb +++ b/Tutorials/4_Evaluation/4.5.1_MIRACL.ipynb @@ -568,7 +568,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Evaluate using FlagEmbedding" + "## 3. Evaluate using FlagEmbedding" ] }, { diff --git a/Tutorials/4_Evaluation/4.5.2_MLDR.ipynb b/Tutorials/4_Evaluation/4.5.2_MLDR.ipynb index 00017577..4da1ec1e 100644 --- a/Tutorials/4_Evaluation/4.5.2_MLDR.ipynb +++ b/Tutorials/4_Evaluation/4.5.2_MLDR.ipynb @@ -34,7 +34,7 @@ "metadata": {}, "outputs": [], "source": [ - "% pip install FlagEmbedding" + "% pip install FlagEmbedding pytrec_eval" ] }, { @@ -318,7 +318,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Use the Faiss index to search for each query." + "Use the Faiss index to search answers for each query." ] }, { @@ -456,7 +456,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Evaluate using FlagEmbedding" + "## 3. Evaluate using FlagEmbedding" ] }, { @@ -496,15 +496,6 @@ "sys.argv = arguments.split()" ] }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "os.environ[\"SETUPTOOLS_USE_DISTUTILS\"] = \"\"" - ] - }, { "cell_type": "code", "execution_count": 4, diff --git a/Tutorials/4_Evaluation/utils/compute_metrics.py b/Tutorials/4_Evaluation/utils/compute_metrics.py new file mode 100644 index 00000000..7620cc56 --- /dev/null +++ b/Tutorials/4_Evaluation/utils/compute_metrics.py @@ -0,0 +1,95 @@ +""" +Ref: https://github.com/facebookresearch/contriever +""" +import regex +import unicodedata +from functools import partial +from typing import List, Union + + +class SimpleTokenizer: + ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+' + NON_WS = r'[^\p{Z}\p{C}]' + + def __init__(self): + """ + Args: + annotators: None or empty set (only tokenizes). + """ + self._regexp = regex.compile( + '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS), + flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE + ) + + def tokenize(self, text, uncased=False): + matches = [m for m in self._regexp.finditer(text)] + if uncased: + tokens = [m.group().lower() for m in matches] + else: + tokens = [m.group() for m in matches] + return tokens + + +def _normalize(text): + return unicodedata.normalize('NFD', text) + + +def has_answer(answers, text, tokenizer) -> bool: + """Check if a document contains an answer string.""" + text = _normalize(text) + text = tokenizer.tokenize(text, uncased=True) + + for answer in answers: + answer = _normalize(answer) + answer = tokenizer.tokenize(answer, uncased=True) + for i in range(0, len(text) - len(answer) + 1): + if answer == text[i: i + len(answer)]: + return True + return False + + +def check_answer(example, tokenizer) -> List[bool]: + """Search through all the top docs to see if they have any of the answers.""" + answers = example['answers'] + ctxs = example['ctxs'] + + hits = [] + for i, text in enumerate(ctxs): + if text is None: # cannot find the document for some reason + hits.append(False) + continue + hits.append(has_answer(answers, text, tokenizer)) + return hits + + +def evaluate_qa_recall(ctxs, answers, k_values: Union[int, List[int]]=100): + # compute Recall@k for QA task + data = [] + assert len(ctxs) == len(answers) + for i in range(len(ctxs)): + _ctxs, _answers = ctxs[i], answers[i] + data.append({ + 'answers': _answers, + 'ctxs': _ctxs, + }) + tokenizer = SimpleTokenizer() + get_score_partial = partial(check_answer, tokenizer=tokenizer) + + scores = map(get_score_partial, data) + + n_docs = len(data[0]['ctxs']) + top_k_hits = [0] * n_docs + for question_hits in scores: + best_hit = next((i for i, x in enumerate(question_hits) if x), None) + if best_hit is not None: + top_k_hits[best_hit:] = [v + 1 for v in top_k_hits[best_hit:]] + + if isinstance(k_values, int): + k = min(k_values, len(top_k_hits)) + return top_k_hits[k - 1] / len(data) + else: + scores = [] + for k in k_values: + k = min(k, len(top_k_hits)) + scores.append(top_k_hits[k - 1] / len(data)) + return scores diff --git a/Tutorials/4_Evaluation/utils/normalize_text.py b/Tutorials/4_Evaluation/utils/normalize_text.py new file mode 100644 index 00000000..a16fe58d --- /dev/null +++ b/Tutorials/4_Evaluation/utils/normalize_text.py @@ -0,0 +1,162 @@ +""" +adapted from chemdataextractor.text.normalize +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Tools for normalizing text. +https://github.com/mcs07/ChemDataExtractor +:copyright: Copyright 2016 by Matt Swain. +:license: MIT + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +'Software'), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +""" + +#: Control characters. +CONTROLS = { + '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011', + '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b', +} +# There are further control characters, but they are instead replaced with a space by unicode normalization +# '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c', '\u001d', '\u001e', '\u001f' + + +#: Hyphen and dash characters. +HYPHENS = { + '-', # \u002d Hyphen-minus + '‐', # \u2010 Hyphen + '‑', # \u2011 Non-breaking hyphen + '⁃', # \u2043 Hyphen bullet + '‒', # \u2012 figure dash + '–', # \u2013 en dash + '—', # \u2014 em dash + '―', # \u2015 horizontal bar +} + +#: Minus characters. +MINUSES = { + '-', # \u002d Hyphen-minus + '−', # \u2212 Minus + '-', # \uff0d Full-width Hyphen-minus + '⁻', # \u207b Superscript minus +} + +#: Plus characters. +PLUSES = { + '+', # \u002b Plus + '+', # \uff0b Full-width Plus + '⁺', # \u207a Superscript plus +} + +#: Slash characters. +SLASHES = { + '/', # \u002f Solidus + '⁄', # \u2044 Fraction slash + '∕', # \u2215 Division slash +} + +#: Tilde characters. +TILDES = { + '~', # \u007e Tilde + '˜', # \u02dc Small tilde + '⁓', # \u2053 Swung dash + '∼', # \u223c Tilde operator #in mbert vocab + '∽', # \u223d Reversed tilde + '∿', # \u223f Sine wave + '〜', # \u301c Wave dash #in mbert vocab + '~', # \uff5e Full-width tilde #in mbert vocab +} + +#: Apostrophe characters. +APOSTROPHES = { + "'", # \u0027 + '’', # \u2019 + '՚', # \u055a + 'Ꞌ', # \ua78b + 'ꞌ', # \ua78c + ''', # \uff07 +} + +#: Single quote characters. +SINGLE_QUOTES = { + "'", # \u0027 + '‘', # \u2018 + '’', # \u2019 + '‚', # \u201a + '‛', # \u201b + +} + +#: Double quote characters. +DOUBLE_QUOTES = { + '"', # \u0022 + '“', # \u201c + '”', # \u201d + '„', # \u201e + '‟', # \u201f +} + +#: Accent characters. +ACCENTS = { + '`', # \u0060 + '´', # \u00b4 +} + +#: Prime characters. +PRIMES = { + '′', # \u2032 + '″', # \u2033 + '‴', # \u2034 + '‵', # \u2035 + '‶', # \u2036 + '‷', # \u2037 + '⁗', # \u2057 +} + +#: Quote characters, including apostrophes, single quotes, double quotes, accents and primes. +QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES + +def normalize_text(text: str): + for control in CONTROLS: + text = text.replace(control, '') + text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ') + + for hyphen in HYPHENS | MINUSES: + text = text.replace(hyphen, '-') + text = text.replace('\u00ad', '') + + for double_quote in DOUBLE_QUOTES: + text = text.replace(double_quote, '"') # \u0022 + for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS): + text = text.replace(single_quote, "'") # \u0027 + text = text.replace('′', "'") # \u2032 prime + text = text.replace('‵', "'") # \u2035 reversed prime + text = text.replace('″', "''") # \u2033 double prime + text = text.replace('‶', "''") # \u2036 reversed double prime + text = text.replace('‴', "'''") # \u2034 triple prime + text = text.replace('‷', "'''") # \u2037 reversed triple prime + text = text.replace('⁗', "''''") # \u2057 quadruple prime + + text = text.replace('…', '...').replace(' . . . ', ' ... ') # \u2026 + + for slash in SLASHES: + text = text.replace(slash, '/') + + #for tilde in TILDES: + # text = text.replace(tilde, '~') + + return text