flairNLP · alanakbik · Apr 5, 2024 · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@ Flair is:
 
 * **A powerful NLP library.** Flair allows you to apply our state-of-the-art natural language processing (NLP)
 models to your text, such as named entity recognition (NER), sentiment analysis, part-of-speech tagging (PoS),
-  special support for [biomedical data](/resources/docs/HUNFLAIR.md),
+  special support for [biomedical texts](/resources/docs/HUNFLAIR2.md),
  sense disambiguation and classification, with support for a rapidly growing number of languages.
 
 * **A text embedding library.** Flair has simple interfaces that allow you to use and combine different word and

diff --git a/flair/models/__init__.py b/flair/models/__init__.py
@@ -1,4 +1,4 @@
 from .clustering import ClusteringModel
 from .entity_linker_model import SpanClassifier
 from .entity_mention_linking import EntityMentionLinker
 from .language_model import LanguageModel
@@ -10,6 +10,7 @@
 from .relation_classifier_model import RelationClassifier
 from .relation_extractor_model import RelationExtractor
 from .sequence_tagger_model import SequenceTagger
+from .prefixed_tagger import PrefixedSequenceTagger # This import has to be after SequenceTagger!
 from .tars_model import FewshotClassifier, TARSClassifier, TARSTagger
 from .text_classification_model import TextClassifier
 from .text_regression_model import TextRegressor
@@ -26,6 +27,7 @@
     "RelationExtractor",
     "RegexpTagger",
     "SequenceTagger",
+    "PrefixedSequenceTagger",
     "TokenClassifier",
     "WordTagger",
     "FewshotClassifier",

diff --git a/flair/models/entity_mention_linking.py b/flair/models/entity_mention_linking.py
@@ -1,6 +1,7 @@
 import inspect
 import logging
 import os
+import platform
 import re
 import stat
 import string
@@ -836,9 +837,13 @@ def extract_entities_mentions(self, sentence: Sentence, entity_label_types: Dict
         if any(label in ["diseases", "genes", "species", "chemical"] for label in sentence.annotation_layers):
             if not self._warned_legacy_sequence_tagger:
                 logger.warning(
-                    "The tagger `Classifier.load('hunflair') is deprecated. Please update to: `Classifier.load('hunflair2')`."
+                    "It appears that the sentences have been annotated with HunFlair (version 1). "
+                    "Consider using HunFlair2 for improved extraction performance: Classifier.load('hunflair2')."
+                    "See https://github.com/flairNLP/flair/blob/master/resources/docs/HUNFLAIR2.md for further "
+                    "information."
                 )
                 self._warned_legacy_sequence_tagger = True
+
             entity_types = {e for sublist in entity_label_types.values() for e in sublist}
             entities_mentions = [
                 label for label in sentence.get_labels() if normalize_entity_type(label.value) in entity_types
@@ -935,6 +940,14 @@ def _fetch_model(model_name: str) -> str:
         if model_name in hf_model_map:
             model_name = hf_model_map[model_name]
 
+            if platform.system() == "Windows":
+                logger.warning(
+                    "You seem to run your application on a Windows system. Unfortunately, the abbreviation "
+                    "resolution of HunFlair2 is only available on Linux/Mac systems. Therefore, a model "
+                    "without abbreviation resolution is therefore loaded"
+                )
+                model_name += "-no-ab3p"
+
         return hf_download(model_name)
 
     @classmethod

diff --git a/flair/models/multitask_model.py b/flair/models/multitask_model.py
@@ -1,4 +1,4 @@
 import logging
 import random
 import typing
 from pathlib import Path
@@ -245,7 +245,7 @@
         return self._label_type
 
     @staticmethod
-    def _fetch_model(model_name) -> str:
+    def _fetch_model(model_name: str) -> str:
         model_map = {}
         hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models"
 
@@ -260,6 +260,14 @@
 
         cache_dir = Path("models")
         if model_name in model_map:
+            if model_name.startswith("hunflair") or model_name == "bioner":
+                log.warning(
+                    "HunFlair (version 1) is deprecated. Consider using HunFlair2 for improved extraction performance: "
+                    "Classifier.load('hunflair2')."
+                    "See https://github.com/flairNLP/flair/blob/master/resources/docs/HUNFLAIR2.md for further "
+                    "information."
+                )
+
             model_name = cached_path(model_map[model_name], cache_dir=cache_dir)
 
         return model_name

diff --git a/flair/models/prefixed_tagger.py b/flair/models/prefixed_tagger.py
@@ -9,6 +9,7 @@
 import flair.data
 from flair.data import Corpus, Sentence, Token
 from flair.datasets import DataLoader, FlairDatapointDataset
+from flair.file_utils import hf_download
 from flair.models import SequenceTagger
 
 
@@ -317,3 +318,21 @@ def augment_sentences(
             sentences = [sentences]
 
         return [self.augmentation_strategy.augment_sentence(sentence, annotation_layers) for sentence in sentences]
+
+    @staticmethod
+    def _fetch_model(model_name) -> str:
+        huggingface_model_map = {"hunflair2": "hunflair/hunflair2-ner"}
+
+        # check if model name is a valid local file
+        if Path(model_name).exists():
+            model_path = model_name
+
+        # check if model name is a pre-configured hf model
+        elif model_name in huggingface_model_map:
+            hf_model_name = huggingface_model_map[model_name]
+            return hf_download(hf_model_name)
+
+        else:
+            model_path = hf_download(model_name)
+
+        return model_path
diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
@@ -781,6 +781,14 @@ def _fetch_model(model_name) -> str:
         elif model_name in hu_model_map:
             model_path = cached_path(hu_model_map[model_name], cache_dir=cache_dir)
 
+            if model_name.startswith("hunflair"):
+                log.warning(
+                    "HunFlair (version 1) is deprecated. Consider using HunFlair2 for improved extraction performance: "
+                    "Classifier.load('hunflair2')."
+                    "See https://github.com/flairNLP/flair/blob/master/resources/docs/HUNFLAIR2.md for further "
+                    "information."
+                )
+
         # special handling for the taggers by the @redewiegergabe project (TODO: move to model hub)
         elif model_name == "de-historic-indirect":
             model_file = flair.cache_root / cache_dir / "indirect" / "final-model.pt"

diff --git a/resources/docs/HUNFLAIR.md b/resources/docs/HUNFLAIR.md
@@ -8,6 +8,9 @@ NER data sets](HUNFLAIR_CORPORA.md) and comes with a Flair language model ("pubm
 FastText embeddings ("pubmed") that were trained on roughly 3 million full texts and about
 25 million abstracts from the biomedical domain.
 
+**<span style="color:red">Using HunFlair (version 1) is deprecated, please refer to [HunFlair2](HUNFLAIR2.md)
+for an updated and improved version.</span>**
+
 <b>Content:</b>
 [Quick Start](#quick-start) |
 [BioNER-Tool Comparison](#comparison-to-other-biomedical-ner-tools) |

diff --git a/resources/docs/HUNFLAIR2.md b/resources/docs/HUNFLAIR2.md
@@ -0,0 +1,135 @@
+# HunFlair2
+
+*HunFlair2* is a state-of-the-art named entity tagger and linker for biomedical texts. It comes with
+models for genes/proteins, chemicals, diseases, species and cell lines. *HunFlair2*
+builds on pretrained domain-specific language models and outperforms other biomedical
+NER tools on unseen corpora.
+
+<b>Content:</b>
+[Quick Start](#quick-start) |
+[Tool Comparison](#comparison-to-other-biomedical-entity-extraction-tools) |
+[Tutorials](#tutorials) |
+[Citing HunFlair](#citing-hunflair2)
+
+## Quick Start
+
+#### Requirements and Installation
+*HunFlair2* is based on Flair 0.13+ and Python 3.8+. If you do not have Python 3.8, install it first.
+Then, in your favorite virtual environment, simply do:
+```
+pip install flair
+```
+
+#### Example 1: Biomedical NER 
+Let's run named entity recognition (NER) over an example sentence. All you need to do is
+make a Sentence, load a pre-trained model and use it to predict tags for the sentence:
+```python
+from flair.data import Sentence
+from flair.nn import Classifier
+
+# make a sentence 
+sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome")
+
+# load biomedical NER tagger
+tagger = Classifier.load("hunflair2")
+
+# tag sentence
+tagger.predict(sentence)
+```
+Done! The Sentence now has entity annotations. Let's print the entities found by the tagger:
+```python
+for entity in sentence.get_labels():
+    print(entity)
+```
+This should print:
+```console
+Span[0:2]: "Behavioral abnormalities" → Disease (1.0)
+Span[4:5]: "Fmr1" → Gene (1.0)
+Span[6:7]: "Mouse" → Species (1.0)
+Span[9:12]: "Fragile X Syndrome" → Disease (1.0)
+```
+
+#### Example 2: Biomedical NEN
+For improved integration and aggregation from multiple different documents linking / normalizing the entities to 
+standardized ontologies or knowledge bases is required. Let's perform entity normalization by using
+specialized models per entity type:
+```python
+from flair.data import Sentence
+from flair.models import EntityMentionLinker
+from flair.nn import Classifier
+
+# make a sentence
+sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome")
+
+# load biomedical NER tagger + predict entities
+tagger = Classifier.load("hunflair2")
+tagger.predict(sentence)
+
+# load gene linker and perform normalization
+gene_linker = EntityMentionLinker.load("gene-linker")
+gene_linker.predict(sentence)
+
+# load disease linker and perform normalization
+disease_linker = EntityMentionLinker.load("disease-linker")
+disease_linker.predict(sentence)
+
+# load species linker and perform normalization
+species_linker = EntityMentionLinker.load("species-linker")
+species_linker.predict(sentence)
+```
+**Note**, the ontologies and knowledge bases used are pre-processed the first time the normalisation is executed, 
+which might takes a certain amount of time. All further calls are then based on this pre-processing and run 
+much faster.
+
+Done! The Sentence now has entity normalizations. Let's print the entity identifiers found by the linkers:
+```python
+for entity in sentence.get_labels("link"):
+    print(entity)
+```
+This should print:
+```console
+Span[0:2]: "Behavioral abnormalities" → MESH:D001523/name=Mental Disorders (197.9467010498047)
+Span[4:5]: "Fmr1" → 108684022/name=FRAXA (219.9510040283203)
+Span[6:7]: "Mouse" → 10090/name=Mus musculus (213.6201934814453)
+Span[9:12]: "Fragile X Syndrome" → MESH:D005600/name=Fragile X Syndrome (193.7115020751953)
+```
+
+## Comparison to other biomedical entity extraction tools
+Tools for biomedical entity extraction are typically trained and evaluated on single, rather small gold standard 
+data sets.  However, they are applied "in the wild" to a much larger collection of texts, often varying in
+topic, entity distribution, genre (e.g. patents vs. scientific articles) and text type (e.g. abstract
+vs. full text), which can lead to severe drops in performance.
+
+*HunFlair2* outperforms other biomedical entity extraction tools on corpora not used for training of neither 
+*HunFlair2* or any of the competitor tools.
+
+| Corpus                                                                                       | Entity Type | BENT  | BERN2 | PubTator Central | SciSpacy | HunFlair    |
+|----------------------------------------------------------------------------------------------|-------------|-------|-------|------------------|----------|-------------|
+| [MedMentions](https://github.com/chanzuckerberg/MedMentions)                                 | Chemical    | 40.90 | 41.79 | 31.28            | 34.95    | *__51.17__* |
+|                                                                                              | Disease     | 45.94 | 47.33 | 41.11            | 40.78    | *__57.27__* |
+| [tmVar (v3)](https://github.com/ncbi/tmVar3?tab=readme-ov-file)                              | Gene        | 0.54  | 43.96 | *__86.02__*      | -        | 76.75       |
+| [BioID](https://biocreative.bioinformatics.udel.edu/media/store/files/2018/BC6_track1_1.pdf) | Species     | 10.35 | 14.35 | *__58.90__*      | 37.14    | 49.66       |
+|||||
+| Average                                                                                      | All         | 24.43 | 36.86 | 54.33            | 37.61    | *__58.79__* |
+
+<sub>All results are F1 scores highlighting end-to-end performance, i.e., named entity recognition and normalization,
+using partial matching of predicted text offsets with the original char offsets of the gold standard data. 
+We allow a shift by max one character.</sub>
+
+You can find detailed evaluations and discussions in [our paper](https://arxiv.org/abs/2402.12372).
+
+## Tutorials
+We provide a set of quick tutorials to get you started with *HunFlair2*:
+* [Tutorial 1: Tagging biomedical named entities](HUNFLAIR2_TUTORIAL_1_TAGGING.md)
+* [Tutorial 2: Linking biomedical named entities](HUNFLAIR2_TUTORIAL_2_LINKING.md)
+
+## Citing HunFlair2
+Please cite the following paper when using *HunFlair2*:
+~~~
+@article{sanger2024hunflair2,
+  title={HunFlair2 in a cross-corpus evaluation of biomedical named entity recognition and normalization tools},
+  author={S{\"a}nger, Mario and Garda, Samuele and Wang, Xing David and Weber-Genzel, Leon and Droop, Pia and Fuchs, Benedikt and Akbik, Alan and Leser, Ulf},
+  journal={arXiv preprint arXiv:2402.12372},
+  year={2024}
+}
+~~~
diff --git a/resources/docs/HUNFLAIR2_TUTORIAL_1_TAGGING.md b/resources/docs/HUNFLAIR2_TUTORIAL_1_TAGGING.md
@@ -0,0 +1,121 @@
+# HunFlair2 - Tutorial 1: Tagging
+
+This is part 1 of the tutorial, in which we show how to use our pre-trained *HunFlair2* models to tag your text.
+
+### Tagging with Pre-trained HunFlair2-Models
+Let's use the pre-trained *HunFlair2* model for biomedical named entity recognition (NER).
+This model was trained over multiple biomedical NER data sets and can recognize 5 different entity types,
+i.e. cell lines, chemicals, disease, gene / proteins and species.
+```python
+from flair.nn import Classifier
+
+tagger = Classifier.load("hunflair2")
+```
+All you need to do is use the predict() method of the tagger on a sentence.
+This will add predicted tags to the tokens in the sentence.
+Lets use a sentence with four named entities:
+```python
+from flair.data import Sentence
+
+sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome")
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print the predicted tags
+for entity in sentence.get_labels():
+    print(entity)
+```
+This should print:
+```console
+Span[0:2]: "Behavioral abnormalities" → Disease (1.0)
+Span[4:5]: "Fmr1" → Gene (1.0)
+Span[6:7]: "Mouse" → Species (1.0)
+Span[9:12]: "Fragile X Syndrome" → Disease (1.0)
+```
+The output indicates that there are two diseases mentioned in the text ("_Behavioral Abnormalities_" and 
+"_Fragile X Syndrome_") as well as one gene ("_fmr1_") and one species ("_Mouse_"). For each entity the
+text span in the sentence mention it is given and Label with a value and a score (confidence in the 
+prediction). You can also get additional information, such as the position offsets of each entity 
+in the sentence in a structured way by calling the `to_dict()` method:
+
+```python
+print(sentence.to_dict())
+```
+This should print:
+```python
+{
+    'text': 'Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome', 
+    'labels': [], 
+    'entities': [
+        {'text': 'Behavioral abnormalities', 'start_pos': 0, 'end_pos': 24, 'labels': [{'value': 'Disease', 'confidence': 0.9999860525131226}]}, 
+        {'text': 'Fmr1', 'start_pos': 32, 'end_pos': 36, 'labels': [{'value': 'Gene', 'confidence': 0.9999895095825195}]}, 
+        {'text': 'Mouse', 'start_pos': 41, 'end_pos': 46, 'labels': [{'value': 'Species', 'confidence': 0.9999873638153076}]}, 
+        {'text': 'Fragile X Syndrome', 'start_pos': 56, 'end_pos': 74, 'labels': [{'value': 'Disease', 'confidence': 0.9999928871790568}]}
+      ],
+    # further sentence information
+}
+```
+
+### Using a Biomedical Tokenizer
+Tokenization, i.e. separating a text into tokens / words, is an important issue in natural language processing
+in general and biomedical text mining in particular. So far, we used a tokenizer for general domain text.
+This can be unfavourable if applied to biomedical texts.
+
+*HunFlair2* integrates [SciSpaCy](https://allenai.github.io/scispacy/), a library specially designed to work with scientific text.
+To use the library we first have to install it and download one of it's models:
+~~~
+pip install scispacy==0.5.1
+pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
+~~~
+
+To use the tokenizer we just have to pass it as parameter to when instancing a sentence:
+```python
+from flair.tokenization import SciSpacyTokenizer
+
+sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome",
+                    use_tokenizer=SciSpacyTokenizer())
+```
+
+### Working with longer Texts
+Often, we are concerned with complete scientific abstracts or full-texts when performing biomedical text mining, e.g.
+```python
+abstract = "Fragile X syndrome (FXS) is a developmental disorder caused by a mutation in the X-linked FMR1 gene, " \
+           "coding for the FMRP protein which is largely involved in synaptic function. FXS patients present several " \
+           "behavioral abnormalities, including hyperactivity, anxiety, sensory hyper-responsiveness, and cognitive " \
+           "deficits. Autistic symptoms, e.g., altered social interaction and communication, are also often observed: " \
+           "FXS is indeed the most common monogenic cause of autism."
+```
+
+To work with complete abstracts or full-text, we first have to split them into separate sentences.
+Again we can apply the integration of the [SciSpaCy](https://allenai.github.io/scispacy/) library:
+```python
+from flair.splitter import SciSpacySentenceSplitter
+
+# initialize the sentence splitter
+splitter = SciSpacySentenceSplitter()
+
+# split text into a list of Sentence objects
+sentences = splitter.split(abstract)
+
+# you can apply the HunFlair tagger directly to this list
+tagger.predict(sentences)
+```
+We can access the annotations of the single sentences by just iterating over the list:
+```python
+for sentence in sentences:
+    print(sentence.to_tagged_string())
+```
+This should print:
+~~~
+Sentence[35]: "Fragile X syndrome (FXS) is a developmental disorder caused by a mutation in the X-linked FMR1 gene, coding for the FMRP protein which is largely involved in synaptic function." \
+              → ["Fragile X syndrome"/Disease, "FXS"/Disease, "developmental disorder"/Disease, "X-linked"/Gene, "FMR1"/Gene, "FMRP"/Gene]
+Sentence[23]: "FXS patients present several behavioral abnormalities, including hyperactivity, anxiety, sensory hyper-responsiveness, and cognitive deficits." \
+              → ["FXS"/Disease, "patients"/Species, "behavioral abnormalities"/Disease, "hyperactivity"/Disease, "anxiety"/Disease, "sensory hyper-responsiveness"/Disease, "cognitive deficits"/Disease]
+Sentence[27]: "Autistic symptoms, e.g., altered social interaction and communication, are also often observed: FXS is indeed the most common monogenic cause of autism." \
+              → ["Autistic symptoms"/Disease, "altered social interaction and communication"/Disease, "FXS"/Disease, "autism"/Disease]
+~~~
+
+### Next
+Now, let us look at how to [link / normalize the entities to standard ontologies](HUNFLAIR2_TUTORIAL_2_LINKING.md) 
+in the second tutorial.