From 6703b173aa551f009582f4f45269d33993721376 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Fri, 19 Jan 2024 10:39:28 +0100 Subject: [PATCH 1/2] Fix linking of sentences in sentence splitter --- flair/splitter.py | 25 +++++++++--- tests/test_tokenize_sentence.py | 68 ++++++++++++++++++++++++--------- 2 files changed, 70 insertions(+), 23 deletions(-) diff --git a/flair/splitter.py b/flair/splitter.py index 90464bfc2..32431f503 100644 --- a/flair/splitter.py +++ b/flair/splitter.py @@ -25,8 +25,23 @@ class SentenceSplitter(ABC): the sentence splitter's configuration. """ + def split(self, text: str, link_sentences: Optional[bool] = True) -> List[Sentence]: + sentences = self._perform_split(text) + if not link_sentences: + return sentences + + num_sentences = len(sentences) + for i, sentence in enumerate(sentences): + if i > 0: + sentence._previous_sentence = sentences[i - 1] + + if i + 1 < num_sentences: + sentence._next_sentence = sentences[i + 1] + + return sentences + @abstractmethod - def split(self, text: str) -> List[Sentence]: + def _perform_split(self, text: str) -> List[Sentence]: raise NotImplementedError @property @@ -54,7 +69,7 @@ def __init__(self, tokenizer: Tokenizer = SegtokTokenizer()) -> None: super().__init__() self._tokenizer = tokenizer - def split(self, text: str) -> List[Sentence]: + def _perform_split(self, text: str) -> List[Sentence]: plain_sentences: List[str] = split_multi(text) sentence_offset = 0 @@ -125,7 +140,7 @@ def __init__(self, model: Union[Any, str], tokenizer: Optional[Tokenizer] = None else: self._tokenizer = tokenizer - def split(self, text: str) -> List[Sentence]: + def _perform_split(self, text: str) -> List[Sentence]: document = self.model(text) sentences = [ @@ -184,7 +199,7 @@ def __init__(self, tag: str, tokenizer: Tokenizer = SegtokTokenizer()) -> None: self._tokenizer = tokenizer self.tag = tag - def split(self, text: str) -> List[Sentence]: + def _perform_split(self, text: str) -> List[Sentence]: plain_sentences = text.split(self.tag) sentences = [] @@ -244,7 +259,7 @@ def __init__(self, tokenizer: Tokenizer = SegtokTokenizer()) -> None: super().__init__() self._tokenizer = tokenizer - def split(self, text: str) -> List[Sentence]: + def _perform_split(self, text: str) -> List[Sentence]: return [Sentence(text=text, use_tokenizer=self._tokenizer, start_position=0)] @property diff --git a/tests/test_tokenize_sentence.py b/tests/test_tokenize_sentence.py index a3a4d52ba..baeacb408 100644 --- a/tests/test_tokenize_sentence.py +++ b/tests/test_tokenize_sentence.py @@ -202,7 +202,7 @@ def test_create_sentence_using_scispacy_tokenizer(): def test_split_text_segtok(): segtok_splitter = SegtokSentenceSplitter() - sentences = segtok_splitter.split("I love Berlin. Berlin is a great city.") + sentences = segtok_splitter._perform_split("I love Berlin. Berlin is a great city.") assert len(sentences) == 2 assert sentences[0].start_position == 0 assert len(sentences[0].tokens) == 4 @@ -210,7 +210,7 @@ def test_split_text_segtok(): assert len(sentences[1].tokens) == 6 segtok_splitter = SegtokSentenceSplitter(tokenizer=TokenizerWrapper(no_op_tokenizer)) - sentences = segtok_splitter.split("I love Berlin. Berlin is a great city.") + sentences = segtok_splitter._perform_split("I love Berlin. Berlin is a great city.") assert len(sentences) == 2 assert sentences[0].start_position == 0 assert len(sentences[0].tokens) == 1 @@ -220,13 +220,13 @@ def test_split_text_segtok(): def test_split_text_nosplit(): no_splitter = NoSentenceSplitter() - sentences = no_splitter.split("I love Berlin") + sentences = no_splitter._perform_split("I love Berlin") assert len(sentences) == 1 assert sentences[0].start_position == 0 assert len(sentences[0].tokens) == 3 no_splitter = NoSentenceSplitter(TokenizerWrapper(no_op_tokenizer)) - sentences = no_splitter.split("I love Berlin") + sentences = no_splitter._perform_split("I love Berlin") assert len(sentences) == 1 assert sentences[0].start_position == 0 assert len(sentences[0].tokens) == 1 @@ -235,7 +235,7 @@ def test_split_text_nosplit(): def test_split_text_on_tag(): tag_splitter = TagSentenceSplitter(tag="#!") - sentences = tag_splitter.split("I love Berlin#!Me too") + sentences = tag_splitter._perform_split("I love Berlin#!Me too") assert len(sentences) == 2 assert sentences[0].start_position == 0 assert len(sentences[0].tokens) == 3 @@ -243,27 +243,27 @@ def test_split_text_on_tag(): assert len(sentences[1].tokens) == 2 tag_splitter = TagSentenceSplitter(tag="#!", tokenizer=TokenizerWrapper(no_op_tokenizer)) - sentences = tag_splitter.split("I love Berlin#!Me too") + sentences = tag_splitter._perform_split("I love Berlin#!Me too") assert len(sentences) == 2 assert sentences[0].start_position == 0 assert len(sentences[0].tokens) == 1 assert sentences[1].start_position == 15 assert len(sentences[1].tokens) == 1 - sentences = tag_splitter.split("I love Berlin Me too") + sentences = tag_splitter._perform_split("I love Berlin Me too") assert len(sentences) == 1 - sentences = tag_splitter.split("I love Berlin#!#!Me too") + sentences = tag_splitter._perform_split("I love Berlin#!#!Me too") assert len(sentences) == 2 - sentences = tag_splitter.split("I love Berl#! #!inMe too") + sentences = tag_splitter._perform_split("I love Berl#! #!inMe too") assert len(sentences) == 2 def test_split_text_on_newline(): newline_splitter = NewlineSentenceSplitter() - sentences = newline_splitter.split("I love Berlin\nMe too") + sentences = newline_splitter._perform_split("I love Berlin\nMe too") assert len(sentences) == 2 assert sentences[0].start_position == 0 assert len(sentences[0].tokens) == 3 @@ -271,34 +271,66 @@ def test_split_text_on_newline(): assert len(sentences[1].tokens) == 2 newline_splitter = NewlineSentenceSplitter(tokenizer=TokenizerWrapper(no_op_tokenizer)) - sentences = newline_splitter.split("I love Berlin\nMe too") + sentences = newline_splitter._perform_split("I love Berlin\nMe too") assert len(sentences) == 2 assert len(sentences[0].tokens) == 1 assert sentences[1].start_position == 14 assert len(sentences[1].tokens) == 1 - sentences = newline_splitter.split("I love Berlin Me too") + sentences = newline_splitter._perform_split("I love Berlin Me too") assert len(sentences) == 1 - sentences = newline_splitter.split("I love Berlin\n\nMe too") + sentences = newline_splitter._perform_split("I love Berlin\n\nMe too") assert len(sentences) == 2 - sentences = newline_splitter.split("I love Berlin\n \nMe too") + sentences = newline_splitter._perform_split("I love Berlin\n \nMe too") assert len(sentences) == 2 +def test_split_sentence_linkage(): + splitter = SegtokSentenceSplitter() + + text = "This is a single sentence." + sentences = splitter.split(text) + + assert len(sentences) == 1 + assert sentences[0].previous_sentence() is None + assert sentences[0].next_sentence() is None + + text = "This is a sentence. This is another sentence. This is yet another sentence." + sentences = splitter.split(text) + + assert len(sentences) == 3 + assert sentences[0].previous_sentence() is None + assert sentences[0].next_sentence() == sentences[1] + assert sentences[1].previous_sentence() == sentences[0] + assert sentences[1].next_sentence() == sentences[2] + assert sentences[2].previous_sentence() == sentences[1] + assert sentences[2].next_sentence() is None + + +def test_split_sentence_linkage_false(): + splitter = SegtokSentenceSplitter() + + text = "This is a sentence. This is another sentence. This is yet another sentence." + sentences = splitter.split(text, link_sentences=False) + + assert len(sentences) == 3 + assert all(s.next_sentence() is None and s.previous_sentence() is None for s in sentences) + + @pytest.mark.skip(reason="SpacySentenceSplitter need optional requirements, so we skip the test by default") def test_split_text_spacy(): spacy_splitter = SpacySentenceSplitter("en_core_sci_sm") - sentences = spacy_splitter.split("This a sentence. And here is another one.") + sentences = spacy_splitter._perform_split("This a sentence. And here is another one.") assert len(sentences) == 2 assert sentences[0].start_position == 0 assert len(sentences[0].tokens) == 4 assert sentences[1].start_position == 17 assert len(sentences[1].tokens) == 6 - sentences = spacy_splitter.split("VF inhibits something. ACE-dependent (GH+) issuses too.") + sentences = spacy_splitter._perform_split("VF inhibits something. ACE-dependent (GH+) issuses too.") assert len(sentences) == 2 assert sentences[0].start_position == 0 assert len(sentences[0].tokens) == 4 @@ -306,7 +338,7 @@ def test_split_text_spacy(): assert len(sentences[1].tokens) == 7 spacy_splitter = SpacySentenceSplitter("en_core_sci_sm", tokenizer=TokenizerWrapper(no_op_tokenizer)) - sentences = spacy_splitter.split("This a sentence. And here is another one.") + sentences = spacy_splitter._perform_split("This a sentence. And here is another one.") assert len(sentences) == 2 assert sentences[0].start_position == 0 assert len(sentences[0].tokens) == 1 @@ -317,7 +349,7 @@ def test_split_text_spacy(): @pytest.mark.skip(reason="SciSpacySentenceSplitter need optional requirements, so we skip the test by default") def test_split_text_scispacy(): scispacy_splitter = SciSpacySentenceSplitter() - sentences = scispacy_splitter.split("VF inhibits something. ACE-dependent (GH+) issuses too.") + sentences = scispacy_splitter._perform_split("VF inhibits something. ACE-dependent (GH+) issuses too.") assert len(sentences) == 2 assert sentences[0].start_position == 0 assert len(sentences[0].tokens) == 4 From fb49beff06ff8dcc8bdd18ff0803f09b33c221ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Wed, 24 Jan 2024 13:40:51 +0100 Subject: [PATCH 2/2] Use existing method for setting the sentence contexts --- flair/splitter.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/flair/splitter.py b/flair/splitter.py index 32431f503..9f7e502c8 100644 --- a/flair/splitter.py +++ b/flair/splitter.py @@ -30,14 +30,7 @@ def split(self, text: str, link_sentences: Optional[bool] = True) -> List[Senten if not link_sentences: return sentences - num_sentences = len(sentences) - for i, sentence in enumerate(sentences): - if i > 0: - sentence._previous_sentence = sentences[i - 1] - - if i + 1 < num_sentences: - sentence._next_sentence = sentences[i + 1] - + Sentence.set_context_for_sentences(sentences) return sentences @abstractmethod