Skip to content

Commit

Permalink
Merge pull request #3397 from flairNLP/fix-splitter
Browse files Browse the repository at this point in the history
Fix linking of sentences in sentence splitter
alanakbik authored Jan 30, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
2 parents ddf3bb3 + fb49bef commit 347b9a8
Showing 2 changed files with 63 additions and 23 deletions.
18 changes: 13 additions & 5 deletions flair/splitter.py
Original file line number Diff line number Diff line change
@@ -25,8 +25,16 @@ class SentenceSplitter(ABC):
the sentence splitter's configuration.
"""

def split(self, text: str, link_sentences: Optional[bool] = True) -> List[Sentence]:
sentences = self._perform_split(text)
if not link_sentences:
return sentences

Sentence.set_context_for_sentences(sentences)
return sentences

@abstractmethod
def split(self, text: str) -> List[Sentence]:
def _perform_split(self, text: str) -> List[Sentence]:
raise NotImplementedError

@property
@@ -54,7 +62,7 @@ def __init__(self, tokenizer: Tokenizer = SegtokTokenizer()) -> None:
super().__init__()
self._tokenizer = tokenizer

def split(self, text: str) -> List[Sentence]:
def _perform_split(self, text: str) -> List[Sentence]:
plain_sentences: List[str] = split_multi(text)
sentence_offset = 0

@@ -125,7 +133,7 @@ def __init__(self, model: Union[Any, str], tokenizer: Optional[Tokenizer] = None
else:
self._tokenizer = tokenizer

def split(self, text: str) -> List[Sentence]:
def _perform_split(self, text: str) -> List[Sentence]:
document = self.model(text)

sentences = [
@@ -184,7 +192,7 @@ def __init__(self, tag: str, tokenizer: Tokenizer = SegtokTokenizer()) -> None:
self._tokenizer = tokenizer
self.tag = tag

def split(self, text: str) -> List[Sentence]:
def _perform_split(self, text: str) -> List[Sentence]:
plain_sentences = text.split(self.tag)

sentences = []
@@ -244,7 +252,7 @@ def __init__(self, tokenizer: Tokenizer = SegtokTokenizer()) -> None:
super().__init__()
self._tokenizer = tokenizer

def split(self, text: str) -> List[Sentence]:
def _perform_split(self, text: str) -> List[Sentence]:
return [Sentence(text=text, use_tokenizer=self._tokenizer, start_position=0)]

@property
68 changes: 50 additions & 18 deletions tests/test_tokenize_sentence.py
Original file line number Diff line number Diff line change
@@ -202,15 +202,15 @@ def test_create_sentence_using_scispacy_tokenizer():

def test_split_text_segtok():
segtok_splitter = SegtokSentenceSplitter()
sentences = segtok_splitter.split("I love Berlin. Berlin is a great city.")
sentences = segtok_splitter._perform_split("I love Berlin. Berlin is a great city.")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 4
assert sentences[1].start_position == 15
assert len(sentences[1].tokens) == 6

segtok_splitter = SegtokSentenceSplitter(tokenizer=TokenizerWrapper(no_op_tokenizer))
sentences = segtok_splitter.split("I love Berlin. Berlin is a great city.")
sentences = segtok_splitter._perform_split("I love Berlin. Berlin is a great city.")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 1
@@ -220,13 +220,13 @@ def test_split_text_segtok():

def test_split_text_nosplit():
no_splitter = NoSentenceSplitter()
sentences = no_splitter.split("I love Berlin")
sentences = no_splitter._perform_split("I love Berlin")
assert len(sentences) == 1
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 3

no_splitter = NoSentenceSplitter(TokenizerWrapper(no_op_tokenizer))
sentences = no_splitter.split("I love Berlin")
sentences = no_splitter._perform_split("I love Berlin")
assert len(sentences) == 1
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 1
@@ -235,78 +235,110 @@ def test_split_text_nosplit():
def test_split_text_on_tag():
tag_splitter = TagSentenceSplitter(tag="#!")

sentences = tag_splitter.split("I love Berlin#!Me too")
sentences = tag_splitter._perform_split("I love Berlin#!Me too")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 3
assert sentences[1].start_position == 15
assert len(sentences[1].tokens) == 2

tag_splitter = TagSentenceSplitter(tag="#!", tokenizer=TokenizerWrapper(no_op_tokenizer))
sentences = tag_splitter.split("I love Berlin#!Me too")
sentences = tag_splitter._perform_split("I love Berlin#!Me too")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 1
assert sentences[1].start_position == 15
assert len(sentences[1].tokens) == 1

sentences = tag_splitter.split("I love Berlin Me too")
sentences = tag_splitter._perform_split("I love Berlin Me too")
assert len(sentences) == 1

sentences = tag_splitter.split("I love Berlin#!#!Me too")
sentences = tag_splitter._perform_split("I love Berlin#!#!Me too")
assert len(sentences) == 2

sentences = tag_splitter.split("I love Berl#! #!inMe too")
sentences = tag_splitter._perform_split("I love Berl#! #!inMe too")
assert len(sentences) == 2


def test_split_text_on_newline():
newline_splitter = NewlineSentenceSplitter()

sentences = newline_splitter.split("I love Berlin\nMe too")
sentences = newline_splitter._perform_split("I love Berlin\nMe too")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 3
assert sentences[0].start_position == 0
assert len(sentences[1].tokens) == 2

newline_splitter = NewlineSentenceSplitter(tokenizer=TokenizerWrapper(no_op_tokenizer))
sentences = newline_splitter.split("I love Berlin\nMe too")
sentences = newline_splitter._perform_split("I love Berlin\nMe too")
assert len(sentences) == 2
assert len(sentences[0].tokens) == 1
assert sentences[1].start_position == 14
assert len(sentences[1].tokens) == 1

sentences = newline_splitter.split("I love Berlin Me too")
sentences = newline_splitter._perform_split("I love Berlin Me too")
assert len(sentences) == 1

sentences = newline_splitter.split("I love Berlin\n\nMe too")
sentences = newline_splitter._perform_split("I love Berlin\n\nMe too")
assert len(sentences) == 2

sentences = newline_splitter.split("I love Berlin\n \nMe too")
sentences = newline_splitter._perform_split("I love Berlin\n \nMe too")
assert len(sentences) == 2


def test_split_sentence_linkage():
splitter = SegtokSentenceSplitter()

text = "This is a single sentence."
sentences = splitter.split(text)

assert len(sentences) == 1
assert sentences[0].previous_sentence() is None
assert sentences[0].next_sentence() is None

text = "This is a sentence. This is another sentence. This is yet another sentence."
sentences = splitter.split(text)

assert len(sentences) == 3
assert sentences[0].previous_sentence() is None
assert sentences[0].next_sentence() == sentences[1]
assert sentences[1].previous_sentence() == sentences[0]
assert sentences[1].next_sentence() == sentences[2]
assert sentences[2].previous_sentence() == sentences[1]
assert sentences[2].next_sentence() is None


def test_split_sentence_linkage_false():
splitter = SegtokSentenceSplitter()

text = "This is a sentence. This is another sentence. This is yet another sentence."
sentences = splitter.split(text, link_sentences=False)

assert len(sentences) == 3
assert all(s.next_sentence() is None and s.previous_sentence() is None for s in sentences)


@pytest.mark.skip(reason="SpacySentenceSplitter need optional requirements, so we skip the test by default")
def test_split_text_spacy():
spacy_splitter = SpacySentenceSplitter("en_core_sci_sm")

sentences = spacy_splitter.split("This a sentence. And here is another one.")
sentences = spacy_splitter._perform_split("This a sentence. And here is another one.")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 4
assert sentences[1].start_position == 17
assert len(sentences[1].tokens) == 6

sentences = spacy_splitter.split("VF inhibits something. ACE-dependent (GH+) issuses too.")
sentences = spacy_splitter._perform_split("VF inhibits something. ACE-dependent (GH+) issuses too.")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 4
assert sentences[1].start_position == 23
assert len(sentences[1].tokens) == 7

spacy_splitter = SpacySentenceSplitter("en_core_sci_sm", tokenizer=TokenizerWrapper(no_op_tokenizer))
sentences = spacy_splitter.split("This a sentence. And here is another one.")
sentences = spacy_splitter._perform_split("This a sentence. And here is another one.")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 1
@@ -317,7 +349,7 @@ def test_split_text_spacy():
@pytest.mark.skip(reason="SciSpacySentenceSplitter need optional requirements, so we skip the test by default")
def test_split_text_scispacy():
scispacy_splitter = SciSpacySentenceSplitter()
sentences = scispacy_splitter.split("VF inhibits something. ACE-dependent (GH+) issuses too.")
sentences = scispacy_splitter._perform_split("VF inhibits something. ACE-dependent (GH+) issuses too.")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 4

0 comments on commit 347b9a8

Please sign in to comment.