Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix linking of sentences in sentence splitter #3397

Merged
merged 2 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions flair/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,16 @@ class SentenceSplitter(ABC):
the sentence splitter's configuration.
"""

def split(self, text: str, link_sentences: Optional[bool] = True) -> List[Sentence]:
sentences = self._perform_split(text)
if not link_sentences:
return sentences

Sentence.set_context_for_sentences(sentences)
return sentences

@abstractmethod
def split(self, text: str) -> List[Sentence]:
def _perform_split(self, text: str) -> List[Sentence]:
raise NotImplementedError

@property
Expand Down Expand Up @@ -54,7 +62,7 @@ def __init__(self, tokenizer: Tokenizer = SegtokTokenizer()) -> None:
super().__init__()
self._tokenizer = tokenizer

def split(self, text: str) -> List[Sentence]:
def _perform_split(self, text: str) -> List[Sentence]:
plain_sentences: List[str] = split_multi(text)
sentence_offset = 0

Expand Down Expand Up @@ -125,7 +133,7 @@ def __init__(self, model: Union[Any, str], tokenizer: Optional[Tokenizer] = None
else:
self._tokenizer = tokenizer

def split(self, text: str) -> List[Sentence]:
def _perform_split(self, text: str) -> List[Sentence]:
document = self.model(text)

sentences = [
Expand Down Expand Up @@ -184,7 +192,7 @@ def __init__(self, tag: str, tokenizer: Tokenizer = SegtokTokenizer()) -> None:
self._tokenizer = tokenizer
self.tag = tag

def split(self, text: str) -> List[Sentence]:
def _perform_split(self, text: str) -> List[Sentence]:
plain_sentences = text.split(self.tag)

sentences = []
Expand Down Expand Up @@ -244,7 +252,7 @@ def __init__(self, tokenizer: Tokenizer = SegtokTokenizer()) -> None:
super().__init__()
self._tokenizer = tokenizer

def split(self, text: str) -> List[Sentence]:
def _perform_split(self, text: str) -> List[Sentence]:
return [Sentence(text=text, use_tokenizer=self._tokenizer, start_position=0)]

@property
Expand Down
68 changes: 50 additions & 18 deletions tests/test_tokenize_sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,15 +202,15 @@ def test_create_sentence_using_scispacy_tokenizer():

def test_split_text_segtok():
segtok_splitter = SegtokSentenceSplitter()
sentences = segtok_splitter.split("I love Berlin. Berlin is a great city.")
sentences = segtok_splitter._perform_split("I love Berlin. Berlin is a great city.")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 4
assert sentences[1].start_position == 15
assert len(sentences[1].tokens) == 6

segtok_splitter = SegtokSentenceSplitter(tokenizer=TokenizerWrapper(no_op_tokenizer))
sentences = segtok_splitter.split("I love Berlin. Berlin is a great city.")
sentences = segtok_splitter._perform_split("I love Berlin. Berlin is a great city.")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 1
Expand All @@ -220,13 +220,13 @@ def test_split_text_segtok():

def test_split_text_nosplit():
no_splitter = NoSentenceSplitter()
sentences = no_splitter.split("I love Berlin")
sentences = no_splitter._perform_split("I love Berlin")
assert len(sentences) == 1
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 3

no_splitter = NoSentenceSplitter(TokenizerWrapper(no_op_tokenizer))
sentences = no_splitter.split("I love Berlin")
sentences = no_splitter._perform_split("I love Berlin")
assert len(sentences) == 1
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 1
Expand All @@ -235,78 +235,110 @@ def test_split_text_nosplit():
def test_split_text_on_tag():
tag_splitter = TagSentenceSplitter(tag="#!")

sentences = tag_splitter.split("I love Berlin#!Me too")
sentences = tag_splitter._perform_split("I love Berlin#!Me too")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 3
assert sentences[1].start_position == 15
assert len(sentences[1].tokens) == 2

tag_splitter = TagSentenceSplitter(tag="#!", tokenizer=TokenizerWrapper(no_op_tokenizer))
sentences = tag_splitter.split("I love Berlin#!Me too")
sentences = tag_splitter._perform_split("I love Berlin#!Me too")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 1
assert sentences[1].start_position == 15
assert len(sentences[1].tokens) == 1

sentences = tag_splitter.split("I love Berlin Me too")
sentences = tag_splitter._perform_split("I love Berlin Me too")
assert len(sentences) == 1

sentences = tag_splitter.split("I love Berlin#!#!Me too")
sentences = tag_splitter._perform_split("I love Berlin#!#!Me too")
assert len(sentences) == 2

sentences = tag_splitter.split("I love Berl#! #!inMe too")
sentences = tag_splitter._perform_split("I love Berl#! #!inMe too")
assert len(sentences) == 2


def test_split_text_on_newline():
newline_splitter = NewlineSentenceSplitter()

sentences = newline_splitter.split("I love Berlin\nMe too")
sentences = newline_splitter._perform_split("I love Berlin\nMe too")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 3
assert sentences[0].start_position == 0
assert len(sentences[1].tokens) == 2

newline_splitter = NewlineSentenceSplitter(tokenizer=TokenizerWrapper(no_op_tokenizer))
sentences = newline_splitter.split("I love Berlin\nMe too")
sentences = newline_splitter._perform_split("I love Berlin\nMe too")
assert len(sentences) == 2
assert len(sentences[0].tokens) == 1
assert sentences[1].start_position == 14
assert len(sentences[1].tokens) == 1

sentences = newline_splitter.split("I love Berlin Me too")
sentences = newline_splitter._perform_split("I love Berlin Me too")
assert len(sentences) == 1

sentences = newline_splitter.split("I love Berlin\n\nMe too")
sentences = newline_splitter._perform_split("I love Berlin\n\nMe too")
assert len(sentences) == 2

sentences = newline_splitter.split("I love Berlin\n \nMe too")
sentences = newline_splitter._perform_split("I love Berlin\n \nMe too")
assert len(sentences) == 2


def test_split_sentence_linkage():
splitter = SegtokSentenceSplitter()

text = "This is a single sentence."
sentences = splitter.split(text)

assert len(sentences) == 1
assert sentences[0].previous_sentence() is None
assert sentences[0].next_sentence() is None

text = "This is a sentence. This is another sentence. This is yet another sentence."
sentences = splitter.split(text)

assert len(sentences) == 3
assert sentences[0].previous_sentence() is None
assert sentences[0].next_sentence() == sentences[1]
assert sentences[1].previous_sentence() == sentences[0]
assert sentences[1].next_sentence() == sentences[2]
assert sentences[2].previous_sentence() == sentences[1]
assert sentences[2].next_sentence() is None


def test_split_sentence_linkage_false():
splitter = SegtokSentenceSplitter()

text = "This is a sentence. This is another sentence. This is yet another sentence."
sentences = splitter.split(text, link_sentences=False)

assert len(sentences) == 3
assert all(s.next_sentence() is None and s.previous_sentence() is None for s in sentences)


@pytest.mark.skip(reason="SpacySentenceSplitter need optional requirements, so we skip the test by default")
def test_split_text_spacy():
spacy_splitter = SpacySentenceSplitter("en_core_sci_sm")

sentences = spacy_splitter.split("This a sentence. And here is another one.")
sentences = spacy_splitter._perform_split("This a sentence. And here is another one.")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 4
assert sentences[1].start_position == 17
assert len(sentences[1].tokens) == 6

sentences = spacy_splitter.split("VF inhibits something. ACE-dependent (GH+) issuses too.")
sentences = spacy_splitter._perform_split("VF inhibits something. ACE-dependent (GH+) issuses too.")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 4
assert sentences[1].start_position == 23
assert len(sentences[1].tokens) == 7

spacy_splitter = SpacySentenceSplitter("en_core_sci_sm", tokenizer=TokenizerWrapper(no_op_tokenizer))
sentences = spacy_splitter.split("This a sentence. And here is another one.")
sentences = spacy_splitter._perform_split("This a sentence. And here is another one.")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 1
Expand All @@ -317,7 +349,7 @@ def test_split_text_spacy():
@pytest.mark.skip(reason="SciSpacySentenceSplitter need optional requirements, so we skip the test by default")
def test_split_text_scispacy():
scispacy_splitter = SciSpacySentenceSplitter()
sentences = scispacy_splitter.split("VF inhibits something. ACE-dependent (GH+) issuses too.")
sentences = scispacy_splitter._perform_split("VF inhibits something. ACE-dependent (GH+) issuses too.")
assert len(sentences) == 2
assert sentences[0].start_position == 0
assert len(sentences[0].tokens) == 4
Expand Down
Loading