Skip to content

Commit

Permalink
Merge pull request #3404 from flairNLP/ignore_separator_symbols
Browse files Browse the repository at this point in the history
ignore separator symbols
  • Loading branch information
alanakbik authored Mar 6, 2024
2 parents d55c0e9 + f34e9c7 commit 74050e1
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 0 deletions.
5 changes: 5 additions & 0 deletions flair/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1106,6 +1106,11 @@ def __remove_zero_width_characters(text: str) -> str:
text = text.replace("\u200b", "")
text = text.replace("\ufe0f", "")
text = text.replace("\ufeff", "")

text = text.replace(
"\u2028", ""
) # LINE SEPARATOR & PARAGRAPH SEPARATOR are usually used for wrapping & displaying texts,
text = text.replace("\u2029", "") # but not for semantic meaning -> ignore them.
return text

@staticmethod
Expand Down
7 changes: 7 additions & 0 deletions tests/test_tokenize_sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,5 +485,12 @@ def test_token_positions_when_creating_word_by_word():
assert sentence.tokens[2].end_position == 13


def test_line_separator_is_ignored():
with_separator = "Untersuchungs-\u2028ausschüsse"
without_separator = "Untersuchungs-ausschüsse"

assert Sentence(with_separator).to_original_text() == Sentence(without_separator).to_original_text()


def no_op_tokenizer(text: str) -> List[str]:
return [text]

0 comments on commit 74050e1

Please sign in to comment.