Skip to content

Commit

Permalink
ignore separator symbols
Browse files Browse the repository at this point in the history
  • Loading branch information
Benedikt Fuchs committed Feb 9, 2024
1 parent 17e2895 commit 2579a0b
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 0 deletions.
3 changes: 3 additions & 0 deletions flair/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1106,6 +1106,9 @@ def __remove_zero_width_characters(text: str) -> str:
text = text.replace("\u200b", "")
text = text.replace("\ufe0f", "")
text = text.replace("\ufeff", "")

text = text.replace("\u2028", "") # LINE SEPARATOR & PARAGRAPH SEPARATOR are usually used for wrapping & displaying texts,
text = text.replace("\u2029", "") # but not for semantic meaning -> ignore them.
return text

@staticmethod
Expand Down
7 changes: 7 additions & 0 deletions tests/test_tokenize_sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,5 +485,12 @@ def test_token_positions_when_creating_word_by_word():
assert sentence.tokens[2].end_position == 13


def test_line_separator_is_ignored():
with_separator = "Untersuchungs-\u2028ausschüsse"
without_separator = "Untersuchungs-ausschüsse"

assert Sentence(with_separator).to_original_text() == Sentence(without_separator).to_original_text()


def no_op_tokenizer(text: str) -> List[str]:
return [text]

0 comments on commit 2579a0b

Please sign in to comment.