flairNLP · alanakbik · Mar 6, 2024 · Feb 9, 2024 · Feb 16, 2024
diff --git a/flair/data.py b/flair/data.py
@@ -1106,6 +1106,11 @@ def __remove_zero_width_characters(text: str) -> str:
         text = text.replace("\u200b", "")
         text = text.replace("\ufe0f", "")
         text = text.replace("\ufeff", "")
+
+        text = text.replace(
+            "\u2028", ""
+        )  # LINE SEPARATOR & PARAGRAPH SEPARATOR are usually used for wrapping & displaying texts,
+        text = text.replace("\u2029", "")  # but not for semantic meaning -> ignore them.
         return text
 
     @staticmethod

diff --git a/tests/test_tokenize_sentence.py b/tests/test_tokenize_sentence.py
@@ -485,5 +485,12 @@ def test_token_positions_when_creating_word_by_word():
     assert sentence.tokens[2].end_position == 13
 
 
+def test_line_separator_is_ignored():
+    with_separator = "Untersuchungs-\u2028ausschüsse"
+    without_separator = "Untersuchungs-ausschüsse"
+
+    assert Sentence(with_separator).to_original_text() == Sentence(without_separator).to_original_text()
+
+
 def no_op_tokenizer(text: str) -> List[str]:
     return [text]