From 2579a0b565bff26098588b2555ac85c1ae3e117f Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Fri, 9 Feb 2024 15:22:10 +0100
Subject: [PATCH] ignore separator symbols

---
 flair/data.py                   | 3 +++
 tests/test_tokenize_sentence.py | 7 +++++++
 2 files changed, 10 insertions(+)

diff --git a/flair/data.py b/flair/data.py
index dd8304405..0ce60dfa0 100644
--- a/flair/data.py
+++ b/flair/data.py
@@ -1106,6 +1106,9 @@ def __remove_zero_width_characters(text: str) -> str:
         text = text.replace("\u200b", "")
         text = text.replace("\ufe0f", "")
         text = text.replace("\ufeff", "")
+
+        text = text.replace("\u2028", "")  # LINE SEPARATOR & PARAGRAPH SEPARATOR are usually used for wrapping & displaying texts,
+        text = text.replace("\u2029", "")  # but not for semantic meaning -> ignore them.
         return text
 
     @staticmethod
diff --git a/tests/test_tokenize_sentence.py b/tests/test_tokenize_sentence.py
index baeacb408..fd049b642 100644
--- a/tests/test_tokenize_sentence.py
+++ b/tests/test_tokenize_sentence.py
@@ -485,5 +485,12 @@ def test_token_positions_when_creating_word_by_word():
     assert sentence.tokens[2].end_position == 13
 
 
+def test_line_separator_is_ignored():
+    with_separator = "Untersuchungs-\u2028ausschüsse"
+    without_separator = "Untersuchungs-ausschüsse"
+
+    assert Sentence(with_separator).to_original_text() == Sentence(without_separator).to_original_text()
+
+
 def no_op_tokenizer(text: str) -> List[str]:
     return [text]