diff --git a/flair/data.py b/flair/data.py index dd8304405..0ce60dfa0 100644 --- a/flair/data.py +++ b/flair/data.py @@ -1106,6 +1106,9 @@ def __remove_zero_width_characters(text: str) -> str: text = text.replace("\u200b", "") text = text.replace("\ufe0f", "") text = text.replace("\ufeff", "") + + text = text.replace("\u2028", "") # LINE SEPARATOR & PARAGRAPH SEPARATOR are usually used for wrapping & displaying texts, + text = text.replace("\u2029", "") # but not for semantic meaning -> ignore them. return text @staticmethod diff --git a/tests/test_tokenize_sentence.py b/tests/test_tokenize_sentence.py index baeacb408..fd049b642 100644 --- a/tests/test_tokenize_sentence.py +++ b/tests/test_tokenize_sentence.py @@ -485,5 +485,12 @@ def test_token_positions_when_creating_word_by_word(): assert sentence.tokens[2].end_position == 13 +def test_line_separator_is_ignored(): + with_separator = "Untersuchungs-\u2028ausschüsse" + without_separator = "Untersuchungs-ausschüsse" + + assert Sentence(with_separator).to_original_text() == Sentence(without_separator).to_original_text() + + def no_op_tokenizer(text: str) -> List[str]: return [text]