Merge branch 'master' of https://github.com/mideind/Tokenizer

mideind · May 12, 2024 · be8ee4d · be8ee4d
2 parents c0db5e1 + 43a931e
commit be8ee4d
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 17 deletions.
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
@@ -1453,7 +1453,6 @@ def generate_raw_tokens(
     big_text: str
 
     for big_text in text_or_gen:
-
         if not one_sent_per_line and not big_text:
             # An explicit empty string in the input always
             # causes a sentence split
@@ -1831,7 +1830,6 @@ def parse_mixed(
     pp = PunctuationParser()
 
     while rt.txt:
-
         # Handle punctuation
         yield from pp.parse(rt)
         rt, ate = pp.rt, pp.ate
@@ -2350,7 +2348,6 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
             if (
                 token.kind == TOK.NUMBER or token.kind == TOK.YEAR
             ) and next_token.txt in SI_UNITS:
-
                 value = token.number
                 orig_unit = next_token.txt
                 unit: str
@@ -2499,12 +2496,11 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
     exclamation marks, etc.)"""
 
     in_sentence = False
-    token = cast(Tok, None)
+    token: Optional[Tok] = None
     tok_begin_sentence = TOK.Begin_Sentence()
     tok_end_sentence = TOK.End_Sentence()
 
     try:
-
         # Maintain a one-token lookahead
         token = next(token_stream)
         while True:
@@ -2641,7 +2637,6 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
 
     token = cast(Tok, None)
     try:
-
         # Maintain a one-token lookahead
         token = next(token_stream)
         while True:
@@ -2700,7 +2695,6 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
 
             # Check for [date] [year]
             if token.kind == TOK.DATE and next_token.kind == TOK.YEAR:
-
                 dt = cast(DateTimeTuple, token.val)
                 if not dt[0]:
                     # No year yet: add it
@@ -2760,7 +2754,6 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]:
 
     token = cast(Tok, None)
     try:
-
         # Maintain a one-token lookahead
         token = next(token_stream)
 
@@ -2920,12 +2913,10 @@ def parse_phrases_2(
 
     token = cast(Tok, None)
     try:
-
         # Maintain a one-token lookahead
         token = next(token_stream)
 
         while True:
-
             next_token = next(token_stream)
 
             # Logic for numbers and fractions that are partially or entirely
@@ -2945,7 +2936,6 @@ def parse_phrases_2(
 
             # Check for [number] [ISK_AMOUNT|CURRENCY|PERCENTAGE]
             elif token.kind == TOK.NUMBER and next_token.kind == TOK.WORD:
-
                 if next_token.txt in AMOUNT_ABBREV:
                     # Abbreviations for ISK amounts
                     # For abbreviations, we do not know the case,
@@ -3122,7 +3112,6 @@ def mark_paragraphs(txt: str) -> str:
 
 
 def paragraphs(tokens: Iterable[Tok]) -> Iterator[List[Tuple[int, List[Tok]]]]:
-
     """Generator yielding paragraphs from token iterable. Each paragraph is a list
     of sentence tuples. Sentence tuples consist of the index of the first token
     of the sentence (the TOK.S_BEGIN token) and a list of the tokens within the
@@ -3178,6 +3167,10 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
     r"|([\+\-\$€]?\d+\,\d+(?!\.\d))"  # -1234,56
     # The following regex catches English numbers with a dot only
     r"|([\+\-\$€]?\d+\.\d+(?!\,\d))"  # -1234.56
+    # The following regex catches Icelandic abbreviations, e.g. a.m.k., A.M.K., þ.e.a.s.
+    r"|([^\W\d_]+\.(?:[^\W\d_]+\.)+)(?![^\W\d_]+\s)"
+    # The following regex catches degree characters, i.e. °C, °F
+    r"|(°[CF])"
     # Finally, space and punctuation
     r"|([~\s"
     + "".join("\\" + c for c in PUNCTUATION)

diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
@@ -63,7 +63,6 @@ def get_text_and_norm(orig: str) -> Tuple[str, str]:
 
 
 def test_single_tokens() -> None:
-
     TEST_CASES = [
         (".", TOK.PUNCTUATION),
         (",", TOK.PUNCTUATION),
@@ -616,7 +615,6 @@ def run_test(test_cases: Iterable[TestCase], **options: Any) -> None:
 
 
 def test_sentences() -> None:
-
     KIND = {
         "B": TOK.S_BEGIN,
         "E": TOK.S_END,
@@ -646,7 +644,6 @@ def test_sentences() -> None:
     }
 
     def test_sentence(text: str, expected: str, **options: Any) -> None:
-
         exp = expected.split()
         s = list(t.tokenize(text, **options))
         assert len(s) == len(exp)
@@ -1131,7 +1128,23 @@ def test_correct_spaces() -> None:
     s = t.correct_spaces("Jón- sem var formaður — mótmælti málinu.")
     assert s == "Jón-sem var formaður—mótmælti málinu."
     s = t.correct_spaces("Það á   að geyma mjólkina við  20 ±  3 °C")
-    assert s == "Það á að geyma mjólkina við 20±3° C"
+    assert s == "Það á að geyma mjólkina við 20±3 °C"
+    s = t.correct_spaces("Við förum t.d. til Íslands o.s.frv.")
+    assert s == "Við förum t.d. til Íslands o.s.frv."
+    s = t.correct_spaces("Við förum t. d. til Íslands o. s. frv.")
+    assert (
+        s == "Við förum t. d. til Íslands o. s. frv."
+    )  # This shouldn't be corrected here
+    s = t.correct_spaces("M.a. lögum við bil.")
+    assert s == "M.a. lögum við bil."
+    s = t.correct_spaces("HANN BORÐAR Þ.Á.M. EPLI.")
+    assert s == "HANN BORÐAR Þ.Á.M. EPLI."
+    s = t.correct_spaces("Ég fór til Írlands 6.júní og þar var 17.4°C hiti eða 230.3K.")
+    assert s == "Ég fór til Írlands 6. júní og þar var 17.4 °C hiti eða 230.3 K."
+    s = t.correct_spaces(
+        "Þetta er setning.Þetta er önnur setning.Líka.En hvað með þetta?"
+    )
+    assert s == "Þetta er setning. Þetta er önnur setning. Líka. En hvað með þetta?"
 
 
 def test_abbrev() -> None:
@@ -2556,7 +2569,6 @@ def test_one_sent_per_line() -> None:
 
 
 if __name__ == "__main__":
-
     test_single_tokens()
     test_sentences()
     test_correct_spaces()