From b123fbcee97120d67e41d02febf2fb03da76476d Mon Sep 17 00:00:00 2001
From: thorunna <thar@hi.is>
Date: Thu, 2 Nov 2023 13:42:47 +0000
Subject: [PATCH 1/9] added handling for abbreviations

---
 src/tokenizer/tokenizer.py | 234 +++++++++----------------------------
 1 file changed, 53 insertions(+), 181 deletions(-)

diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
index 1d38321..73f5fa1 100644
--- a/src/tokenizer/tokenizer.py
+++ b/src/tokenizer/tokenizer.py
@@ -232,9 +232,7 @@ def substitute(self, span: Tuple[int, int], new: str) -> None:
         self.txt = self.txt[: span[0]] + new + self.txt[span[1] :]
         if self.origin_spans is not None:
             # Remove origin entries that correspond to characters that are gone.
-            self.origin_spans = (
-                self.origin_spans[: span[0] + len(new)] + self.origin_spans[span[1] :]
-            )
+            self.origin_spans = self.origin_spans[: span[0] + len(new)] + self.origin_spans[span[1] :]
 
     def substitute_longer(self, span: Tuple[int, int], new: str) -> None:
         """Substitute a span with a potentially longer string"""
@@ -312,13 +310,9 @@ def concatenate(
 
         self_origin_spans = self.origin_spans or []
         other_origin_spans = other.origin_spans or []
-        separator_origin_spans: List[int] = (
-            [len(self_original)] * len(separator) if len(other_origin_spans) > 0 else []
-        )
+        separator_origin_spans: List[int] = [len(self_original)] * len(separator) if len(other_origin_spans) > 0 else []
         new_origin_spans = (
-            self_origin_spans
-            + separator_origin_spans
-            + [i + len(self_original) for i in other_origin_spans]
+            self_origin_spans + separator_origin_spans + [i + len(self_original) for i in other_origin_spans]
         )
 
         return Tok(new_kind, new_txt, new_val, new_original, new_origin_spans)
@@ -343,9 +337,7 @@ def __getitem__(self, i: int) -> Union[int, str, ValType]:
     def equal(self, other: "Tok") -> bool:
         """Equality of content between two tokens, i.e. ignoring the
         'original' and 'origin_spans' attributes"""
-        return (
-            self.kind == other.kind and self.txt == other.txt and self.val == other.val
-        )
+        return self.kind == other.kind and self.txt == other.txt and self.val == other.val
 
     def __eq__(self, o: Any) -> bool:
         """Full equality between two Tok instances"""
@@ -587,9 +579,7 @@ def Daterel(t: Union[Tok, str], y: int, m: int, d: int) -> Tok:
         return t
 
     @staticmethod
-    def Timestamp(
-        t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int
-    ) -> Tok:
+    def Timestamp(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int) -> Tok:
         if isinstance(t, str):
             return Tok(TOK.TIMESTAMP, t, (y, mo, d, h, m, s))
         t.kind = TOK.TIMESTAMP
@@ -597,9 +587,7 @@ def Timestamp(
         return t
 
     @staticmethod
-    def Timestampabs(
-        t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int
-    ) -> Tok:
+    def Timestampabs(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int) -> Tok:
         if isinstance(t, str):
             return Tok(TOK.TIMESTAMPABS, t, (y, mo, d, h, m, s))
         t.kind = TOK.TIMESTAMPABS
@@ -607,9 +595,7 @@ def Timestampabs(
         return t
 
     @staticmethod
-    def Timestamprel(
-        t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int
-    ) -> Tok:
+    def Timestamprel(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int) -> Tok:
         if isinstance(t, str):
             return Tok(TOK.TIMESTAMPREL, t, (y, mo, d, h, m, s))
         t.kind = TOK.TIMESTAMPREL
@@ -967,11 +953,7 @@ def could_be_end_of_sentence(self, i: int = 0, *args: Any) -> bool:
 
 def normalized_text(token: Tok) -> str:
     """Returns token text after normalizing punctuation"""
-    return (
-        cast(Tuple[int, str], token.val)[1]
-        if token.kind == TOK.PUNCTUATION
-        else token.txt
-    )
+    return cast(Tuple[int, str], token.val)[1] if token.kind == TOK.PUNCTUATION else token.txt
 
 
 def text_from_tokens(tokens: Iterable[Tok]) -> str:
@@ -1234,11 +1216,7 @@ def parse_digits(tok: Tok, convert_numbers: bool) -> Tuple[Tok, Tok]:
         p = g.split("/")
         m = int(p[1])
         d = int(p[0])
-        if (
-            p[0][0] != "0"
-            and p[1][0] != "0"
-            and ((d <= 5 and m <= 6) or (d == 1 and m <= 10))
-        ):
+        if p[0][0] != "0" and p[1][0] != "0" and ((d <= 5 and m <= 6) or (d == 1 and m <= 10)):
             # This is probably a fraction, not a date
             # (1/2, 1/3, 1/4, 1/5, 1/6, 2/3, 2/5, 5/6 etc.)
             # Return a number
@@ -1359,9 +1337,7 @@ def unicode_replacement(token: Tok) -> Tok:
     total_reduction = 0
     for m in UNICODE_REGEX.finditer(token.txt):
         span, new_letter = m.span(), UNICODE_REPLACEMENTS[m.group(0)]
-        token.substitute(
-            (span[0] - total_reduction, span[1] - total_reduction), new_letter
-        )
+        token.substitute((span[0] - total_reduction, span[1] - total_reduction), new_letter)
         total_reduction += span[1] - span[0] - len(new_letter)
     return token
 
@@ -1371,9 +1347,7 @@ def html_replacement(token: Tok) -> Tok:
     total_reduction = 0
     for m in HTML_ESCAPE_REGEX.finditer(token.txt):
         span, new_letter = html_escape(m)
-        token.substitute(
-            (span[0] - total_reduction, span[1] - total_reduction), new_letter
-        )
+        token.substitute((span[0] - total_reduction, span[1] - total_reduction), new_letter)
         total_reduction += span[1] - span[0] - len(new_letter)
     return token
 
@@ -1415,12 +1389,8 @@ def shift_span(span: Tuple[int, int], pos: int):
         assert match is not None
         # Since the match indexes the text of the original token,
         # we need to shift the indices so that they match the current token.
-        shifted_all_group_span = shift_span(
-            match.span(ROUGH_TOKEN_REGEX_ENTIRE_MATCH), -pos
-        )
-        shifted_white_space_span = shift_span(
-            match.span(ROUGH_TOKEN_REGEX_WHITE_SPACE_GROUP), -pos
-        )
+        shifted_all_group_span = shift_span(match.span(ROUGH_TOKEN_REGEX_ENTIRE_MATCH), -pos)
+        shifted_white_space_span = shift_span(match.span(ROUGH_TOKEN_REGEX_WHITE_SPACE_GROUP), -pos)
         # Then we split the current token using the shifted spans
         small_tok, tok = tok.split(shifted_all_group_span[SPAN_END])
         # Remove whitespace characters from the start of the token
@@ -1453,7 +1423,6 @@ def generate_raw_tokens(
     big_text: str
 
     for big_text in text_or_gen:
-
         if not one_sent_per_line and not big_text:
             # An explicit empty string in the input always
             # causes a sentence split
@@ -1592,12 +1561,7 @@ def parse(self) -> Iterable[Tok]:
         lw = len(rt.txt)
         i = 1
         while i < lw and (
-            rt.txt[i].isalpha()
-            or (
-                rt.txt[i] in PUNCT_INSIDE_WORD
-                and i + 1 < lw
-                and rt.txt[i + 1].isalpha()
-            )
+            rt.txt[i].isalpha() or (rt.txt[i] in PUNCT_INSIDE_WORD and i + 1 < lw and rt.txt[i + 1].isalpha())
         ):
             # We allow dots to occur inside words in the case of
             # abbreviations; also apostrophes are allowed within
@@ -1667,9 +1631,7 @@ class NumberParser:
 
     """Parses a sequence of digits off the front of a raw token"""
 
-    def __init__(
-        self, rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool
-    ) -> None:
+    def __init__(self, rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool) -> None:
         self.rt = rt
         self.handle_kludgy_ordinals = handle_kludgy_ordinals
         self.convert_numbers = convert_numbers
@@ -1690,10 +1652,7 @@ def parse(self) -> Iterable[Tok]:
                     # '1sti' -> 'fyrsti', '3ji' -> 'þriðji', etc.
                     key_tok.substitute_longer((0, len(key)), val)
                     yield TOK.Word(key_tok)
-                elif (
-                    handle_kludgy_ordinals == KLUDGY_ORDINALS_TRANSLATE
-                    and key in ORDINAL_NUMBERS
-                ):
+                elif handle_kludgy_ordinals == KLUDGY_ORDINALS_TRANSLATE and key in ORDINAL_NUMBERS:
                     # Convert word-form ordinals into ordinal tokens,
                     # i.e. '1sti' -> TOK.Ordinal('1sti', 1),
                     # but leave other kludgy constructs ('2ja')
@@ -1822,16 +1781,13 @@ def parse(self, rt: Tok) -> Iterable[Tok]:
         self.ate = ate
 
 
-def parse_mixed(
-    rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool
-) -> Iterable[Tok]:
+def parse_mixed(rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool) -> Iterable[Tok]:
     """Parse a mixed raw token string, from the token rt"""
 
     # Initialize a singleton parser for punctuation
     pp = PunctuationParser()
 
     while rt.txt:
-
         # Handle punctuation
         yield from pp.parse(rt)
         rt, ate = pp.rt, pp.ate
@@ -1910,8 +1866,7 @@ def parse_mixed(
         # Numbers or other stuff starting with a digit
         # (eventually prefixed by a '+' or '-')
         if rtxt and (
-            rtxt[0] in DIGITS_PREFIX
-            or (rtxt[0] in SIGN_PREFIX and len(rtxt) >= 2 and rtxt[1] in DIGITS_PREFIX)
+            rtxt[0] in DIGITS_PREFIX or (rtxt[0] in SIGN_PREFIX and len(rtxt) >= 2 and rtxt[1] in DIGITS_PREFIX)
         ):
             np = NumberParser(rt, handle_kludgy_ordinals, convert_numbers)
             yield from np.parse()
@@ -1977,9 +1932,7 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok
 
     # The default behavior for kludgy ordinals is to pass them
     # through as word tokens
-    handle_kludgy_ordinals: int = options.get(
-        "handle_kludgy_ordinals", KLUDGY_ORDINALS_PASS_THROUGH
-    )
+    handle_kludgy_ordinals: int = options.get("handle_kludgy_ordinals", KLUDGY_ORDINALS_PASS_THROUGH)
 
     # This code proceeds roughly as follows:
     # 1) The text is split into raw tokens on whitespace boundaries.
@@ -2002,9 +1955,7 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok
 
     rtxt: str = ""
 
-    for rt in generate_raw_tokens(
-        txt, replace_composite_glyphs, replace_html_escapes, one_sent_per_line
-    ):
+    for rt in generate_raw_tokens(txt, replace_composite_glyphs, replace_html_escapes, one_sent_per_line):
         # rt: raw token
 
         if rt.kind in {TOK.S_SPLIT, TOK.P_BEGIN, TOK.P_END}:
@@ -2134,18 +2085,13 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                 and (next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR)
             ):
                 currabbr = CURRENCY_SYMBOLS[token.txt]
-                token = TOK.Amount(
-                    token.concatenate(next_token), currabbr, next_token.number
-                )
+                token = TOK.Amount(token.concatenate(next_token), currabbr, next_token.number)
                 next_token = next(token_stream)
 
             # Special case for a DATEREL token of the form "25.10.",
             # i.e. with a trailing period: It can end a sentence
             if token.kind == TOK.DATEREL and "." in token.txt:
-                if (
-                    next_token.txt == "."
-                    and not token_stream.could_be_end_of_sentence()
-                ):
+                if next_token.txt == "." and not token_stream.could_be_end_of_sentence():
                     # This is something like 'Ég fæddist 25.9. í Svarfaðardal.'
                     y, m, d = cast(Tuple[int, int, int], token.val)
                     token = TOK.Daterel(token.concatenate(next_token), y, m, d)
@@ -2154,11 +2100,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
             # Coalesce abbreviations ending with a period into a single
             # abbreviation token
             if next_token.punctuation == ".":
-                if (
-                    token.kind == TOK.WORD
-                    and token.txt[-1] != "."
-                    and is_abbr_with_period(token.txt)
-                ):
+                if token.kind == TOK.WORD and token.txt[-1] != "." and is_abbr_with_period(token.txt):
                     # Abbreviation ending with period: make a special token for it
                     # and advance the input stream
                     follow_token = next(token_stream)
@@ -2181,9 +2123,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                     # þær þarf að vera hægt að sameina í þessa flóknari tóka en viljum
                     # geta merkt það sem villu. Ætti líklega að setja í sérlista,
                     # WRONG_MONTHS, og sérif-lykkju og setja inn villu í tókann.
-                    finish = could_be_end_of_sentence(
-                        follow_token, test_set, abbrev in NUMBER_ABBREV
-                    )
+                    finish = could_be_end_of_sentence(follow_token, test_set, abbrev in NUMBER_ABBREV)
                     if finish:
                         # Potentially at the end of a sentence
                         if abbrev in Abbreviations.FINISHERS:
@@ -2195,10 +2135,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                             yield token
                             # Set token to the period
                             token = next_token
-                        elif (
-                            abbrev in Abbreviations.NOT_FINISHERS
-                            or abbrev.lower() in Abbreviations.NOT_FINISHERS
-                        ):
+                        elif abbrev in Abbreviations.NOT_FINISHERS or abbrev.lower() in Abbreviations.NOT_FINISHERS:
                             # This is a potential abbreviation that we don't interpret
                             # as such if it's at the end of a sentence
                             # ('dags.', 'próf.', 'mín.'). Note that this also
@@ -2207,9 +2144,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                             token = next_token
                         else:
                             # Substitute the abbreviation and eat the period
-                            token = TOK.Word(
-                                token.concatenate(next_token), lookup(abbrev)
-                            )
+                            token = TOK.Word(token.concatenate(next_token), lookup(abbrev))
                     else:
                         # 'Regular' abbreviation in the middle of a sentence:
                         # Eat the period and yield the abbreviation as a single token
@@ -2245,9 +2180,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                     next_token = next(token_stream)
 
             # Coalesce 'klukkan/kl. átta/hálfátta' into a time
-            elif (
-                next_token.kind == TOK.WORD and next_token.txt.lower() in CLOCK_NUMBERS
-            ):
+            elif next_token.kind == TOK.WORD and next_token.txt.lower() in CLOCK_NUMBERS:
                 if token.kind == TOK.WORD and token.txt.lower() in CLOCK_ABBREVS:
                     # Match: coalesce and step to next token
                     next_txt = next_token.txt.lower()
@@ -2309,9 +2242,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                     sign = next_token.txt
                     # Store promille as one-tenth of a percentage
                     factor = 1.0 if sign == "%" else 0.1
-                    token = TOK.Percent(
-                        token.concatenate(next_token), token.number * factor
-                    )
+                    token = TOK.Percent(token.concatenate(next_token), token.number * factor)
                     next_token = next(token_stream)
 
             # Coalesce ordinals (1. = first, 2. = second...) into a single token
@@ -2337,20 +2268,13 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                     ):
                         # OK: replace the number/Roman numeral and the period
                         # with an ordinal token
-                        num = (
-                            token.integer
-                            if token.kind == TOK.NUMBER
-                            else roman_to_int(token.txt)
-                        )
+                        num = token.integer if token.kind == TOK.NUMBER else roman_to_int(token.txt)
                         token = TOK.Ordinal(token.concatenate(next_token), num)
                         # Continue with the following word
                         next_token = next(token_stream)
 
             # Convert "1920 mm" or "30 °C" to a single measurement token
-            if (
-                token.kind == TOK.NUMBER or token.kind == TOK.YEAR
-            ) and next_token.txt in SI_UNITS:
-
+            if (token.kind == TOK.NUMBER or token.kind == TOK.YEAR) and next_token.txt in SI_UNITS:
                 value = token.number
                 orig_unit = next_token.txt
                 unit: str
@@ -2363,9 +2287,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                     assert isinstance(factor_func, float)
                     value *= factor_func
                 if unit in ("%", "‰"):
-                    token = TOK.Percent(
-                        token.concatenate(next_token, separator=" "), value
-                    )
+                    token = TOK.Percent(token.concatenate(next_token, separator=" "), value)
                 else:
                     token = TOK.Measurement(
                         token.concatenate(next_token, separator=" "),
@@ -2463,11 +2385,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                 next_token = next(token_stream)
 
             # Cases such as 19 $, 199.99 $
-            if (
-                token.kind == TOK.NUMBER
-                and next_token.kind == TOK.PUNCTUATION
-                and next_token.txt in CURRENCY_SYMBOLS
-            ):
+            if token.kind == TOK.NUMBER and next_token.kind == TOK.PUNCTUATION and next_token.txt in CURRENCY_SYMBOLS:
                 token = TOK.Amount(
                     token.concatenate(next_token, separator=" "),
                     CURRENCY_SYMBOLS.get(next_token.txt, ""),
@@ -2504,7 +2422,6 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
     tok_end_sentence = TOK.End_Sentence()
 
     try:
-
         # Maintain a one-token lookahead
         token = next(token_stream)
         while True:
@@ -2528,9 +2445,7 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                     _skip_me.substitute((0, len(_skip_me.txt)), "")
                     token = cast(Tok, None)
                     # 3. attach them to the front of the next token
-                    token = _skip_me.concatenate(
-                        next(token_stream), metadata_from_other=True
-                    )
+                    token = _skip_me.concatenate(next(token_stream), metadata_from_other=True)
                     continue
             elif token.kind == TOK.X_END:
                 assert not in_sentence
@@ -2551,10 +2466,7 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                     # This token starts a new sentence
                     yield tok_begin_sentence
                     in_sentence = True
-                if (
-                    token.punctuation in PUNCT_INDIRECT_SPEECH
-                    and next_token.punctuation in DQUOTES
-                ):
+                if token.punctuation in PUNCT_INDIRECT_SPEECH and next_token.punctuation in DQUOTES:
                     yield token
                     token = next_token
                     next_token = next(token_stream)
@@ -2570,15 +2482,10 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                         in_sentence = False
                 if token.punctuation in END_OF_SENTENCE and not (
                     token.punctuation == "…"
-                    and not could_be_end_of_sentence(
-                        next_token
-                    )  # Excluding sentences with ellipsis in the middle
+                    and not could_be_end_of_sentence(next_token)  # Excluding sentences with ellipsis in the middle
                 ):
                     # Combining punctuation ('??!!!')
-                    while (
-                        token.punctuation in PUNCT_COMBINATIONS
-                        and next_token.punctuation in PUNCT_COMBINATIONS
-                    ):
+                    while token.punctuation in PUNCT_COMBINATIONS and next_token.punctuation in PUNCT_COMBINATIONS:
                         # The normalized form comes from the first token except with "…?"
                         v = token.punctuation
                         if v == "…" and next_token.punctuation == "?":
@@ -2641,7 +2548,6 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
 
     token = cast(Tok, None)
     try:
-
         # Maintain a one-token lookahead
         token = next(token_stream)
         while True:
@@ -2672,9 +2578,7 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                         token = TOK.Year(token.concatenate(next_token), nval)
                         next_token = next(token_stream)
             # Check for [number | ordinal] [month name]
-            if (
-                token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER
-            ) and next_token.kind == TOK.WORD:
+            if (token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER) and next_token.kind == TOK.WORD:
                 if next_token.txt == "gr.":
                     # Corner case: If we have an ordinal followed by
                     # the abbreviation "gr.", we assume that the only
@@ -2700,7 +2604,6 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
 
             # Check for [date] [year]
             if token.kind == TOK.DATE and next_token.kind == TOK.YEAR:
-
                 dt = cast(DateTimeTuple, token.val)
                 if not dt[0]:
                     # No year yet: add it
@@ -2730,11 +2633,7 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                 # Eat the time token
                 next_token = next(token_stream)
 
-            if (
-                token.kind == TOK.NUMBER
-                and next_token.kind == TOK.TELNO
-                and token.txt in COUNTRY_CODES
-            ):
+            if token.kind == TOK.NUMBER and next_token.kind == TOK.TELNO and token.txt in COUNTRY_CODES:
                 # Check for country code in front of telephone number
                 token = TOK.Telno(
                     token.concatenate(next_token, separator=" "),
@@ -2760,7 +2659,6 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]:
 
     token = cast(Tok, None)
     try:
-
         # Maintain a one-token lookahead
         token = next(token_stream)
 
@@ -2768,9 +2666,7 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]:
             next_token = next(token_stream)
             # DATEABS and DATEREL made
             # Check for [number | ordinal] [month name]
-            if (
-                token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER
-            ) and next_token.kind == TOK.WORD:
+            if (token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER) and next_token.kind == TOK.WORD:
                 month = month_for_token(next_token, True)
                 if month is not None:
                     token = TOK.Date(
@@ -2783,9 +2679,7 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                     next_token = next(token_stream)
 
             # Check for [DATE] [year]
-            if token.kind == TOK.DATE and (
-                next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR
-            ):
+            if token.kind == TOK.DATE and (next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR):
                 dt = cast(DateTimeTuple, token.val)
                 if not dt[0]:
                     # No year yet: add it
@@ -2805,9 +2699,7 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                         next_token = next(token_stream)
 
             # Check for [month name] [year|YEAR]
-            if token.kind == TOK.WORD and (
-                next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR
-            ):
+            if token.kind == TOK.WORD and (next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR):
                 month = month_for_token(token)
                 if month is not None:
                     year = next_token.integer
@@ -2913,28 +2805,22 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]:
         yield token
 
 
-def parse_phrases_2(
-    token_stream: Iterator[Tok], coalesce_percent: bool = False
-) -> Iterator[Tok]:
+def parse_phrases_2(token_stream: Iterator[Tok], coalesce_percent: bool = False) -> Iterator[Tok]:
     """Handle numbers, amounts and composite words."""
 
     token = cast(Tok, None)
     try:
-
         # Maintain a one-token lookahead
         token = next(token_stream)
 
         while True:
-
             next_token = next(token_stream)
 
             # Logic for numbers and fractions that are partially or entirely
             # written out in words
 
             # Check for [CURRENCY] [number] (e.g. kr. 9.900 or USD 50)
-            if next_token.kind == TOK.NUMBER and (
-                token.txt in ISK_AMOUNT_PRECEDING or token.txt in CURRENCY_ABBREV
-            ):
+            if next_token.kind == TOK.NUMBER and (token.txt in ISK_AMOUNT_PRECEDING or token.txt in CURRENCY_ABBREV):
                 curr = "ISK" if token.txt in ISK_AMOUNT_PRECEDING else token.txt
                 token = TOK.Amount(
                     token.concatenate(next_token, separator=" "),
@@ -2945,7 +2831,6 @@ def parse_phrases_2(
 
             # Check for [number] [ISK_AMOUNT|CURRENCY|PERCENTAGE]
             elif token.kind == TOK.NUMBER and next_token.kind == TOK.WORD:
-
                 if next_token.txt in AMOUNT_ABBREV:
                     # Abbreviations for ISK amounts
                     # For abbreviations, we do not know the case,
@@ -3020,9 +2905,7 @@ def parse_phrases_2(
                         # part of the composition, so it can be an unknown word.
                         _acc = tq[0]
                         for t in tq[1:] + [token, next_token]:
-                            _acc = _acc.concatenate(
-                                t, separator=" ", metadata_from_other=True
-                            )
+                            _acc = _acc.concatenate(t, separator=" ", metadata_from_other=True)
                         _acc.substitute_all(" -", "-")
                         _acc.substitute_all(" ,", ",")
                         token = _acc
@@ -3068,17 +2951,13 @@ def tokenize(text_or_gen: Union[str, Iterable[str]], **options: Any) -> Iterator
     return (t for t in token_stream if t.kind != TOK.X_END)
 
 
-def tokenize_without_annotation(
-    text_or_gen: Union[str, Iterable[str]], **options: Any
-) -> Iterator[Tok]:
+def tokenize_without_annotation(text_or_gen: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok]:
     """Tokenize without the last pass which can be done more thoroughly if BÍN
     annotation is available, for instance in GreynirPackage."""
     return tokenize(text_or_gen, with_annotation=False, **options)
 
 
-def split_into_sentences(
-    text_or_gen: Union[str, Iterable[str]], **options: Any
-) -> Iterator[str]:
+def split_into_sentences(text_or_gen: Union[str, Iterable[str]], **options: Any) -> Iterator[str]:
     """Shallow tokenization of the input text, which can be either
     a text string or a generator of lines of text (such as a file).
     This function returns a generator of strings, where each string
@@ -3122,7 +3001,6 @@ def mark_paragraphs(txt: str) -> str:
 
 
 def paragraphs(tokens: Iterable[Tok]) -> Iterator[List[Tuple[int, List[Tok]]]]:
-
     """Generator yielding paragraphs from token iterable. Each paragraph is a list
     of sentence tuples. Sentence tuples consist of the index of the first token
     of the sentence (the TOK.S_BEGIN token) and a list of the tokens within the
@@ -3170,7 +3048,7 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
 
 
 RE_SPLIT_STR = (
-    # The following regex catches Icelandic numbers with dots and a comma
+    # The following regex catches Icelandic numbers with dots
     r"([\+\-\$€]?\d{1,3}(?:\.\d\d\d)+\,\d+)"  # +123.456,789
     # The following regex catches English numbers with commas and a dot
     r"|([\+\-\$€]?\d{1,3}(?:\,\d\d\d)+\.\d+)"  # +123,456.789
@@ -3178,6 +3056,8 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
     r"|([\+\-\$€]?\d+\,\d+(?!\.\d))"  # -1234,56
     # The following regex catches English numbers with a dot only
     r"|([\+\-\$€]?\d+\.\d+(?!\,\d))"  # -1234.56
+    # The following regex catches Icelandic abbreviations, e.g. a.m.k., A.M.K., þ.e.a.s.
+    r"|([a-záðéíóúýþæöA-ZÁÐÉÍÓÚÝÞÆÖ]+\.(?:[a-záðéíóúýþæöA-ZÁÐÉÍÓÚÝÞÆÖ]+\.)+)"
     # Finally, space and punctuation
     r"|([~\s"
     + "".join("\\" + c for c in PUNCTUATION)
@@ -3217,12 +3097,8 @@ def correct_spaces(s: str) -> str:
             this = TP_CENTER
         else:
             this = TP_WORD
-        if (
-            (w == "og" or w == "eða")
-            and len(r) >= 2
-            and r[-1] == "-"
-            and r[-2].lstrip().isalpha()
-        ):
+        # print("this: ", this)
+        if (w == "og" or w == "eða") and len(r) >= 2 and r[-1] == "-" and r[-2].lstrip().isalpha():
             # Special case for compounds such as "fjármála- og efnahagsráðuneytið"
             # and "Iðnaðar-, ferðamála- og atvinnuráðuneytið":
             # detach the hyphen from "og"/"eða"
@@ -3283,9 +3159,7 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str:
     return "".join(r)
 
 
-def calculate_indexes(
-    tokens: Iterable[Tok], last_is_end: bool = False
-) -> Tuple[List[int], List[int]]:
+def calculate_indexes(tokens: Iterable[Tok], last_is_end: bool = False) -> Tuple[List[int], List[int]]:
     """Calculate character and byte indexes for a token stream.
     The indexes are the start positions of each token in the original
     text that was tokenized.
@@ -3307,9 +3181,7 @@ def byte_len(string: str) -> int:
             if t.txt:
                 # Origin tracking failed for this token.
                 # TODO: Can we do something better here? Or guarantee that it doesn't happen?
-                raise ValueError(
-                    f"Origin tracking failed at {t.txt} near index {char_indexes[-1]}"
-                )
+                raise ValueError(f"Origin tracking failed at {t.txt} near index {char_indexes[-1]}")
             else:
                 # This is some marker token that has no text
                 pass

From 15d54adc5e4251f791962a5afb5ba57a232bd4c2 Mon Sep 17 00:00:00 2001
From: thorunna <thar@hi.is>
Date: Thu, 2 Nov 2023 14:23:14 +0000
Subject: [PATCH 2/9] reversed formatting

---
 src/tokenizer/tokenizer.py | 232 ++++++++++++++++++++++++++++---------
 1 file changed, 180 insertions(+), 52 deletions(-)

diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
index 73f5fa1..8ec12ad 100644
--- a/src/tokenizer/tokenizer.py
+++ b/src/tokenizer/tokenizer.py
@@ -232,7 +232,9 @@ def substitute(self, span: Tuple[int, int], new: str) -> None:
         self.txt = self.txt[: span[0]] + new + self.txt[span[1] :]
         if self.origin_spans is not None:
             # Remove origin entries that correspond to characters that are gone.
-            self.origin_spans = self.origin_spans[: span[0] + len(new)] + self.origin_spans[span[1] :]
+            self.origin_spans = (
+                self.origin_spans[: span[0] + len(new)] + self.origin_spans[span[1] :]
+            )
 
     def substitute_longer(self, span: Tuple[int, int], new: str) -> None:
         """Substitute a span with a potentially longer string"""
@@ -310,9 +312,13 @@ def concatenate(
 
         self_origin_spans = self.origin_spans or []
         other_origin_spans = other.origin_spans or []
-        separator_origin_spans: List[int] = [len(self_original)] * len(separator) if len(other_origin_spans) > 0 else []
+        separator_origin_spans: List[int] = (
+            [len(self_original)] * len(separator) if len(other_origin_spans) > 0 else []
+        )
         new_origin_spans = (
-            self_origin_spans + separator_origin_spans + [i + len(self_original) for i in other_origin_spans]
+            self_origin_spans 
+            + separator_origin_spans 
+            + [i + len(self_original) for i in other_origin_spans]
         )
 
         return Tok(new_kind, new_txt, new_val, new_original, new_origin_spans)
@@ -337,7 +343,9 @@ def __getitem__(self, i: int) -> Union[int, str, ValType]:
     def equal(self, other: "Tok") -> bool:
         """Equality of content between two tokens, i.e. ignoring the
         'original' and 'origin_spans' attributes"""
-        return self.kind == other.kind and self.txt == other.txt and self.val == other.val
+        return (
+            self.kind == other.kind and self.txt == other.txt and self.val == other.val
+        )
 
     def __eq__(self, o: Any) -> bool:
         """Full equality between two Tok instances"""
@@ -579,7 +587,9 @@ def Daterel(t: Union[Tok, str], y: int, m: int, d: int) -> Tok:
         return t
 
     @staticmethod
-    def Timestamp(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int) -> Tok:
+    def Timestamp(
+        t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int
+    ) -> Tok:
         if isinstance(t, str):
             return Tok(TOK.TIMESTAMP, t, (y, mo, d, h, m, s))
         t.kind = TOK.TIMESTAMP
@@ -587,7 +597,9 @@ def Timestamp(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: in
         return t
 
     @staticmethod
-    def Timestampabs(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int) -> Tok:
+    def Timestampabs(
+        t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int
+    ) -> Tok:
         if isinstance(t, str):
             return Tok(TOK.TIMESTAMPABS, t, (y, mo, d, h, m, s))
         t.kind = TOK.TIMESTAMPABS
@@ -595,7 +607,9 @@ def Timestampabs(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s:
         return t
 
     @staticmethod
-    def Timestamprel(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int) -> Tok:
+    def Timestamprel(
+        t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int
+    ) -> Tok:
         if isinstance(t, str):
             return Tok(TOK.TIMESTAMPREL, t, (y, mo, d, h, m, s))
         t.kind = TOK.TIMESTAMPREL
@@ -953,7 +967,11 @@ def could_be_end_of_sentence(self, i: int = 0, *args: Any) -> bool:
 
 def normalized_text(token: Tok) -> str:
     """Returns token text after normalizing punctuation"""
-    return cast(Tuple[int, str], token.val)[1] if token.kind == TOK.PUNCTUATION else token.txt
+    return (
+        cast(Tuple[int, str], token.val)[1]
+        if token.kind == TOK.PUNCTUATION
+        else token.txt
+    )
 
 
 def text_from_tokens(tokens: Iterable[Tok]) -> str:
@@ -1216,7 +1234,11 @@ def parse_digits(tok: Tok, convert_numbers: bool) -> Tuple[Tok, Tok]:
         p = g.split("/")
         m = int(p[1])
         d = int(p[0])
-        if p[0][0] != "0" and p[1][0] != "0" and ((d <= 5 and m <= 6) or (d == 1 and m <= 10)):
+        if (
+            p[0][0] != "0"
+            and p[1][0] != "0"
+            and ((d <= 5 and m <= 6) or (d == 1 and m <= 10))
+        ):
             # This is probably a fraction, not a date
             # (1/2, 1/3, 1/4, 1/5, 1/6, 2/3, 2/5, 5/6 etc.)
             # Return a number
@@ -1337,7 +1359,9 @@ def unicode_replacement(token: Tok) -> Tok:
     total_reduction = 0
     for m in UNICODE_REGEX.finditer(token.txt):
         span, new_letter = m.span(), UNICODE_REPLACEMENTS[m.group(0)]
-        token.substitute((span[0] - total_reduction, span[1] - total_reduction), new_letter)
+        token.substitute(
+            (span[0] - total_reduction, span[1] - total_reduction), new_letter
+        )
         total_reduction += span[1] - span[0] - len(new_letter)
     return token
 
@@ -1347,7 +1371,9 @@ def html_replacement(token: Tok) -> Tok:
     total_reduction = 0
     for m in HTML_ESCAPE_REGEX.finditer(token.txt):
         span, new_letter = html_escape(m)
-        token.substitute((span[0] - total_reduction, span[1] - total_reduction), new_letter)
+        token.substitute(
+            (span[0] - total_reduction, span[1] - total_reduction), new_letter
+        )
         total_reduction += span[1] - span[0] - len(new_letter)
     return token
 
@@ -1389,8 +1415,12 @@ def shift_span(span: Tuple[int, int], pos: int):
         assert match is not None
         # Since the match indexes the text of the original token,
         # we need to shift the indices so that they match the current token.
-        shifted_all_group_span = shift_span(match.span(ROUGH_TOKEN_REGEX_ENTIRE_MATCH), -pos)
-        shifted_white_space_span = shift_span(match.span(ROUGH_TOKEN_REGEX_WHITE_SPACE_GROUP), -pos)
+        shifted_all_group_span = shift_span(
+            match.span(ROUGH_TOKEN_REGEX_ENTIRE_MATCH), -pos
+        )
+        shifted_white_space_span = shift_span(
+            match.span(ROUGH_TOKEN_REGEX_WHITE_SPACE_GROUP), -pos
+        )
         # Then we split the current token using the shifted spans
         small_tok, tok = tok.split(shifted_all_group_span[SPAN_END])
         # Remove whitespace characters from the start of the token
@@ -1423,6 +1453,7 @@ def generate_raw_tokens(
     big_text: str
 
     for big_text in text_or_gen:
+        
         if not one_sent_per_line and not big_text:
             # An explicit empty string in the input always
             # causes a sentence split
@@ -1561,7 +1592,12 @@ def parse(self) -> Iterable[Tok]:
         lw = len(rt.txt)
         i = 1
         while i < lw and (
-            rt.txt[i].isalpha() or (rt.txt[i] in PUNCT_INSIDE_WORD and i + 1 < lw and rt.txt[i + 1].isalpha())
+            rt.txt[i].isalpha()
+            or (
+                rt.txt[i] in PUNCT_INSIDE_WORD
+                and i + 1 < lw
+                and rt.txt[i + 1].isalpha()
+            )
         ):
             # We allow dots to occur inside words in the case of
             # abbreviations; also apostrophes are allowed within
@@ -1631,7 +1667,9 @@ class NumberParser:
 
     """Parses a sequence of digits off the front of a raw token"""
 
-    def __init__(self, rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool) -> None:
+    def __init__(
+        self, rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool
+    ) -> None:
         self.rt = rt
         self.handle_kludgy_ordinals = handle_kludgy_ordinals
         self.convert_numbers = convert_numbers
@@ -1652,7 +1690,10 @@ def parse(self) -> Iterable[Tok]:
                     # '1sti' -> 'fyrsti', '3ji' -> 'þriðji', etc.
                     key_tok.substitute_longer((0, len(key)), val)
                     yield TOK.Word(key_tok)
-                elif handle_kludgy_ordinals == KLUDGY_ORDINALS_TRANSLATE and key in ORDINAL_NUMBERS:
+                elif (
+                    handle_kludgy_ordinals == KLUDGY_ORDINALS_TRANSLATE
+                    and key in ORDINAL_NUMBERS
+                ):
                     # Convert word-form ordinals into ordinal tokens,
                     # i.e. '1sti' -> TOK.Ordinal('1sti', 1),
                     # but leave other kludgy constructs ('2ja')
@@ -1780,8 +1821,9 @@ def parse(self, rt: Tok) -> Iterable[Tok]:
         self.rt = rt
         self.ate = ate
 
-
-def parse_mixed(rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool) -> Iterable[Tok]:
+def parse_mixed(
+    rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool
+) -> Iterable[Tok]:
     """Parse a mixed raw token string, from the token rt"""
 
     # Initialize a singleton parser for punctuation
@@ -1866,7 +1908,8 @@ def parse_mixed(rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool) ->
         # Numbers or other stuff starting with a digit
         # (eventually prefixed by a '+' or '-')
         if rtxt and (
-            rtxt[0] in DIGITS_PREFIX or (rtxt[0] in SIGN_PREFIX and len(rtxt) >= 2 and rtxt[1] in DIGITS_PREFIX)
+            rtxt[0] in DIGITS_PREFIX
+            or (rtxt[0] in SIGN_PREFIX and len(rtxt) >= 2 and rtxt[1] in DIGITS_PREFIX)
         ):
             np = NumberParser(rt, handle_kludgy_ordinals, convert_numbers)
             yield from np.parse()
@@ -1932,7 +1975,9 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok
 
     # The default behavior for kludgy ordinals is to pass them
     # through as word tokens
-    handle_kludgy_ordinals: int = options.get("handle_kludgy_ordinals", KLUDGY_ORDINALS_PASS_THROUGH)
+    handle_kludgy_ordinals: int = options.get(
+        "handle_kludgy_ordinals", KLUDGY_ORDINALS_PASS_THROUGH
+    )
 
     # This code proceeds roughly as follows:
     # 1) The text is split into raw tokens on whitespace boundaries.
@@ -1955,7 +2000,9 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok
 
     rtxt: str = ""
 
-    for rt in generate_raw_tokens(txt, replace_composite_glyphs, replace_html_escapes, one_sent_per_line):
+    for rt in generate_raw_tokens(
+        txt, replace_composite_glyphs, replace_html_escapes, one_sent_per_line
+    ):
         # rt: raw token
 
         if rt.kind in {TOK.S_SPLIT, TOK.P_BEGIN, TOK.P_END}:
@@ -2085,13 +2132,18 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                 and (next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR)
             ):
                 currabbr = CURRENCY_SYMBOLS[token.txt]
-                token = TOK.Amount(token.concatenate(next_token), currabbr, next_token.number)
+                token = TOK.Amount(
+                    token.concatenate(next_token), currabbr, next_token.number
+                )
                 next_token = next(token_stream)
 
             # Special case for a DATEREL token of the form "25.10.",
             # i.e. with a trailing period: It can end a sentence
             if token.kind == TOK.DATEREL and "." in token.txt:
-                if next_token.txt == "." and not token_stream.could_be_end_of_sentence():
+                if (
+                    next_token.txt == "." 
+                    and not token_stream.could_be_end_of_sentence()
+                ):
                     # This is something like 'Ég fæddist 25.9. í Svarfaðardal.'
                     y, m, d = cast(Tuple[int, int, int], token.val)
                     token = TOK.Daterel(token.concatenate(next_token), y, m, d)
@@ -2100,7 +2152,11 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
             # Coalesce abbreviations ending with a period into a single
             # abbreviation token
             if next_token.punctuation == ".":
-                if token.kind == TOK.WORD and token.txt[-1] != "." and is_abbr_with_period(token.txt):
+                if (
+                    token.kind == TOK.WORD 
+                    and token.txt[-1] != "." 
+                    and is_abbr_with_period(token.txt)
+                ):
                     # Abbreviation ending with period: make a special token for it
                     # and advance the input stream
                     follow_token = next(token_stream)
@@ -2123,7 +2179,9 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                     # þær þarf að vera hægt að sameina í þessa flóknari tóka en viljum
                     # geta merkt það sem villu. Ætti líklega að setja í sérlista,
                     # WRONG_MONTHS, og sérif-lykkju og setja inn villu í tókann.
-                    finish = could_be_end_of_sentence(follow_token, test_set, abbrev in NUMBER_ABBREV)
+                    finish = could_be_end_of_sentence(
+                        follow_token, test_set, abbrev in NUMBER_ABBREV
+                    )
                     if finish:
                         # Potentially at the end of a sentence
                         if abbrev in Abbreviations.FINISHERS:
@@ -2135,7 +2193,10 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                             yield token
                             # Set token to the period
                             token = next_token
-                        elif abbrev in Abbreviations.NOT_FINISHERS or abbrev.lower() in Abbreviations.NOT_FINISHERS:
+                        elif (
+                            abbrev in Abbreviations.NOT_FINISHERS 
+                            or abbrev.lower() in Abbreviations.NOT_FINISHERS
+                        ):
                             # This is a potential abbreviation that we don't interpret
                             # as such if it's at the end of a sentence
                             # ('dags.', 'próf.', 'mín.'). Note that this also
@@ -2144,7 +2205,9 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                             token = next_token
                         else:
                             # Substitute the abbreviation and eat the period
-                            token = TOK.Word(token.concatenate(next_token), lookup(abbrev))
+                            token = TOK.Word(
+                                token.concatenate(next_token), lookup(abbrev)
+                            )
                     else:
                         # 'Regular' abbreviation in the middle of a sentence:
                         # Eat the period and yield the abbreviation as a single token
@@ -2180,7 +2243,9 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                     next_token = next(token_stream)
 
             # Coalesce 'klukkan/kl. átta/hálfátta' into a time
-            elif next_token.kind == TOK.WORD and next_token.txt.lower() in CLOCK_NUMBERS:
+            elif (
+                next_token.kind == TOK.WORD and next_token.txt.lower() in CLOCK_NUMBERS
+            ):
                 if token.kind == TOK.WORD and token.txt.lower() in CLOCK_ABBREVS:
                     # Match: coalesce and step to next token
                     next_txt = next_token.txt.lower()
@@ -2242,7 +2307,9 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                     sign = next_token.txt
                     # Store promille as one-tenth of a percentage
                     factor = 1.0 if sign == "%" else 0.1
-                    token = TOK.Percent(token.concatenate(next_token), token.number * factor)
+                    token = TOK.Percent(
+                        token.concatenate(next_token), token.number * factor
+                    )
                     next_token = next(token_stream)
 
             # Coalesce ordinals (1. = first, 2. = second...) into a single token
@@ -2268,13 +2335,20 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                     ):
                         # OK: replace the number/Roman numeral and the period
                         # with an ordinal token
-                        num = token.integer if token.kind == TOK.NUMBER else roman_to_int(token.txt)
+                        num = (
+                            token.integer 
+                            if token.kind == TOK.NUMBER 
+                            else roman_to_int(token.txt)
+                        )
                         token = TOK.Ordinal(token.concatenate(next_token), num)
                         # Continue with the following word
                         next_token = next(token_stream)
 
             # Convert "1920 mm" or "30 °C" to a single measurement token
-            if (token.kind == TOK.NUMBER or token.kind == TOK.YEAR) and next_token.txt in SI_UNITS:
+            if (
+                token.kind == TOK.NUMBER or token.kind == TOK.YEAR
+            ) and next_token.txt in SI_UNITS:
+
                 value = token.number
                 orig_unit = next_token.txt
                 unit: str
@@ -2287,7 +2361,9 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                     assert isinstance(factor_func, float)
                     value *= factor_func
                 if unit in ("%", "‰"):
-                    token = TOK.Percent(token.concatenate(next_token, separator=" "), value)
+                    token = TOK.Percent(
+                        token.concatenate(next_token, separator=" "), value
+                    )
                 else:
                     token = TOK.Measurement(
                         token.concatenate(next_token, separator=" "),
@@ -2385,7 +2461,11 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                 next_token = next(token_stream)
 
             # Cases such as 19 $, 199.99 $
-            if token.kind == TOK.NUMBER and next_token.kind == TOK.PUNCTUATION and next_token.txt in CURRENCY_SYMBOLS:
+            if (
+                token.kind == TOK.NUMBER 
+                and next_token.kind == TOK.PUNCTUATION 
+                and next_token.txt in CURRENCY_SYMBOLS
+            ):
                 token = TOK.Amount(
                     token.concatenate(next_token, separator=" "),
                     CURRENCY_SYMBOLS.get(next_token.txt, ""),
@@ -2422,6 +2502,7 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
     tok_end_sentence = TOK.End_Sentence()
 
     try:
+
         # Maintain a one-token lookahead
         token = next(token_stream)
         while True:
@@ -2445,7 +2526,9 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                     _skip_me.substitute((0, len(_skip_me.txt)), "")
                     token = cast(Tok, None)
                     # 3. attach them to the front of the next token
-                    token = _skip_me.concatenate(next(token_stream), metadata_from_other=True)
+                    token = _skip_me.concatenate(
+                        next(token_stream), metadata_from_other=True
+                    )
                     continue
             elif token.kind == TOK.X_END:
                 assert not in_sentence
@@ -2466,7 +2549,10 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                     # This token starts a new sentence
                     yield tok_begin_sentence
                     in_sentence = True
-                if token.punctuation in PUNCT_INDIRECT_SPEECH and next_token.punctuation in DQUOTES:
+                if (
+                    token.punctuation in PUNCT_INDIRECT_SPEECH 
+                    and next_token.punctuation in DQUOTES
+                ):
                     yield token
                     token = next_token
                     next_token = next(token_stream)
@@ -2482,10 +2568,15 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                         in_sentence = False
                 if token.punctuation in END_OF_SENTENCE and not (
                     token.punctuation == "…"
-                    and not could_be_end_of_sentence(next_token)  # Excluding sentences with ellipsis in the middle
+                    and not could_be_end_of_sentence(
+                        next_token
+                    )  # Excluding sentences with ellipsis in the middle
                 ):
                     # Combining punctuation ('??!!!')
-                    while token.punctuation in PUNCT_COMBINATIONS and next_token.punctuation in PUNCT_COMBINATIONS:
+                    while (
+                        token.punctuation in PUNCT_COMBINATIONS 
+                        and next_token.punctuation in PUNCT_COMBINATIONS
+                    ):
                         # The normalized form comes from the first token except with "…?"
                         v = token.punctuation
                         if v == "…" and next_token.punctuation == "?":
@@ -2548,6 +2639,7 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
 
     token = cast(Tok, None)
     try:
+
         # Maintain a one-token lookahead
         token = next(token_stream)
         while True:
@@ -2578,7 +2670,9 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                         token = TOK.Year(token.concatenate(next_token), nval)
                         next_token = next(token_stream)
             # Check for [number | ordinal] [month name]
-            if (token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER) and next_token.kind == TOK.WORD:
+            if (
+                token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER
+            ) and next_token.kind == TOK.WORD:
                 if next_token.txt == "gr.":
                     # Corner case: If we have an ordinal followed by
                     # the abbreviation "gr.", we assume that the only
@@ -2604,6 +2698,7 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
 
             # Check for [date] [year]
             if token.kind == TOK.DATE and next_token.kind == TOK.YEAR:
+
                 dt = cast(DateTimeTuple, token.val)
                 if not dt[0]:
                     # No year yet: add it
@@ -2633,7 +2728,11 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                 # Eat the time token
                 next_token = next(token_stream)
 
-            if token.kind == TOK.NUMBER and next_token.kind == TOK.TELNO and token.txt in COUNTRY_CODES:
+            if (
+                token.kind == TOK.NUMBER 
+                and next_token.kind == TOK.TELNO 
+                and token.txt in COUNTRY_CODES
+            ):
                 # Check for country code in front of telephone number
                 token = TOK.Telno(
                     token.concatenate(next_token, separator=" "),
@@ -2659,6 +2758,7 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]:
 
     token = cast(Tok, None)
     try:
+
         # Maintain a one-token lookahead
         token = next(token_stream)
 
@@ -2666,7 +2766,9 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]:
             next_token = next(token_stream)
             # DATEABS and DATEREL made
             # Check for [number | ordinal] [month name]
-            if (token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER) and next_token.kind == TOK.WORD:
+            if (
+                token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER
+            ) and next_token.kind == TOK.WORD:
                 month = month_for_token(next_token, True)
                 if month is not None:
                     token = TOK.Date(
@@ -2679,7 +2781,9 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                     next_token = next(token_stream)
 
             # Check for [DATE] [year]
-            if token.kind == TOK.DATE and (next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR):
+            if token.kind == TOK.DATE and (
+                next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR
+            ):
                 dt = cast(DateTimeTuple, token.val)
                 if not dt[0]:
                     # No year yet: add it
@@ -2699,7 +2803,9 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                         next_token = next(token_stream)
 
             # Check for [month name] [year|YEAR]
-            if token.kind == TOK.WORD and (next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR):
+            if token.kind == TOK.WORD and (
+                next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR
+            ):
                 month = month_for_token(token)
                 if month is not None:
                     year = next_token.integer
@@ -2805,22 +2911,28 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]:
         yield token
 
 
-def parse_phrases_2(token_stream: Iterator[Tok], coalesce_percent: bool = False) -> Iterator[Tok]:
+def parse_phrases_2(
+    token_stream: Iterator[Tok], coalesce_percent: bool = False
+) -> Iterator[Tok]:
     """Handle numbers, amounts and composite words."""
 
     token = cast(Tok, None)
     try:
+
         # Maintain a one-token lookahead
         token = next(token_stream)
 
         while True:
+
             next_token = next(token_stream)
 
             # Logic for numbers and fractions that are partially or entirely
             # written out in words
 
             # Check for [CURRENCY] [number] (e.g. kr. 9.900 or USD 50)
-            if next_token.kind == TOK.NUMBER and (token.txt in ISK_AMOUNT_PRECEDING or token.txt in CURRENCY_ABBREV):
+            if next_token.kind == TOK.NUMBER and (
+                token.txt in ISK_AMOUNT_PRECEDING or token.txt in CURRENCY_ABBREV
+            ):
                 curr = "ISK" if token.txt in ISK_AMOUNT_PRECEDING else token.txt
                 token = TOK.Amount(
                     token.concatenate(next_token, separator=" "),
@@ -2831,6 +2943,7 @@ def parse_phrases_2(token_stream: Iterator[Tok], coalesce_percent: bool = False)
 
             # Check for [number] [ISK_AMOUNT|CURRENCY|PERCENTAGE]
             elif token.kind == TOK.NUMBER and next_token.kind == TOK.WORD:
+
                 if next_token.txt in AMOUNT_ABBREV:
                     # Abbreviations for ISK amounts
                     # For abbreviations, we do not know the case,
@@ -2905,7 +3018,9 @@ def parse_phrases_2(token_stream: Iterator[Tok], coalesce_percent: bool = False)
                         # part of the composition, so it can be an unknown word.
                         _acc = tq[0]
                         for t in tq[1:] + [token, next_token]:
-                            _acc = _acc.concatenate(t, separator=" ", metadata_from_other=True)
+                            _acc = _acc.concatenate(
+                                t, separator=" ", metadata_from_other=True
+                            )
                         _acc.substitute_all(" -", "-")
                         _acc.substitute_all(" ,", ",")
                         token = _acc
@@ -2951,13 +3066,17 @@ def tokenize(text_or_gen: Union[str, Iterable[str]], **options: Any) -> Iterator
     return (t for t in token_stream if t.kind != TOK.X_END)
 
 
-def tokenize_without_annotation(text_or_gen: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok]:
+def tokenize_without_annotation(
+    text_or_gen: Union[str, Iterable[str]], **options: Any
+) -> Iterator[Tok]:
     """Tokenize without the last pass which can be done more thoroughly if BÍN
     annotation is available, for instance in GreynirPackage."""
     return tokenize(text_or_gen, with_annotation=False, **options)
 
 
-def split_into_sentences(text_or_gen: Union[str, Iterable[str]], **options: Any) -> Iterator[str]:
+def split_into_sentences(
+    text_or_gen: Union[str, Iterable[str]], **options: Any
+) -> Iterator[str]:
     """Shallow tokenization of the input text, which can be either
     a text string or a generator of lines of text (such as a file).
     This function returns a generator of strings, where each string
@@ -3001,6 +3120,7 @@ def mark_paragraphs(txt: str) -> str:
 
 
 def paragraphs(tokens: Iterable[Tok]) -> Iterator[List[Tuple[int, List[Tok]]]]:
+
     """Generator yielding paragraphs from token iterable. Each paragraph is a list
     of sentence tuples. Sentence tuples consist of the index of the first token
     of the sentence (the TOK.S_BEGIN token) and a list of the tokens within the
@@ -3048,7 +3168,7 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
 
 
 RE_SPLIT_STR = (
-    # The following regex catches Icelandic numbers with dots
+    # The following regex catches Icelandic numbers with dots and a comma
     r"([\+\-\$€]?\d{1,3}(?:\.\d\d\d)+\,\d+)"  # +123.456,789
     # The following regex catches English numbers with commas and a dot
     r"|([\+\-\$€]?\d{1,3}(?:\,\d\d\d)+\.\d+)"  # +123,456.789
@@ -3097,8 +3217,12 @@ def correct_spaces(s: str) -> str:
             this = TP_CENTER
         else:
             this = TP_WORD
-        # print("this: ", this)
-        if (w == "og" or w == "eða") and len(r) >= 2 and r[-1] == "-" and r[-2].lstrip().isalpha():
+        if (
+            (w == "og" or w == "eða")
+            and len(r) >= 2
+            and r[-1] == "-"
+            and r[-2].lstrip().isalpha()
+        ):
             # Special case for compounds such as "fjármála- og efnahagsráðuneytið"
             # and "Iðnaðar-, ferðamála- og atvinnuráðuneytið":
             # detach the hyphen from "og"/"eða"
@@ -3159,7 +3283,9 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str:
     return "".join(r)
 
 
-def calculate_indexes(tokens: Iterable[Tok], last_is_end: bool = False) -> Tuple[List[int], List[int]]:
+def calculate_indexes(
+    tokens: Iterable[Tok], last_is_end: bool = False
+) -> Tuple[List[int], List[int]]:
     """Calculate character and byte indexes for a token stream.
     The indexes are the start positions of each token in the original
     text that was tokenized.
@@ -3181,7 +3307,9 @@ def byte_len(string: str) -> int:
             if t.txt:
                 # Origin tracking failed for this token.
                 # TODO: Can we do something better here? Or guarantee that it doesn't happen?
-                raise ValueError(f"Origin tracking failed at {t.txt} near index {char_indexes[-1]}")
+                raise ValueError(
+                    f"Origin tracking failed at {t.txt} near index {char_indexes[-1]}"
+                )
             else:
                 # This is some marker token that has no text
                 pass

From 7098dd3764621007bef788a31549eadccce3ae2d Mon Sep 17 00:00:00 2001
From: thorunna <thar@hi.is>
Date: Thu, 2 Nov 2023 14:27:19 +0000
Subject: [PATCH 3/9] added test cases for abbreviations

---
 test/test_tokenizer.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
index dbf9822..6802a97 100755
--- a/test/test_tokenizer.py
+++ b/test/test_tokenizer.py
@@ -1132,6 +1132,12 @@ def test_correct_spaces() -> None:
     assert s == "Jón-sem var formaður—mótmælti málinu."
     s = t.correct_spaces("Það á   að geyma mjólkina við  20 ±  3 °C")
     assert s == "Það á að geyma mjólkina við 20±3° C"
+    s = t.correct_spaces("Við förum t.d. til Íslands o.s.frv.")
+    assert s == "Við förum t.d. til Íslands o.s.frv."
+    s = t.correct_spaces("M.a. lögum við bil.")
+    assert s == "M.a. lögum við bil."
+    s = t.correct_spaces("HANN BORÐAR Þ.Á.M. EPLI.")
+    assert s == "HANN BORÐAR Þ.Á.M. EPLI."
 
 
 def test_abbrev() -> None:

From 7696a2785804125803be1430ff9822e2f9b823f5 Mon Sep 17 00:00:00 2001
From: thorunna <thar@hi.is>
Date: Thu, 2 Nov 2023 15:54:21 +0000
Subject: [PATCH 4/9] improved handling for abbreviations and degree symbols

---
 src/tokenizer/tokenizer.py | 48 +++++++++++++++++---------------------
 test/test_tokenizer.py     | 16 +++++++++----
 2 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
index 8ec12ad..d2c2401 100644
--- a/src/tokenizer/tokenizer.py
+++ b/src/tokenizer/tokenizer.py
@@ -58,6 +58,7 @@
 
 import datetime
 import re
+import regex
 import unicodedata  # type: ignore
 from collections import deque
 
@@ -316,8 +317,8 @@ def concatenate(
             [len(self_original)] * len(separator) if len(other_origin_spans) > 0 else []
         )
         new_origin_spans = (
-            self_origin_spans 
-            + separator_origin_spans 
+            self_origin_spans
+            + separator_origin_spans
             + [i + len(self_original) for i in other_origin_spans]
         )
 
@@ -1453,7 +1454,6 @@ def generate_raw_tokens(
     big_text: str
 
     for big_text in text_or_gen:
-        
         if not one_sent_per_line and not big_text:
             # An explicit empty string in the input always
             # causes a sentence split
@@ -1821,6 +1821,7 @@ def parse(self, rt: Tok) -> Iterable[Tok]:
         self.rt = rt
         self.ate = ate
 
+
 def parse_mixed(
     rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool
 ) -> Iterable[Tok]:
@@ -2141,7 +2142,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
             # i.e. with a trailing period: It can end a sentence
             if token.kind == TOK.DATEREL and "." in token.txt:
                 if (
-                    next_token.txt == "." 
+                    next_token.txt == "."
                     and not token_stream.could_be_end_of_sentence()
                 ):
                     # This is something like 'Ég fæddist 25.9. í Svarfaðardal.'
@@ -2153,8 +2154,8 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
             # abbreviation token
             if next_token.punctuation == ".":
                 if (
-                    token.kind == TOK.WORD 
-                    and token.txt[-1] != "." 
+                    token.kind == TOK.WORD
+                    and token.txt[-1] != "."
                     and is_abbr_with_period(token.txt)
                 ):
                     # Abbreviation ending with period: make a special token for it
@@ -2194,7 +2195,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                             # Set token to the period
                             token = next_token
                         elif (
-                            abbrev in Abbreviations.NOT_FINISHERS 
+                            abbrev in Abbreviations.NOT_FINISHERS
                             or abbrev.lower() in Abbreviations.NOT_FINISHERS
                         ):
                             # This is a potential abbreviation that we don't interpret
@@ -2336,8 +2337,8 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                         # OK: replace the number/Roman numeral and the period
                         # with an ordinal token
                         num = (
-                            token.integer 
-                            if token.kind == TOK.NUMBER 
+                            token.integer
+                            if token.kind == TOK.NUMBER
                             else roman_to_int(token.txt)
                         )
                         token = TOK.Ordinal(token.concatenate(next_token), num)
@@ -2348,7 +2349,6 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
             if (
                 token.kind == TOK.NUMBER or token.kind == TOK.YEAR
             ) and next_token.txt in SI_UNITS:
-
                 value = token.number
                 orig_unit = next_token.txt
                 unit: str
@@ -2462,8 +2462,8 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
 
             # Cases such as 19 $, 199.99 $
             if (
-                token.kind == TOK.NUMBER 
-                and next_token.kind == TOK.PUNCTUATION 
+                token.kind == TOK.NUMBER
+                and next_token.kind == TOK.PUNCTUATION
                 and next_token.txt in CURRENCY_SYMBOLS
             ):
                 token = TOK.Amount(
@@ -2502,7 +2502,6 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
     tok_end_sentence = TOK.End_Sentence()
 
     try:
-
         # Maintain a one-token lookahead
         token = next(token_stream)
         while True:
@@ -2550,7 +2549,7 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                     yield tok_begin_sentence
                     in_sentence = True
                 if (
-                    token.punctuation in PUNCT_INDIRECT_SPEECH 
+                    token.punctuation in PUNCT_INDIRECT_SPEECH
                     and next_token.punctuation in DQUOTES
                 ):
                     yield token
@@ -2574,7 +2573,7 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                 ):
                     # Combining punctuation ('??!!!')
                     while (
-                        token.punctuation in PUNCT_COMBINATIONS 
+                        token.punctuation in PUNCT_COMBINATIONS
                         and next_token.punctuation in PUNCT_COMBINATIONS
                     ):
                         # The normalized form comes from the first token except with "…?"
@@ -2639,7 +2638,6 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
 
     token = cast(Tok, None)
     try:
-
         # Maintain a one-token lookahead
         token = next(token_stream)
         while True:
@@ -2698,7 +2696,6 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
 
             # Check for [date] [year]
             if token.kind == TOK.DATE and next_token.kind == TOK.YEAR:
-
                 dt = cast(DateTimeTuple, token.val)
                 if not dt[0]:
                     # No year yet: add it
@@ -2729,8 +2726,8 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                 next_token = next(token_stream)
 
             if (
-                token.kind == TOK.NUMBER 
-                and next_token.kind == TOK.TELNO 
+                token.kind == TOK.NUMBER
+                and next_token.kind == TOK.TELNO
                 and token.txt in COUNTRY_CODES
             ):
                 # Check for country code in front of telephone number
@@ -2758,7 +2755,6 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]:
 
     token = cast(Tok, None)
     try:
-
         # Maintain a one-token lookahead
         token = next(token_stream)
 
@@ -2918,12 +2914,10 @@ def parse_phrases_2(
 
     token = cast(Tok, None)
     try:
-
         # Maintain a one-token lookahead
         token = next(token_stream)
 
         while True:
-
             next_token = next(token_stream)
 
             # Logic for numbers and fractions that are partially or entirely
@@ -2943,7 +2937,6 @@ def parse_phrases_2(
 
             # Check for [number] [ISK_AMOUNT|CURRENCY|PERCENTAGE]
             elif token.kind == TOK.NUMBER and next_token.kind == TOK.WORD:
-
                 if next_token.txt in AMOUNT_ABBREV:
                     # Abbreviations for ISK amounts
                     # For abbreviations, we do not know the case,
@@ -3120,7 +3113,6 @@ def mark_paragraphs(txt: str) -> str:
 
 
 def paragraphs(tokens: Iterable[Tok]) -> Iterator[List[Tuple[int, List[Tok]]]]:
-
     """Generator yielding paragraphs from token iterable. Each paragraph is a list
     of sentence tuples. Sentence tuples consist of the index of the first token
     of the sentence (the TOK.S_BEGIN token) and a list of the tokens within the
@@ -3177,13 +3169,17 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
     # The following regex catches English numbers with a dot only
     r"|([\+\-\$€]?\d+\.\d+(?!\,\d))"  # -1234.56
     # The following regex catches Icelandic abbreviations, e.g. a.m.k., A.M.K., þ.e.a.s.
-    r"|([a-záðéíóúýþæöA-ZÁÐÉÍÓÚÝÞÆÖ]+\.(?:[a-záðéíóúýþæöA-ZÁÐÉÍÓÚÝÞÆÖ]+\.)+)"
+    r"|(\p{L}+\.(?:\p{L}+\.)+)(?!\p{L}+\s)"
+    # The following regex catches degree characters, i.e. °C, °F
+    r"|(°[CF])"
     # Finally, space and punctuation
     r"|([~\s"
     + "".join("\\" + c for c in PUNCTUATION)
     + r"])"
 )
-RE_SPLIT = re.compile(RE_SPLIT_STR)
+# The re module doesn't support \p{L}, which matches any letter in any language,
+# but regex does.
+RE_SPLIT = regex.compile(RE_SPLIT_STR)
 
 
 def correct_spaces(s: str) -> str:
diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
index 6802a97..01116fa 100755
--- a/test/test_tokenizer.py
+++ b/test/test_tokenizer.py
@@ -63,7 +63,6 @@ def get_text_and_norm(orig: str) -> Tuple[str, str]:
 
 
 def test_single_tokens() -> None:
-
     TEST_CASES = [
         (".", TOK.PUNCTUATION),
         (",", TOK.PUNCTUATION),
@@ -616,7 +615,6 @@ def run_test(test_cases: Iterable[TestCase], **options: Any) -> None:
 
 
 def test_sentences() -> None:
-
     KIND = {
         "B": TOK.S_BEGIN,
         "E": TOK.S_END,
@@ -646,7 +644,6 @@ def test_sentences() -> None:
     }
 
     def test_sentence(text: str, expected: str, **options: Any) -> None:
-
         exp = expected.split()
         s = list(t.tokenize(text, **options))
         assert len(s) == len(exp)
@@ -1131,13 +1128,23 @@ def test_correct_spaces() -> None:
     s = t.correct_spaces("Jón- sem var formaður — mótmælti málinu.")
     assert s == "Jón-sem var formaður—mótmælti málinu."
     s = t.correct_spaces("Það á   að geyma mjólkina við  20 ±  3 °C")
-    assert s == "Það á að geyma mjólkina við 20±3° C"
+    assert s == "Það á að geyma mjólkina við 20±3 °C"
     s = t.correct_spaces("Við förum t.d. til Íslands o.s.frv.")
     assert s == "Við förum t.d. til Íslands o.s.frv."
+    s = t.correct_spaces("Við förum t. d. til Íslands o. s. frv.")
+    assert (
+        s == "Við förum t. d. til Íslands o. s. frv."
+    )  # This shouldn't be corrected here
     s = t.correct_spaces("M.a. lögum við bil.")
     assert s == "M.a. lögum við bil."
     s = t.correct_spaces("HANN BORÐAR Þ.Á.M. EPLI.")
     assert s == "HANN BORÐAR Þ.Á.M. EPLI."
+    s = t.correct_spaces("Ég fór til Írlands 6.júní og þar var 17.4°C hiti eða 230.3K.")
+    assert s == "Ég fór til Írlands 6. júní og þar var 17.4 °C hiti eða 230.3 K."
+    s = t.correct_spaces(
+        "Þetta er setning.Þetta er önnur setning.Líka.En hvað með þetta?"
+    )
+    assert s == "Þetta er setning. Þetta er önnur setning. Líka. En hvað með þetta?"
 
 
 def test_abbrev() -> None:
@@ -2562,7 +2569,6 @@ def test_one_sent_per_line() -> None:
 
 
 if __name__ == "__main__":
-
     test_single_tokens()
     test_sentences()
     test_correct_spaces()

From 25649f611ba9a782b1586154a6b6a5f7e315c24b Mon Sep 17 00:00:00 2001
From: thorunna <thar@hi.is>
Date: Fri, 3 Nov 2023 09:23:32 +0000
Subject: [PATCH 5/9] added installation for regex module

---
 .github/workflows/python-package.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index aad426a..cf3b2d6 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -30,6 +30,9 @@ jobs:
       run: |
         python -m pip install --upgrade pip wheel setuptools
         python -m pip install -e ".[dev]"
+    - name: Install regex module
+      run: |
+        python -m pip install regex
     - name: Type check with mypy (only on Python 3.8)
       run: |
         if [ "${{ matrix.python-version }}" == "3.8" ]; then python -m pip install mypy; fi

From 877df42e17160a7abcc1301bcd18369c9c1264b9 Mon Sep 17 00:00:00 2001
From: thorunna <thar@hi.is>
Date: Fri, 3 Nov 2023 09:26:45 +0000
Subject: [PATCH 6/9] reversed change

---
 .github/workflows/python-package.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index cf3b2d6..aad426a 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -30,9 +30,6 @@ jobs:
       run: |
         python -m pip install --upgrade pip wheel setuptools
         python -m pip install -e ".[dev]"
-    - name: Install regex module
-      run: |
-        python -m pip install regex
     - name: Type check with mypy (only on Python 3.8)
       run: |
         if [ "${{ matrix.python-version }}" == "3.8" ]; then python -m pip install mypy; fi

From e84e4424d6d9a02bdde020b0236d7e705442a580 Mon Sep 17 00:00:00 2001
From: thorunna <thar@hi.is>
Date: Fri, 3 Nov 2023 11:28:41 +0000
Subject: [PATCH 7/9] went back to re module

---
 src/tokenizer/tokenizer.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
index d2c2401..8c14199 100644
--- a/src/tokenizer/tokenizer.py
+++ b/src/tokenizer/tokenizer.py
@@ -58,7 +58,6 @@
 
 import datetime
 import re
-import regex
 import unicodedata  # type: ignore
 from collections import deque
 
@@ -3169,7 +3168,8 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
     # The following regex catches English numbers with a dot only
     r"|([\+\-\$€]?\d+\.\d+(?!\,\d))"  # -1234.56
     # The following regex catches Icelandic abbreviations, e.g. a.m.k., A.M.K., þ.e.a.s.
-    r"|(\p{L}+\.(?:\p{L}+\.)+)(?!\p{L}+\s)"
+    # r"|(\p{L}+\.(?:\p{L}+\.)+)(?!\p{L}+\s)"
+    r"|([a-záðéíóúýþæö]+\.(?:[a-záðéíóúýþæö]+\.)+)(?![a-záðéíóúýþæö]+\s)"
     # The following regex catches degree characters, i.e. °C, °F
     r"|(°[CF])"
     # Finally, space and punctuation
@@ -3177,9 +3177,7 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
     + "".join("\\" + c for c in PUNCTUATION)
     + r"])"
 )
-# The re module doesn't support \p{L}, which matches any letter in any language,
-# but regex does.
-RE_SPLIT = regex.compile(RE_SPLIT_STR)
+RE_SPLIT = re.compile(RE_SPLIT_STR, re.IGNORECASE)
 
 
 def correct_spaces(s: str) -> str:

From 8a6054347603b1b6abad84803bdf56e795a22c9b Mon Sep 17 00:00:00 2001
From: thorunna <thar@hi.is>
Date: Fri, 3 Nov 2023 11:40:46 +0000
Subject: [PATCH 8/9] old regex string, which was commented out, removed

---
 src/tokenizer/tokenizer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
index 8c14199..62295f5 100644
--- a/src/tokenizer/tokenizer.py
+++ b/src/tokenizer/tokenizer.py
@@ -3168,7 +3168,6 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
     # The following regex catches English numbers with a dot only
     r"|([\+\-\$€]?\d+\.\d+(?!\,\d))"  # -1234.56
     # The following regex catches Icelandic abbreviations, e.g. a.m.k., A.M.K., þ.e.a.s.
-    # r"|(\p{L}+\.(?:\p{L}+\.)+)(?!\p{L}+\s)"
     r"|([a-záðéíóúýþæö]+\.(?:[a-záðéíóúýþæö]+\.)+)(?![a-záðéíóúýþæö]+\s)"
     # The following regex catches degree characters, i.e. °C, °F
     r"|(°[CF])"

From 66ffce0404155b07f369ad5e60383d2a42a6bee4 Mon Sep 17 00:00:00 2001
From: thorunna <thar@hi.is>
Date: Fri, 3 Nov 2023 13:30:52 +0000
Subject: [PATCH 9/9] updated regex string

---
 src/tokenizer/tokenizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
index 62295f5..c0ea7a3 100644
--- a/src/tokenizer/tokenizer.py
+++ b/src/tokenizer/tokenizer.py
@@ -3168,7 +3168,7 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
     # The following regex catches English numbers with a dot only
     r"|([\+\-\$€]?\d+\.\d+(?!\,\d))"  # -1234.56
     # The following regex catches Icelandic abbreviations, e.g. a.m.k., A.M.K., þ.e.a.s.
-    r"|([a-záðéíóúýþæö]+\.(?:[a-záðéíóúýþæö]+\.)+)(?![a-záðéíóúýþæö]+\s)"
+    r"|([^\W\d_]+\.(?:[^\W\d_]+\.)+)(?![^\W\d_]+\s)"
     # The following regex catches degree characters, i.e. °C, °F
     r"|(°[CF])"
     # Finally, space and punctuation
@@ -3176,7 +3176,7 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
     + "".join("\\" + c for c in PUNCTUATION)
     + r"])"
 )
-RE_SPLIT = re.compile(RE_SPLIT_STR, re.IGNORECASE)
+RE_SPLIT = re.compile(RE_SPLIT_STR)
 
 
 def correct_spaces(s: str) -> str: