From b26b89ef36ad0729f5e24a961c10eda945956ff1 Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Thu, 22 Aug 2024 20:14:14 +0000
Subject: [PATCH] Modern typing annotation: Set, List, Dict -> set, list, dict,
 etc.

---
 src/tokenizer/abbrev.py      | 28 +++++++++----------
 src/tokenizer/definitions.py | 13 ++++-----
 src/tokenizer/main.py        | 12 ++++----
 src/tokenizer/tokenizer.py   | 53 ++++++++++++++++--------------------
 4 files changed, 48 insertions(+), 58 deletions(-)

diff --git a/src/tokenizer/abbrev.py b/src/tokenizer/abbrev.py
index a57c954..010cc29 100644
--- a/src/tokenizer/abbrev.py
+++ b/src/tokenizer/abbrev.py
@@ -33,7 +33,7 @@
 
 """
 
-from typing import Generic, Iterator, Optional, Set, List, Dict, TypeVar
+from typing import Generic, Iterator, Optional, TypeVar
 
 from threading import Lock
 from collections import defaultdict, OrderedDict
@@ -56,7 +56,7 @@ class OrderedSet(Generic[_T]):
     if a standard Python set() was used."""
 
     def __init__(self) -> None:
-        self._dict: Dict[_T, None] = OrderedDict()
+        self._dict: dict[_T, None] = OrderedDict()
 
     def add(self, item: _T) -> None:
         """Add an item at the end of the ordered set"""
@@ -75,29 +75,29 @@ class Abbreviations:
     initialized from the config file"""
 
     # Dictionary of abbreviations and their meanings
-    DICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
+    DICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
     # Wrong versions of abbreviations
-    WRONGDICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
+    WRONGDICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
     # All abbreviation meanings
-    MEANINGS: Set[str] = set()
+    MEANINGS: set[str] = set()
     # Single-word abbreviations, i.e. those with only one dot at the end
-    SINGLES: Set[str] = set()
+    SINGLES: set[str] = set()
     # Set of abbreviations without periods, e.g. "td", "osfrv"
-    WRONGSINGLES: Set[str] = set()
+    WRONGSINGLES: set[str] = set()
     # Potential sentence finishers, i.e. those with a dot at the end,
     # marked with an asterisk in the config file
-    FINISHERS: Set[str] = set()
+    FINISHERS: set[str] = set()
     # Abbreviations that should not be seen as such at the end of sentences,
     # marked with an exclamation mark in the config file
-    NOT_FINISHERS: Set[str] = set()
+    NOT_FINISHERS: set[str] = set()
     # Abbreviations that should not be seen as such at the end of sentences, but
     # are allowed in front of person names; marked with a hat ^ in the config file
-    NAME_FINISHERS: Set[str] = set()
+    NAME_FINISHERS: set[str] = set()
     # Wrong versions of abbreviations with possible corrections
     # wrong version : [correction1, correction2, ...]
-    WRONGDOTS: Dict[str, List[str]] = defaultdict(list)
+    WRONGDOTS: dict[str, list[str]] = defaultdict(list)
     # Word forms that should never be interpreted as abbreviations
-    NOT_ABBREVIATIONS: Set[str] = set()
+    NOT_ABBREVIATIONS: set[str] = set()
 
     # Ensure that only one thread initializes the abbreviations
     _lock = Lock()
@@ -208,7 +208,7 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
                 i1 = indices[0]
                 i2 = indices[1]
                 i3 = indices[2]
-                wabbrevs: List[str] = []
+                wabbrevs: list[str] = []
                 # 1 and 2 removed
                 wabbrevs.append(abbrev[:i1] + abbrev[i1 + 1 : i2] + abbrev[i2 + 1 :])
                 # 1 and 3 removed
@@ -257,7 +257,7 @@ def has_abbreviation(meaning: str) -> bool:
         return meaning in Abbreviations.MEANINGS
 
     @staticmethod
-    def get_meaning(abbrev: str) -> Optional[List[BIN_Tuple]]:
+    def get_meaning(abbrev: str) -> Optional[list[BIN_Tuple]]:
         """Look up meaning(s) of abbreviation, if available."""
         m = Abbreviations.DICT.get(abbrev)
         if not m:
diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py
index 812aeb7..b327939 100644
--- a/src/tokenizer/definitions.py
+++ b/src/tokenizer/definitions.py
@@ -29,13 +29,10 @@
 """
 
 from typing import (
-    Dict,
-    FrozenSet,
     Mapping,
     Tuple,
     Union,
     Callable,
-    List,
     Sequence,
     Optional,
     NamedTuple,
@@ -47,13 +44,13 @@
 
 BeginTuple = Tuple[int, Optional[int]]
 PunctuationTuple = Tuple[int, str]
-NumberTuple = Tuple[float, Optional[List[str]], Optional[List[str]]]
+NumberTuple = Tuple[float, Optional[list[str]], Optional[list[str]]]
 DateTimeTuple = Tuple[int, int, int]
 MeasurementTuple = Tuple[str, float]
 TimeStampTuple = Tuple[int, int, int, int, int, int]
-AmountTuple = Tuple[float, str, Optional[List[str]], Optional[List[str]]]
+AmountTuple = Tuple[float, str, Optional[list[str]], Optional[list[str]]]
 TelnoTuple = Tuple[str, str]
-CurrencyTuple = Tuple[str, Optional[List[str]], Optional[List[str]]]
+CurrencyTuple = Tuple[str, Optional[list[str]], Optional[list[str]]]
 
 
 class BIN_Tuple(NamedTuple):
@@ -434,7 +431,7 @@ class PersonNameTuple(NamedTuple):
 SINGLECHAR_FRACTIONS = "↉⅒⅑⅛⅐⅙⅕¼⅓½⅖⅔⅜⅗¾⅘⅝⅚⅞"
 
 # Derived unit : (base SI unit, conversion factor/function)
-SI_UNITS: Dict[str, Tuple[str, Union[float, Callable[[float], float]]]] = {
+SI_UNITS: dict[str, Tuple[str, Union[float, Callable[[float], float]]]] = {
     # Distance
     "m": ("m", 1.0),
     "mm": ("m", 1.0e-3),
@@ -538,7 +535,7 @@ class PersonNameTuple(NamedTuple):
     unit + r"(?!\w)" if unit[-1].isalpha() else unit
 )
 
-SI_UNITS_SET: FrozenSet[str] = frozenset(SI_UNITS.keys())
+SI_UNITS_SET: frozenset[str] = frozenset(SI_UNITS.keys())
 SI_UNITS_REGEX_STRING = r"|".join(
     map(
         # If the unit ends with a letter, don't allow the next character
diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py
index b6a94eb..aec191d 100755
--- a/src/tokenizer/main.py
+++ b/src/tokenizer/main.py
@@ -35,7 +35,7 @@
 
 """
 
-from typing import TextIO, Dict, Iterator, List, Callable, Any, Tuple, Union, cast
+from typing import TextIO, Iterator, Callable, Any, Tuple, Union, cast
 
 import sys
 import argparse
@@ -158,14 +158,14 @@ def main() -> None:
     """Main function, called when the tokenize command is invoked"""
 
     args = parser.parse_args()
-    options: Dict[str, bool] = dict()
+    options: dict[str, bool] = dict()
 
     def quote(s: str) -> str:
         """Return the string s within double quotes, and with any contained
         backslashes and double quotes escaped with a backslash"""
         return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'
 
-    def spanquote(l: List[int]) -> str:
+    def spanquote(l: list[int]) -> str:
         """Return the list l as a string within double quotes"""
         return '"' + "-".join(str(x) for x in l) + '"'
 
@@ -180,7 +180,7 @@ def val(t: Tok, quote_word: bool = False) -> Any:
             return None
         if t.kind == TOK.WORD:
             # Get the full expansion of an abbreviation
-            mm = cast(List[BIN_Tuple], t.val)
+            mm = cast(list[BIN_Tuple], t.val)
             if quote_word:
                 # Return a |-delimited list of possible meanings,
                 # joined into a single string
@@ -254,7 +254,7 @@ def val(t: Tok, quote_word: bool = False) -> Any:
 
     # Configure our JSON dump function
     json_dumps = partial(json.dumps, ensure_ascii=False, separators=(",", ":"))
-    curr_sent: List[str] = []
+    curr_sent: list[str] = []
     tsep = "" if args.original else " "  # token separator
     for t in tokenize(gen(args.infile), **options):
         if args.csv:
@@ -275,7 +275,7 @@ def val(t: Tok, quote_word: bool = False) -> Any:
                 print('0,"","","",""', file=args.outfile)
         elif args.json:
             # Output the tokens in JSON format, one line per token
-            d: Dict[str, Union[str, List[int]]] = dict(k=TOK.descr[t.kind])
+            d: dict[str, Union[str, list[int]]] = dict(k=TOK.descr[t.kind])
             if t.txt is not None:
                 d["t"] = t.txt
             v = val(t)
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
index 1a6dfc8..35bee09 100644
--- a/src/tokenizer/tokenizer.py
+++ b/src/tokenizer/tokenizer.py
@@ -42,10 +42,8 @@
     Any,
     Callable,
     Deque,
-    FrozenSet,
     Iterable,
     Iterator,
-    List,
     Mapping,
     Match,
     Optional,
@@ -77,7 +75,6 @@
 
 
 class Tok:
-
     """Information about a single token"""
 
     def __init__(
@@ -86,7 +83,7 @@ def __init__(
         txt: Optional[str],
         val: ValType,
         original: Optional[str] = None,
-        origin_spans: Optional[List[int]] = None,
+        origin_spans: Optional[list[int]] = None,
     ) -> None:
         # Type of token
         self.kind: int = kind
@@ -101,7 +98,7 @@ def __init__(
         # Each such integer index maps the corresponding character
         # (which may have substitutions) to its index in 'original'.
         # This is required to preserve 'original' correctly when splitting.
-        self.origin_spans: Optional[List[int]] = origin_spans
+        self.origin_spans: Optional[list[int]] = origin_spans
 
     @classmethod
     def from_txt(cls: Type[_T], txt: str) -> _T:
@@ -312,7 +309,7 @@ def concatenate(
 
         self_origin_spans = self.origin_spans or []
         other_origin_spans = other.origin_spans or []
-        separator_origin_spans: List[int] = (
+        separator_origin_spans: list[int] = (
             [len(self_original)] * len(separator) if len(other_origin_spans) > 0 else []
         )
         new_origin_spans = (
@@ -373,7 +370,6 @@ def quoted_string_repr(obj: Any) -> str:
 
 
 class TOK:
-
     """
     The TOK class contains constants that define token types and
     constructors for creating token instances.
@@ -647,8 +643,8 @@ def Email(t: Union[Tok, str]) -> Tok:
     def Number(
         t: Union[Tok, str],
         n: float,
-        cases: Optional[List[str]] = None,
-        genders: Optional[List[str]] = None,
+        cases: Optional[list[str]] = None,
+        genders: Optional[list[str]] = None,
     ) -> Tok:
         # The cases parameter is a list of possible cases for this number
         # (if it was originally stated in words)
@@ -670,8 +666,8 @@ def NumberWithLetter(t: Union[Tok, str], n: int, c: str) -> Tok:
     def Currency(
         t: Union[Tok, str],
         iso: str,
-        cases: Optional[List[str]] = None,
-        genders: Optional[List[str]] = None,
+        cases: Optional[list[str]] = None,
+        genders: Optional[list[str]] = None,
     ) -> Tok:
         # The cases parameter is a list of possible cases for this currency name
         # (if it was originally stated in words, i.e. not abbreviated)
@@ -686,8 +682,8 @@ def Amount(
         t: Union[Tok, str],
         iso: str,
         n: float,
-        cases: Optional[List[str]] = None,
-        genders: Optional[List[str]] = None,
+        cases: Optional[list[str]] = None,
+        genders: Optional[list[str]] = None,
     ) -> Tok:
         # The cases parameter is a list of possible cases for this amount
         # (if it was originally stated in words)
@@ -701,8 +697,8 @@ def Amount(
     def Percent(
         t: Union[Tok, str],
         n: float,
-        cases: Optional[List[str]] = None,
-        genders: Optional[List[str]] = None,
+        cases: Optional[list[str]] = None,
+        genders: Optional[list[str]] = None,
     ) -> Tok:
         if isinstance(t, str):
             return Tok(TOK.PERCENT, t, (n, cases, genders))
@@ -1559,7 +1555,7 @@ def generate_raw_tokens(
 
 def could_be_end_of_sentence(
     next_token: Tok,
-    test_set: FrozenSet[int] = TOK.TEXT,
+    test_set: frozenset[int] = TOK.TEXT,
     multiplier: bool = False,
 ) -> bool:
     """Return True if next_token could be ending the current sentence or
@@ -1578,7 +1574,6 @@ def could_be_end_of_sentence(
 
 
 class LetterParser:
-
     """Parses a sequence of alphabetic characters
     off the front of a raw token"""
 
@@ -1663,7 +1658,6 @@ def parse(self) -> Iterable[Tok]:
 
 
 class NumberParser:
-
     """Parses a sequence of digits off the front of a raw token"""
 
     def __init__(
@@ -1724,7 +1718,6 @@ def parse(self) -> Iterable[Tok]:
 
 
 class PunctuationParser:
-
     """Parses a sequence of punctuation off the front of a raw token"""
 
     def __init__(self) -> None:
@@ -2108,7 +2101,7 @@ def is_abbr_with_period(txt: str) -> bool:
             return txt not in Abbreviations.DICT
         return False
 
-    def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
+    def lookup(abbrev: str) -> Optional[list[BIN_Tuple]]:
         """Look up an abbreviation, both in original case and in lower case,
         and return either None if not found or a meaning list having one entry"""
         m = Abbreviations.DICT.get(abbrev)
@@ -2647,7 +2640,7 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
                 if abbrev in Abbreviations.FINISHERS:
                     token = TOK.Word(
                         token.concatenate(next_token),
-                        cast(Optional[List[BIN_Tuple]], token.val),
+                        cast(Optional[list[BIN_Tuple]], token.val),
                     )
                     next_token = next(token_stream)
 
@@ -2975,7 +2968,7 @@ def parse_phrases_2(
             # Check for composites:
             # 'stjórnskipunar- og eftirlitsnefnd'
             # 'dómsmála-, viðskipta- og iðnaðarráðherra'
-            tq: List[Tok] = []
+            tq: list[Tok] = []
             while token.kind == TOK.WORD and next_token.punctuation == COMPOSITE_HYPHEN:
                 # Accumulate the prefix in tq
                 tq.append(token)
@@ -3081,7 +3074,7 @@ def split_into_sentences(
         to_text = lambda t: t.original or t.txt
     else:
         to_text = lambda t: t.txt
-    curr_sent: List[str] = []
+    curr_sent: list[str] = []
     for t in tokenize_without_annotation(text_or_gen, **options):
         if t.kind in TOK.END:
             # End of sentence/paragraph
@@ -3111,14 +3104,14 @@ def mark_paragraphs(txt: str) -> str:
     return "[[" + "]][[".join(t for t in txt.split("\n") if t) + "]]"
 
 
-def paragraphs(tokens: Iterable[Tok]) -> Iterator[List[Tuple[int, List[Tok]]]]:
+def paragraphs(tokens: Iterable[Tok]) -> Iterator[list[Tuple[int, list[Tok]]]]:
     """Generator yielding paragraphs from token iterable. Each paragraph is a list
     of sentence tuples. Sentence tuples consist of the index of the first token
     of the sentence (the TOK.S_BEGIN token) and a list of the tokens within the
     sentence, not including the starting TOK.S_BEGIN or the terminating TOK.S_END
     tokens."""
 
-    def valid_sent(sent: Optional[List[Tok]]) -> bool:
+    def valid_sent(sent: Optional[list[Tok]]) -> bool:
         """Return True if the token list in sent is a proper
         sentence that we want to process further"""
         if not sent:
@@ -3126,9 +3119,9 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
         # A sentence with only punctuation is not valid
         return any(t[0] != TOK.PUNCTUATION for t in sent)
 
-    sent: List[Tok] = []  # Current sentence
+    sent: list[Tok] = []  # Current sentence
     sent_begin = 0
-    current_p: List[Tuple[int, List[Tok]]] = []  # Current paragraph
+    current_p: list[Tuple[int, list[Tok]]] = []  # Current paragraph
 
     for ix, t in enumerate(tokens):
         t0 = t[0]
@@ -3184,7 +3177,7 @@ def correct_spaces(s: str) -> str:
     with correct spacing between tokens.
     NOTE that this function uses a quick-and-dirty approach
     which may not handle all edge cases!"""
-    r: List[str] = []
+    r: list[str] = []
     last = TP_NONE
     double_quote_count = 0
     for w in RE_SPLIT.split(s):
@@ -3244,7 +3237,7 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str:
     to a correctly spaced string. If normalize is True,
     punctuation is normalized before assembling the string."""
     to_text: Callable[[Tok], str] = normalized_text if normalize else lambda t: t.txt
-    r: List[str] = []
+    r: list[str] = []
     last = TP_NONE
     double_quote_count = 0
     for t in tokens:
@@ -3278,7 +3271,7 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str:
 
 def calculate_indexes(
     tokens: Iterable[Tok], last_is_end: bool = False
-) -> Tuple[List[int], List[int]]:
+) -> Tuple[list[int], list[int]]:
     """Calculate character and byte indexes for a token stream.
     The indexes are the start positions of each token in the original
     text that was tokenized.