Skip to content

Commit

Permalink
More type modernization
Browse files Browse the repository at this point in the history
  • Loading branch information
sveinbjornt committed Aug 22, 2024
1 parent b26b89e commit d8a3351
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 24 deletions.
23 changes: 11 additions & 12 deletions src/tokenizer/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@

from typing import (
Mapping,
Tuple,
Union,
Callable,
Sequence,
Expand All @@ -42,15 +41,15 @@
import re


BeginTuple = Tuple[int, Optional[int]]
PunctuationTuple = Tuple[int, str]
NumberTuple = Tuple[float, Optional[list[str]], Optional[list[str]]]
DateTimeTuple = Tuple[int, int, int]
MeasurementTuple = Tuple[str, float]
TimeStampTuple = Tuple[int, int, int, int, int, int]
AmountTuple = Tuple[float, str, Optional[list[str]], Optional[list[str]]]
TelnoTuple = Tuple[str, str]
CurrencyTuple = Tuple[str, Optional[list[str]], Optional[list[str]]]
BeginTuple = tuple[int, Optional[int]]
PunctuationTuple = tuple[int, str]
NumberTuple = tuple[float, Optional[list[str]], Optional[list[str]]]
DateTimeTuple = tuple[int, int, int]
MeasurementTuple = tuple[str, float]
TimeStampTuple = tuple[int, int, int, int, int, int]
AmountTuple = tuple[float, str, Optional[list[str]], Optional[list[str]]]
TelnoTuple = tuple[str, str]
CurrencyTuple = tuple[str, Optional[list[str]], Optional[list[str]]]


class BIN_Tuple(NamedTuple):
Expand Down Expand Up @@ -339,7 +338,7 @@ class PersonNameTuple(NamedTuple):
# }

# Time of day expressions spelled out
CLOCK_NUMBERS: Mapping[str, Tuple[int, int, int]] = {
CLOCK_NUMBERS: Mapping[str, tuple[int, int, int]] = {
"eitt": (1, 0, 0),
"tvö": (2, 0, 0),
"þrjú": (3, 0, 0),
Expand Down Expand Up @@ -431,7 +430,7 @@ class PersonNameTuple(NamedTuple):
SINGLECHAR_FRACTIONS = "↉⅒⅑⅛⅐⅙⅕¼⅓½⅖⅔⅜⅗¾⅘⅝⅚⅞"

# Derived unit : (base SI unit, conversion factor/function)
SI_UNITS: dict[str, Tuple[str, Union[float, Callable[[float], float]]]] = {
SI_UNITS: dict[str, tuple[str, Union[float, Callable[[float], float]]]] = {
# Distance
"m": ("m", 1.0),
"mm": ("m", 1.0e-3),
Expand Down
6 changes: 3 additions & 3 deletions src/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3104,7 +3104,7 @@ def mark_paragraphs(txt: str) -> str:
return "[[" + "]][[".join(t for t in txt.split("\n") if t) + "]]"


def paragraphs(tokens: Iterable[Tok]) -> Iterator[list[Tuple[int, list[Tok]]]]:
def paragraphs(tokens: Iterable[Tok]) -> Iterator[list[tuple[int, list[Tok]]]]:
"""Generator yielding paragraphs from token iterable. Each paragraph is a list
of sentence tuples. Sentence tuples consist of the index of the first token
of the sentence (the TOK.S_BEGIN token) and a list of the tokens within the
Expand All @@ -3121,7 +3121,7 @@ def valid_sent(sent: Optional[list[Tok]]) -> bool:

sent: list[Tok] = [] # Current sentence
sent_begin = 0
current_p: list[Tuple[int, list[Tok]]] = [] # Current paragraph
current_p: list[tuple[int, list[Tok]]] = [] # Current paragraph

for ix, t in enumerate(tokens):
t0 = t[0]
Expand Down Expand Up @@ -3271,7 +3271,7 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str:

def calculate_indexes(
tokens: Iterable[Tok], last_is_end: bool = False
) -> Tuple[list[int], list[int]]:
) -> tuple[list[int], list[int]]:
"""Calculate character and byte indexes for a token stream.
The indexes are the start positions of each token in the original
text that was tokenized.
Expand Down
18 changes: 9 additions & 9 deletions test/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,18 @@
"""

from typing import Any, Iterable, Iterator, List, Tuple, Union, cast
from typing import Any, Iterable, Iterator, Union, cast

import tokenizer as t
from tokenizer.definitions import BIN_Tuple, ValType

TOK = t.TOK
Tok = t.Tok

TestCase = Union[Tuple[str, int], Tuple[str, int, ValType], Tuple[str, List[Tok]]]
TestCase = Union[tuple[str, int], tuple[str, int, ValType], tuple[str, list[Tok]]]


def strip_originals(tokens: List[Tok]) -> List[Tok]:
def strip_originals(tokens: list[Tok]) -> list[Tok]:
"""Remove origin tracking info from a list of tokens.
This is useful for simplifying tests where we don't care about tracking
origins.
Expand All @@ -57,7 +57,7 @@ def strip_originals(tokens: List[Tok]) -> List[Tok]:
return tokens


def get_text_and_norm(orig: str) -> Tuple[str, str]:
def get_text_and_norm(orig: str) -> tuple[str, str]:
toklist = list(t.tokenize(orig))
return t.text_from_tokens(toklist), t.normalized_text_from_tokens(toklist)

Expand Down Expand Up @@ -563,12 +563,12 @@ def test_single_tokens() -> None:
def run_test(test_cases: Iterable[TestCase], **options: Any) -> None:
for test_case in test_cases:
if len(test_case) == 3:
txt, kind, val = cast(Tuple[str, int, ValType], test_case)
txt, kind, val = cast(tuple[str, int, ValType], test_case)
c = [Tok(kind, txt, val)]
elif isinstance(test_case[1], list):
txt, c = cast(Tuple[str, List[Tok]], test_case)
txt, c = cast(tuple[str, list[Tok]], test_case)
else:
txt, kind = cast(Tuple[str, int], test_case)
txt, kind = cast(tuple[str, int], test_case)
c = [Tok(kind, txt, None)]
l = list(t.tokenize(txt, **options))
assert len(l) == len(c) + 2, repr(l)
Expand All @@ -593,8 +593,8 @@ def run_test(test_cases: Iterable[TestCase], **options: Any) -> None:
if check.kind == TOK.WORD:
# Test set equivalence, since the order of word meanings
# is not deterministic
assert set(cast(List[BIN_Tuple], tok.val) or []) == set(
cast(List[BIN_Tuple], check.val) or []
assert set(cast(list[BIN_Tuple], tok.val) or []) == set(
cast(list[BIN_Tuple], check.val) or []
), (repr(tok.val) + " != " + repr(check.val))
else:
assert tok.val == check.val, (
Expand Down

0 comments on commit d8a3351

Please sign in to comment.