diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index e432efc..15df45f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,10 +15,11 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "pypy-3.9", "pypy-3.10"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1", "pypy-3.9", "pypy-3.10"] steps: - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: @@ -29,10 +30,10 @@ jobs: python -m pip install --upgrade pip wheel setuptools python -m pip install -e ".[dev]" - - name: Type check with mypy (only on Python 3.8) + - name: Type check with mypy (only on oldest supported Python version) run: | - if [ "${{ matrix.python-version }}" == "3.8" ]; then python -m pip install mypy; fi - if [ "${{ matrix.python-version }}" == "3.8" ]; then mypy --python-version=3.8 src/tokenizer; fi + if [ "${{ matrix.python-version }}" == "3.9" ]; then python -m pip install mypy; fi + if [ "${{ matrix.python-version }}" == "3.9" ]; then mypy --python-version=3.9 src/tokenizer; fi - name: Test with pytest run: | diff --git a/LICENSE.txt b/LICENSE.txt index 6eebeb7..a3fd327 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ MIT License -Copyright (C) 2023 Miðeind ehf. +Copyright (C) 2016-2024 Miðeind ehf. Original author: Vilhjálmur Þorsteinsson Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/MANIFEST.in b/MANIFEST.in index 04cc9cf..0c93fb0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ graft src prune src/tokenizer/__pycache__ prune src/tokenizer/.mypy_cache +prune src/tokenizer/.DS_Store \ No newline at end of file diff --git a/README.rst b/README.rst index 316697e..50bb5ff 100644 --- a/README.rst +++ b/README.rst @@ -12,7 +12,7 @@ Tokenization is a necessary first step in many natural language processing tasks, such as word counting, parsing, spell checking, corpus generation, and statistical analysis of text. -**Tokenizer** is a compact pure-Python (>= 3.8) executable +**Tokenizer** is a compact pure-Python (>=3.9) executable program and module for tokenizing Icelandic text. It converts input text to streams of *tokens*, where each token is a separate word, punctuation sign, number/amount, date, e-mail, URL/URI, etc. It also segments the token stream @@ -809,6 +809,7 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``. Changelog --------- +* Version 3.4.5: Compatibility with Python 3.13. Now requires Python 3.9 or later. * Version 3.4.4: Better handling of abbreviations * Version 3.4.3: Various minor fixes. Now requires Python 3.8 or later. * Version 3.4.2: Abbreviations and phrases added, ``META_BEGIN`` token added. diff --git a/pyproject.toml b/pyproject.toml index 5bd7107..8e3d464 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,33 +1,33 @@ [project] name = "tokenizer" -version = "3.4.4" +version = "3.4.5" description = "A tokenizer for Icelandic text" authors = [{ name = "Miðeind ehf.", email = "mideind@mideind.is" }] readme = { file = "README.rst", content-type = "text/x-rst" } -license = { file = "LICENSE.txt" } -# For classifier list see: https://pypi.org/pypi?%3Aaction=list_classifiers +license = { text = "MIT" } classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Operating System :: Unix", "Operating System :: POSIX", + "Operating System :: MacOS", "Operating System :: Microsoft :: Windows", "Natural Language :: Icelandic", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Utilities", "Topic :: Text Processing :: Linguistic", ] -requires-python = ">=3.8" +requires-python = ">=3.9" [project.urls] Repository = "https://github.com/mideind/Tokenizer" @@ -51,17 +51,17 @@ where = ["src"] [tool.pytest.ini_options] filterwarnings = [ # Ignore deprecation warnings in libraries, their problem not ours - "ignore::DeprecationWarning", + # "ignore::DeprecationWarning", ] [tool.ruff] -line-length = 120 +line-length = 88 [tool.black] -line-length = 120 +line-length = 88 [tool.isort] # This forces these imports to placed at the top known_future_library = ["__future__", "typing", "typing_extensions"] profile = "black" -line_length = 120 +line_length = 88 diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py index 51fba02..d57468b 100644 --- a/src/tokenizer/__init__.py +++ b/src/tokenizer/__init__.py @@ -1,6 +1,6 @@ """ - Copyright(C) 2022 Miðeind ehf. + Copyright(C) 2016-2024 Miðeind ehf. Original author: Vilhjálmur Þorsteinsson This software is licensed under the MIT License: @@ -63,9 +63,8 @@ from .abbrev import Abbreviations, ConfigError __author__ = "Miðeind ehf." -__copyright__ = "(C) 2023 Miðeind ehf." -__version__ = importlib.metadata.version("tokenizer") - +__copyright__ = "(C) 2016-2024 Miðeind ehf." +__version__ = importlib.metadata.version(__name__) __all__ = ( "__author__", diff --git a/src/tokenizer/abbrev.py b/src/tokenizer/abbrev.py index a08ce5b..606f10c 100644 --- a/src/tokenizer/abbrev.py +++ b/src/tokenizer/abbrev.py @@ -2,7 +2,7 @@ Abbreviations module for tokenization of Icelandic text - Copyright (C) 2022 Miðeind ehf. + Copyright (C) 2016-2024 Miðeind ehf. Original author: Vilhjálmur Þorsteinsson This software is licensed under the MIT License: @@ -33,17 +33,16 @@ """ -from typing import Generic, Iterator, Optional, Set, List, Dict, TypeVar +from typing import Generic, Iterator, Optional, TypeVar from threading import Lock from collections import defaultdict, OrderedDict -from importlib.resources import open_text +import importlib.resources as importlib_resources from .definitions import BIN_Tuple class ConfigError(Exception): - pass @@ -51,17 +50,16 @@ class ConfigError(Exception): class OrderedSet(Generic[_T]): - - """ Shim class to provide an ordered set API on top - of an OrderedDict. This is necessary to make abbreviation - lookups predictable and repeatable, which they would not be - if a standard Python set() was used. """ + """Shim class to provide an ordered set API on top + of an OrderedDict. This is necessary to make abbreviation + lookups predictable and repeatable, which they would not be + if a standard Python set() was used.""" def __init__(self) -> None: - self._dict: Dict[_T, None] = OrderedDict() + self._dict: dict[_T, None] = OrderedDict() def add(self, item: _T) -> None: - """ Add an item at the end of the ordered set """ + """Add an item at the end of the ordered set""" if item not in self._dict: self._dict[item] = None @@ -73,42 +71,41 @@ def __iter__(self) -> Iterator[_T]: class Abbreviations: - - """ Wrapper around dictionary of abbreviations, - initialized from the config file """ + """Wrapper around dictionary of abbreviations, + initialized from the config file""" # Dictionary of abbreviations and their meanings - DICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet) + DICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet) # Wrong versions of abbreviations - WRONGDICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet) + WRONGDICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet) # All abbreviation meanings - MEANINGS: Set[str] = set() + MEANINGS: set[str] = set() # Single-word abbreviations, i.e. those with only one dot at the end - SINGLES: Set[str] = set() + SINGLES: set[str] = set() # Set of abbreviations without periods, e.g. "td", "osfrv" - WRONGSINGLES: Set[str] = set() + WRONGSINGLES: set[str] = set() # Potential sentence finishers, i.e. those with a dot at the end, # marked with an asterisk in the config file - FINISHERS: Set[str] = set() + FINISHERS: set[str] = set() # Abbreviations that should not be seen as such at the end of sentences, # marked with an exclamation mark in the config file - NOT_FINISHERS: Set[str] = set() + NOT_FINISHERS: set[str] = set() # Abbreviations that should not be seen as such at the end of sentences, but # are allowed in front of person names; marked with a hat ^ in the config file - NAME_FINISHERS: Set[str] = set() + NAME_FINISHERS: set[str] = set() # Wrong versions of abbreviations with possible corrections # wrong version : [correction1, correction2, ...] - WRONGDOTS: Dict[str, List[str]] = defaultdict(list) + WRONGDOTS: dict[str, list[str]] = defaultdict(list) # Word forms that should never be interpreted as abbreviations - NOT_ABBREVIATIONS: Set[str] = set() + NOT_ABBREVIATIONS: set[str] = set() # Ensure that only one thread initializes the abbreviations _lock = Lock() @staticmethod def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> None: - """ Add an abbreviation to the dictionary. - Called from the config file handler. """ + """Add an abbreviation to the dictionary. + Called from the config file handler.""" # Check for sentence finishers finisher = False not_finisher = False @@ -152,7 +149,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non # Append the abbreviation and its meaning in tuple form # Multiple meanings are supported for each abbreviation Abbreviations.DICT[abbrev].add( - BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, abbrev, "-",) + BIN_Tuple( + meaning, + 0, + gender, + "skst" if fl is None else fl, + abbrev, + "-", + ) ) Abbreviations.MEANINGS.add(meaning) # Adding wrong versions of abbreviations @@ -169,7 +173,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non # as abbreviations, even though they are listed as such # in the form 'Í.' and 'Á.' for use within person names Abbreviations.WRONGDICT[wabbrev].add( - BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",) + BIN_Tuple( + meaning, + 0, + gender, + "skst" if fl is None else fl, + wabbrev, + "-", + ) ) elif "." in abbrev: @@ -182,7 +193,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non wabbrev = abbrev[:i] + abbrev[i + 1 :] Abbreviations.WRONGDOTS[wabbrev].append(abbrev) Abbreviations.WRONGDICT[wabbrev].add( - BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",) + BIN_Tuple( + meaning, + 0, + gender, + "skst" if fl is None else fl, + wabbrev, + "-", + ) ) if len(indices) > 2: # 3 or 4 dots currently in vocabulary @@ -190,7 +208,7 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non i1 = indices[0] i2 = indices[1] i3 = indices[2] - wabbrevs: List[str] = [] + wabbrevs: list[str] = [] # 1 and 2 removed wabbrevs.append(abbrev[:i1] + abbrev[i1 + 1 : i2] + abbrev[i2 + 1 :]) # 1 and 3 removed @@ -214,7 +232,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non Abbreviations.WRONGSINGLES.add(wabbrev) Abbreviations.WRONGDOTS[wabbrev].append(abbrev) Abbreviations.WRONGDICT[wabbrev].add( - BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",) + BIN_Tuple( + meaning, + 0, + gender, + "skst" if fl is None else fl, + wabbrev, + "-", + ) ) if finisher: Abbreviations.FINISHERS.add(abbrev) @@ -232,8 +257,8 @@ def has_abbreviation(meaning: str) -> bool: return meaning in Abbreviations.MEANINGS @staticmethod - def get_meaning(abbrev: str) -> Optional[List[BIN_Tuple]]: - """ Lookup meaning(s) of abbreviation, if available. """ + def get_meaning(abbrev: str) -> Optional[list[BIN_Tuple]]: + """Look up meaning(s) of abbreviation, if available.""" m = Abbreviations.DICT.get(abbrev) if not m: m = Abbreviations.WRONGDICT.get(abbrev) @@ -241,7 +266,7 @@ def get_meaning(abbrev: str) -> Optional[List[BIN_Tuple]]: @staticmethod def _handle_abbreviations(s: str) -> None: - """ Handle abbreviations in the settings section """ + """Handle abbreviations in the settings section""" # Format: abbrev[*] = "meaning" gender (kk|kvk|hk) # An asterisk after an abbreviation ending with a period # indicates that the abbreviation may finish a sentence @@ -272,22 +297,25 @@ def _handle_abbreviations(s: str) -> None: @staticmethod def _handle_not_abbreviations(s: str) -> None: - """ Handle not_abbreviations in the settings section """ + """Handle not_abbreviations in the settings section""" if len(s) < 3 or s[0] != '"' or s[-1] != '"': raise ConfigError("not_abbreviations should be enclosed in double quotes") Abbreviations.NOT_ABBREVIATIONS.add(s[1:-1]) @staticmethod def initialize(): - """ Read the abbreviations config file """ + """Read the abbreviations config file""" with Abbreviations._lock: if len(Abbreviations.DICT): # Already initialized return section = None - config = open_text(package="tokenizer", resource="Abbrev.conf", encoding="utf-8") - for s in config: + + p = importlib_resources.files("tokenizer").joinpath("Abbrev.conf") + config = p.read_text(encoding="utf-8") + + for s in config.split("\n"): # Ignore comments ix = s.find("#") if ix >= 0: diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py index 9811192..12a82d3 100644 --- a/src/tokenizer/definitions.py +++ b/src/tokenizer/definitions.py @@ -2,7 +2,7 @@ Definitions used for tokenization of Icelandic text - Copyright (C) 2022 Miðeind ehf. + Copyright (C) 2016-2024 Miðeind ehf. Original author: Vilhjálmur Þorsteinsson This software is licensed under the MIT License: @@ -29,13 +29,9 @@ """ from typing import ( - Dict, - FrozenSet, Mapping, - Tuple, Union, Callable, - List, Sequence, Optional, NamedTuple, @@ -45,15 +41,15 @@ import re -BeginTuple = Tuple[int, Optional[int]] -PunctuationTuple = Tuple[int, str] -NumberTuple = Tuple[float, Optional[List[str]], Optional[List[str]]] -DateTimeTuple = Tuple[int, int, int] -MeasurementTuple = Tuple[str, float] -TimeStampTuple = Tuple[int, int, int, int, int, int] -AmountTuple = Tuple[float, str, Optional[List[str]], Optional[List[str]]] -TelnoTuple = Tuple[str, str] -CurrencyTuple = Tuple[str, Optional[List[str]], Optional[List[str]]] +BeginTuple = tuple[int, Optional[int]] +PunctuationTuple = tuple[int, str] +NumberTuple = tuple[float, Optional[list[str]], Optional[list[str]]] +DateTimeTuple = tuple[int, int, int] +MeasurementTuple = tuple[str, float] +TimeStampTuple = tuple[int, int, int, int, int, int] +AmountTuple = tuple[float, str, Optional[list[str]], Optional[list[str]]] +TelnoTuple = tuple[str, str] +CurrencyTuple = tuple[str, Optional[list[str]], Optional[list[str]]] class BIN_Tuple(NamedTuple): @@ -342,7 +338,7 @@ class PersonNameTuple(NamedTuple): # } # Time of day expressions spelled out -CLOCK_NUMBERS: Mapping[str, Tuple[int, int, int]] = { +CLOCK_NUMBERS: Mapping[str, tuple[int, int, int]] = { "eitt": (1, 0, 0), "tvö": (2, 0, 0), "þrjú": (3, 0, 0), @@ -434,7 +430,7 @@ class PersonNameTuple(NamedTuple): SINGLECHAR_FRACTIONS = "↉⅒⅑⅛⅐⅙⅕¼⅓½⅖⅔⅜⅗¾⅘⅝⅚⅞" # Derived unit : (base SI unit, conversion factor/function) -SI_UNITS: Dict[str, Tuple[str, Union[float, Callable[[float], float]]]] = { +SI_UNITS: dict[str, tuple[str, Union[float, Callable[[float], float]]]] = { # Distance "m": ("m", 1.0), "mm": ("m", 1.0e-3), @@ -534,11 +530,11 @@ class PersonNameTuple(NamedTuple): "N": "Norður", } -_unit_lambda: Callable[[str], str] = ( - lambda unit: unit + r"(?!\w)" if unit[-1].isalpha() else unit +_unit_lambda: Callable[[str], str] = lambda unit: ( + unit + r"(?!\w)" if unit[-1].isalpha() else unit ) -SI_UNITS_SET: FrozenSet[str] = frozenset(SI_UNITS.keys()) +SI_UNITS_SET: frozenset[str] = frozenset(SI_UNITS.keys()) SI_UNITS_REGEX_STRING = r"|".join( map( # If the unit ends with a letter, don't allow the next character @@ -721,9 +717,8 @@ def roman_to_int(s: str) -> int: # Króna amount strings allowed before a number, e.g. "kr. 9.900" ISK_AMOUNT_PRECEDING = frozenset(("kr.", "kr", "krónur")) -# URL prefixes. Note that this list should not contain www since -# www.something.com is a domain token, not a URL token. -URL_PREFIXES = ( +# URI scheme prefixes +URI_PREFIXES = ( "http://", "https://", "file://", @@ -739,6 +734,12 @@ def roman_to_int(s: str) -> int: "telnet://", "udp://", "vnc://", + "irc://", + "nntp://", + "wss://", + "ws://", + "xmpp://", + "mtqp://", ) TOP_LEVEL_DOMAINS = frozenset( diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py index 95e0e63..55d74c1 100755 --- a/src/tokenizer/main.py +++ b/src/tokenizer/main.py @@ -3,7 +3,7 @@ Tokenizer for Icelandic text - Copyright (C) 2022 Miðeind ehf. + Copyright (C) 2016-2024 Miðeind ehf. Original author: Vilhjálmur Þorsteinsson This software is licensed under the MIT License: @@ -35,7 +35,7 @@ """ -from typing import TextIO, Dict, Iterator, List, Callable, Any, Tuple, Union, cast +from typing import TextIO, Iterator, Callable, Any, Union, cast import sys import argparse @@ -71,8 +71,12 @@ group = parser.add_mutually_exclusive_group() -group.add_argument("--csv", help="Output one token per line in CSV format", action="store_true") -group.add_argument("--json", help="Output one token per line in JSON format", action="store_true") +group.add_argument( + "--csv", help="Output one token per line in CSV format", action="store_true" +) +group.add_argument( + "--json", help="Output one token per line in JSON format", action="store_true" +) parser.add_argument( "-s", @@ -92,7 +96,10 @@ "-p", "--coalesce_percent", action="store_true", - help=("Numbers combined into one token with percentage word forms " "(prósent/prósentustig/hundraðshlutar)"), + help=( + "Numbers combined into one token with percentage word forms " + "(prósent/prósentustig/hundraðshlutar)" + ), ) parser.add_argument( @@ -127,7 +134,10 @@ "-c", "--convert_numbers", action="store_true", - help=("English-style decimal points and thousands separators " "in numbers changed to Icelandic style"), + help=( + "English-style decimal points and thousands separators " + "in numbers changed to Icelandic style" + ), ) parser.add_argument( @@ -148,14 +158,14 @@ def main() -> None: """Main function, called when the tokenize command is invoked""" args = parser.parse_args() - options: Dict[str, bool] = dict() + options: dict[str, bool] = dict() def quote(s: str) -> str: """Return the string s within double quotes, and with any contained backslashes and double quotes escaped with a backslash""" return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"' - def spanquote(l: List[int]) -> str: + def spanquote(l: list[int]) -> str: """Return the list l as a string within double quotes""" return '"' + "-".join(str(x) for x in l) + '"' @@ -170,7 +180,7 @@ def val(t: Tok, quote_word: bool = False) -> Any: return None if t.kind == TOK.WORD: # Get the full expansion of an abbreviation - mm = cast(List[BIN_Tuple], t.val) + mm = cast(list[BIN_Tuple], t.val) if quote_word: # Return a |-delimited list of possible meanings, # joined into a single string @@ -203,7 +213,7 @@ def val(t: Tok, quote_word: bool = False) -> Any: TOK.MEASUREMENT, }: # Return a |-delimited list of numbers - vv = cast(Tuple[Any, ...], t.val) + vv = cast(tuple[Any, ...], t.val) return quote("|".join(str(v) for v in vv)) if quote_word and isinstance(t.val, str): return quote(t.val) @@ -244,7 +254,7 @@ def val(t: Tok, quote_word: bool = False) -> Any: # Configure our JSON dump function json_dumps = partial(json.dumps, ensure_ascii=False, separators=(",", ":")) - curr_sent: List[str] = [] + curr_sent: list[str] = [] tsep = "" if args.original else " " # token separator for t in tokenize(gen(args.infile), **options): if args.csv: @@ -265,7 +275,7 @@ def val(t: Tok, quote_word: bool = False) -> Any: print('0,"","","",""', file=args.outfile) elif args.json: # Output the tokens in JSON format, one line per token - d: Dict[str, Union[str, List[int]]] = dict(k=TOK.descr[t.kind]) + d: dict[str, Union[str, list[int]]] = dict(k=TOK.descr[t.kind]) if t.txt is not None: d["t"] = t.txt v = val(t) diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py index 1a6dfc8..2e7be72 100644 --- a/src/tokenizer/tokenizer.py +++ b/src/tokenizer/tokenizer.py @@ -2,7 +2,7 @@ Tokenizer for Icelandic text - Copyright (C) 2022 Miðeind ehf. + Copyright (C) 2016-2024 Miðeind ehf. Original author: Vilhjálmur Þorsteinsson This software is licensed under the MIT License: @@ -42,14 +42,11 @@ Any, Callable, Deque, - FrozenSet, Iterable, Iterator, - List, Mapping, Match, Optional, - Tuple, Type, TypeVar, Union, @@ -77,7 +74,6 @@ class Tok: - """Information about a single token""" def __init__( @@ -86,7 +82,7 @@ def __init__( txt: Optional[str], val: ValType, original: Optional[str] = None, - origin_spans: Optional[List[int]] = None, + origin_spans: Optional[list[int]] = None, ) -> None: # Type of token self.kind: int = kind @@ -101,7 +97,7 @@ def __init__( # Each such integer index maps the corresponding character # (which may have substitutions) to its index in 'original'. # This is required to preserve 'original' correctly when splitting. - self.origin_spans: Optional[List[int]] = origin_spans + self.origin_spans: Optional[list[int]] = origin_spans @classmethod def from_txt(cls: Type[_T], txt: str) -> _T: @@ -184,7 +180,7 @@ def person_names(self) -> PersonNameList: return [] return cast(PersonNameList, self.val) or [] - def split(self, pos: int) -> Tuple["Tok", "Tok"]: + def split(self, pos: int) -> tuple["Tok", "Tok"]: """Split this token into two at 'pos'. The first token returned will have 'pos' characters and the second one will have the rest. @@ -227,7 +223,7 @@ def split(self, pos: int) -> Tuple["Tok", "Tok"]: return l, r - def substitute(self, span: Tuple[int, int], new: str) -> None: + def substitute(self, span: tuple[int, int], new: str) -> None: """Substitute a span with a single or empty character 'new'.""" self.txt = self.txt[: span[0]] + new + self.txt[span[1] :] if self.origin_spans is not None: @@ -236,7 +232,7 @@ def substitute(self, span: Tuple[int, int], new: str) -> None: self.origin_spans[: span[0] + len(new)] + self.origin_spans[span[1] :] ) - def substitute_longer(self, span: Tuple[int, int], new: str) -> None: + def substitute_longer(self, span: tuple[int, int], new: str) -> None: """Substitute a span with a potentially longer string""" # This tracks origin differently from the regular @@ -312,7 +308,7 @@ def concatenate( self_origin_spans = self.origin_spans or [] other_origin_spans = other.origin_spans or [] - separator_origin_spans: List[int] = ( + separator_origin_spans: list[int] = ( [len(self_original)] * len(separator) if len(other_origin_spans) > 0 else [] ) new_origin_spans = ( @@ -324,7 +320,7 @@ def concatenate( return Tok(new_kind, new_txt, new_val, new_original, new_origin_spans) @property - def as_tuple(self) -> Tuple[Any, ...]: + def as_tuple(self) -> tuple[Any, ...]: """Return the contents of this token as a generic tuple, suitable e.g. for serialization""" return (self.kind, self.txt, self.val) @@ -373,7 +369,6 @@ def quoted_string_repr(obj: Any) -> str: class TOK: - """ The TOK class contains constants that define token types and constructors for creating token instances. @@ -647,8 +642,8 @@ def Email(t: Union[Tok, str]) -> Tok: def Number( t: Union[Tok, str], n: float, - cases: Optional[List[str]] = None, - genders: Optional[List[str]] = None, + cases: Optional[list[str]] = None, + genders: Optional[list[str]] = None, ) -> Tok: # The cases parameter is a list of possible cases for this number # (if it was originally stated in words) @@ -670,8 +665,8 @@ def NumberWithLetter(t: Union[Tok, str], n: int, c: str) -> Tok: def Currency( t: Union[Tok, str], iso: str, - cases: Optional[List[str]] = None, - genders: Optional[List[str]] = None, + cases: Optional[list[str]] = None, + genders: Optional[list[str]] = None, ) -> Tok: # The cases parameter is a list of possible cases for this currency name # (if it was originally stated in words, i.e. not abbreviated) @@ -686,8 +681,8 @@ def Amount( t: Union[Tok, str], iso: str, n: float, - cases: Optional[List[str]] = None, - genders: Optional[List[str]] = None, + cases: Optional[list[str]] = None, + genders: Optional[list[str]] = None, ) -> Tok: # The cases parameter is a list of possible cases for this amount # (if it was originally stated in words) @@ -701,8 +696,8 @@ def Amount( def Percent( t: Union[Tok, str], n: float, - cases: Optional[List[str]] = None, - genders: Optional[List[str]] = None, + cases: Optional[list[str]] = None, + genders: Optional[list[str]] = None, ) -> Tok: if isinstance(t, str): return Tok(TOK.PERCENT, t, (n, cases, genders)) @@ -954,7 +949,7 @@ def person_names(self, i: int = 0) -> Optional[PersonNameList]: t = self[i] return t.person_names if t else None - def as_tuple(self, i: int = 0) -> Optional[Tuple[Any, ...]]: + def as_tuple(self, i: int = 0) -> Optional[tuple[Any, ...]]: """Return token.as_tuple for token at index i.""" t = self[i] return t.as_tuple if t else None @@ -968,7 +963,7 @@ def could_be_end_of_sentence(self, i: int = 0, *args: Any) -> bool: def normalized_text(token: Tok) -> str: """Returns token text after normalizing punctuation""" return ( - cast(Tuple[int, str], token.val)[1] + cast(tuple[int, str], token.val)[1] if token.kind == TOK.PUNCTUATION else token.txt ) @@ -995,7 +990,7 @@ def is_valid_date(y: int, m: int, d: int) -> bool: return False -def parse_digits(tok: Tok, convert_numbers: bool) -> Tuple[Tok, Tok]: +def parse_digits(tok: Tok, convert_numbers: bool) -> tuple[Tok, Tok]: """Parse a raw token starting with a digit""" w = tok.txt s: Optional[Match[str]] = re.match(r"\d{1,2}:\d\d:\d\d,\d\d(?!\d)", w) @@ -1338,7 +1333,7 @@ def parse_digits(tok: Tok, convert_numbers: bool) -> Tuple[Tok, Tok]: ) -def html_escape(match: Match[str]) -> Tuple[Tuple[int, int], str]: +def html_escape(match: Match[str]) -> tuple[tuple[int, int], str]: """Regex substitution function for HTML escape codes""" g = match.group(4) if g is not None: @@ -1401,7 +1396,7 @@ def generate_rough_tokens_from_tok(tok: Tok) -> Iterator[Tok]: # This function further splits those tokens into multiple tokens. # Rough tokens are tokens that are separated by white space, i.e. the regex (\\s*).""" - def shift_span(span: Tuple[int, int], pos: int): + def shift_span(span: tuple[int, int], pos: int): """Shift a span by a given amount""" return (span[SPAN_START] + pos, span[SPAN_END] + pos) @@ -1559,7 +1554,7 @@ def generate_raw_tokens( def could_be_end_of_sentence( next_token: Tok, - test_set: FrozenSet[int] = TOK.TEXT, + test_set: frozenset[int] = TOK.TEXT, multiplier: bool = False, ) -> bool: """Return True if next_token could be ending the current sentence or @@ -1578,7 +1573,6 @@ def could_be_end_of_sentence( class LetterParser: - """Parses a sequence of alphabetic characters off the front of a raw token""" @@ -1663,7 +1657,6 @@ def parse(self) -> Iterable[Tok]: class NumberParser: - """Parses a sequence of digits off the front of a raw token""" def __init__( @@ -1724,7 +1717,6 @@ def parse(self) -> Iterable[Tok]: class PunctuationParser: - """Parses a sequence of punctuation off the front of a raw token""" def __init__(self) -> None: @@ -1856,7 +1848,7 @@ def parse_mixed( ate = True rtxt = rt.txt - if rtxt and rtxt.startswith(URL_PREFIXES): + if rtxt and rtxt.startswith(URI_PREFIXES): # Handle URL: cut RIGHT_PUNCTUATION characters off its end, # even though many of them are actually allowed according to # the IETF RFC @@ -1931,7 +1923,7 @@ def parse_mixed( # Check for currency abbreviations immediately followed by a number if len(rt.txt) > 3 and rt.txt[0:3] in CURRENCY_ABBREV and rt.txt[3].isdigit(): - # XXX: This feels a little hacky + # TODO: This feels a little hacky temp_tok = Tok(TOK.RAW, rt.txt[3:], None) digit_tok, _ = parse_digits(temp_tok, convert_numbers) if digit_tok.kind == TOK.NUMBER: @@ -2108,7 +2100,7 @@ def is_abbr_with_period(txt: str) -> bool: return txt not in Abbreviations.DICT return False - def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: + def lookup(abbrev: str) -> Optional[list[BIN_Tuple]]: """Look up an abbreviation, both in original case and in lower case, and return either None if not found or a meaning list having one entry""" m = Abbreviations.DICT.get(abbrev) @@ -2145,7 +2137,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: and not token_stream.could_be_end_of_sentence() ): # This is something like 'Ég fæddist 25.9. í Svarfaðardal.' - y, m, d = cast(Tuple[int, int, int], token.val) + y, m, d = cast(tuple[int, int, int], token.val) token = TOK.Daterel(token.concatenate(next_token), y, m, d) next_token = next(token_stream) @@ -2647,7 +2639,7 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]: if abbrev in Abbreviations.FINISHERS: token = TOK.Word( token.concatenate(next_token), - cast(Optional[List[BIN_Tuple]], token.val), + cast(Optional[list[BIN_Tuple]], token.val), ) next_token = next(token_stream) @@ -2975,7 +2967,7 @@ def parse_phrases_2( # Check for composites: # 'stjórnskipunar- og eftirlitsnefnd' # 'dómsmála-, viðskipta- og iðnaðarráðherra' - tq: List[Tok] = [] + tq: list[Tok] = [] while token.kind == TOK.WORD and next_token.punctuation == COMPOSITE_HYPHEN: # Accumulate the prefix in tq tq.append(token) @@ -3081,7 +3073,7 @@ def split_into_sentences( to_text = lambda t: t.original or t.txt else: to_text = lambda t: t.txt - curr_sent: List[str] = [] + curr_sent: list[str] = [] for t in tokenize_without_annotation(text_or_gen, **options): if t.kind in TOK.END: # End of sentence/paragraph @@ -3111,14 +3103,14 @@ def mark_paragraphs(txt: str) -> str: return "[[" + "]][[".join(t for t in txt.split("\n") if t) + "]]" -def paragraphs(tokens: Iterable[Tok]) -> Iterator[List[Tuple[int, List[Tok]]]]: +def paragraphs(tokens: Iterable[Tok]) -> Iterator[list[tuple[int, list[Tok]]]]: """Generator yielding paragraphs from token iterable. Each paragraph is a list of sentence tuples. Sentence tuples consist of the index of the first token of the sentence (the TOK.S_BEGIN token) and a list of the tokens within the sentence, not including the starting TOK.S_BEGIN or the terminating TOK.S_END tokens.""" - def valid_sent(sent: Optional[List[Tok]]) -> bool: + def valid_sent(sent: Optional[list[Tok]]) -> bool: """Return True if the token list in sent is a proper sentence that we want to process further""" if not sent: @@ -3126,9 +3118,9 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool: # A sentence with only punctuation is not valid return any(t[0] != TOK.PUNCTUATION for t in sent) - sent: List[Tok] = [] # Current sentence + sent: list[Tok] = [] # Current sentence sent_begin = 0 - current_p: List[Tuple[int, List[Tok]]] = [] # Current paragraph + current_p: list[tuple[int, list[Tok]]] = [] # Current paragraph for ix, t in enumerate(tokens): t0 = t[0] @@ -3184,7 +3176,7 @@ def correct_spaces(s: str) -> str: with correct spacing between tokens. NOTE that this function uses a quick-and-dirty approach which may not handle all edge cases!""" - r: List[str] = [] + r: list[str] = [] last = TP_NONE double_quote_count = 0 for w in RE_SPLIT.split(s): @@ -3244,7 +3236,7 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str: to a correctly spaced string. If normalize is True, punctuation is normalized before assembling the string.""" to_text: Callable[[Tok], str] = normalized_text if normalize else lambda t: t.txt - r: List[str] = [] + r: list[str] = [] last = TP_NONE double_quote_count = 0 for t in tokens: @@ -3278,7 +3270,7 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str: def calculate_indexes( tokens: Iterable[Tok], last_is_end: bool = False -) -> Tuple[List[int], List[int]]: +) -> tuple[list[int], list[int]]: """Calculate character and byte indexes for a token stream. The indexes are the start positions of each token in the original text that was tokenized. diff --git a/test/test_detokenize.py b/test/test_detokenize.py index db5f413..09545e1 100644 --- a/test/test_detokenize.py +++ b/test/test_detokenize.py @@ -6,7 +6,7 @@ Tests for Tokenizer module - Copyright (C) 2022 by Miðeind ehf. + Copyright (C) 2016-2024 by Miðeind ehf. Original author: Vilhjálmur Þorsteinsson This software is licensed under the MIT License: @@ -37,7 +37,7 @@ def test_detokenize() -> None: - options = { "normalize": True } + options = {"normalize": True} def should_be_equal(s: str) -> None: toklist = t.tokenize(s, **options) @@ -58,19 +58,18 @@ def should_be(s1: str, s2: str) -> None: should_be_equal("Páll veiddi 74 cm. lax í Norðurá þann 1.3.") should_be( - "Páll var með \"netfangið\" palli@einn.i.heiminum.is.", - "Páll var með „netfangið“ palli@einn.i.heiminum.is." + 'Páll var með "netfangið" palli@einn.i.heiminum.is.', + "Páll var með „netfangið“ palli@einn.i.heiminum.is.", ) # !!! BUG - #should_be( + # should_be( # "Páll var með \"netfangið\", þ.e.a.s. (\"þetta\").", # "Páll var með „netfangið“, þ.e.a.s. („þetta“).", - #) + # ) - options = { "normalize": False } + options = {"normalize": False} should_be_equal("Páll var með „netfangið“, þ.e.a.s. („þetta“).") - should_be_equal("Páll var með \"netfangið\" palli@einn.i.heiminum.is.") - should_be_equal("Páll var með \"netfangið\", þ.e.a.s. (\"þetta\").") - + should_be_equal('Páll var með "netfangið" palli@einn.i.heiminum.is.') + should_be_equal('Páll var með "netfangið", þ.e.a.s. ("þetta").') diff --git a/test/test_index_calculation.py b/test/test_index_calculation.py index 0b59e35..60a81a8 100644 --- a/test/test_index_calculation.py +++ b/test/test_index_calculation.py @@ -6,7 +6,7 @@ Tests for Tokenizer module - Copyright (C) 2022 by Miðeind ehf. + Copyright (C) 2016-2024 by Miðeind ehf. This software is licensed under the MIT License: @@ -169,7 +169,6 @@ def test_small_difficult_cases() -> None: assert char_indexes == [0, 2, 4] assert byte_indexes == [0, 2, 4] - # Two byte characters for x in ["þ", "æ", "á"]: s = x @@ -230,12 +229,11 @@ def test_small_difficult_cases() -> None: assert char_indexes == [0, 2, 4] assert byte_indexes == [0, 3, 6] - # Two character characters # These strings contain two unicode code points that are rendered as one letter. # They are counted as two characters in python. # In addition the accent and umlaut characters are two bytes. - for x in ["a"+ACCENT, "o"+UMLAUT]: + for x in ["a" + ACCENT, "o" + UMLAUT]: s = x toks = tokenizer.parse_tokens([s]) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) @@ -288,11 +286,11 @@ def test_small_difficult_cases() -> None: # example chars: # " a´ a´" # 012345 - # ^ ^ + # ^ ^ # example bytes: # " a´_ a´_" # 01234567 - # ^ ^ + # ^ ^ toks = tokenizer.parse_tokens([s]) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) assert char_indexes == [0, 3] @@ -302,7 +300,6 @@ def test_small_difficult_cases() -> None: assert char_indexes == [0, 3, 6] assert byte_indexes == [0, 4, 8] - # The em-dash is 3 bytes for x in [EM_DASH]: s = x @@ -361,7 +358,7 @@ def test_small_difficult_cases() -> None: # example bytes: # " a__ a__" # 01234567 - # ^ ^ + # ^ ^ toks = tokenizer.parse_tokens([s]) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) assert char_indexes == [0, 2] @@ -379,25 +376,181 @@ def test_larger_case() -> None: # x x x xx x toks = tokenizer.parse_tokens([s]) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) - assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72] - assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78] + assert char_indexes == [ + 0, + 5, + 13, + 16, + 18, + 25, + 30, + 33, + 36, + 40, + 45, + 50, + 53, + 61, + 66, + 72, + ] + assert byte_indexes == [ + 0, + 6, + 14, + 17, + 20, + 27, + 32, + 35, + 38, + 43, + 50, + 55, + 58, + 66, + 72, + 78, + ] toks = tokenizer.parse_tokens([s]) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True) - assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72, 73] - assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78, 79] + assert char_indexes == [ + 0, + 5, + 13, + 16, + 18, + 25, + 30, + 33, + 36, + 40, + 45, + 50, + 53, + 61, + 66, + 72, + 73, + ] + assert byte_indexes == [ + 0, + 6, + 14, + 17, + 20, + 27, + 32, + 35, + 38, + 43, + 50, + 55, + 58, + 66, + 72, + 78, + 79, + ] def test_iterator_cases() -> None: - s = ["Þessi ", "setning ", "er ", "í ", "lengra ", "lagi ", "og ", "er ", "með ", "bæði ", "eins ", "og ", "tveggja ", "bæta ", "stafi."] + s = [ + "Þessi ", + "setning ", + "er ", + "í ", + "lengra ", + "lagi ", + "og ", + "er ", + "með ", + "bæði ", + "eins ", + "og ", + "tveggja ", + "bæta ", + "stafi.", + ] # (char and byte indexes in a similar test above) toks = tokenizer.parse_tokens(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) - assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72] - assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78] + assert char_indexes == [ + 0, + 5, + 13, + 16, + 18, + 25, + 30, + 33, + 36, + 40, + 45, + 50, + 53, + 61, + 66, + 72, + ] + assert byte_indexes == [ + 0, + 6, + 14, + 17, + 20, + 27, + 32, + 35, + 38, + 43, + 50, + 55, + 58, + 66, + 72, + 78, + ] toks = tokenizer.parse_tokens(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True) - assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72, 73] - assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78, 79] + assert char_indexes == [ + 0, + 5, + 13, + 16, + 18, + 25, + 30, + 33, + 36, + 40, + 45, + 50, + 53, + 61, + 66, + 72, + 73, + ] + assert byte_indexes == [ + 0, + 6, + 14, + 17, + 20, + 27, + 32, + 35, + 38, + 43, + 50, + 55, + 58, + 66, + 72, + 78, + 79, + ] s = ["Stutt setning.", "", "Önnur setning."] # 01234567890123 45678901234567 @@ -493,11 +646,15 @@ def test_lengthening_substitutions() -> None: # ^ ^ ^ ^ ^ # x x # ! lengthening happens here (3ji->þriðji) - toks = tokenizer.parse_tokens(s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY) + toks = tokenizer.parse_tokens( + s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY + ) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) assert char_indexes == [0, 5, 8, 12, 21] assert byte_indexes == [0, 6, 9, 13, 23] - toks = tokenizer.parse_tokens(s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY) + toks = tokenizer.parse_tokens( + s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY + ) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True) assert char_indexes == [0, 5, 8, 12, 21, 22] assert byte_indexes == [0, 6, 9, 13, 23, 24] diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py index 01116fa..193cfef 100755 --- a/test/test_tokenizer.py +++ b/test/test_tokenizer.py @@ -5,7 +5,7 @@ Tests for Tokenizer module - Copyright (C) 2022 by Miðeind ehf. + Copyright (C) 2016-2024 by Miðeind ehf. Original author: Vilhjálmur Þorsteinsson This software is licensed under the MIT License: @@ -31,7 +31,7 @@ """ -from typing import Any, Iterable, Iterator, List, Tuple, Union, cast +from typing import Any, Iterable, Iterator, Union, cast import tokenizer as t from tokenizer.definitions import BIN_Tuple, ValType @@ -39,14 +39,14 @@ TOK = t.TOK Tok = t.Tok -TestCase = Union[Tuple[str, int], Tuple[str, int, ValType], Tuple[str, List[Tok]]] +TestCase = Union[tuple[str, int], tuple[str, int, ValType], tuple[str, list[Tok]]] -def strip_originals(tokens: List[Tok]) -> List[Tok]: +def strip_originals(tokens: list[Tok]) -> list[Tok]: """Remove origin tracking info from a list of tokens. This is useful for simplifying tests where we don't care about tracking origins. - XXX: This could be removed if we get a feature to disable origin + TODO: This could be removed if we get a feature to disable origin tracking during tokenization. """ @@ -57,7 +57,7 @@ def strip_originals(tokens: List[Tok]) -> List[Tok]: return tokens -def get_text_and_norm(orig: str) -> Tuple[str, str]: +def get_text_and_norm(orig: str) -> tuple[str, str]: toklist = list(t.tokenize(orig)) return t.text_from_tokens(toklist), t.normalized_text_from_tokens(toklist) @@ -563,12 +563,12 @@ def test_single_tokens() -> None: def run_test(test_cases: Iterable[TestCase], **options: Any) -> None: for test_case in test_cases: if len(test_case) == 3: - txt, kind, val = cast(Tuple[str, int, ValType], test_case) + txt, kind, val = cast(tuple[str, int, ValType], test_case) c = [Tok(kind, txt, val)] elif isinstance(test_case[1], list): - txt, c = cast(Tuple[str, List[Tok]], test_case) + txt, c = cast(tuple[str, list[Tok]], test_case) else: - txt, kind = cast(Tuple[str, int], test_case) + txt, kind = cast(tuple[str, int], test_case) c = [Tok(kind, txt, None)] l = list(t.tokenize(txt, **options)) assert len(l) == len(c) + 2, repr(l) @@ -593,8 +593,8 @@ def run_test(test_cases: Iterable[TestCase], **options: Any) -> None: if check.kind == TOK.WORD: # Test set equivalence, since the order of word meanings # is not deterministic - assert set(cast(List[BIN_Tuple], tok.val) or []) == set( - cast(List[BIN_Tuple], check.val) or [] + assert set(cast(list[BIN_Tuple], tok.val) or []) == set( + cast(list[BIN_Tuple], check.val) or [] ), (repr(tok.val) + " != " + repr(check.val)) else: assert tok.val == check.val, ( diff --git a/test/test_tokenizer_tok.py b/test/test_tokenizer_tok.py index 3ee1f46..c0caa7e 100644 --- a/test/test_tokenizer_tok.py +++ b/test/test_tokenizer_tok.py @@ -3,7 +3,7 @@ Tests for Tokenizer module - Copyright (C) 2022 by Miðeind ehf. + Copyright (C) 2016-2024 by Miðeind ehf. This software is licensed under the MIT License: