From b123fbcee97120d67e41d02febf2fb03da76476d Mon Sep 17 00:00:00 2001 From: thorunna Date: Thu, 2 Nov 2023 13:42:47 +0000 Subject: [PATCH 1/9] added handling for abbreviations --- src/tokenizer/tokenizer.py | 234 +++++++++---------------------------- 1 file changed, 53 insertions(+), 181 deletions(-) diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py index 1d38321..73f5fa1 100644 --- a/src/tokenizer/tokenizer.py +++ b/src/tokenizer/tokenizer.py @@ -232,9 +232,7 @@ def substitute(self, span: Tuple[int, int], new: str) -> None: self.txt = self.txt[: span[0]] + new + self.txt[span[1] :] if self.origin_spans is not None: # Remove origin entries that correspond to characters that are gone. - self.origin_spans = ( - self.origin_spans[: span[0] + len(new)] + self.origin_spans[span[1] :] - ) + self.origin_spans = self.origin_spans[: span[0] + len(new)] + self.origin_spans[span[1] :] def substitute_longer(self, span: Tuple[int, int], new: str) -> None: """Substitute a span with a potentially longer string""" @@ -312,13 +310,9 @@ def concatenate( self_origin_spans = self.origin_spans or [] other_origin_spans = other.origin_spans or [] - separator_origin_spans: List[int] = ( - [len(self_original)] * len(separator) if len(other_origin_spans) > 0 else [] - ) + separator_origin_spans: List[int] = [len(self_original)] * len(separator) if len(other_origin_spans) > 0 else [] new_origin_spans = ( - self_origin_spans - + separator_origin_spans - + [i + len(self_original) for i in other_origin_spans] + self_origin_spans + separator_origin_spans + [i + len(self_original) for i in other_origin_spans] ) return Tok(new_kind, new_txt, new_val, new_original, new_origin_spans) @@ -343,9 +337,7 @@ def __getitem__(self, i: int) -> Union[int, str, ValType]: def equal(self, other: "Tok") -> bool: """Equality of content between two tokens, i.e. ignoring the 'original' and 'origin_spans' attributes""" - return ( - self.kind == other.kind and self.txt == other.txt and self.val == other.val - ) + return self.kind == other.kind and self.txt == other.txt and self.val == other.val def __eq__(self, o: Any) -> bool: """Full equality between two Tok instances""" @@ -587,9 +579,7 @@ def Daterel(t: Union[Tok, str], y: int, m: int, d: int) -> Tok: return t @staticmethod - def Timestamp( - t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int - ) -> Tok: + def Timestamp(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int) -> Tok: if isinstance(t, str): return Tok(TOK.TIMESTAMP, t, (y, mo, d, h, m, s)) t.kind = TOK.TIMESTAMP @@ -597,9 +587,7 @@ def Timestamp( return t @staticmethod - def Timestampabs( - t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int - ) -> Tok: + def Timestampabs(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int) -> Tok: if isinstance(t, str): return Tok(TOK.TIMESTAMPABS, t, (y, mo, d, h, m, s)) t.kind = TOK.TIMESTAMPABS @@ -607,9 +595,7 @@ def Timestampabs( return t @staticmethod - def Timestamprel( - t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int - ) -> Tok: + def Timestamprel(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int) -> Tok: if isinstance(t, str): return Tok(TOK.TIMESTAMPREL, t, (y, mo, d, h, m, s)) t.kind = TOK.TIMESTAMPREL @@ -967,11 +953,7 @@ def could_be_end_of_sentence(self, i: int = 0, *args: Any) -> bool: def normalized_text(token: Tok) -> str: """Returns token text after normalizing punctuation""" - return ( - cast(Tuple[int, str], token.val)[1] - if token.kind == TOK.PUNCTUATION - else token.txt - ) + return cast(Tuple[int, str], token.val)[1] if token.kind == TOK.PUNCTUATION else token.txt def text_from_tokens(tokens: Iterable[Tok]) -> str: @@ -1234,11 +1216,7 @@ def parse_digits(tok: Tok, convert_numbers: bool) -> Tuple[Tok, Tok]: p = g.split("/") m = int(p[1]) d = int(p[0]) - if ( - p[0][0] != "0" - and p[1][0] != "0" - and ((d <= 5 and m <= 6) or (d == 1 and m <= 10)) - ): + if p[0][0] != "0" and p[1][0] != "0" and ((d <= 5 and m <= 6) or (d == 1 and m <= 10)): # This is probably a fraction, not a date # (1/2, 1/3, 1/4, 1/5, 1/6, 2/3, 2/5, 5/6 etc.) # Return a number @@ -1359,9 +1337,7 @@ def unicode_replacement(token: Tok) -> Tok: total_reduction = 0 for m in UNICODE_REGEX.finditer(token.txt): span, new_letter = m.span(), UNICODE_REPLACEMENTS[m.group(0)] - token.substitute( - (span[0] - total_reduction, span[1] - total_reduction), new_letter - ) + token.substitute((span[0] - total_reduction, span[1] - total_reduction), new_letter) total_reduction += span[1] - span[0] - len(new_letter) return token @@ -1371,9 +1347,7 @@ def html_replacement(token: Tok) -> Tok: total_reduction = 0 for m in HTML_ESCAPE_REGEX.finditer(token.txt): span, new_letter = html_escape(m) - token.substitute( - (span[0] - total_reduction, span[1] - total_reduction), new_letter - ) + token.substitute((span[0] - total_reduction, span[1] - total_reduction), new_letter) total_reduction += span[1] - span[0] - len(new_letter) return token @@ -1415,12 +1389,8 @@ def shift_span(span: Tuple[int, int], pos: int): assert match is not None # Since the match indexes the text of the original token, # we need to shift the indices so that they match the current token. - shifted_all_group_span = shift_span( - match.span(ROUGH_TOKEN_REGEX_ENTIRE_MATCH), -pos - ) - shifted_white_space_span = shift_span( - match.span(ROUGH_TOKEN_REGEX_WHITE_SPACE_GROUP), -pos - ) + shifted_all_group_span = shift_span(match.span(ROUGH_TOKEN_REGEX_ENTIRE_MATCH), -pos) + shifted_white_space_span = shift_span(match.span(ROUGH_TOKEN_REGEX_WHITE_SPACE_GROUP), -pos) # Then we split the current token using the shifted spans small_tok, tok = tok.split(shifted_all_group_span[SPAN_END]) # Remove whitespace characters from the start of the token @@ -1453,7 +1423,6 @@ def generate_raw_tokens( big_text: str for big_text in text_or_gen: - if not one_sent_per_line and not big_text: # An explicit empty string in the input always # causes a sentence split @@ -1592,12 +1561,7 @@ def parse(self) -> Iterable[Tok]: lw = len(rt.txt) i = 1 while i < lw and ( - rt.txt[i].isalpha() - or ( - rt.txt[i] in PUNCT_INSIDE_WORD - and i + 1 < lw - and rt.txt[i + 1].isalpha() - ) + rt.txt[i].isalpha() or (rt.txt[i] in PUNCT_INSIDE_WORD and i + 1 < lw and rt.txt[i + 1].isalpha()) ): # We allow dots to occur inside words in the case of # abbreviations; also apostrophes are allowed within @@ -1667,9 +1631,7 @@ class NumberParser: """Parses a sequence of digits off the front of a raw token""" - def __init__( - self, rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool - ) -> None: + def __init__(self, rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool) -> None: self.rt = rt self.handle_kludgy_ordinals = handle_kludgy_ordinals self.convert_numbers = convert_numbers @@ -1690,10 +1652,7 @@ def parse(self) -> Iterable[Tok]: # '1sti' -> 'fyrsti', '3ji' -> 'þriðji', etc. key_tok.substitute_longer((0, len(key)), val) yield TOK.Word(key_tok) - elif ( - handle_kludgy_ordinals == KLUDGY_ORDINALS_TRANSLATE - and key in ORDINAL_NUMBERS - ): + elif handle_kludgy_ordinals == KLUDGY_ORDINALS_TRANSLATE and key in ORDINAL_NUMBERS: # Convert word-form ordinals into ordinal tokens, # i.e. '1sti' -> TOK.Ordinal('1sti', 1), # but leave other kludgy constructs ('2ja') @@ -1822,16 +1781,13 @@ def parse(self, rt: Tok) -> Iterable[Tok]: self.ate = ate -def parse_mixed( - rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool -) -> Iterable[Tok]: +def parse_mixed(rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool) -> Iterable[Tok]: """Parse a mixed raw token string, from the token rt""" # Initialize a singleton parser for punctuation pp = PunctuationParser() while rt.txt: - # Handle punctuation yield from pp.parse(rt) rt, ate = pp.rt, pp.ate @@ -1910,8 +1866,7 @@ def parse_mixed( # Numbers or other stuff starting with a digit # (eventually prefixed by a '+' or '-') if rtxt and ( - rtxt[0] in DIGITS_PREFIX - or (rtxt[0] in SIGN_PREFIX and len(rtxt) >= 2 and rtxt[1] in DIGITS_PREFIX) + rtxt[0] in DIGITS_PREFIX or (rtxt[0] in SIGN_PREFIX and len(rtxt) >= 2 and rtxt[1] in DIGITS_PREFIX) ): np = NumberParser(rt, handle_kludgy_ordinals, convert_numbers) yield from np.parse() @@ -1977,9 +1932,7 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok # The default behavior for kludgy ordinals is to pass them # through as word tokens - handle_kludgy_ordinals: int = options.get( - "handle_kludgy_ordinals", KLUDGY_ORDINALS_PASS_THROUGH - ) + handle_kludgy_ordinals: int = options.get("handle_kludgy_ordinals", KLUDGY_ORDINALS_PASS_THROUGH) # This code proceeds roughly as follows: # 1) The text is split into raw tokens on whitespace boundaries. @@ -2002,9 +1955,7 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok rtxt: str = "" - for rt in generate_raw_tokens( - txt, replace_composite_glyphs, replace_html_escapes, one_sent_per_line - ): + for rt in generate_raw_tokens(txt, replace_composite_glyphs, replace_html_escapes, one_sent_per_line): # rt: raw token if rt.kind in {TOK.S_SPLIT, TOK.P_BEGIN, TOK.P_END}: @@ -2134,18 +2085,13 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: and (next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR) ): currabbr = CURRENCY_SYMBOLS[token.txt] - token = TOK.Amount( - token.concatenate(next_token), currabbr, next_token.number - ) + token = TOK.Amount(token.concatenate(next_token), currabbr, next_token.number) next_token = next(token_stream) # Special case for a DATEREL token of the form "25.10.", # i.e. with a trailing period: It can end a sentence if token.kind == TOK.DATEREL and "." in token.txt: - if ( - next_token.txt == "." - and not token_stream.could_be_end_of_sentence() - ): + if next_token.txt == "." and not token_stream.could_be_end_of_sentence(): # This is something like 'Ég fæddist 25.9. í Svarfaðardal.' y, m, d = cast(Tuple[int, int, int], token.val) token = TOK.Daterel(token.concatenate(next_token), y, m, d) @@ -2154,11 +2100,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: # Coalesce abbreviations ending with a period into a single # abbreviation token if next_token.punctuation == ".": - if ( - token.kind == TOK.WORD - and token.txt[-1] != "." - and is_abbr_with_period(token.txt) - ): + if token.kind == TOK.WORD and token.txt[-1] != "." and is_abbr_with_period(token.txt): # Abbreviation ending with period: make a special token for it # and advance the input stream follow_token = next(token_stream) @@ -2181,9 +2123,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: # þær þarf að vera hægt að sameina í þessa flóknari tóka en viljum # geta merkt það sem villu. Ætti líklega að setja í sérlista, # WRONG_MONTHS, og sérif-lykkju og setja inn villu í tókann. - finish = could_be_end_of_sentence( - follow_token, test_set, abbrev in NUMBER_ABBREV - ) + finish = could_be_end_of_sentence(follow_token, test_set, abbrev in NUMBER_ABBREV) if finish: # Potentially at the end of a sentence if abbrev in Abbreviations.FINISHERS: @@ -2195,10 +2135,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: yield token # Set token to the period token = next_token - elif ( - abbrev in Abbreviations.NOT_FINISHERS - or abbrev.lower() in Abbreviations.NOT_FINISHERS - ): + elif abbrev in Abbreviations.NOT_FINISHERS or abbrev.lower() in Abbreviations.NOT_FINISHERS: # This is a potential abbreviation that we don't interpret # as such if it's at the end of a sentence # ('dags.', 'próf.', 'mín.'). Note that this also @@ -2207,9 +2144,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: token = next_token else: # Substitute the abbreviation and eat the period - token = TOK.Word( - token.concatenate(next_token), lookup(abbrev) - ) + token = TOK.Word(token.concatenate(next_token), lookup(abbrev)) else: # 'Regular' abbreviation in the middle of a sentence: # Eat the period and yield the abbreviation as a single token @@ -2245,9 +2180,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: next_token = next(token_stream) # Coalesce 'klukkan/kl. átta/hálfátta' into a time - elif ( - next_token.kind == TOK.WORD and next_token.txt.lower() in CLOCK_NUMBERS - ): + elif next_token.kind == TOK.WORD and next_token.txt.lower() in CLOCK_NUMBERS: if token.kind == TOK.WORD and token.txt.lower() in CLOCK_ABBREVS: # Match: coalesce and step to next token next_txt = next_token.txt.lower() @@ -2309,9 +2242,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: sign = next_token.txt # Store promille as one-tenth of a percentage factor = 1.0 if sign == "%" else 0.1 - token = TOK.Percent( - token.concatenate(next_token), token.number * factor - ) + token = TOK.Percent(token.concatenate(next_token), token.number * factor) next_token = next(token_stream) # Coalesce ordinals (1. = first, 2. = second...) into a single token @@ -2337,20 +2268,13 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: ): # OK: replace the number/Roman numeral and the period # with an ordinal token - num = ( - token.integer - if token.kind == TOK.NUMBER - else roman_to_int(token.txt) - ) + num = token.integer if token.kind == TOK.NUMBER else roman_to_int(token.txt) token = TOK.Ordinal(token.concatenate(next_token), num) # Continue with the following word next_token = next(token_stream) # Convert "1920 mm" or "30 °C" to a single measurement token - if ( - token.kind == TOK.NUMBER or token.kind == TOK.YEAR - ) and next_token.txt in SI_UNITS: - + if (token.kind == TOK.NUMBER or token.kind == TOK.YEAR) and next_token.txt in SI_UNITS: value = token.number orig_unit = next_token.txt unit: str @@ -2363,9 +2287,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: assert isinstance(factor_func, float) value *= factor_func if unit in ("%", "‰"): - token = TOK.Percent( - token.concatenate(next_token, separator=" "), value - ) + token = TOK.Percent(token.concatenate(next_token, separator=" "), value) else: token = TOK.Measurement( token.concatenate(next_token, separator=" "), @@ -2463,11 +2385,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: next_token = next(token_stream) # Cases such as 19 $, 199.99 $ - if ( - token.kind == TOK.NUMBER - and next_token.kind == TOK.PUNCTUATION - and next_token.txt in CURRENCY_SYMBOLS - ): + if token.kind == TOK.NUMBER and next_token.kind == TOK.PUNCTUATION and next_token.txt in CURRENCY_SYMBOLS: token = TOK.Amount( token.concatenate(next_token, separator=" "), CURRENCY_SYMBOLS.get(next_token.txt, ""), @@ -2504,7 +2422,6 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]: tok_end_sentence = TOK.End_Sentence() try: - # Maintain a one-token lookahead token = next(token_stream) while True: @@ -2528,9 +2445,7 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]: _skip_me.substitute((0, len(_skip_me.txt)), "") token = cast(Tok, None) # 3. attach them to the front of the next token - token = _skip_me.concatenate( - next(token_stream), metadata_from_other=True - ) + token = _skip_me.concatenate(next(token_stream), metadata_from_other=True) continue elif token.kind == TOK.X_END: assert not in_sentence @@ -2551,10 +2466,7 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]: # This token starts a new sentence yield tok_begin_sentence in_sentence = True - if ( - token.punctuation in PUNCT_INDIRECT_SPEECH - and next_token.punctuation in DQUOTES - ): + if token.punctuation in PUNCT_INDIRECT_SPEECH and next_token.punctuation in DQUOTES: yield token token = next_token next_token = next(token_stream) @@ -2570,15 +2482,10 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]: in_sentence = False if token.punctuation in END_OF_SENTENCE and not ( token.punctuation == "…" - and not could_be_end_of_sentence( - next_token - ) # Excluding sentences with ellipsis in the middle + and not could_be_end_of_sentence(next_token) # Excluding sentences with ellipsis in the middle ): # Combining punctuation ('??!!!') - while ( - token.punctuation in PUNCT_COMBINATIONS - and next_token.punctuation in PUNCT_COMBINATIONS - ): + while token.punctuation in PUNCT_COMBINATIONS and next_token.punctuation in PUNCT_COMBINATIONS: # The normalized form comes from the first token except with "…?" v = token.punctuation if v == "…" and next_token.punctuation == "?": @@ -2641,7 +2548,6 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]: token = cast(Tok, None) try: - # Maintain a one-token lookahead token = next(token_stream) while True: @@ -2672,9 +2578,7 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]: token = TOK.Year(token.concatenate(next_token), nval) next_token = next(token_stream) # Check for [number | ordinal] [month name] - if ( - token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER - ) and next_token.kind == TOK.WORD: + if (token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER) and next_token.kind == TOK.WORD: if next_token.txt == "gr.": # Corner case: If we have an ordinal followed by # the abbreviation "gr.", we assume that the only @@ -2700,7 +2604,6 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]: # Check for [date] [year] if token.kind == TOK.DATE and next_token.kind == TOK.YEAR: - dt = cast(DateTimeTuple, token.val) if not dt[0]: # No year yet: add it @@ -2730,11 +2633,7 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]: # Eat the time token next_token = next(token_stream) - if ( - token.kind == TOK.NUMBER - and next_token.kind == TOK.TELNO - and token.txt in COUNTRY_CODES - ): + if token.kind == TOK.NUMBER and next_token.kind == TOK.TELNO and token.txt in COUNTRY_CODES: # Check for country code in front of telephone number token = TOK.Telno( token.concatenate(next_token, separator=" "), @@ -2760,7 +2659,6 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]: token = cast(Tok, None) try: - # Maintain a one-token lookahead token = next(token_stream) @@ -2768,9 +2666,7 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]: next_token = next(token_stream) # DATEABS and DATEREL made # Check for [number | ordinal] [month name] - if ( - token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER - ) and next_token.kind == TOK.WORD: + if (token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER) and next_token.kind == TOK.WORD: month = month_for_token(next_token, True) if month is not None: token = TOK.Date( @@ -2783,9 +2679,7 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]: next_token = next(token_stream) # Check for [DATE] [year] - if token.kind == TOK.DATE and ( - next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR - ): + if token.kind == TOK.DATE and (next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR): dt = cast(DateTimeTuple, token.val) if not dt[0]: # No year yet: add it @@ -2805,9 +2699,7 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]: next_token = next(token_stream) # Check for [month name] [year|YEAR] - if token.kind == TOK.WORD and ( - next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR - ): + if token.kind == TOK.WORD and (next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR): month = month_for_token(token) if month is not None: year = next_token.integer @@ -2913,28 +2805,22 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]: yield token -def parse_phrases_2( - token_stream: Iterator[Tok], coalesce_percent: bool = False -) -> Iterator[Tok]: +def parse_phrases_2(token_stream: Iterator[Tok], coalesce_percent: bool = False) -> Iterator[Tok]: """Handle numbers, amounts and composite words.""" token = cast(Tok, None) try: - # Maintain a one-token lookahead token = next(token_stream) while True: - next_token = next(token_stream) # Logic for numbers and fractions that are partially or entirely # written out in words # Check for [CURRENCY] [number] (e.g. kr. 9.900 or USD 50) - if next_token.kind == TOK.NUMBER and ( - token.txt in ISK_AMOUNT_PRECEDING or token.txt in CURRENCY_ABBREV - ): + if next_token.kind == TOK.NUMBER and (token.txt in ISK_AMOUNT_PRECEDING or token.txt in CURRENCY_ABBREV): curr = "ISK" if token.txt in ISK_AMOUNT_PRECEDING else token.txt token = TOK.Amount( token.concatenate(next_token, separator=" "), @@ -2945,7 +2831,6 @@ def parse_phrases_2( # Check for [number] [ISK_AMOUNT|CURRENCY|PERCENTAGE] elif token.kind == TOK.NUMBER and next_token.kind == TOK.WORD: - if next_token.txt in AMOUNT_ABBREV: # Abbreviations for ISK amounts # For abbreviations, we do not know the case, @@ -3020,9 +2905,7 @@ def parse_phrases_2( # part of the composition, so it can be an unknown word. _acc = tq[0] for t in tq[1:] + [token, next_token]: - _acc = _acc.concatenate( - t, separator=" ", metadata_from_other=True - ) + _acc = _acc.concatenate(t, separator=" ", metadata_from_other=True) _acc.substitute_all(" -", "-") _acc.substitute_all(" ,", ",") token = _acc @@ -3068,17 +2951,13 @@ def tokenize(text_or_gen: Union[str, Iterable[str]], **options: Any) -> Iterator return (t for t in token_stream if t.kind != TOK.X_END) -def tokenize_without_annotation( - text_or_gen: Union[str, Iterable[str]], **options: Any -) -> Iterator[Tok]: +def tokenize_without_annotation(text_or_gen: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok]: """Tokenize without the last pass which can be done more thoroughly if BÍN annotation is available, for instance in GreynirPackage.""" return tokenize(text_or_gen, with_annotation=False, **options) -def split_into_sentences( - text_or_gen: Union[str, Iterable[str]], **options: Any -) -> Iterator[str]: +def split_into_sentences(text_or_gen: Union[str, Iterable[str]], **options: Any) -> Iterator[str]: """Shallow tokenization of the input text, which can be either a text string or a generator of lines of text (such as a file). This function returns a generator of strings, where each string @@ -3122,7 +3001,6 @@ def mark_paragraphs(txt: str) -> str: def paragraphs(tokens: Iterable[Tok]) -> Iterator[List[Tuple[int, List[Tok]]]]: - """Generator yielding paragraphs from token iterable. Each paragraph is a list of sentence tuples. Sentence tuples consist of the index of the first token of the sentence (the TOK.S_BEGIN token) and a list of the tokens within the @@ -3170,7 +3048,7 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool: RE_SPLIT_STR = ( - # The following regex catches Icelandic numbers with dots and a comma + # The following regex catches Icelandic numbers with dots r"([\+\-\$€]?\d{1,3}(?:\.\d\d\d)+\,\d+)" # +123.456,789 # The following regex catches English numbers with commas and a dot r"|([\+\-\$€]?\d{1,3}(?:\,\d\d\d)+\.\d+)" # +123,456.789 @@ -3178,6 +3056,8 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool: r"|([\+\-\$€]?\d+\,\d+(?!\.\d))" # -1234,56 # The following regex catches English numbers with a dot only r"|([\+\-\$€]?\d+\.\d+(?!\,\d))" # -1234.56 + # The following regex catches Icelandic abbreviations, e.g. a.m.k., A.M.K., þ.e.a.s. + r"|([a-záðéíóúýþæöA-ZÁÐÉÍÓÚÝÞÆÖ]+\.(?:[a-záðéíóúýþæöA-ZÁÐÉÍÓÚÝÞÆÖ]+\.)+)" # Finally, space and punctuation r"|([~\s" + "".join("\\" + c for c in PUNCTUATION) @@ -3217,12 +3097,8 @@ def correct_spaces(s: str) -> str: this = TP_CENTER else: this = TP_WORD - if ( - (w == "og" or w == "eða") - and len(r) >= 2 - and r[-1] == "-" - and r[-2].lstrip().isalpha() - ): + # print("this: ", this) + if (w == "og" or w == "eða") and len(r) >= 2 and r[-1] == "-" and r[-2].lstrip().isalpha(): # Special case for compounds such as "fjármála- og efnahagsráðuneytið" # and "Iðnaðar-, ferðamála- og atvinnuráðuneytið": # detach the hyphen from "og"/"eða" @@ -3283,9 +3159,7 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str: return "".join(r) -def calculate_indexes( - tokens: Iterable[Tok], last_is_end: bool = False -) -> Tuple[List[int], List[int]]: +def calculate_indexes(tokens: Iterable[Tok], last_is_end: bool = False) -> Tuple[List[int], List[int]]: """Calculate character and byte indexes for a token stream. The indexes are the start positions of each token in the original text that was tokenized. @@ -3307,9 +3181,7 @@ def byte_len(string: str) -> int: if t.txt: # Origin tracking failed for this token. # TODO: Can we do something better here? Or guarantee that it doesn't happen? - raise ValueError( - f"Origin tracking failed at {t.txt} near index {char_indexes[-1]}" - ) + raise ValueError(f"Origin tracking failed at {t.txt} near index {char_indexes[-1]}") else: # This is some marker token that has no text pass From 15d54adc5e4251f791962a5afb5ba57a232bd4c2 Mon Sep 17 00:00:00 2001 From: thorunna Date: Thu, 2 Nov 2023 14:23:14 +0000 Subject: [PATCH 2/9] reversed formatting --- src/tokenizer/tokenizer.py | 232 ++++++++++++++++++++++++++++--------- 1 file changed, 180 insertions(+), 52 deletions(-) diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py index 73f5fa1..8ec12ad 100644 --- a/src/tokenizer/tokenizer.py +++ b/src/tokenizer/tokenizer.py @@ -232,7 +232,9 @@ def substitute(self, span: Tuple[int, int], new: str) -> None: self.txt = self.txt[: span[0]] + new + self.txt[span[1] :] if self.origin_spans is not None: # Remove origin entries that correspond to characters that are gone. - self.origin_spans = self.origin_spans[: span[0] + len(new)] + self.origin_spans[span[1] :] + self.origin_spans = ( + self.origin_spans[: span[0] + len(new)] + self.origin_spans[span[1] :] + ) def substitute_longer(self, span: Tuple[int, int], new: str) -> None: """Substitute a span with a potentially longer string""" @@ -310,9 +312,13 @@ def concatenate( self_origin_spans = self.origin_spans or [] other_origin_spans = other.origin_spans or [] - separator_origin_spans: List[int] = [len(self_original)] * len(separator) if len(other_origin_spans) > 0 else [] + separator_origin_spans: List[int] = ( + [len(self_original)] * len(separator) if len(other_origin_spans) > 0 else [] + ) new_origin_spans = ( - self_origin_spans + separator_origin_spans + [i + len(self_original) for i in other_origin_spans] + self_origin_spans + + separator_origin_spans + + [i + len(self_original) for i in other_origin_spans] ) return Tok(new_kind, new_txt, new_val, new_original, new_origin_spans) @@ -337,7 +343,9 @@ def __getitem__(self, i: int) -> Union[int, str, ValType]: def equal(self, other: "Tok") -> bool: """Equality of content between two tokens, i.e. ignoring the 'original' and 'origin_spans' attributes""" - return self.kind == other.kind and self.txt == other.txt and self.val == other.val + return ( + self.kind == other.kind and self.txt == other.txt and self.val == other.val + ) def __eq__(self, o: Any) -> bool: """Full equality between two Tok instances""" @@ -579,7 +587,9 @@ def Daterel(t: Union[Tok, str], y: int, m: int, d: int) -> Tok: return t @staticmethod - def Timestamp(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int) -> Tok: + def Timestamp( + t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int + ) -> Tok: if isinstance(t, str): return Tok(TOK.TIMESTAMP, t, (y, mo, d, h, m, s)) t.kind = TOK.TIMESTAMP @@ -587,7 +597,9 @@ def Timestamp(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: in return t @staticmethod - def Timestampabs(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int) -> Tok: + def Timestampabs( + t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int + ) -> Tok: if isinstance(t, str): return Tok(TOK.TIMESTAMPABS, t, (y, mo, d, h, m, s)) t.kind = TOK.TIMESTAMPABS @@ -595,7 +607,9 @@ def Timestampabs(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: return t @staticmethod - def Timestamprel(t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int) -> Tok: + def Timestamprel( + t: Union[Tok, str], y: int, mo: int, d: int, h: int, m: int, s: int + ) -> Tok: if isinstance(t, str): return Tok(TOK.TIMESTAMPREL, t, (y, mo, d, h, m, s)) t.kind = TOK.TIMESTAMPREL @@ -953,7 +967,11 @@ def could_be_end_of_sentence(self, i: int = 0, *args: Any) -> bool: def normalized_text(token: Tok) -> str: """Returns token text after normalizing punctuation""" - return cast(Tuple[int, str], token.val)[1] if token.kind == TOK.PUNCTUATION else token.txt + return ( + cast(Tuple[int, str], token.val)[1] + if token.kind == TOK.PUNCTUATION + else token.txt + ) def text_from_tokens(tokens: Iterable[Tok]) -> str: @@ -1216,7 +1234,11 @@ def parse_digits(tok: Tok, convert_numbers: bool) -> Tuple[Tok, Tok]: p = g.split("/") m = int(p[1]) d = int(p[0]) - if p[0][0] != "0" and p[1][0] != "0" and ((d <= 5 and m <= 6) or (d == 1 and m <= 10)): + if ( + p[0][0] != "0" + and p[1][0] != "0" + and ((d <= 5 and m <= 6) or (d == 1 and m <= 10)) + ): # This is probably a fraction, not a date # (1/2, 1/3, 1/4, 1/5, 1/6, 2/3, 2/5, 5/6 etc.) # Return a number @@ -1337,7 +1359,9 @@ def unicode_replacement(token: Tok) -> Tok: total_reduction = 0 for m in UNICODE_REGEX.finditer(token.txt): span, new_letter = m.span(), UNICODE_REPLACEMENTS[m.group(0)] - token.substitute((span[0] - total_reduction, span[1] - total_reduction), new_letter) + token.substitute( + (span[0] - total_reduction, span[1] - total_reduction), new_letter + ) total_reduction += span[1] - span[0] - len(new_letter) return token @@ -1347,7 +1371,9 @@ def html_replacement(token: Tok) -> Tok: total_reduction = 0 for m in HTML_ESCAPE_REGEX.finditer(token.txt): span, new_letter = html_escape(m) - token.substitute((span[0] - total_reduction, span[1] - total_reduction), new_letter) + token.substitute( + (span[0] - total_reduction, span[1] - total_reduction), new_letter + ) total_reduction += span[1] - span[0] - len(new_letter) return token @@ -1389,8 +1415,12 @@ def shift_span(span: Tuple[int, int], pos: int): assert match is not None # Since the match indexes the text of the original token, # we need to shift the indices so that they match the current token. - shifted_all_group_span = shift_span(match.span(ROUGH_TOKEN_REGEX_ENTIRE_MATCH), -pos) - shifted_white_space_span = shift_span(match.span(ROUGH_TOKEN_REGEX_WHITE_SPACE_GROUP), -pos) + shifted_all_group_span = shift_span( + match.span(ROUGH_TOKEN_REGEX_ENTIRE_MATCH), -pos + ) + shifted_white_space_span = shift_span( + match.span(ROUGH_TOKEN_REGEX_WHITE_SPACE_GROUP), -pos + ) # Then we split the current token using the shifted spans small_tok, tok = tok.split(shifted_all_group_span[SPAN_END]) # Remove whitespace characters from the start of the token @@ -1423,6 +1453,7 @@ def generate_raw_tokens( big_text: str for big_text in text_or_gen: + if not one_sent_per_line and not big_text: # An explicit empty string in the input always # causes a sentence split @@ -1561,7 +1592,12 @@ def parse(self) -> Iterable[Tok]: lw = len(rt.txt) i = 1 while i < lw and ( - rt.txt[i].isalpha() or (rt.txt[i] in PUNCT_INSIDE_WORD and i + 1 < lw and rt.txt[i + 1].isalpha()) + rt.txt[i].isalpha() + or ( + rt.txt[i] in PUNCT_INSIDE_WORD + and i + 1 < lw + and rt.txt[i + 1].isalpha() + ) ): # We allow dots to occur inside words in the case of # abbreviations; also apostrophes are allowed within @@ -1631,7 +1667,9 @@ class NumberParser: """Parses a sequence of digits off the front of a raw token""" - def __init__(self, rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool) -> None: + def __init__( + self, rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool + ) -> None: self.rt = rt self.handle_kludgy_ordinals = handle_kludgy_ordinals self.convert_numbers = convert_numbers @@ -1652,7 +1690,10 @@ def parse(self) -> Iterable[Tok]: # '1sti' -> 'fyrsti', '3ji' -> 'þriðji', etc. key_tok.substitute_longer((0, len(key)), val) yield TOK.Word(key_tok) - elif handle_kludgy_ordinals == KLUDGY_ORDINALS_TRANSLATE and key in ORDINAL_NUMBERS: + elif ( + handle_kludgy_ordinals == KLUDGY_ORDINALS_TRANSLATE + and key in ORDINAL_NUMBERS + ): # Convert word-form ordinals into ordinal tokens, # i.e. '1sti' -> TOK.Ordinal('1sti', 1), # but leave other kludgy constructs ('2ja') @@ -1780,8 +1821,9 @@ def parse(self, rt: Tok) -> Iterable[Tok]: self.rt = rt self.ate = ate - -def parse_mixed(rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool) -> Iterable[Tok]: +def parse_mixed( + rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool +) -> Iterable[Tok]: """Parse a mixed raw token string, from the token rt""" # Initialize a singleton parser for punctuation @@ -1866,7 +1908,8 @@ def parse_mixed(rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool) -> # Numbers or other stuff starting with a digit # (eventually prefixed by a '+' or '-') if rtxt and ( - rtxt[0] in DIGITS_PREFIX or (rtxt[0] in SIGN_PREFIX and len(rtxt) >= 2 and rtxt[1] in DIGITS_PREFIX) + rtxt[0] in DIGITS_PREFIX + or (rtxt[0] in SIGN_PREFIX and len(rtxt) >= 2 and rtxt[1] in DIGITS_PREFIX) ): np = NumberParser(rt, handle_kludgy_ordinals, convert_numbers) yield from np.parse() @@ -1932,7 +1975,9 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok # The default behavior for kludgy ordinals is to pass them # through as word tokens - handle_kludgy_ordinals: int = options.get("handle_kludgy_ordinals", KLUDGY_ORDINALS_PASS_THROUGH) + handle_kludgy_ordinals: int = options.get( + "handle_kludgy_ordinals", KLUDGY_ORDINALS_PASS_THROUGH + ) # This code proceeds roughly as follows: # 1) The text is split into raw tokens on whitespace boundaries. @@ -1955,7 +2000,9 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok rtxt: str = "" - for rt in generate_raw_tokens(txt, replace_composite_glyphs, replace_html_escapes, one_sent_per_line): + for rt in generate_raw_tokens( + txt, replace_composite_glyphs, replace_html_escapes, one_sent_per_line + ): # rt: raw token if rt.kind in {TOK.S_SPLIT, TOK.P_BEGIN, TOK.P_END}: @@ -2085,13 +2132,18 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: and (next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR) ): currabbr = CURRENCY_SYMBOLS[token.txt] - token = TOK.Amount(token.concatenate(next_token), currabbr, next_token.number) + token = TOK.Amount( + token.concatenate(next_token), currabbr, next_token.number + ) next_token = next(token_stream) # Special case for a DATEREL token of the form "25.10.", # i.e. with a trailing period: It can end a sentence if token.kind == TOK.DATEREL and "." in token.txt: - if next_token.txt == "." and not token_stream.could_be_end_of_sentence(): + if ( + next_token.txt == "." + and not token_stream.could_be_end_of_sentence() + ): # This is something like 'Ég fæddist 25.9. í Svarfaðardal.' y, m, d = cast(Tuple[int, int, int], token.val) token = TOK.Daterel(token.concatenate(next_token), y, m, d) @@ -2100,7 +2152,11 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: # Coalesce abbreviations ending with a period into a single # abbreviation token if next_token.punctuation == ".": - if token.kind == TOK.WORD and token.txt[-1] != "." and is_abbr_with_period(token.txt): + if ( + token.kind == TOK.WORD + and token.txt[-1] != "." + and is_abbr_with_period(token.txt) + ): # Abbreviation ending with period: make a special token for it # and advance the input stream follow_token = next(token_stream) @@ -2123,7 +2179,9 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: # þær þarf að vera hægt að sameina í þessa flóknari tóka en viljum # geta merkt það sem villu. Ætti líklega að setja í sérlista, # WRONG_MONTHS, og sérif-lykkju og setja inn villu í tókann. - finish = could_be_end_of_sentence(follow_token, test_set, abbrev in NUMBER_ABBREV) + finish = could_be_end_of_sentence( + follow_token, test_set, abbrev in NUMBER_ABBREV + ) if finish: # Potentially at the end of a sentence if abbrev in Abbreviations.FINISHERS: @@ -2135,7 +2193,10 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: yield token # Set token to the period token = next_token - elif abbrev in Abbreviations.NOT_FINISHERS or abbrev.lower() in Abbreviations.NOT_FINISHERS: + elif ( + abbrev in Abbreviations.NOT_FINISHERS + or abbrev.lower() in Abbreviations.NOT_FINISHERS + ): # This is a potential abbreviation that we don't interpret # as such if it's at the end of a sentence # ('dags.', 'próf.', 'mín.'). Note that this also @@ -2144,7 +2205,9 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: token = next_token else: # Substitute the abbreviation and eat the period - token = TOK.Word(token.concatenate(next_token), lookup(abbrev)) + token = TOK.Word( + token.concatenate(next_token), lookup(abbrev) + ) else: # 'Regular' abbreviation in the middle of a sentence: # Eat the period and yield the abbreviation as a single token @@ -2180,7 +2243,9 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: next_token = next(token_stream) # Coalesce 'klukkan/kl. átta/hálfátta' into a time - elif next_token.kind == TOK.WORD and next_token.txt.lower() in CLOCK_NUMBERS: + elif ( + next_token.kind == TOK.WORD and next_token.txt.lower() in CLOCK_NUMBERS + ): if token.kind == TOK.WORD and token.txt.lower() in CLOCK_ABBREVS: # Match: coalesce and step to next token next_txt = next_token.txt.lower() @@ -2242,7 +2307,9 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: sign = next_token.txt # Store promille as one-tenth of a percentage factor = 1.0 if sign == "%" else 0.1 - token = TOK.Percent(token.concatenate(next_token), token.number * factor) + token = TOK.Percent( + token.concatenate(next_token), token.number * factor + ) next_token = next(token_stream) # Coalesce ordinals (1. = first, 2. = second...) into a single token @@ -2268,13 +2335,20 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: ): # OK: replace the number/Roman numeral and the period # with an ordinal token - num = token.integer if token.kind == TOK.NUMBER else roman_to_int(token.txt) + num = ( + token.integer + if token.kind == TOK.NUMBER + else roman_to_int(token.txt) + ) token = TOK.Ordinal(token.concatenate(next_token), num) # Continue with the following word next_token = next(token_stream) # Convert "1920 mm" or "30 °C" to a single measurement token - if (token.kind == TOK.NUMBER or token.kind == TOK.YEAR) and next_token.txt in SI_UNITS: + if ( + token.kind == TOK.NUMBER or token.kind == TOK.YEAR + ) and next_token.txt in SI_UNITS: + value = token.number orig_unit = next_token.txt unit: str @@ -2287,7 +2361,9 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: assert isinstance(factor_func, float) value *= factor_func if unit in ("%", "‰"): - token = TOK.Percent(token.concatenate(next_token, separator=" "), value) + token = TOK.Percent( + token.concatenate(next_token, separator=" "), value + ) else: token = TOK.Measurement( token.concatenate(next_token, separator=" "), @@ -2385,7 +2461,11 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: next_token = next(token_stream) # Cases such as 19 $, 199.99 $ - if token.kind == TOK.NUMBER and next_token.kind == TOK.PUNCTUATION and next_token.txt in CURRENCY_SYMBOLS: + if ( + token.kind == TOK.NUMBER + and next_token.kind == TOK.PUNCTUATION + and next_token.txt in CURRENCY_SYMBOLS + ): token = TOK.Amount( token.concatenate(next_token, separator=" "), CURRENCY_SYMBOLS.get(next_token.txt, ""), @@ -2422,6 +2502,7 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]: tok_end_sentence = TOK.End_Sentence() try: + # Maintain a one-token lookahead token = next(token_stream) while True: @@ -2445,7 +2526,9 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]: _skip_me.substitute((0, len(_skip_me.txt)), "") token = cast(Tok, None) # 3. attach them to the front of the next token - token = _skip_me.concatenate(next(token_stream), metadata_from_other=True) + token = _skip_me.concatenate( + next(token_stream), metadata_from_other=True + ) continue elif token.kind == TOK.X_END: assert not in_sentence @@ -2466,7 +2549,10 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]: # This token starts a new sentence yield tok_begin_sentence in_sentence = True - if token.punctuation in PUNCT_INDIRECT_SPEECH and next_token.punctuation in DQUOTES: + if ( + token.punctuation in PUNCT_INDIRECT_SPEECH + and next_token.punctuation in DQUOTES + ): yield token token = next_token next_token = next(token_stream) @@ -2482,10 +2568,15 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]: in_sentence = False if token.punctuation in END_OF_SENTENCE and not ( token.punctuation == "…" - and not could_be_end_of_sentence(next_token) # Excluding sentences with ellipsis in the middle + and not could_be_end_of_sentence( + next_token + ) # Excluding sentences with ellipsis in the middle ): # Combining punctuation ('??!!!') - while token.punctuation in PUNCT_COMBINATIONS and next_token.punctuation in PUNCT_COMBINATIONS: + while ( + token.punctuation in PUNCT_COMBINATIONS + and next_token.punctuation in PUNCT_COMBINATIONS + ): # The normalized form comes from the first token except with "…?" v = token.punctuation if v == "…" and next_token.punctuation == "?": @@ -2548,6 +2639,7 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]: token = cast(Tok, None) try: + # Maintain a one-token lookahead token = next(token_stream) while True: @@ -2578,7 +2670,9 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]: token = TOK.Year(token.concatenate(next_token), nval) next_token = next(token_stream) # Check for [number | ordinal] [month name] - if (token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER) and next_token.kind == TOK.WORD: + if ( + token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER + ) and next_token.kind == TOK.WORD: if next_token.txt == "gr.": # Corner case: If we have an ordinal followed by # the abbreviation "gr.", we assume that the only @@ -2604,6 +2698,7 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]: # Check for [date] [year] if token.kind == TOK.DATE and next_token.kind == TOK.YEAR: + dt = cast(DateTimeTuple, token.val) if not dt[0]: # No year yet: add it @@ -2633,7 +2728,11 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]: # Eat the time token next_token = next(token_stream) - if token.kind == TOK.NUMBER and next_token.kind == TOK.TELNO and token.txt in COUNTRY_CODES: + if ( + token.kind == TOK.NUMBER + and next_token.kind == TOK.TELNO + and token.txt in COUNTRY_CODES + ): # Check for country code in front of telephone number token = TOK.Telno( token.concatenate(next_token, separator=" "), @@ -2659,6 +2758,7 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]: token = cast(Tok, None) try: + # Maintain a one-token lookahead token = next(token_stream) @@ -2666,7 +2766,9 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]: next_token = next(token_stream) # DATEABS and DATEREL made # Check for [number | ordinal] [month name] - if (token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER) and next_token.kind == TOK.WORD: + if ( + token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER + ) and next_token.kind == TOK.WORD: month = month_for_token(next_token, True) if month is not None: token = TOK.Date( @@ -2679,7 +2781,9 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]: next_token = next(token_stream) # Check for [DATE] [year] - if token.kind == TOK.DATE and (next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR): + if token.kind == TOK.DATE and ( + next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR + ): dt = cast(DateTimeTuple, token.val) if not dt[0]: # No year yet: add it @@ -2699,7 +2803,9 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]: next_token = next(token_stream) # Check for [month name] [year|YEAR] - if token.kind == TOK.WORD and (next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR): + if token.kind == TOK.WORD and ( + next_token.kind == TOK.NUMBER or next_token.kind == TOK.YEAR + ): month = month_for_token(token) if month is not None: year = next_token.integer @@ -2805,22 +2911,28 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]: yield token -def parse_phrases_2(token_stream: Iterator[Tok], coalesce_percent: bool = False) -> Iterator[Tok]: +def parse_phrases_2( + token_stream: Iterator[Tok], coalesce_percent: bool = False +) -> Iterator[Tok]: """Handle numbers, amounts and composite words.""" token = cast(Tok, None) try: + # Maintain a one-token lookahead token = next(token_stream) while True: + next_token = next(token_stream) # Logic for numbers and fractions that are partially or entirely # written out in words # Check for [CURRENCY] [number] (e.g. kr. 9.900 or USD 50) - if next_token.kind == TOK.NUMBER and (token.txt in ISK_AMOUNT_PRECEDING or token.txt in CURRENCY_ABBREV): + if next_token.kind == TOK.NUMBER and ( + token.txt in ISK_AMOUNT_PRECEDING or token.txt in CURRENCY_ABBREV + ): curr = "ISK" if token.txt in ISK_AMOUNT_PRECEDING else token.txt token = TOK.Amount( token.concatenate(next_token, separator=" "), @@ -2831,6 +2943,7 @@ def parse_phrases_2(token_stream: Iterator[Tok], coalesce_percent: bool = False) # Check for [number] [ISK_AMOUNT|CURRENCY|PERCENTAGE] elif token.kind == TOK.NUMBER and next_token.kind == TOK.WORD: + if next_token.txt in AMOUNT_ABBREV: # Abbreviations for ISK amounts # For abbreviations, we do not know the case, @@ -2905,7 +3018,9 @@ def parse_phrases_2(token_stream: Iterator[Tok], coalesce_percent: bool = False) # part of the composition, so it can be an unknown word. _acc = tq[0] for t in tq[1:] + [token, next_token]: - _acc = _acc.concatenate(t, separator=" ", metadata_from_other=True) + _acc = _acc.concatenate( + t, separator=" ", metadata_from_other=True + ) _acc.substitute_all(" -", "-") _acc.substitute_all(" ,", ",") token = _acc @@ -2951,13 +3066,17 @@ def tokenize(text_or_gen: Union[str, Iterable[str]], **options: Any) -> Iterator return (t for t in token_stream if t.kind != TOK.X_END) -def tokenize_without_annotation(text_or_gen: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok]: +def tokenize_without_annotation( + text_or_gen: Union[str, Iterable[str]], **options: Any +) -> Iterator[Tok]: """Tokenize without the last pass which can be done more thoroughly if BÍN annotation is available, for instance in GreynirPackage.""" return tokenize(text_or_gen, with_annotation=False, **options) -def split_into_sentences(text_or_gen: Union[str, Iterable[str]], **options: Any) -> Iterator[str]: +def split_into_sentences( + text_or_gen: Union[str, Iterable[str]], **options: Any +) -> Iterator[str]: """Shallow tokenization of the input text, which can be either a text string or a generator of lines of text (such as a file). This function returns a generator of strings, where each string @@ -3001,6 +3120,7 @@ def mark_paragraphs(txt: str) -> str: def paragraphs(tokens: Iterable[Tok]) -> Iterator[List[Tuple[int, List[Tok]]]]: + """Generator yielding paragraphs from token iterable. Each paragraph is a list of sentence tuples. Sentence tuples consist of the index of the first token of the sentence (the TOK.S_BEGIN token) and a list of the tokens within the @@ -3048,7 +3168,7 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool: RE_SPLIT_STR = ( - # The following regex catches Icelandic numbers with dots + # The following regex catches Icelandic numbers with dots and a comma r"([\+\-\$€]?\d{1,3}(?:\.\d\d\d)+\,\d+)" # +123.456,789 # The following regex catches English numbers with commas and a dot r"|([\+\-\$€]?\d{1,3}(?:\,\d\d\d)+\.\d+)" # +123,456.789 @@ -3097,8 +3217,12 @@ def correct_spaces(s: str) -> str: this = TP_CENTER else: this = TP_WORD - # print("this: ", this) - if (w == "og" or w == "eða") and len(r) >= 2 and r[-1] == "-" and r[-2].lstrip().isalpha(): + if ( + (w == "og" or w == "eða") + and len(r) >= 2 + and r[-1] == "-" + and r[-2].lstrip().isalpha() + ): # Special case for compounds such as "fjármála- og efnahagsráðuneytið" # and "Iðnaðar-, ferðamála- og atvinnuráðuneytið": # detach the hyphen from "og"/"eða" @@ -3159,7 +3283,9 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str: return "".join(r) -def calculate_indexes(tokens: Iterable[Tok], last_is_end: bool = False) -> Tuple[List[int], List[int]]: +def calculate_indexes( + tokens: Iterable[Tok], last_is_end: bool = False +) -> Tuple[List[int], List[int]]: """Calculate character and byte indexes for a token stream. The indexes are the start positions of each token in the original text that was tokenized. @@ -3181,7 +3307,9 @@ def byte_len(string: str) -> int: if t.txt: # Origin tracking failed for this token. # TODO: Can we do something better here? Or guarantee that it doesn't happen? - raise ValueError(f"Origin tracking failed at {t.txt} near index {char_indexes[-1]}") + raise ValueError( + f"Origin tracking failed at {t.txt} near index {char_indexes[-1]}" + ) else: # This is some marker token that has no text pass From 7098dd3764621007bef788a31549eadccce3ae2d Mon Sep 17 00:00:00 2001 From: thorunna Date: Thu, 2 Nov 2023 14:27:19 +0000 Subject: [PATCH 3/9] added test cases for abbreviations --- test/test_tokenizer.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py index dbf9822..6802a97 100755 --- a/test/test_tokenizer.py +++ b/test/test_tokenizer.py @@ -1132,6 +1132,12 @@ def test_correct_spaces() -> None: assert s == "Jón-sem var formaður—mótmælti málinu." s = t.correct_spaces("Það á að geyma mjólkina við 20 ± 3 °C") assert s == "Það á að geyma mjólkina við 20±3° C" + s = t.correct_spaces("Við förum t.d. til Íslands o.s.frv.") + assert s == "Við förum t.d. til Íslands o.s.frv." + s = t.correct_spaces("M.a. lögum við bil.") + assert s == "M.a. lögum við bil." + s = t.correct_spaces("HANN BORÐAR Þ.Á.M. EPLI.") + assert s == "HANN BORÐAR Þ.Á.M. EPLI." def test_abbrev() -> None: From 7696a2785804125803be1430ff9822e2f9b823f5 Mon Sep 17 00:00:00 2001 From: thorunna Date: Thu, 2 Nov 2023 15:54:21 +0000 Subject: [PATCH 4/9] improved handling for abbreviations and degree symbols --- src/tokenizer/tokenizer.py | 48 +++++++++++++++++--------------------- test/test_tokenizer.py | 16 +++++++++---- 2 files changed, 33 insertions(+), 31 deletions(-) diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py index 8ec12ad..d2c2401 100644 --- a/src/tokenizer/tokenizer.py +++ b/src/tokenizer/tokenizer.py @@ -58,6 +58,7 @@ import datetime import re +import regex import unicodedata # type: ignore from collections import deque @@ -316,8 +317,8 @@ def concatenate( [len(self_original)] * len(separator) if len(other_origin_spans) > 0 else [] ) new_origin_spans = ( - self_origin_spans - + separator_origin_spans + self_origin_spans + + separator_origin_spans + [i + len(self_original) for i in other_origin_spans] ) @@ -1453,7 +1454,6 @@ def generate_raw_tokens( big_text: str for big_text in text_or_gen: - if not one_sent_per_line and not big_text: # An explicit empty string in the input always # causes a sentence split @@ -1821,6 +1821,7 @@ def parse(self, rt: Tok) -> Iterable[Tok]: self.rt = rt self.ate = ate + def parse_mixed( rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool ) -> Iterable[Tok]: @@ -2141,7 +2142,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: # i.e. with a trailing period: It can end a sentence if token.kind == TOK.DATEREL and "." in token.txt: if ( - next_token.txt == "." + next_token.txt == "." and not token_stream.could_be_end_of_sentence() ): # This is something like 'Ég fæddist 25.9. í Svarfaðardal.' @@ -2153,8 +2154,8 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: # abbreviation token if next_token.punctuation == ".": if ( - token.kind == TOK.WORD - and token.txt[-1] != "." + token.kind == TOK.WORD + and token.txt[-1] != "." and is_abbr_with_period(token.txt) ): # Abbreviation ending with period: make a special token for it @@ -2194,7 +2195,7 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: # Set token to the period token = next_token elif ( - abbrev in Abbreviations.NOT_FINISHERS + abbrev in Abbreviations.NOT_FINISHERS or abbrev.lower() in Abbreviations.NOT_FINISHERS ): # This is a potential abbreviation that we don't interpret @@ -2336,8 +2337,8 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: # OK: replace the number/Roman numeral and the period # with an ordinal token num = ( - token.integer - if token.kind == TOK.NUMBER + token.integer + if token.kind == TOK.NUMBER else roman_to_int(token.txt) ) token = TOK.Ordinal(token.concatenate(next_token), num) @@ -2348,7 +2349,6 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: if ( token.kind == TOK.NUMBER or token.kind == TOK.YEAR ) and next_token.txt in SI_UNITS: - value = token.number orig_unit = next_token.txt unit: str @@ -2462,8 +2462,8 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: # Cases such as 19 $, 199.99 $ if ( - token.kind == TOK.NUMBER - and next_token.kind == TOK.PUNCTUATION + token.kind == TOK.NUMBER + and next_token.kind == TOK.PUNCTUATION and next_token.txt in CURRENCY_SYMBOLS ): token = TOK.Amount( @@ -2502,7 +2502,6 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]: tok_end_sentence = TOK.End_Sentence() try: - # Maintain a one-token lookahead token = next(token_stream) while True: @@ -2550,7 +2549,7 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]: yield tok_begin_sentence in_sentence = True if ( - token.punctuation in PUNCT_INDIRECT_SPEECH + token.punctuation in PUNCT_INDIRECT_SPEECH and next_token.punctuation in DQUOTES ): yield token @@ -2574,7 +2573,7 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]: ): # Combining punctuation ('??!!!') while ( - token.punctuation in PUNCT_COMBINATIONS + token.punctuation in PUNCT_COMBINATIONS and next_token.punctuation in PUNCT_COMBINATIONS ): # The normalized form comes from the first token except with "…?" @@ -2639,7 +2638,6 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]: token = cast(Tok, None) try: - # Maintain a one-token lookahead token = next(token_stream) while True: @@ -2698,7 +2696,6 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]: # Check for [date] [year] if token.kind == TOK.DATE and next_token.kind == TOK.YEAR: - dt = cast(DateTimeTuple, token.val) if not dt[0]: # No year yet: add it @@ -2729,8 +2726,8 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]: next_token = next(token_stream) if ( - token.kind == TOK.NUMBER - and next_token.kind == TOK.TELNO + token.kind == TOK.NUMBER + and next_token.kind == TOK.TELNO and token.txt in COUNTRY_CODES ): # Check for country code in front of telephone number @@ -2758,7 +2755,6 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]: token = cast(Tok, None) try: - # Maintain a one-token lookahead token = next(token_stream) @@ -2918,12 +2914,10 @@ def parse_phrases_2( token = cast(Tok, None) try: - # Maintain a one-token lookahead token = next(token_stream) while True: - next_token = next(token_stream) # Logic for numbers and fractions that are partially or entirely @@ -2943,7 +2937,6 @@ def parse_phrases_2( # Check for [number] [ISK_AMOUNT|CURRENCY|PERCENTAGE] elif token.kind == TOK.NUMBER and next_token.kind == TOK.WORD: - if next_token.txt in AMOUNT_ABBREV: # Abbreviations for ISK amounts # For abbreviations, we do not know the case, @@ -3120,7 +3113,6 @@ def mark_paragraphs(txt: str) -> str: def paragraphs(tokens: Iterable[Tok]) -> Iterator[List[Tuple[int, List[Tok]]]]: - """Generator yielding paragraphs from token iterable. Each paragraph is a list of sentence tuples. Sentence tuples consist of the index of the first token of the sentence (the TOK.S_BEGIN token) and a list of the tokens within the @@ -3177,13 +3169,17 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool: # The following regex catches English numbers with a dot only r"|([\+\-\$€]?\d+\.\d+(?!\,\d))" # -1234.56 # The following regex catches Icelandic abbreviations, e.g. a.m.k., A.M.K., þ.e.a.s. - r"|([a-záðéíóúýþæöA-ZÁÐÉÍÓÚÝÞÆÖ]+\.(?:[a-záðéíóúýþæöA-ZÁÐÉÍÓÚÝÞÆÖ]+\.)+)" + r"|(\p{L}+\.(?:\p{L}+\.)+)(?!\p{L}+\s)" + # The following regex catches degree characters, i.e. °C, °F + r"|(°[CF])" # Finally, space and punctuation r"|([~\s" + "".join("\\" + c for c in PUNCTUATION) + r"])" ) -RE_SPLIT = re.compile(RE_SPLIT_STR) +# The re module doesn't support \p{L}, which matches any letter in any language, +# but regex does. +RE_SPLIT = regex.compile(RE_SPLIT_STR) def correct_spaces(s: str) -> str: diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py index 6802a97..01116fa 100755 --- a/test/test_tokenizer.py +++ b/test/test_tokenizer.py @@ -63,7 +63,6 @@ def get_text_and_norm(orig: str) -> Tuple[str, str]: def test_single_tokens() -> None: - TEST_CASES = [ (".", TOK.PUNCTUATION), (",", TOK.PUNCTUATION), @@ -616,7 +615,6 @@ def run_test(test_cases: Iterable[TestCase], **options: Any) -> None: def test_sentences() -> None: - KIND = { "B": TOK.S_BEGIN, "E": TOK.S_END, @@ -646,7 +644,6 @@ def test_sentences() -> None: } def test_sentence(text: str, expected: str, **options: Any) -> None: - exp = expected.split() s = list(t.tokenize(text, **options)) assert len(s) == len(exp) @@ -1131,13 +1128,23 @@ def test_correct_spaces() -> None: s = t.correct_spaces("Jón- sem var formaður — mótmælti málinu.") assert s == "Jón-sem var formaður—mótmælti málinu." s = t.correct_spaces("Það á að geyma mjólkina við 20 ± 3 °C") - assert s == "Það á að geyma mjólkina við 20±3° C" + assert s == "Það á að geyma mjólkina við 20±3 °C" s = t.correct_spaces("Við förum t.d. til Íslands o.s.frv.") assert s == "Við förum t.d. til Íslands o.s.frv." + s = t.correct_spaces("Við förum t. d. til Íslands o. s. frv.") + assert ( + s == "Við förum t. d. til Íslands o. s. frv." + ) # This shouldn't be corrected here s = t.correct_spaces("M.a. lögum við bil.") assert s == "M.a. lögum við bil." s = t.correct_spaces("HANN BORÐAR Þ.Á.M. EPLI.") assert s == "HANN BORÐAR Þ.Á.M. EPLI." + s = t.correct_spaces("Ég fór til Írlands 6.júní og þar var 17.4°C hiti eða 230.3K.") + assert s == "Ég fór til Írlands 6. júní og þar var 17.4 °C hiti eða 230.3 K." + s = t.correct_spaces( + "Þetta er setning.Þetta er önnur setning.Líka.En hvað með þetta?" + ) + assert s == "Þetta er setning. Þetta er önnur setning. Líka. En hvað með þetta?" def test_abbrev() -> None: @@ -2562,7 +2569,6 @@ def test_one_sent_per_line() -> None: if __name__ == "__main__": - test_single_tokens() test_sentences() test_correct_spaces() From 25649f611ba9a782b1586154a6b6a5f7e315c24b Mon Sep 17 00:00:00 2001 From: thorunna Date: Fri, 3 Nov 2023 09:23:32 +0000 Subject: [PATCH 5/9] added installation for regex module --- .github/workflows/python-package.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index aad426a..cf3b2d6 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -30,6 +30,9 @@ jobs: run: | python -m pip install --upgrade pip wheel setuptools python -m pip install -e ".[dev]" + - name: Install regex module + run: | + python -m pip install regex - name: Type check with mypy (only on Python 3.8) run: | if [ "${{ matrix.python-version }}" == "3.8" ]; then python -m pip install mypy; fi From 877df42e17160a7abcc1301bcd18369c9c1264b9 Mon Sep 17 00:00:00 2001 From: thorunna Date: Fri, 3 Nov 2023 09:26:45 +0000 Subject: [PATCH 6/9] reversed change --- .github/workflows/python-package.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index cf3b2d6..aad426a 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -30,9 +30,6 @@ jobs: run: | python -m pip install --upgrade pip wheel setuptools python -m pip install -e ".[dev]" - - name: Install regex module - run: | - python -m pip install regex - name: Type check with mypy (only on Python 3.8) run: | if [ "${{ matrix.python-version }}" == "3.8" ]; then python -m pip install mypy; fi From e84e4424d6d9a02bdde020b0236d7e705442a580 Mon Sep 17 00:00:00 2001 From: thorunna Date: Fri, 3 Nov 2023 11:28:41 +0000 Subject: [PATCH 7/9] went back to re module --- src/tokenizer/tokenizer.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py index d2c2401..8c14199 100644 --- a/src/tokenizer/tokenizer.py +++ b/src/tokenizer/tokenizer.py @@ -58,7 +58,6 @@ import datetime import re -import regex import unicodedata # type: ignore from collections import deque @@ -3169,7 +3168,8 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool: # The following regex catches English numbers with a dot only r"|([\+\-\$€]?\d+\.\d+(?!\,\d))" # -1234.56 # The following regex catches Icelandic abbreviations, e.g. a.m.k., A.M.K., þ.e.a.s. - r"|(\p{L}+\.(?:\p{L}+\.)+)(?!\p{L}+\s)" + # r"|(\p{L}+\.(?:\p{L}+\.)+)(?!\p{L}+\s)" + r"|([a-záðéíóúýþæö]+\.(?:[a-záðéíóúýþæö]+\.)+)(?![a-záðéíóúýþæö]+\s)" # The following regex catches degree characters, i.e. °C, °F r"|(°[CF])" # Finally, space and punctuation @@ -3177,9 +3177,7 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool: + "".join("\\" + c for c in PUNCTUATION) + r"])" ) -# The re module doesn't support \p{L}, which matches any letter in any language, -# but regex does. -RE_SPLIT = regex.compile(RE_SPLIT_STR) +RE_SPLIT = re.compile(RE_SPLIT_STR, re.IGNORECASE) def correct_spaces(s: str) -> str: From 8a6054347603b1b6abad84803bdf56e795a22c9b Mon Sep 17 00:00:00 2001 From: thorunna Date: Fri, 3 Nov 2023 11:40:46 +0000 Subject: [PATCH 8/9] old regex string, which was commented out, removed --- src/tokenizer/tokenizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py index 8c14199..62295f5 100644 --- a/src/tokenizer/tokenizer.py +++ b/src/tokenizer/tokenizer.py @@ -3168,7 +3168,6 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool: # The following regex catches English numbers with a dot only r"|([\+\-\$€]?\d+\.\d+(?!\,\d))" # -1234.56 # The following regex catches Icelandic abbreviations, e.g. a.m.k., A.M.K., þ.e.a.s. - # r"|(\p{L}+\.(?:\p{L}+\.)+)(?!\p{L}+\s)" r"|([a-záðéíóúýþæö]+\.(?:[a-záðéíóúýþæö]+\.)+)(?![a-záðéíóúýþæö]+\s)" # The following regex catches degree characters, i.e. °C, °F r"|(°[CF])" From 66ffce0404155b07f369ad5e60383d2a42a6bee4 Mon Sep 17 00:00:00 2001 From: thorunna Date: Fri, 3 Nov 2023 13:30:52 +0000 Subject: [PATCH 9/9] updated regex string --- src/tokenizer/tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py index 62295f5..c0ea7a3 100644 --- a/src/tokenizer/tokenizer.py +++ b/src/tokenizer/tokenizer.py @@ -3168,7 +3168,7 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool: # The following regex catches English numbers with a dot only r"|([\+\-\$€]?\d+\.\d+(?!\,\d))" # -1234.56 # The following regex catches Icelandic abbreviations, e.g. a.m.k., A.M.K., þ.e.a.s. - r"|([a-záðéíóúýþæö]+\.(?:[a-záðéíóúýþæö]+\.)+)(?![a-záðéíóúýþæö]+\s)" + r"|([^\W\d_]+\.(?:[^\W\d_]+\.)+)(?![^\W\d_]+\s)" # The following regex catches degree characters, i.e. °C, °F r"|(°[CF])" # Finally, space and punctuation @@ -3176,7 +3176,7 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool: + "".join("\\" + c for c in PUNCTUATION) + r"])" ) -RE_SPLIT = re.compile(RE_SPLIT_STR, re.IGNORECASE) +RE_SPLIT = re.compile(RE_SPLIT_STR) def correct_spaces(s: str) -> str: