Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/mideind/Tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
sveinbjornt committed May 12, 2024
2 parents c0db5e1 + 43a931e commit be8ee4d
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 17 deletions.
17 changes: 5 additions & 12 deletions src/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1453,7 +1453,6 @@ def generate_raw_tokens(
big_text: str

for big_text in text_or_gen:

if not one_sent_per_line and not big_text:
# An explicit empty string in the input always
# causes a sentence split
Expand Down Expand Up @@ -1831,7 +1830,6 @@ def parse_mixed(
pp = PunctuationParser()

while rt.txt:

# Handle punctuation
yield from pp.parse(rt)
rt, ate = pp.rt, pp.ate
Expand Down Expand Up @@ -2350,7 +2348,6 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
if (
token.kind == TOK.NUMBER or token.kind == TOK.YEAR
) and next_token.txt in SI_UNITS:

value = token.number
orig_unit = next_token.txt
unit: str
Expand Down Expand Up @@ -2499,12 +2496,11 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
exclamation marks, etc.)"""

in_sentence = False
token = cast(Tok, None)
token: Optional[Tok] = None
tok_begin_sentence = TOK.Begin_Sentence()
tok_end_sentence = TOK.End_Sentence()

try:

# Maintain a one-token lookahead
token = next(token_stream)
while True:
Expand Down Expand Up @@ -2641,7 +2637,6 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:

token = cast(Tok, None)
try:

# Maintain a one-token lookahead
token = next(token_stream)
while True:
Expand Down Expand Up @@ -2700,7 +2695,6 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:

# Check for [date] [year]
if token.kind == TOK.DATE and next_token.kind == TOK.YEAR:

dt = cast(DateTimeTuple, token.val)
if not dt[0]:
# No year yet: add it
Expand Down Expand Up @@ -2760,7 +2754,6 @@ def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]:

token = cast(Tok, None)
try:

# Maintain a one-token lookahead
token = next(token_stream)

Expand Down Expand Up @@ -2920,12 +2913,10 @@ def parse_phrases_2(

token = cast(Tok, None)
try:

# Maintain a one-token lookahead
token = next(token_stream)

while True:

next_token = next(token_stream)

# Logic for numbers and fractions that are partially or entirely
Expand All @@ -2945,7 +2936,6 @@ def parse_phrases_2(

# Check for [number] [ISK_AMOUNT|CURRENCY|PERCENTAGE]
elif token.kind == TOK.NUMBER and next_token.kind == TOK.WORD:

if next_token.txt in AMOUNT_ABBREV:
# Abbreviations for ISK amounts
# For abbreviations, we do not know the case,
Expand Down Expand Up @@ -3122,7 +3112,6 @@ def mark_paragraphs(txt: str) -> str:


def paragraphs(tokens: Iterable[Tok]) -> Iterator[List[Tuple[int, List[Tok]]]]:

"""Generator yielding paragraphs from token iterable. Each paragraph is a list
of sentence tuples. Sentence tuples consist of the index of the first token
of the sentence (the TOK.S_BEGIN token) and a list of the tokens within the
Expand Down Expand Up @@ -3178,6 +3167,10 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool:
r"|([\+\-\$€]?\d+\,\d+(?!\.\d))" # -1234,56
# The following regex catches English numbers with a dot only
r"|([\+\-\$€]?\d+\.\d+(?!\,\d))" # -1234.56
# The following regex catches Icelandic abbreviations, e.g. a.m.k., A.M.K., þ.e.a.s.
r"|([^\W\d_]+\.(?:[^\W\d_]+\.)+)(?![^\W\d_]+\s)"
# The following regex catches degree characters, i.e. °C, °F
r"|(°[CF])"
# Finally, space and punctuation
r"|([~\s"
+ "".join("\\" + c for c in PUNCTUATION)
Expand Down
22 changes: 17 additions & 5 deletions test/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ def get_text_and_norm(orig: str) -> Tuple[str, str]:


def test_single_tokens() -> None:

TEST_CASES = [
(".", TOK.PUNCTUATION),
(",", TOK.PUNCTUATION),
Expand Down Expand Up @@ -616,7 +615,6 @@ def run_test(test_cases: Iterable[TestCase], **options: Any) -> None:


def test_sentences() -> None:

KIND = {
"B": TOK.S_BEGIN,
"E": TOK.S_END,
Expand Down Expand Up @@ -646,7 +644,6 @@ def test_sentences() -> None:
}

def test_sentence(text: str, expected: str, **options: Any) -> None:

exp = expected.split()
s = list(t.tokenize(text, **options))
assert len(s) == len(exp)
Expand Down Expand Up @@ -1131,7 +1128,23 @@ def test_correct_spaces() -> None:
s = t.correct_spaces("Jón- sem var formaður — mótmælti málinu.")
assert s == "Jón-sem var formaður—mótmælti málinu."
s = t.correct_spaces("Það á að geyma mjólkina við 20 ± 3 °C")
assert s == "Það á að geyma mjólkina við 20±3° C"
assert s == "Það á að geyma mjólkina við 20±3 °C"
s = t.correct_spaces("Við förum t.d. til Íslands o.s.frv.")
assert s == "Við förum t.d. til Íslands o.s.frv."
s = t.correct_spaces("Við förum t. d. til Íslands o. s. frv.")
assert (
s == "Við förum t. d. til Íslands o. s. frv."
) # This shouldn't be corrected here
s = t.correct_spaces("M.a. lögum við bil.")
assert s == "M.a. lögum við bil."
s = t.correct_spaces("HANN BORÐAR Þ.Á.M. EPLI.")
assert s == "HANN BORÐAR Þ.Á.M. EPLI."
s = t.correct_spaces("Ég fór til Írlands 6.júní og þar var 17.4°C hiti eða 230.3K.")
assert s == "Ég fór til Írlands 6. júní og þar var 17.4 °C hiti eða 230.3 K."
s = t.correct_spaces(
"Þetta er setning.Þetta er önnur setning.Líka.En hvað með þetta?"
)
assert s == "Þetta er setning. Þetta er önnur setning. Líka. En hvað með þetta?"


def test_abbrev() -> None:
Expand Down Expand Up @@ -2556,7 +2569,6 @@ def test_one_sent_per_line() -> None:


if __name__ == "__main__":

test_single_tokens()
test_sentences()
test_correct_spaces()
Expand Down

0 comments on commit be8ee4d

Please sign in to comment.