Skip to content

Commit

Permalink
Fix unnatural tokenizations if possible
Browse files Browse the repository at this point in the history
  • Loading branch information
KlaudiaTH committed Nov 8, 2023
1 parent b05c53d commit a69dc1e
Showing 1 changed file with 14 additions and 0 deletions.
14 changes: 14 additions & 0 deletions lm_eval/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,20 @@ def loglikelihood(self, requests):

continuation_enc = self.tok_encode(continuation)
# continuation_enc = self.tok_encode(continuation, is_continuation=True)
context_continuation_enc = self.tok_encode(context + continuation)

if context_enc + continuation_enc != context_continuation_enc:
if context_continuation_enc[: len(context_enc)] == context_enc:
# continuation_enc is incorrect and context_enc is correct
continuation_enc = context_continuation_enc[len(context_enc) :]
elif context_continuation_enc[-len(continuation_enc) :] == continuation_enc:
# continuation_enc is correct and context_enc is incorrect
context_enc = context_continuation_enc[: -len(continuation_enc)]
else:
# Both are incorrect
print(
f"WARNING: Unnatural tokenization of concatenated context ...{repr(context[-20:])} and continuation {repr(continuation)}"
)

new_reqs.append(((context, continuation), context_enc, continuation_enc))

Expand Down

0 comments on commit a69dc1e

Please sign in to comment.