diff --git a/finetune/encoding/input_encoder.py b/finetune/encoding/input_encoder.py index eddb6266..86c901fa 100644 --- a/finetune/encoding/input_encoder.py +++ b/finetune/encoding/input_encoder.py @@ -320,7 +320,7 @@ def tokenize_context(context, encoded_output, config): ) if ( context_by_char_loc[current_char_loc][2] - and token.strip().strip("Â") not in context_by_char_loc[current_char_loc][2] + and token.strip().strip("Âĉ") not in context_by_char_loc[current_char_loc][2] ): warnings.warn( "subtoken: {} has matched up with the context for: {}".format(