[CodeLlamaTokenizerFast] Fix fix set_infilling_processor to properly reset (#26041)

* fix `set_infilling_processor` to properly reset

* Add docstring!

* fixups

* more details in the docuemtation about the tokenization

* styl;e
This commit is contained in:
Arthur 2023-09-08 16:03:09 -04:00 committed by GitHub
parent d53606031f
commit 09b2de6eb7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -256,6 +256,16 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
self.update_post_processor()
def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens=True):
"""
Updates the normalizer to make sure the prompt format for `infilling` is respected. The infilling format is the
following: if suffix_first
" <PRE> <SUF>{suf} <MID> {pre}"
else:
" <PRE> {pre} <SUF>{suf} <MID>"
If `reset` is set to `True`, the `normalizer` and `post_processor` are reset to their "normal" behaviour, which
is to add a prefix space for the normalizer, and add a `bos_token` to the input text for the `post_processor`.
"""
if reset:
self._tokenizer.normalizer = normalizers.Sequence(
[
@ -264,6 +274,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
]
)
self.update_post_processor()
return
self._tokenizer.normalizer = normalizers.Replace(pattern=" ", content="")
pair = [self.bos_token] if self.add_bos_token and add_special_tokens else []