mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-02 19:21:31 +06:00
[CodeLlamaTokenizerFast
] Fix fix set_infilling_processor
to properly reset (#26041)
* fix `set_infilling_processor` to properly reset * Add docstring! * fixups * more details in the docuemtation about the tokenization * styl;e
This commit is contained in:
parent
d53606031f
commit
09b2de6eb7
@ -256,6 +256,16 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
|
||||
self.update_post_processor()
|
||||
|
||||
def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens=True):
|
||||
"""
|
||||
Updates the normalizer to make sure the prompt format for `infilling` is respected. The infilling format is the
|
||||
following: if suffix_first
|
||||
" <PRE> <SUF>{suf} <MID> {pre}"
|
||||
else:
|
||||
" <PRE> {pre} <SUF>{suf} <MID>"
|
||||
|
||||
If `reset` is set to `True`, the `normalizer` and `post_processor` are reset to their "normal" behaviour, which
|
||||
is to add a prefix space for the normalizer, and add a `bos_token` to the input text for the `post_processor`.
|
||||
"""
|
||||
if reset:
|
||||
self._tokenizer.normalizer = normalizers.Sequence(
|
||||
[
|
||||
@ -264,6 +274,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
|
||||
]
|
||||
)
|
||||
self.update_post_processor()
|
||||
return
|
||||
|
||||
self._tokenizer.normalizer = normalizers.Replace(pattern=" ", content="▁")
|
||||
pair = [self.bos_token] if self.add_bos_token and add_special_tokens else []
|
||||
|
Loading…
Reference in New Issue
Block a user