[CodeLlamaTokenizerFast] Fix fix set_infilling_processor to properly reset (#26041)

* fix `set_infilling_processor` to properly reset * Add docstring! * fixups * more details in the docuemtation about the tokenization * styl;e
2025-08-02 19:21:31 +06:00 · 2023-09-08 16:03:09 -04:00 · 2023-09-08 16:03:09 -04:00 · 09b2de6eb7
commit 09b2de6eb7
parent d53606031f
1 changed files with 11 additions and 0 deletions
--- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py
+++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
@ -256,6 +256,16 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
        self.update_post_processor()

    def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens=True):
+        """
+        Updates the normalizer to make sure the prompt format for `infilling` is respected. The infilling format is the
+        following: if suffix_first
+            " <PRE> <SUF>{suf} <MID> {pre}"
+        else:
+            " <PRE> {pre} <SUF>{suf} <MID>"
+
+        If `reset` is set to `True`, the `normalizer` and `post_processor` are reset to their "normal" behaviour, which
+        is to add a prefix space for the normalizer, and add a `bos_token` to the input text for the `post_processor`.
+        """
        if reset:
            self._tokenizer.normalizer = normalizers.Sequence(
                [
@ -264,6 +274,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
                ]
            )
            self.update_post_processor()
+            return

        self._tokenizer.normalizer = normalizers.Replace(pattern=" ", content="▁")
        pair = [self.bos_token] if self.add_bos_token and add_special_tokens else []