mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-01 02:31:11 +06:00
update the arguments add_prefix_space
and trim_offsets
in backend_tokenizer.post_processor
of RobertaTokenizerFast
(#14752)
* add tests * change post-processor, pre-tokenizer and decoder (can't update decoder) * update test (remove decoder which doesn't depend on trim and add_prefix) * just update the post_processor * fix change * `trim_offsets` has no influence on `pre_tokenizer` * remove a test that need some input from the `tokenizers` lib maintainers * format * add new test offsets roberta * polish comments
This commit is contained in:
parent
ec3567fe20
commit
c94c1b8967
@ -13,9 +13,11 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Fast Tokenization classes for RoBERTa."""
|
||||
|
||||
import json
|
||||
from typing import List, Optional
|
||||
|
||||
from tokenizers import processors
|
||||
|
||||
from ...tokenization_utils_base import AddedToken
|
||||
from ...utils import logging
|
||||
from ..gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
|
||||
@ -162,6 +164,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
|
||||
pad_token="<pad>",
|
||||
mask_token="<mask>",
|
||||
add_prefix_space=False,
|
||||
trim_offsets=True,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
@ -177,9 +180,37 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
trim_offsets=trim_offsets,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__`
|
||||
tokenizer_component = "post_processor"
|
||||
tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
|
||||
if tokenizer_component_instance:
|
||||
state = json.loads(tokenizer_component_instance.__getstate__())
|
||||
|
||||
# The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class`
|
||||
if "sep" in state:
|
||||
state["sep"] = tuple(state["sep"])
|
||||
if "cls" in state:
|
||||
state["cls"] = tuple(state["cls"])
|
||||
|
||||
changes_to_apply = False
|
||||
|
||||
if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
|
||||
state["add_prefix_space"] = add_prefix_space
|
||||
changes_to_apply = True
|
||||
|
||||
if state.get("trim_offsets", trim_offsets) != trim_offsets:
|
||||
state["trim_offsets"] = trim_offsets
|
||||
changes_to_apply = True
|
||||
|
||||
if changes_to_apply:
|
||||
component_class = getattr(processors, state.pop("type"))
|
||||
new_value = component_class(**state)
|
||||
setattr(self.backend_tokenizer, tokenizer_component, new_value)
|
||||
|
||||
@property
|
||||
def mask_token(self) -> str:
|
||||
"""
|
||||
|
@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
import unittest
|
||||
@ -196,3 +197,107 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertSequenceEqual(
|
||||
tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
|
||||
)
|
||||
|
||||
def test_change_add_prefix_space_and_trim_offsets_args(self):
|
||||
for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
||||
self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets
|
||||
)
|
||||
|
||||
pre_tokenizer_state = json.loads(tokenizer_r.backend_tokenizer.pre_tokenizer.__getstate__())
|
||||
post_processor_state = json.loads(tokenizer_r.backend_tokenizer.post_processor.__getstate__())
|
||||
|
||||
self.assertEqual(pre_tokenizer_state["add_prefix_space"], add_prefix_space)
|
||||
|
||||
self.assertEqual(post_processor_state["add_prefix_space"], add_prefix_space)
|
||||
self.assertEqual(post_processor_state["trim_offsets"], trim_offsets)
|
||||
|
||||
def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_arguments(self):
|
||||
# Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space` and
|
||||
# `trim_offsets`
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
|
||||
text = f"{text_of_1_token} {text_of_1_token}"
|
||||
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
||||
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
|
||||
)
|
||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||
self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
|
||||
self.assertEqual(
|
||||
encoding.offset_mapping[1],
|
||||
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||
)
|
||||
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
||||
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
|
||||
)
|
||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||
self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
|
||||
self.assertEqual(
|
||||
encoding.offset_mapping[1],
|
||||
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||
)
|
||||
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
||||
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
|
||||
)
|
||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||
self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
|
||||
self.assertEqual(
|
||||
encoding.offset_mapping[1],
|
||||
(len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||
)
|
||||
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
||||
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
|
||||
)
|
||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||
self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
|
||||
self.assertEqual(
|
||||
encoding.offset_mapping[1],
|
||||
(len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||
)
|
||||
|
||||
text = f" {text}"
|
||||
|
||||
# tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
||||
# pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
|
||||
# )
|
||||
# encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||
# self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token)))
|
||||
# self.assertEqual(
|
||||
# encoding.offset_mapping[1],
|
||||
# (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||
# )
|
||||
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
||||
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
|
||||
)
|
||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||
self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token)))
|
||||
self.assertEqual(
|
||||
encoding.offset_mapping[1],
|
||||
(1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||
)
|
||||
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
||||
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
|
||||
)
|
||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||
self.assertEqual(encoding.offset_mapping[0], (0, 1 + len(text_of_1_token)))
|
||||
self.assertEqual(
|
||||
encoding.offset_mapping[1],
|
||||
(1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||
)
|
||||
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
||||
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
|
||||
)
|
||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||
self.assertEqual(encoding.offset_mapping[0], (0, 1 + len(text_of_1_token)))
|
||||
self.assertEqual(
|
||||
encoding.offset_mapping[1],
|
||||
(1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user