mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Fix fast tokenization problems (#13930)
* Fix albert mask token tokenization. * Ensure special tokans sanitized. * Style * Fix * Apply suggestions from code review
This commit is contained in:
parent
5c153079e2
commit
ea163d0948
@ -142,8 +142,13 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
||||
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
||||
**kwargs
|
||||
) -> None:
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
# Mask token behave like a normal word, i.e. include the space before it and
|
||||
# is included in the raw text, there should be a match in a non-normalized sentence.
|
||||
mask_token = (
|
||||
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
|
||||
if isinstance(mask_token, str)
|
||||
else mask_token
|
||||
)
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
|
@ -135,8 +135,13 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
|
||||
mask_token="[MASK]",
|
||||
**kwargs
|
||||
):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
# Mask token behave like a normal word, i.e. include the space before it and
|
||||
# is included in the raw text, there should be a match in a non-normalized sentence.
|
||||
mask_token = (
|
||||
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
|
||||
if isinstance(mask_token, str)
|
||||
else mask_token
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
|
Loading…
Reference in New Issue
Block a user