Fix mask token handling (#14364)

* Fix mask token handling

* Revert "Fix mask token handling"

This reverts commit daaa3f5291.

* Fix FNet mask token tokenization
This commit is contained in:
Li-Huai (Allan) Lin 2021-12-02 03:16:52 +08:00 committed by GitHub
parent 4df7d05a87
commit 934e2799da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 14 additions and 4 deletions

View File

@ -113,8 +113,13 @@ class FNetTokenizer(PreTrainedTokenizer):
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs
) -> None:
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
# Mask token behave like a normal word, i.e. include the space before it and
# is included in the raw text, there should be a match in a non-normalized sentence.
mask_token = (
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
if isinstance(mask_token, str)
else mask_token
)
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

View File

@ -107,8 +107,13 @@ class FNetTokenizerFast(PreTrainedTokenizerFast):
mask_token="[MASK]",
**kwargs
):
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
# Mask token behave like a normal word, i.e. include the space before it and
# is included in the raw text, there should be a match in a non-normalized sentence.
mask_token = (
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
if isinstance(mask_token, str)
else mask_token
)
super().__init__(
vocab_file,