mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Fix docstring for BertTokenizerFast (#6185)
- remove duplicate doc-entry for tokenize_chinese_chars - add doc for strip_accents and wordpieces_prefix
This commit is contained in:
parent
d8dbf3b75d
commit
82a0e2b67e
@ -577,10 +577,6 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
|
||||
mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
|
||||
The token used for masking values. This is the token used when training this model with masked language
|
||||
modeling. This is the token which the model will try to predict.
|
||||
tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to tokenize Chinese characters.
|
||||
This should likely be deactivated for Japanese:
|
||||
see: https://github.com/huggingface/transformers/issues/328
|
||||
clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to clean the text before tokenization by removing any control characters and
|
||||
replacing all whitespaces by the classic one.
|
||||
@ -588,6 +584,11 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
|
||||
Whether to tokenize Chinese characters.
|
||||
This should likely be deactivated for Japanese:
|
||||
see: https://github.com/huggingface/transformers/issues/328
|
||||
strip_accents: (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
Whether to strip all accents. If this option is not specified (ie == None),
|
||||
then it will be determined by the value for `lowercase` (as in the original Bert).
|
||||
wordpieces_prefix: (:obj:`string`, `optional`, defaults to "##"):
|
||||
The prefix for subwords.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
Loading…
Reference in New Issue
Block a user