mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Update WhisperTokenizer Doc: Timestamps and Previous Tokens Behaviour (#33390)
* added doc explaining behaviour regarding tokens timestamps and previous tokens * copied changes to faster tokenizer --------- Co-authored-by: Bruno Hays <bruno.hays@illuin.tech>
This commit is contained in:
parent
6ed2b10942
commit
dfee4f2362
@ -673,13 +673,15 @@ class WhisperTokenizer(PreTrainedTokenizer):
|
||||
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
|
||||
List of tokenized input ids. Can be obtained using the `__call__` method.
|
||||
skip_special_tokens (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to remove special tokens in the decoding.
|
||||
Whether or not to remove special tokens in the decoding. Will remove the previous tokens (pre-prompt)
|
||||
if present.
|
||||
clean_up_tokenization_spaces (`bool`, *optional*):
|
||||
Whether or not to clean up the tokenization spaces. If `None`, will default to
|
||||
`self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
|
||||
output_offsets (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to output the offsets of the tokens. This should only be set if the model predicted
|
||||
timestamps.
|
||||
timestamps. If there are previous tokens (pre-prompt) to decode, they will only appear in the decoded
|
||||
text if they contain timestamp tokens.
|
||||
time_precision (`float`, *optional*, defaults to 0.02):
|
||||
The time ratio to convert from token to time.
|
||||
decode_with_timestamps (`bool`, *optional*, defaults to `False`):
|
||||
|
@ -319,13 +319,15 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
|
||||
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
|
||||
List of tokenized input ids. Can be obtained using the `__call__` method.
|
||||
skip_special_tokens (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to remove special tokens in the decoding.
|
||||
Whether or not to remove special tokens in the decoding. Will remove the previous tokens (pre-prompt)
|
||||
if present.
|
||||
clean_up_tokenization_spaces (`bool`, *optional*):
|
||||
Whether or not to clean up the tokenization spaces. If `None`, will default to
|
||||
`self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
|
||||
output_offsets (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to output the offsets of the tokens. This should only be set if the model predicted
|
||||
timestamps.
|
||||
timestamps. If there are previous tokens (pre-prompt) to decode, they will only appear in the decoded
|
||||
text if they contain timestamp tokens.
|
||||
time_precision (`float`, *optional*, defaults to 0.02):
|
||||
The time ratio to convert from token to time.
|
||||
decode_with_timestamps (`bool`, *optional*, defaults to `False`):
|
||||
|
Loading…
Reference in New Issue
Block a user