mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
chore: remove repetitive words (#30174)
Signed-off-by: hugehope <cmm7@sina.cn>
This commit is contained in:
parent
e50be9a058
commit
58b170cdb1
@ -608,7 +608,7 @@ class CanineAttention(nn.Module):
|
||||
chunk_end = min(from_seq_length, chunk_start + self.attend_from_chunk_width)
|
||||
from_chunks.append((chunk_start, chunk_end))
|
||||
|
||||
# Determine the chunks (windows) that will will attend *to*.
|
||||
# Determine the chunks (windows) that will attend *to*.
|
||||
to_chunks = []
|
||||
if self.first_position_attends_to_all:
|
||||
to_chunks.append((0, to_seq_length))
|
||||
|
@ -67,7 +67,7 @@ class MambaConfig(PretrainedConfig):
|
||||
residual_in_fp32 (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model
|
||||
time_step_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
|
||||
Rank of the the discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
|
||||
Rank of the discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
|
||||
time_step_scale (`float`, *optional*, defaults to 1.0):
|
||||
Scale used used to scale `dt_proj.bias`.
|
||||
time_step_min (`float`, *optional*, defaults to 0.001):
|
||||
|
@ -41,7 +41,7 @@ class RwkvConfig(PretrainedConfig):
|
||||
Vocabulary size of the RWKV model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`RwkvModel`].
|
||||
context_length (`int`, *optional*, defaults to 1024):
|
||||
The maximum sequence length that this model can be be used with in a single forward (using it in RNN mode
|
||||
The maximum sequence length that this model can be used with in a single forward (using it in RNN mode
|
||||
lets use any sequence length).
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
|
@ -273,7 +273,7 @@ def get_polynomial_decay_schedule_with_warmup(
|
||||
|
||||
lr_init = optimizer.defaults["lr"]
|
||||
if not (lr_init > lr_end):
|
||||
raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
|
||||
raise ValueError(f"lr_end ({lr_end}) must be smaller than initial lr ({lr_init})")
|
||||
|
||||
lr_lambda = partial(
|
||||
_get_polynomial_decay_schedule_with_warmup_lr_lambda,
|
||||
|
Loading…
Reference in New Issue
Block a user