mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Fix some typos. (#17560)
* Fix some typos. Signed-off-by: Yulv-git <yulvchi@qq.com> * Fix typo. Signed-off-by: Yulv-git <yulvchi@qq.com> * make fixup.
This commit is contained in:
parent
ad28ca291b
commit
95113d1365
@ -289,7 +289,7 @@ from huggingface_hub import notebook_login
|
||||
notebook_login()
|
||||
```
|
||||
|
||||
You can then push to to your own namespace (or an organization you are a member of) like this:
|
||||
You can then push to your own namespace (or an organization you are a member of) like this:
|
||||
|
||||
```py
|
||||
resnet50d.push_to_hub("custom-resnet50d")
|
||||
|
@ -37,7 +37,7 @@ predicted token ids.
|
||||
|
||||
The feature extractor depends on `torchaudio` and the tokenizer depends on `sentencepiece` so be sure to
|
||||
install those packages before running the examples. You could either install those as extra speech dependencies with
|
||||
`pip install transformers"[speech, sentencepiece]"` or install the packages seperately with `pip install torchaudio sentencepiece`. Also `torchaudio` requires the development version of the [libsndfile](http://www.mega-nerd.com/libsndfile/) package which can be installed via a system package manager. On Ubuntu it can
|
||||
`pip install transformers"[speech, sentencepiece]"` or install the packages separately with `pip install torchaudio sentencepiece`. Also `torchaudio` requires the development version of the [libsndfile](http://www.mega-nerd.com/libsndfile/) package which can be installed via a system package manager. On Ubuntu it can
|
||||
be installed as follows: `apt install libsndfile1-dev`
|
||||
|
||||
|
||||
|
@ -1226,7 +1226,7 @@ This whole process would have been much easier if we only could set something li
|
||||
experimental step, and let it fail without impacting the overall status of PRs. But as mentioned earlier CircleCI and
|
||||
Github Actions don't support it at the moment.
|
||||
|
||||
You can vote for this feature and see where it is at at these CI-specific threads:
|
||||
You can vote for this feature and see where it is at these CI-specific threads:
|
||||
|
||||
- [Github Actions:](https://github.com/actions/toolkit/issues/399)
|
||||
- [CircleCI:](https://ideas.circleci.com/ideas/CCI-I-344)
|
||||
|
@ -140,7 +140,7 @@ class TokenClassificationTask:
|
||||
# it easier for the model to learn the concept of sequences.
|
||||
#
|
||||
# For classification tasks, the first vector (corresponding to [CLS]) is
|
||||
# used as as the "sentence vector". Note that this only makes sense because
|
||||
# used as the "sentence vector". Note that this only makes sense because
|
||||
# the entire model is fine-tuned.
|
||||
tokens += [sep_token]
|
||||
label_ids += [pad_token_label_id]
|
||||
|
@ -43,7 +43,7 @@ A good metric to observe during training is the gradient norm which should ideal
|
||||
|
||||
When training a model on large datasets it is recommended to run the data preprocessing
|
||||
in a first run in a **non-distributed** mode via `--preprocessing_only` so that
|
||||
when running the model in **distributed** mode in a second step the preprocessed data
|
||||
when running the model in **distributed** mode in a second step the preprocessed data
|
||||
can easily be loaded on each distributed device.
|
||||
|
||||
---
|
||||
|
@ -91,7 +91,7 @@ python scripts/initialize_model.py \
|
||||
--model_name codeparrot \
|
||||
--push_to_hub True
|
||||
```
|
||||
This will initialize a new model with the architecture and configuration of `gpt2-large` and use the tokenizer to appropriately size the input embeddings. Finally, the initilaized model is pushed the the hub.
|
||||
This will initialize a new model with the architecture and configuration of `gpt2-large` and use the tokenizer to appropriately size the input embeddings. Finally, the initilaized model is pushed the hub.
|
||||
|
||||
We can either pass the name of a text dataset or a pretokenized dataset which speeds up training a bit.
|
||||
Now that the tokenizer and model are also ready we can start training the model. The main training script is built with `accelerate` to scale across a wide range of platforms and infrastructure scales. We train two models with [110M](https://huggingface.co/lvwerra/codeparrot-small/) and [1.5B](https://huggingface.co/lvwerra/codeparrot/) parameters for 25-30B tokens on a 16xA100 (40GB) machine which takes 1 day and 1 week, respectively.
|
||||
|
@ -43,7 +43,7 @@ if __name__ == "__main__":
|
||||
with open(args.data_file, "rb") as fp:
|
||||
data = pickle.load(fp)
|
||||
|
||||
logger.info("Counting occurences for MLM.")
|
||||
logger.info("Counting occurrences for MLM.")
|
||||
counter = Counter()
|
||||
for tk_ids in data:
|
||||
counter.update(tk_ids)
|
||||
|
@ -49,7 +49,7 @@ At the end of the community week, each team should submit a demo of their projec
|
||||
|
||||
- **23.06.** Official announcement of the community week. Make sure to sign-up in [this google form](https://forms.gle/tVGPhjKXyEsSgUcs8).
|
||||
- **23.06. - 30.06.** Participants will be added to an internal Slack channel. Project ideas can be proposed here and groups of 3-5 are formed. Read this document for more information.
|
||||
- **30.06.** Release of all relevant training scripts in JAX/Flax as well as other documents on how to set up a TPU, how to use the training scripts, how to submit a demo, tips & tricks for JAX/Flax, tips & tricks for efficient use of the hub.
|
||||
- **30.06.** Release of all relevant training scripts in JAX/Flax as well as other documents on how to set up a TPU, how to use the training scripts, how to submit a demo, tips & tricks for JAX/Flax, tips & tricks for efficient use of the hub.
|
||||
- **30.06. - 2.07.** Talks about JAX/Flax, TPU, Transformers, Computer Vision & NLP will be held.
|
||||
- **7.07.** Start of the community week! Access to TPUv3-8 will be given to each team.
|
||||
- **7.07. - 14.07.** The Hugging Face & JAX/Flax & Cloud team will be available for any questions, problems the teams might run into.
|
||||
|
@ -106,7 +106,7 @@ def main():
|
||||
return start_logits, end_logits, jnp.argmax(pooled_logits, axis=-1)
|
||||
|
||||
def evaluate(example):
|
||||
# encode question and context so that they are seperated by a tokenizer.sep_token and cut at max_length
|
||||
# encode question and context so that they are separated by a tokenizer.sep_token and cut at max_length
|
||||
inputs = tokenizer(
|
||||
example["question"],
|
||||
example["context"],
|
||||
|
@ -22,7 +22,7 @@ the JAX/Flax backend and the [`pjit`](https://jax.readthedocs.io/en/latest/jax.e
|
||||
> Note: The example is experimental and might have bugs. Also currently it only supports single V3-8.
|
||||
|
||||
The `partition.py` file defines the `PyTree` of `ParitionSpec` for the GPTNeo model which describes how the model will be sharded.
|
||||
The actual sharding is auto-matically handled by `pjit`. The weights are sharded accross all local devices.
|
||||
The actual sharding is auto-matically handled by `pjit`. The weights are sharded across all local devices.
|
||||
To adapt the script for other models, we need to also change the `ParitionSpec` accordingly.
|
||||
|
||||
TODO: Add more explantion.
|
||||
|
@ -78,7 +78,7 @@ class FlaxBeamSearchOutput(ModelOutput):
|
||||
sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
|
||||
The generated sequences.
|
||||
scores (`jnp.ndarray` of shape `(batch_size,)`):
|
||||
The scores (log probabilites) of the generated sequences.
|
||||
The scores (log probabilities) of the generated sequences.
|
||||
"""
|
||||
|
||||
sequences: jnp.ndarray = None
|
||||
|
@ -277,7 +277,7 @@ class PushToHubCallback(Callback):
|
||||
for instance `"user_name/model"`, which allows you to push to an organization you are a member of with
|
||||
`"organization_name/model"`.
|
||||
|
||||
Will default to to the name of `output_dir`.
|
||||
Will default to the name of `output_dir`.
|
||||
hub_token (`str`, *optional*):
|
||||
The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with
|
||||
`huggingface-cli login`.
|
||||
|
@ -1267,7 +1267,7 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel, TFCausalLanguageMode
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
self.model = TFBartMainLayer(config, load_weight_prefix=load_weight_prefix, name="model")
|
||||
self.use_cache = config.use_cache
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
|
||||
self.final_logits_bias = self.add_weight(
|
||||
name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
|
||||
)
|
||||
|
@ -1253,7 +1253,7 @@ class TFBlenderbotForConditionalGeneration(TFBlenderbotPreTrainedModel, TFCausal
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
self.model = TFBlenderbotMainLayer(config, name="model")
|
||||
self.use_cache = config.use_cache
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
|
||||
self.final_logits_bias = self.add_weight(
|
||||
name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
|
||||
)
|
||||
|
@ -1240,7 +1240,7 @@ class TFBlenderbotSmallForConditionalGeneration(TFBlenderbotSmallPreTrainedModel
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
self.model = TFBlenderbotSmallMainLayer(config, name="model")
|
||||
self.use_cache = config.use_cache
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
|
||||
self.final_logits_bias = self.add_weight(
|
||||
name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
|
||||
)
|
||||
|
@ -184,7 +184,7 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||
|
||||
# add offset to the starting indexes so that that indexes now create a span
|
||||
# add offset to the starting indexes so that indexes now create a span
|
||||
offsets = np.arange(mask_length)[None, None, :]
|
||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||
batch_size, max_num_masked_span * mask_length
|
||||
|
@ -2054,7 +2054,7 @@ class DetrLoss(nn.Module):
|
||||
# Retrieve the matching between the outputs of the last layer and the targets
|
||||
indices = self.matcher(outputs_without_aux, targets)
|
||||
|
||||
# Compute the average number of target boxes accross all nodes, for normalization purposes
|
||||
# Compute the average number of target boxes across all nodes, for normalization purposes
|
||||
num_boxes = sum(len(t["class_labels"]) for t in targets)
|
||||
num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
|
||||
# (Niels): comment out function below, distributed training to be added
|
||||
|
@ -212,7 +212,7 @@ class TFElectraSelfOutput(tf.keras.layers.Layer):
|
||||
return hidden_states
|
||||
|
||||
|
||||
# Copied from from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Electra
|
||||
# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Electra
|
||||
class TFElectraAttention(tf.keras.layers.Layer):
|
||||
def __init__(self, config: ElectraConfig, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
@ -83,7 +83,7 @@ class FlavaImageConfig(PretrainedConfig):
|
||||
>>> # Initializing a FlavaImageModel with style configuration
|
||||
>>> configuration = FlavaImageConfig()
|
||||
|
||||
>>> # Initializing a FlavaImageModel model from the style configuration
|
||||
>>> # Initializing a FlavaImageModel model from the style configuration
|
||||
>>> model = FlavaImageModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
@ -212,7 +212,7 @@ class FlavaTextConfig(PretrainedConfig):
|
||||
>>> # Initializing a FlavaTextModel with style configuration
|
||||
>>> configuration = FlavaTextConfig()
|
||||
|
||||
>>> # Initializing a FlavaTextConfig from the style configuration
|
||||
>>> # Initializing a FlavaTextConfig from the style configuration
|
||||
>>> model = FlavaTextModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
@ -321,7 +321,7 @@ class FlavaMultimodalConfig(PretrainedConfig):
|
||||
>>> # Initializing a FlavaMultimodalModel with style configuration
|
||||
>>> configuration = FlavaMultimodalConfig()
|
||||
|
||||
>>> # Initializing a FlavaMultimodalModel model from the style configuration
|
||||
>>> # Initializing a FlavaMultimodalModel model from the style configuration
|
||||
>>> model = FlavaMultimodalModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
|
@ -82,10 +82,10 @@ class HubertConfig(PretrainedConfig):
|
||||
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
||||
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
||||
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
||||
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
||||
of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
|
||||
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
||||
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
||||
length of *conv_kernel* defines the number of convolutional layers and has to match the length of
|
||||
*conv_dim*.
|
||||
conv_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether the 1D convolutional layers have a bias.
|
||||
|
@ -174,7 +174,7 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||
|
||||
# add offset to the starting indexes so that that indexes now create a span
|
||||
# add offset to the starting indexes so that indexes now create a span
|
||||
offsets = np.arange(mask_length)[None, None, :]
|
||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||
batch_size, max_num_masked_span * mask_length
|
||||
|
@ -203,7 +203,7 @@ def _compute_mask_indices(
|
||||
Computes random mask spans for a given shape
|
||||
|
||||
Args:
|
||||
shape: the the shape for which to compute masks.
|
||||
shape: the shape for which to compute masks.
|
||||
should be of size 2 where first element is batch size and 2nd is timesteps
|
||||
attention_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
|
||||
mask_prob:
|
||||
|
@ -2330,7 +2330,7 @@ class TFLEDForConditionalGeneration(TFLEDPreTrainedModel):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
self.led = TFLEDMainLayer(config, name="led")
|
||||
self.use_cache = config.use_cache
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
|
||||
self.final_logits_bias = self.add_weight(
|
||||
name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
|
||||
)
|
||||
|
@ -1110,7 +1110,7 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
|
||||
|
||||
def get_qa_logit_layer(self) -> nn.Module:
|
||||
"""
|
||||
Returns the the linear layer that produces question answering logits.
|
||||
Returns the linear layer that produces question answering logits.
|
||||
|
||||
Returns:
|
||||
`nn.Module`: A torch module mapping the question answering prediction hidden states or `None` if LXMERT
|
||||
@ -1341,7 +1341,7 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
|
||||
|
||||
def get_qa_logit_layer(self) -> nn.Module:
|
||||
"""
|
||||
Returns the the linear layer that produces question answering logits
|
||||
Returns the linear layer that produces question answering logits
|
||||
|
||||
Returns:
|
||||
`nn.Module`: A torch module mapping the question answering prediction hidden states. `None`: A NoneType
|
||||
|
@ -1283,7 +1283,7 @@ class TFMarianMTModel(TFMarianPreTrainedModel, TFCausalLanguageModelingLoss):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
self.model = TFMarianMainLayer(config, name="model")
|
||||
self.use_cache = config.use_cache
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
|
||||
self.final_logits_bias = self.add_weight(
|
||||
name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
|
||||
)
|
||||
|
@ -1912,7 +1912,7 @@ class MaskFormerLoss(nn.Module):
|
||||
|
||||
def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> torch.Tensor:
|
||||
"""
|
||||
Computes the average number of target masks accross the batch, for normalization purposes.
|
||||
Computes the average number of target masks across the batch, for normalization purposes.
|
||||
"""
|
||||
num_masks = sum([len(classes) for classes in class_labels])
|
||||
num_masks_pt = torch.as_tensor([num_masks], dtype=torch.float, device=device)
|
||||
|
@ -1280,7 +1280,7 @@ class TFMBartForConditionalGeneration(TFMBartPreTrainedModel, TFCausalLanguageMo
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
self.model = TFMBartMainLayer(config, name="model")
|
||||
self.use_cache = config.use_cache
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
|
||||
self.final_logits_bias = self.add_weight(
|
||||
name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
|
||||
)
|
||||
|
@ -1292,7 +1292,7 @@ class TFPegasusForConditionalGeneration(TFPegasusPreTrainedModel, TFCausalLangua
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
self.model = TFPegasusMainLayer(config, name="model")
|
||||
self.use_cache = config.use_cache
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
|
||||
self.final_logits_bias = self.add_weight(
|
||||
name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
|
||||
)
|
||||
|
@ -28,7 +28,7 @@ RAG_CONFIG_DOC = r"""
|
||||
title_sep (`str`, *optional*, defaults to `" / "`):
|
||||
Separator inserted between the title and the text of the retrieved document when calling [`RagRetriever`].
|
||||
doc_sep (`str`, *optional*, defaults to `" // "`):
|
||||
Separator inserted between the the text of the retrieved document and the original input when calling
|
||||
Separator inserted between the text of the retrieved document and the original input when calling
|
||||
[`RagRetriever`].
|
||||
n_docs (`int`, *optional*, defaults to 5):
|
||||
Number of documents to retrieve.
|
||||
|
@ -81,10 +81,10 @@ class SEWConfig(PretrainedConfig):
|
||||
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
||||
conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
|
||||
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
||||
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
||||
of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
|
||||
conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
|
||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
||||
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
||||
length of *conv_kernel* defines the number of convolutional layers and has to match the length of
|
||||
*conv_dim*.
|
||||
conv_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether the 1D convolutional layers have a bias.
|
||||
|
@ -174,7 +174,7 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||
|
||||
# add offset to the starting indexes so that that indexes now create a span
|
||||
# add offset to the starting indexes so that indexes now create a span
|
||||
offsets = np.arange(mask_length)[None, None, :]
|
||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||
batch_size, max_num_masked_span * mask_length
|
||||
|
@ -99,10 +99,10 @@ class SEWDConfig(PretrainedConfig):
|
||||
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
||||
conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
|
||||
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
||||
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
||||
of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
|
||||
conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
|
||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
||||
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
||||
length of *conv_kernel* defines the number of convolutional layers and has to match the length of
|
||||
*conv_dim*.
|
||||
conv_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether the 1D convolutional layers have a bias.
|
||||
|
@ -175,7 +175,7 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||
|
||||
# add offset to the starting indexes so that that indexes now create a span
|
||||
# add offset to the starting indexes so that indexes now create a span
|
||||
offsets = np.arange(mask_length)[None, None, :]
|
||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||
batch_size, max_num_masked_span * mask_length
|
||||
|
@ -85,10 +85,10 @@ class UniSpeechConfig(PretrainedConfig):
|
||||
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
||||
conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
||||
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
||||
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
||||
of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
|
||||
conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
||||
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
||||
length of *conv_kernel* defines the number of convolutional layers and has to match the length of
|
||||
*conv_dim*.
|
||||
conv_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether the 1D convolutional layers have a bias.
|
||||
|
@ -210,7 +210,7 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||
|
||||
# add offset to the starting indexes so that that indexes now create a span
|
||||
# add offset to the starting indexes so that indexes now create a span
|
||||
offsets = np.arange(mask_length)[None, None, :]
|
||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||
batch_size, max_num_masked_span * mask_length
|
||||
|
@ -86,10 +86,10 @@ class UniSpeechSatConfig(PretrainedConfig):
|
||||
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
||||
conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
||||
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
||||
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
||||
of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
|
||||
conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
||||
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
||||
length of *conv_kernel* defines the number of convolutional layers and has to match the length of
|
||||
*conv_dim*.
|
||||
conv_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether the 1D convolutional layers have a bias.
|
||||
|
@ -224,7 +224,7 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||
|
||||
# add offset to the starting indexes so that that indexes now create a span
|
||||
# add offset to the starting indexes so that indexes now create a span
|
||||
offsets = np.arange(mask_length)[None, None, :]
|
||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||
batch_size, max_num_masked_span * mask_length
|
||||
|
@ -120,7 +120,7 @@ def _compute_mask_indices(
|
||||
CPU as part of the preprocessing during training.
|
||||
|
||||
Args:
|
||||
shape: the the shape for which to compute masks.
|
||||
shape: the shape for which to compute masks.
|
||||
should be of size 2 where first element is batch size and 2nd is timesteps
|
||||
mask_prob:
|
||||
probability for each token to be chosen as start of the span to be masked. this will be multiplied by
|
||||
|
@ -244,7 +244,7 @@ def _compute_mask_indices(
|
||||
Computes random mask spans for a given shape
|
||||
|
||||
Args:
|
||||
shape: the the shape for which to compute masks.
|
||||
shape: the shape for which to compute masks.
|
||||
should be of size 2 where first element is batch size and 2nd is timesteps
|
||||
attention_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
|
||||
mask_prob:
|
||||
|
@ -234,7 +234,7 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||
|
||||
# add offset to the starting indexes so that that indexes now create a span
|
||||
# add offset to the starting indexes so that indexes now create a span
|
||||
offsets = np.arange(mask_length)[None, None, :]
|
||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||
batch_size, max_num_masked_span * mask_length
|
||||
|
@ -231,7 +231,7 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||
|
||||
# add offset to the starting indexes so that that indexes now create a span
|
||||
# add offset to the starting indexes so that indexes now create a span
|
||||
offsets = np.arange(mask_length)[None, None, :]
|
||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||
batch_size, max_num_masked_span * mask_length
|
||||
|
@ -82,10 +82,10 @@ class WavLMConfig(PretrainedConfig):
|
||||
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
||||
conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
||||
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
||||
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
||||
of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
|
||||
conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
||||
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
||||
length of *conv_kernel* defines the number of convolutional layers and has to match the length of
|
||||
*conv_dim*.
|
||||
conv_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether the 1D convolutional layers have a bias.
|
||||
|
@ -183,7 +183,7 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||
|
||||
# add offset to the starting indexes so that that indexes now create a span
|
||||
# add offset to the starting indexes so that indexes now create a span
|
||||
offsets = np.arange(mask_length)[None, None, :]
|
||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||
batch_size, max_num_masked_span * mask_length
|
||||
|
@ -1069,7 +1069,7 @@ class YolosLoss(nn.Module):
|
||||
# Retrieve the matching between the outputs of the last layer and the targets
|
||||
indices = self.matcher(outputs_without_aux, targets)
|
||||
|
||||
# Compute the average number of target boxes accross all nodes, for normalization purposes
|
||||
# Compute the average number of target boxes across all nodes, for normalization purposes
|
||||
num_boxes = sum(len(t["class_labels"]) for t in targets)
|
||||
num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
|
||||
# (Niels): comment out function below, distributed training to be added
|
||||
|
@ -487,7 +487,7 @@ class OnnxConfigWithPast(OnnxConfig, ABC):
|
||||
|
||||
def fill_with_past_key_values_(self, inputs_or_outputs: Mapping[str, Mapping[int, str]], direction: str):
|
||||
"""
|
||||
Fill the input_or_ouputs mapping with past_key_values dynamic axes considering.
|
||||
Fill the input_or_outputs mapping with past_key_values dynamic axes considering.
|
||||
|
||||
Args:
|
||||
inputs_or_outputs: The mapping to fill.
|
||||
|
@ -412,8 +412,8 @@ class TrainingArguments:
|
||||
down the training and evaluation speed.
|
||||
push_to_hub (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to push the model to the Hub every time the model is saved. If this is activated,
|
||||
`output_dir` will begin a git directory synced with the the repo (determined by `hub_model_id`) and the
|
||||
content will be pushed each time a save is triggered (depending on your `save_strategy`). Calling
|
||||
`output_dir` will begin a git directory synced with the repo (determined by `hub_model_id`) and the content
|
||||
will be pushed each time a save is triggered (depending on your `save_strategy`). Calling
|
||||
[`~Trainer.save_model`] will also trigger a push.
|
||||
|
||||
<Tip warning={true}>
|
||||
@ -434,7 +434,7 @@ class TrainingArguments:
|
||||
`"organization_name/model"`. Will default to `user_name/output_dir_name` with *output_dir_name* being the
|
||||
name of `output_dir`.
|
||||
|
||||
Will default to to the name of `output_dir`.
|
||||
Will default to the name of `output_dir`.
|
||||
hub_strategy (`str` or [`~trainer_utils.HubStrategy`], *optional*, defaults to `"every_save"`):
|
||||
Defines the scope of what is pushed to the Hub and when. Possible values are:
|
||||
|
||||
|
@ -990,7 +990,7 @@ tokenizer.
|
||||
For [camelcase name of model], the tokenizer files can be found here:
|
||||
- [To be filled out by mentor]
|
||||
|
||||
and having implemented the 🤗Transformers' version of the tokenizer can be loaded as follows:
|
||||
and having implemented the 🤗 Transformers' version of the tokenizer can be loaded as follows:
|
||||
|
||||
[To be filled out by mentor]
|
||||
|
||||
|
@ -2821,7 +2821,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration(TF{{cookiec
|
||||
self.model = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="model")
|
||||
self.model._set_save_spec(inputs=self.serving.input_signature)
|
||||
self.use_cache = config.use_cache
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
|
||||
# final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
|
||||
self.final_logits_bias = self.add_weight(
|
||||
name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
|
||||
)
|
||||
|
@ -2183,7 +2183,9 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
sequence = tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)
|
||||
total_length = len(sequence["input_ids"])
|
||||
|
||||
self.assertGreater(total_length, 4, "Issue with the testing sequence, please update it it's too short")
|
||||
self.assertGreater(
|
||||
total_length, 4, "Issue with the testing sequence, please update it, it's too short"
|
||||
)
|
||||
|
||||
# Test with max model input length
|
||||
model_max_length = tokenizer.model_max_length
|
||||
@ -2193,7 +2195,9 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
sequence1 = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
|
||||
total_length1 = len(sequence1["input_ids"])
|
||||
self.assertGreater(
|
||||
total_length1, model_max_length, "Issue with the testing sequence, please update it it's too short"
|
||||
total_length1,
|
||||
model_max_length,
|
||||
"Issue with the testing sequence, please update it, it's too short",
|
||||
)
|
||||
|
||||
# Simple
|
||||
|
@ -2097,7 +2097,9 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
sequence = tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)
|
||||
total_length = len(sequence["input_ids"])
|
||||
|
||||
self.assertGreater(total_length, 4, "Issue with the testing sequence, please update it it's too short")
|
||||
self.assertGreater(
|
||||
total_length, 4, "Issue with the testing sequence, please update it, it's too short"
|
||||
)
|
||||
|
||||
# Test with max model input length
|
||||
model_max_length = tokenizer.model_max_length
|
||||
@ -2107,7 +2109,9 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
sequence1 = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
|
||||
total_length1 = len(sequence1["input_ids"])
|
||||
self.assertGreater(
|
||||
total_length1, model_max_length, "Issue with the testing sequence, please update it it's too short"
|
||||
total_length1,
|
||||
model_max_length,
|
||||
"Issue with the testing sequence, please update it, it's too short",
|
||||
)
|
||||
|
||||
# Simple
|
||||
|
@ -281,7 +281,7 @@ class TFViTMAEModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
super().check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
|
||||
|
||||
# overwrite from common since TFViTMAEForPretraining outputs loss along with
|
||||
# logits and mask indices. loss and mask indicies are not suitable for integration
|
||||
# logits and mask indices. loss and mask indices are not suitable for integration
|
||||
# with other keras modules.
|
||||
def test_compile_tf_model(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
@ -278,7 +278,7 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
|
||||
NER_MODEL = "dbmdz/bert-large-cased-finetuned-conll03-english"
|
||||
model = AutoModelForTokenClassification.from_pretrained(NER_MODEL)
|
||||
tokenizer = AutoTokenizer.from_pretrained(NER_MODEL, use_fast=True)
|
||||
sentence = """Enzo works at the the UN"""
|
||||
sentence = """Enzo works at the UN"""
|
||||
token_classifier = pipeline("ner", model=model, tokenizer=tokenizer)
|
||||
output = token_classifier(sentence)
|
||||
self.assertEqual(
|
||||
|
@ -990,7 +990,9 @@ class TokenizerTesterMixin:
|
||||
sequence = tokenizer.encode(seq_0, add_special_tokens=False)
|
||||
total_length = len(sequence)
|
||||
|
||||
self.assertGreater(total_length, 4, "Issue with the testing sequence, please update it it's too short")
|
||||
self.assertGreater(
|
||||
total_length, 4, "Issue with the testing sequence, please update it, it's too short"
|
||||
)
|
||||
|
||||
# Test with max model input length
|
||||
model_max_length = tokenizer.model_max_length
|
||||
@ -1000,7 +1002,9 @@ class TokenizerTesterMixin:
|
||||
sequence1 = tokenizer(seq_1, add_special_tokens=False)
|
||||
total_length1 = len(sequence1["input_ids"])
|
||||
self.assertGreater(
|
||||
total_length1, model_max_length, "Issue with the testing sequence, please update it it's too short"
|
||||
total_length1,
|
||||
model_max_length,
|
||||
"Issue with the testing sequence, please update it, it's too short",
|
||||
)
|
||||
|
||||
# Simple
|
||||
|
@ -53,7 +53,7 @@ def _find_text_in_file(filename, start_prompt, end_prompt):
|
||||
return "".join(lines[start_index:end_index]), start_index, end_index, lines
|
||||
|
||||
|
||||
# Add here suffixes that are used to identify models, seperated by |
|
||||
# Add here suffixes that are used to identify models, separated by |
|
||||
ALLOWED_MODEL_SUFFIXES = "Model|Encoder|Decoder|ForConditionalGeneration"
|
||||
# Regexes that match TF/Flax/PT model names.
|
||||
_re_tf_models = re.compile(r"TF(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")
|
||||
|
Loading…
Reference in New Issue
Block a user