Fix typos in strings and comments (#37799)

This commit is contained in:
co63oc 2025-04-28 18:39:11 +08:00 committed by GitHub
parent f466603963
commit d5fa7d2d19
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
136 changed files with 202 additions and 202 deletions

View File

@ -1146,9 +1146,9 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: Option
tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
fft_window_size (`int`, *optional*):
Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the
spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of
spectrogram. 400 means that the fourier transform is computed on windows of 400 samples. The number of
frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to
`(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally.
`(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionally.
Example:

View File

@ -850,7 +850,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, generated_len=generated_len)
ids_collect.append(beam_id)
# due to overly complex constraints or other factors, sometimes we can't gaurantee a successful
# due to overly complex constraints or other factors, sometimes we can't guarantee a successful
# generation. In these cases we simply return the highest scoring outputs.
if len(ids_collect) < self.num_beam_hyps_to_keep:
for beam_id in range(self.num_beams):

View File

@ -192,7 +192,7 @@ class GenerationConfig(PushToHubMixin):
our [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information.
cache_config (`CacheConfig` or `dict`, *optional*, default to `None`):
Arguments used in the key-value cache class can be passed in `cache_config`. Can be passed as a `Dict` and
it will be converted to its repsective `CacheConfig` internally.
it will be converted to its respective `CacheConfig` internally.
Otherwise can be passed as a `CacheConfig` class matching the indicated `cache_implementation`.
return_legacy_cache (`bool`, *optional*, default to `True`):
Whether to return the legacy or new format of the cache when `DynamicCache` is used by default.
@ -235,7 +235,7 @@ class GenerationConfig(PushToHubMixin):
The parameter for repetition penalty. 1.0 means no penalty. See [this
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
encoder_repetition_penalty (`float`, *optional*, defaults to 1.0):
The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
The parameter for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
original input. 1.0 means no penalty.
length_penalty (`float`, *optional*, defaults to 1.0):
Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
@ -385,7 +385,7 @@ class GenerationConfig(PushToHubMixin):
inference.
disable_compile (`bool`, *optional*):
Whether to disable the automatic compilation of the forward pass. Automatic compilation happens when
specific criteria are met, including using a compileable cache. Please open an issue if you find the
specific criteria are met, including using a compilable cache. Please open an issue if you find the
need to use this flag.
> Wild card
@ -710,7 +710,7 @@ class GenerationConfig(PushToHubMixin):
UserWarning,
)
# 3. detect incorrect paramaterization specific to advanced beam modes
# 3. detect incorrect parameterization specific to advanced beam modes
else:
# constrained beam search
if self.constraints is not None or self.force_words_ids is not None:

View File

@ -271,7 +271,7 @@ class FlaxMinLengthLogitsProcessor(FlaxLogitsProcessor):
class FlaxSuppressTokensAtBeginLogitsProcessor(FlaxLogitsProcessor):
r"""
[`FlaxLogitsProcessor`] supressing a list of tokens as soon as the `generate` function starts generating using
[`FlaxLogitsProcessor`] suppressing a list of tokens as soon as the `generate` function starts generating using
`begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are not sampled at the
beginning of the generation.

View File

@ -543,7 +543,7 @@ class TopKLogitsWarper(LogitsProcessor):
class MinPLogitsWarper(LogitsProcessor):
"""
[`LogitsProcessor`] that performs min-p, i.e. keeps all tokens that are above a minimum probability, scaled by the
probability of the most likely token. As a result, the filter becomes more agressive in the presence of
probability of the most likely token. As a result, the filter becomes more aggressive in the presence of
high-probability tokens, which is a sign of a confident output that we shouldn't deviate from.
Often used together with [`TemperatureLogitsWarper`]. Used as an alternative to [`TopPLogitsWarper`] and
@ -738,7 +738,7 @@ class EpsilonLogitsWarper(LogitsProcessor):
>>> # With epsilon sampling, the output gets restricted to high-probability tokens. Note that this is similar to
>>> # Top P sampling, which restricts tokens based on their cumulative probability.
>>> # Pro tip: The paper recomends using `epsilon_cutoff` values between 3e-4 and 9e-4
>>> # Pro tip: The paper recommends using `epsilon_cutoff` values between 3e-4 and 9e-4
>>> outputs = model.generate(**inputs, do_sample=True, epsilon_cutoff=0.1)
>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
@ -819,7 +819,7 @@ class EtaLogitsWarper(LogitsProcessor):
>>> # With eta sampling, the output gets restricted to high-probability tokens. You can see it as a dynamic form of
>>> # epsilon sampling that adapts its cutoff probability based on the entropy (high entropy = lower cutoff).
>>> # Pro tip: The paper recomends using `eta_cutoff` values between 3e-4 to 4e-3
>>> # Pro tip: The paper recommends using `eta_cutoff` values between 3e-4 to 4e-3
>>> outputs = model.generate(**inputs, do_sample=True, eta_cutoff=0.1)
>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
@ -1348,7 +1348,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
Alice and Bob are friends
>>> # We can contrain it with `prefix_allowed_tokens_fn` to force a certain behavior based on a prefix.
>>> # We can constrain it with `prefix_allowed_tokens_fn` to force a certain behavior based on a prefix.
>>> # For instance, we can force an entire entity to be generated when its beginning is detected.
>>> entity = tokenizer(" Bob Marley", return_tensors="pt").input_ids[0] # 3 tokens
>>> def prefix_allowed_tokens_fn(batch_id, input_ids):
@ -1791,7 +1791,7 @@ class LogitNormalization(LogitsProcessor):
class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
r"""
[`SuppressTokensAtBeginLogitsProcessor`] supresses a list of tokens as soon as the `generate` function starts
[`SuppressTokensAtBeginLogitsProcessor`] suppresses a list of tokens as soon as the `generate` function starts
generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are
not generated at the beginning. Originally created for
[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
@ -2642,7 +2642,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
We assume that the scores are in the log space.
Args:
scores (`torch.FloatTensor`): Scores (batch_size, vocab_size).
g_values (`torch.FloatTensor`): G valus (batch_size, vocab_size, depth).
g_values (`torch.FloatTensor`): G values (batch_size, vocab_size, depth).
Returns:
Updated scores (batch_size, vocab_size).
@ -2668,7 +2668,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
if self.debug_mode:
scores = torch.ones_like(scores)
# Currently indices is just a arange to compute watermarking on the desnse logits.
# Currently indices is just a arange to compute watermarking on the dense logits.
all_indices = torch.stack([torch.arange(vocab_size, device=self.device) for _ in range(batch_size)])
if self.state is None:

View File

@ -343,7 +343,7 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
)
def _match_found():
# Finaly, runs the actual comparison. Can only be called if the previous comparisons do not yield
# Finally, runs the actual comparison. Can only be called if the previous comparisons do not yield
# an answer (otherwise we get indexing exceptions)
compare_len = self.bad_word_seqs_len[bad_word_seq_number] - 1
return tf.cond(

View File

@ -962,7 +962,7 @@ class TFGenerationMixin:
raise ValueError(
"Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
f" num_return_sequences, got {generation_config.num_beams} and"
f" {generation_config.num_return_sequences} (respectivelly)"
f" {generation_config.num_return_sequences} (respectively)"
)
# 11. broadcast inputs to the desired number of beams
@ -994,7 +994,7 @@ class TFGenerationMixin:
raise ValueError(
"Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
f" num_return_sequences, got {generation_config.num_beams} and"
f" {generation_config.num_return_sequences} (respectivelly)"
f" {generation_config.num_return_sequences} (respectively)"
)
# 11. prepare logits warper
@ -1626,7 +1626,7 @@ class TFGenerationMixin:
)
use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
use_xla = not tf.executing_eagerly()
# TODO (Joao): fix cache format or find programatic way to detect cache index
# TODO (Joao): fix cache format or find programmatic way to detect cache index
# GPT2 and other models has a slightly different cache structure, with a different batch axis
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
@ -1910,7 +1910,7 @@ class TFGenerationMixin:
)
use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
use_xla = not tf.executing_eagerly()
# TODO (Joao): fix cache format or find programatic way to detect cache index
# TODO (Joao): fix cache format or find programmatic way to detect cache index
# GPT2 and other models has a slightly different cache structure, with a different batch axis
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
@ -2253,7 +2253,7 @@ class TFGenerationMixin:
use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
use_xla = not tf.executing_eagerly()
# TODO (Joao): fix cache format or find programatic way to detect cache index
# TODO (Joao): fix cache format or find programmatic way to detect cache index
# GPT2 and other models has a slightly different cache structure, with a different batch axis
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
@ -2788,7 +2788,7 @@ class TFGenerationMixin:
model_kwargs.pop("use_cache", None)
use_xla = not tf.executing_eagerly()
# TODO (Joao): fix cache format or find programatic way to detect cache index
# TODO (Joao): fix cache format or find programmatic way to detect cache index
# GPT2 and other models has a slightly different cache structure, with a different batch axis
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0

View File

@ -362,7 +362,7 @@ class GenerationMixin:
inherit from `GenerationMixin` to benefit from all generation-related automation in our codebase;
- `BarkModel` has a custom `generate` method and one of its inner models calls `GenerationMixin.generate`.
However, its `generate` does not share the same interface as `GenerationMixin.generate`. In this case,
`BarkModel` shoud NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.
`BarkModel` should NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.
The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
- *greedy decoding* if `num_beams=1` and `do_sample=False`
@ -392,7 +392,7 @@ class GenerationMixin:
- Exception 1: when passing input_embeds, input_ids may be missing entries
- Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
- Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
- Excpetion 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
- Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
generate the first token for each sequence. Later use the generated Input ids for continuation.
The current implementation does not rely on ``self`` and could be
@ -967,7 +967,7 @@ class GenerationMixin:
assistant_model=assistant_model,
assistant_prune_lm_head=True, # prune LM head of assistant model
)
# Since we prune the LM head, we cannot use the repetition penalty on the assistant model due to mismaches between token ids and logits index
# Since we prune the LM head, we cannot use the repetition penalty on the assistant model due to mismatches between token ids and logits index
assistant_model.generation_config.repetition_penalty = None
candidate_generator = UniversalSpeculativeDecodingGenerator(
input_ids=input_ids,

View File

@ -171,7 +171,7 @@ def find_tied_parameters(model: "nn.Module", **kwargs):
```
"""
# get ALL model parameters and thier names
# get ALL model parameters and their names
all_named_parameters = dict(model.named_parameters(remove_duplicate=False))
# get ONLY unique named parameters,
@ -187,7 +187,7 @@ def find_tied_parameters(model: "nn.Module", **kwargs):
for tied_param_name in tied_param_names:
tied_param = all_named_parameters[tied_param_name]
for param_name, param in no_duplicate_named_parameters.items():
# compare if parameters are the same, if so, group thier names together
# compare if parameters are the same, if so, group their names together
if param is tied_param:
if param_name not in tied_param_groups:
tied_param_groups[param_name] = []

View File

@ -329,7 +329,7 @@ class TorchExportableModuleWithStaticCache(torch.nn.Module):
This util function is designed to test exported models by simulating the generation process.
It processes the input prompt tokens sequentially (no parallel prefill).
This generate function is not intended to replace the original `generate` method, and the support
for leveraging the original `generate` is potentially planed!
for leveraging the original `generate` is potentially planned!
Args:
exported_program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.

View File

@ -28,7 +28,7 @@ def autoname_modules(model):
module.name = name
# Get the linear_tag from a modul name. For example: model.layers.31.self_attn.k_proj -> self_attn.k_proj
# Get the linear_tag from a module name. For example: model.layers.31.self_attn.k_proj -> self_attn.k_proj
def name_to_linear_tag(name):
return ".".join([n for n in name.split(".") if ((n not in ["model", "layers"]) and (not n.isnumeric()))])
@ -86,9 +86,9 @@ def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_conve
"""
Prepares nn.Linear layers for HQQ quantization.
Since each layer type can have separate quantization parameters, we need to do the following:
1- tag each module with its neme via autoname_modules()
1- tag each module with its name via autoname_modules()
2- Extract linear_tags (e.g. ['self_attn.q_proj', ...])
3- Map quantization parameters as a dictionary linear_tag -> quant_params as HQQLinear exepects it, this is referred to as patch_params
3- Map quantization parameters as a dictionary linear_tag -> quant_params as HQQLinear expects it, this is referred to as patch_params
"""
modules_to_not_convert = [] if modules_to_not_convert is None else modules_to_not_convert

View File

@ -160,7 +160,7 @@ def distribute_module(
output_fn=None,
) -> nn.Module:
"""
Copy pasted from torch's function but we remove the communications (partitionning)
Copy pasted from torch's function but we remove the communications (partitioning)
as well as buffer registering that is similarly not efficient.
"""
if len(module._forward_pre_hooks) == 0:
@ -225,7 +225,7 @@ class GatherParallel(TensorParallelLayer):
@staticmethod
def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
# this op cannot be asynch, otherwise it completely breaks the outputs of models
# this op cannot be async, otherwise it completely breaks the outputs of models
torch.distributed.all_reduce(outputs[0], op=torch.distributed.ReduceOp.SUM, async_op=False)
return outputs

View File

@ -343,7 +343,7 @@ class HungarianMatcher(nn.Module):
# Compute the classification cost. Contrary to the loss, we don't use the NLL,
# but approximate it in 1 - proba[target class].
# The 1 is a constant that doesn't change the matching, it can be ommitted.
# The 1 is a constant that doesn't change the matching, it can be omitted.
class_cost = -out_prob[:, target_ids]
# Compute the L1 cost between boxes

View File

@ -99,7 +99,7 @@ class RTDetrHungarianMatcher(nn.Module):
target_bbox = torch.cat([v["boxes"] for v in targets])
# Compute the classification cost. Contrary to the loss, we don't use the NLL,
# but approximate it in 1 - proba[target class].
# The 1 is a constant that doesn't change the matching, it can be ommitted.
# The 1 is a constant that doesn't change the matching, it can be omitted.
if self.use_focal_loss:
out_prob = F.sigmoid(outputs["logits"].flatten(0, 1))
out_prob = out_prob[:, target_ids]

View File

@ -593,7 +593,7 @@ class AlignVisionBlock(nn.Module):
class AlignVisionEncoder(nn.Module):
r"""
Forward propogates the embeddings through each vision encoder (EfficientNet) block.
Forward propagates the embeddings through each vision encoder (EfficientNet) block.
Args:
config ([`AlignVisionConfig`]):

View File

@ -36,7 +36,7 @@ class AlignProcessorKwargs(ProcessingKwargs, total=False):
class AlignProcessor(ProcessorMixin):
r"""
Constructs an ALIGN processor which wraps [`EfficientNetImageProcessor`] and
[`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that interits both the image processor and
[`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that inherits both the image processor and
tokenizer functionalities. See the [`~AlignProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
information.
The preferred way of passing kwargs is as a dictionary per modality, see usage example below.

View File

@ -1936,7 +1936,7 @@ class AutoformerForPrediction(AutoformerPreTrainedModel):
params = None
if future_values is not None:
# outputs.last_hidden_state and trend
# loc is 4rd last and scale is 3rd last output
# loc is 4th last and scale is 3rd last output
params = self.output_params(outputs[0] + outputs[1])
distribution = self.output_distribution(params, loc=outputs[-3], scale=outputs[-2])

View File

@ -164,7 +164,7 @@ def convert_checkpoint_to_pytorch(tf_checkpoint_path: str, config_path: str, pyt
new_model = BertForMaskedLM.from_pretrained(pytorch_dump_path)
print(new_model.eval())
print("Model conversion was done sucessfully!")
print("Model conversion was done successfully!")
if __name__ == "__main__":

View File

@ -235,7 +235,7 @@ class Blip2Config(PretrainedConfig):
num_query_tokens (`int`, *optional*, defaults to 32):
The number of query tokens passed through the Transformer.
image_text_hidden_size (`int`, *optional*, defaults to 256):
Dimentionality of the hidden state of the image-text fusion layer.
Dimensionality of the hidden state of the image-text fusion layer.
image_token_index (`int`, *optional*):
Token index of special image token.

View File

@ -899,7 +899,7 @@ class BloomForCausalLM(BloomPreTrainedModel, GenerationMixin):
use_cache=True,
**kwargs,
):
# Overwriten because of the fixed-shape attention mask creation
# Overwritten because of the fixed-shape attention mask creation
# If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
# Exception 1: when passing input_embeds, input_ids may be missing entries

View File

@ -49,17 +49,17 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
to warn users if the audio fed to the feature extractor does not have the same sampling rate.
hop_length (`int`,*optional*, defaults to 480):
Length of the overlaping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
Length of the overlapping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
in smaller `frames` with a step of `hop_length` between each frame.
max_length_s (`int`, *optional*, defaults to 10):
The maximum input length of the model in seconds. This is used to pad the audio.
fft_window_size (`int`, *optional*, defaults to 1024):
Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency
resolution of the spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples.
resolution of the spectrogram. 400 means that the fourier transform is computed on windows of 400 samples.
padding_value (`float`, *optional*, defaults to 0.0):
Padding value used to pad the audio. Should correspond to silences.
return_attention_mask (`bool`, *optional*, defaults to `False`):
Whether or not the model should return the attention masks coresponding to the input.
Whether or not the model should return the attention masks corresponding to the input.
frequency_min (`float`, *optional*, defaults to 0):
The lowest frequency of interest. The STFT will not be computed for values below this.
frequency_max (`float`, *optional*, defaults to 14000):
@ -141,7 +141,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
Serializes this instance to a Python dictionary.
Returns:
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, excpet for the
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, except for the
mel filter banks, which do not need to be saved or printed as they are too long.
"""
output = copy.deepcopy(self.__dict__)

View File

@ -1067,7 +1067,7 @@ CLAP_TEXT_INPUTS_DOCSTRING = r"""
CLAP_AUDIO_INPUTS_DOCSTRING = r"""
Args:
input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
@ -1105,7 +1105,7 @@ CLAP_INPUTS_DOCSTRING = r"""
[What are position IDs?](../glossary#position-ids)
input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.

View File

@ -127,7 +127,7 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
if kwargs.pop("add_bos_token", False):
model_id = kwargs.pop("name_or_path", "")
raise ValueError(
"Currenty GPT2's fast tokenizer does NOT support adding a BOS token. "
"Currently GPT2's fast tokenizer does NOT support adding a BOS token. "
"Instead you should use GPT2's slow tokenizer class `CodeGenTokenizer` as follows: \n"
f"`CodeGenTokenizer.from_pretrained('{model_id}')`\nor\n"
f"`AutoTokenizer.from_pretrained('{model_id}', use_fast=False)`\n"

View File

@ -277,7 +277,7 @@ def final():
def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_folder):
"""
Fucntion to convert the microsoft cvt checkpoint to huggingface checkpoint
Function to convert the microsoft cvt checkpoint to huggingface checkpoint
"""
img_labels_file = "imagenet-1k-id2label.json"
num_labels = 1000

View File

@ -58,7 +58,7 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
# activation function weight
r"transformer\.encoder\.layers\.(\d+)\.activation\.weight": r"encoder.layers.\1.activation_fn.weight",
#########################################################################################################################################
# decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms + activiation function weight
# decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms + activation function weight
r"transformer\.decoder\.layers\.(\d+)\.self_attn\.out_proj\.(bias|weight)": r"decoder.layers.\1.self_attn.self_attn.output_proj.\2",
r"transformer\.decoder\.layers\.(\d+)\.cross_attn\.out_proj\.(bias|weight)": r"decoder.layers.\1.cross_attn.cross_attn.output_proj.\2",
# FFNs
@ -144,7 +144,7 @@ def write_model(model_name, pretrained_model_weights_path, pytorch_dump_folder_p
config.label2id = {v: k for k, v in id2label.items()}
# load original model from local path
loaded = torch.load(pretrained_model_weights_path, map_location=torch.device("cpu"), weights_only=True)["model"]
# Renaming the original model state dictionary to HF compatibile
# Renaming the original model state dictionary to HF compatible
all_keys = list(loaded.keys())
new_keys = convert_old_keys_to_new_keys(all_keys)
state_dict = {}

View File

@ -1297,7 +1297,7 @@ class JukeboxConditionalAutoregressive(nn.Module):
):
"""
Autoregressive model on either lyric tokens or music tokens, or both. The attention pattern should be properly
set fro each configuration.
set for each configuration.
Args:
config (`JukeboxPriorConfig`):

View File

@ -142,7 +142,7 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
return patches
if n_patches_per_batch < 4:
# for each batch, atleast 4 small patches are required to
# for each batch, at least 4 small patches are required to
# recreate a large square patch from merging them and later padding is applied
# 3 x (8x8) patches becomes 1 x ( 8x8 ) patch (extra patch ignored, no padding)
# 4 x (8x8) patches becomes 1 x (16x16) patch (padding later)

View File

@ -118,7 +118,7 @@ class Embeddings(nn.Module):
# Setting the position-ids to the registered buffer in constructor, it helps
# when tracing the model without passing position-ids, solves
# isues similar to issue #5664
# issues similar to issue #5664
if hasattr(self, "position_ids"):
position_ids = self.position_ids[:, :seq_length]
else:

View File

@ -72,7 +72,7 @@ class DonutImageProcessor(BaseImageProcessor):
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
do_pad (`bool`, *optional*, defaults to `True`):
Whether to pad the image. If `random_padding` is set to `True` in `preprocess`, each image is padded with a
random amont of padding on each size, up to the largest image size in the batch. Otherwise, all images are
random amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are
padded to the largest image size in the batch.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
@ -349,7 +349,7 @@ class DonutImageProcessor(BaseImageProcessor):
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
do_pad (`bool`, *optional*, defaults to `self.do_pad`):
Whether to pad the image. If `random_padding` is set to `True`, each image is padded with a random
amont of padding on each size, up to the largest image size in the batch. Otherwise, all images are
amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are
padded to the largest image size in the batch.
random_padding (`bool`, *optional*, defaults to `self.random_padding`):
Whether to use random padding when padding the image. If `True`, each image in the batch with be padded

View File

@ -142,7 +142,7 @@ class FastSpeech2ConformerConfig(PretrainedConfig):
speaker id embedding layer.
num_languages (`int`, *optional*):
Number of languages. If set to > 1, assume that the language ids will be provided as the input and use the
languge id embedding layer.
language id embedding layer.
speaker_embed_dim (`int`, *optional*):
Speaker embedding dimension. If set to > 0, assume that speaker_embedding will be provided as the input.
is_encoder_decoder (`bool`, *optional*, defaults to `True`):

View File

@ -391,7 +391,7 @@ class FastSpeech2ConformerVariancePredictor(nn.Module):
dropout_rate=0.5,
):
"""
Initilize variance predictor module.
Initialize variance predictor module.
Args:
input_dim (`int`): Input dimension.

View File

@ -948,7 +948,7 @@ class FlaubertModel(FlaubertPreTrainedModel):
# Setting the position-ids to the registered buffer in constructor, it helps
# when tracing the model without passing position-ids, solves
# isues similar to issue #5664
# issues similar to issue #5664
if position_ids is None:
if hasattr(self, "position_ids"):
position_ids = self.position_ids[:, :slen]

View File

@ -360,7 +360,7 @@ class FocalNetModulation(nn.Module):
x = self.projection_in(hidden_state).permute(0, 3, 1, 2).contiguous()
q, ctx, gates = torch.split(x, (num_channels, num_channels, self.focal_level + 1), 1)
# context aggreation
# context aggregation
ctx_all = 0
for level in range(self.focal_level):
ctx = self.focal_layers[level](ctx)
@ -379,7 +379,7 @@ class FocalNetModulation(nn.Module):
if self.use_post_layernorm_in_modulation:
x_out = self.layernorm(x_out)
# post linear porjection
# post linear projection
x_out = self.projection_out(x_out)
x_out = self.projection_dropout(x_out)
return x_out
@ -415,7 +415,7 @@ class FocalNetLayer(nn.Module):
dim (`int`):
Number of input channels.
input_resolution (`Tuple[int]`):
Input resulotion.
Input resolution.
drop_path (`float`, *optional*, defaults to 0.0):
Stochastic depth rate.
"""

View File

@ -244,7 +244,7 @@ def _tokenize_prompts_with_image_and_batch(
- pad all the sequences to this length so we can convert them into a 3D tensor.
"""
# If not tool use, tranform the coordinates while tokenizing
# If not tool use, transform the coordinates while tokenizing
if scale_factors is not None:
transformed_prompt_tokens = []
for prompt_seq, scale_factor_seq in zip(prompts, scale_factors):

View File

@ -96,7 +96,7 @@ class Gemma3TextConfig(PretrainedConfig):
Scaling factor when applying tanh softcapping on the attention scores.
cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type
Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:

View File

@ -140,7 +140,7 @@ class Gemma3ImageProcessor(BaseImageProcessor):
):
"""
Pan and Scan and image, by cropping into smaller images when the aspect ratio exceeds
minumum allowed ratio.
minimum allowed ratio.
Args:
image (`np.ndarray`):

View File

@ -108,7 +108,7 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
):
"""
Pan and Scan an image, by cropping into smaller images when the aspect ratio exceeds
minumum allowed ratio.
minimum allowed ratio.
Args:
image (`torch.Tensor`):

View File

@ -1270,7 +1270,7 @@ class Gemma3ForConditionalGeneration(Gemma3PreTrainedModel, GenerationMixin):
is_training = token_type_ids is not None and labels is not None
# Replace image id woth PAD if the image token if OOV, to avoid index-errors
# Replace image id with PAD if the image token if OOV, to avoid index-errors
if input_ids is not None and self.config.image_token_id >= self.vocab_size:
special_image_mask = input_ids == self.config.image_token_id
llm_input_ids = input_ids.clone()

View File

@ -128,7 +128,7 @@ class Gemma3TextConfig(Gemma2Config):
Scaling factor when applying tanh softcapping on the attention scores.
cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type
Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:
@ -926,7 +926,7 @@ class Gemma3ForConditionalGeneration(PaliGemmaForConditionalGeneration):
is_training = token_type_ids is not None and labels is not None
# Replace image id woth PAD if the image token if OOV, to avoid index-errors
# Replace image id with PAD if the image token if OOV, to avoid index-errors
if input_ids is not None and self.config.image_token_id >= self.vocab_size:
special_image_mask = input_ids == self.config.image_token_id
llm_input_ids = input_ids.clone()

View File

@ -1495,7 +1495,7 @@ class GitForCausalLM(GitPreTrainedModel, GenerationMixin):
>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")
>>> # set seed for reproducability
>>> # set seed for reproducibility
>>> np.random.seed(45)

View File

@ -199,7 +199,7 @@ class GPTNeoXConfig(PretrainedConfig):
if self.hidden_size % self.num_attention_heads != 0:
raise ValueError(
"The hidden size is not divisble by the number of attention heads! Make sure to update them!"
"The hidden size is not divisible by the number of attention heads! Make sure to update them!"
)

View File

@ -402,7 +402,7 @@ def convert_grounding_dino_checkpoint(args):
"grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth",
"grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swinb_cogcoor.pth",
}
# Define default GroundingDino configuation
# Define default GroundingDino configuration
config = get_grounding_dino_config(model_name)
# Load original checkpoint

View File

@ -1850,7 +1850,7 @@ class GroundingDinoDecoder(GroundingDinoPreTrainedModel):
# In original implementation they apply layer norm before outputting intermediate hidden states
# Though that's not through between layers so the layers use as input the output of the previous layer
# withtout layer norm
# without layer norm
if output_hidden_states:
all_hidden_states += (self.layer_norm(hidden_states),)

View File

@ -1425,7 +1425,7 @@ HUBERT_INPUTS_DOCSTRING = r"""
@add_start_docstrings(
"The bare TFHubert Model transformer outputing raw hidden-states without any specific head on top.",
"The bare TFHubert Model transformer outputting raw hidden-states without any specific head on top.",
HUBERT_START_DOCSTRING,
)
class TFHubertModel(TFHubertPreTrainedModel):

View File

@ -74,8 +74,8 @@ class IBertConfig(PretrainedConfig):
quant_mode (`bool`, *optional*, defaults to `False`):
Whether to quantize the model or not.
force_dequant (`str`, *optional*, defaults to `"none"`):
Force dequantize specific nonlinear layer. Dequatized layers are then executed with full precision.
`"none"`, `"gelu"`, `"softmax"`, `"layernorm"` and `"nonlinear"` are supported. As deafult, it is set as
Force dequantize specific nonlinear layer. Dequantized layers are then executed with full precision.
`"none"`, `"gelu"`, `"softmax"`, `"layernorm"` and `"nonlinear"` are supported. As default, it is set as
`"none"`, which does not dequantize any layers. Please specify `"gelu"`, `"softmax"`, or `"layernorm"` to
dequantize GELU, Softmax, or LayerNorm, respectively. `"nonlinear"` will dequantize all nonlinear layers,
i.e., GELU, Softmax, and LayerNorm.

View File

@ -276,7 +276,7 @@ class InternVLProcessor(ProcessorMixin):
Args:
metadata (`VideoMetadata`):
`VideoMetadata` object containing metadat about the video, such as "total_num_frames" or "fps".
`VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
num_frames (`int`, *optional*):
Number of frames to sample uniformly. If None, all frames are sampled.
initial_shift (`bool`, `float` or `int`, defaults to `0`):

View File

@ -246,7 +246,7 @@ class LevitAttentionSubsample(nn.Module):
self.out_dim_keys_values = attention_ratio * key_dim * num_attention_heads + key_dim * num_attention_heads
self.out_dim_projection = attention_ratio * key_dim * num_attention_heads
self.resolution_out = resolution_out
# resolution_in is the intial resolution, resoloution_out is final resolution after downsampling
# resolution_in is the initial resolution, resolution_out is final resolution after downsampling
self.keys_values = MLPLayerWithBN(input_dim, self.out_dim_keys_values)
self.queries_subsample = LevitSubsample(stride, resolution_in)
self.queries = MLPLayerWithBN(input_dim, key_dim * num_attention_heads)
@ -370,7 +370,7 @@ class LevitStage(nn.Module):
self.layers = []
self.config = config
self.resolution_in = resolution_in
# resolution_in is the intial resolution, resolution_out is final resolution after downsampling
# resolution_in is the initial resolution, resolution_out is final resolution after downsampling
for _ in range(depths):
self.layers.append(
LevitResidualLayer(

View File

@ -55,7 +55,7 @@ if is_torchvision_available():
def get_factors(dividend: int) -> Set[int]:
"""
Calculate all factors of a given number, i.e. a dividor that leaves
Calculate all factors of a given number, i.e. a divisor that leaves
no remainder. For example, if dividend=12, it will return {1, 2, 3, 4, 6, 12}.
Args:

View File

@ -60,7 +60,7 @@ class LlavaNextVideoImageProcessor(BaseImageProcessor):
image_grid_pinpoints (`List` *optional*, defaults to `[[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]]`):
A list of possible resolutions to use for processing high resolution images. The best resolution is selected
based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
method. Not used for processinf videos.
method. Not used for processing videos.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
do_center_crop (`bool`, *optional*, defaults to `True`):

View File

@ -405,7 +405,7 @@ class Mask2FormerHungarianMatcher(nn.Module):
"""
super().__init__()
if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
raise ValueError("All costs cant be 0")
raise ValueError("All costs can't be 0")
self.num_points = num_points
self.cost_class = cost_class

View File

@ -829,7 +829,7 @@ class MaskFormerHungarianMatcher(nn.Module):
"""
super().__init__()
if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
raise ValueError("All costs cant be 0")
raise ValueError("All costs can't be 0")
self.cost_class = cost_class
self.cost_mask = cost_mask
self.cost_dice = cost_dice

View File

@ -98,7 +98,7 @@ def add_megatron_checkpoint_args(parser):
default=128,
help=(
"Pad the vocab size to be divisible by this value. "
"This is added for computational efficieny reasons. "
"This is added for computational efficiency reasons. "
"Only used when converting a Transformers checkpoint to a Megatron checkpoint."
),
)
@ -235,7 +235,7 @@ def transformers_to_megatron_fix_query_key_value_ordering(
param, checkpoint_version, num_splits, num_heads, hidden_size
):
"""
Permutes layout of param tensor to the one compatible with respective NVIDIA Megatron-LM chekpoint versions. Input
Permutes layout of param tensor to the one compatible with respective NVIDIA Megatron-LM checkpoint versions. Input
is [num_splits * num_heads * hidden_size, :] and output is [num_heads * hidden_size * num_splits, :] for version
1.0 and [num_heads * num_splits * hidden_size, :] for version 2.0 and later. If param is the weight tensor of the
self-attention block, the param needs to be already transposed before calling this function.
@ -348,7 +348,7 @@ def convert_checkpoint_from_megatron_to_transformers(args):
raise ValueError(
"Megatron-LM checkpoint does not contain arguments. This utility only supports Megatron-LM checkpoints"
" containing all the megatron arguments. This is because it loads all config related to model"
" architecture, the tensor and pipeline model parallel size from the checkpoint insead of user having to"
" architecture, the tensor and pipeline model parallel size from the checkpoint instead of user having to"
" manually specify all the details. Please save Megatron-LM checkpoint along with all the megatron"
" arguments to use this utility."
)

View File

@ -1601,7 +1601,7 @@ class MusicgenForCausalLM(MusicgenPreTrainedModel, GenerationMixin):
# 7. determine generation mode
generation_mode = generation_config.get_generation_mode()
# 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
# 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
generation_config.guidance_scale = None
@ -2617,7 +2617,7 @@ class MusicgenForConditionalGeneration(PreTrainedModel, GenerationMixin):
# 7. determine generation mode
generation_mode = generation_config.get_generation_mode()
# 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
# 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
generation_config.guidance_scale = None

View File

@ -54,7 +54,7 @@ class MusicgenMelodyFeatureExtractor(SequenceFeatureExtractor):
sampling_rate (`int`, *optional*, defaults to 32000):
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
hop_length (`int`, *optional*, defaults to 4096):
Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
chunk_length (`int`, *optional*, defaults to 30):
The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
sequences.

View File

@ -92,7 +92,7 @@ class MusicgenMelodyOutputWithPast(ModelOutput):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of conditional hidden-states representing the concatenation of the projeted text encoder output and the projeted audio encoder output.
Sequence of conditional hidden-states representing the concatenation of the projected text encoder output and the projected audio encoder output.
Used as a conditional signal.
"""
@ -757,8 +757,8 @@ MUSICGEN_MELODY_INPUTS_DOCSTRING = r"""
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of conditional hidden-states representing the concatenation of the projeted text encoder output and the projeted audio encoder output.
Used as a conditional signal and will thus be concatenated to the projeted `decoder_input_ids`.
Sequence of conditional hidden-states representing the concatenation of the projected text encoder output and the projected audio encoder output.
Used as a conditional signal and will thus be concatenated to the projected `decoder_input_ids`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
@ -818,7 +818,7 @@ MUSICGEN_MELODY_DECODER_INPUTS_DOCSTRING = r"""
[What are attention masks?](../glossary#attention-mask)
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states representing the concatenation of the text encoder output and the processed audio encoder output.
Used as a conditional signal and will thus be concatenated to the projeted `decoder_input_ids`.
Used as a conditional signal and will thus be concatenated to the projected `decoder_input_ids`.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing attention on conditional hidden states. Mask values
selected in `[0, 1]`:
@ -1522,7 +1522,7 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel, GenerationMixin):
# 7. determine generation mode
generation_mode = generation_config.get_generation_mode()
# 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
# 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
generation_config.guidance_scale = None
@ -2478,7 +2478,7 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin):
# 7. determine generation mode
generation_mode = generation_config.get_generation_mode()
# 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
# 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
generation_config.guidance_scale = None

View File

@ -425,7 +425,7 @@ class NllbMoeSparseMLP(nn.Module):
r"""
The goal of this forward pass is to have the same number of operation as the equivalent `NllbMoeDenseActDense`
(mlp) layer. This means that all of the hidden states should be processed at most twice ( since we are using a
top_2 gating mecanism). This means that we keep the complexity to O(batch_size x sequence_length x hidden_dim)
top_2 gating mechanism). This means that we keep the complexity to O(batch_size x sequence_length x hidden_dim)
instead of O(num_experts x batch_size x sequence_length x hidden_dim).
1- Get the `router_probs` from the `router`. The shape of the `router_mask` is `(batch_size X sequence_length,

View File

@ -376,7 +376,7 @@ class NougatTokenizerFast(PreTrainedTokenizerFast):
contains everything needed to load the tokenizer.
clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
Whether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
spaces.
unk_token (`str`, *optional*, defaults to `"<unk>"`):

View File

@ -268,7 +268,7 @@ def convert_omdet_turbo_checkpoint(args):
"https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/ViT-B-16.pt",
],
}
# Define default OmDetTurbo configuation
# Define default OmDetTurbo configuration
config = get_omdet_turbo_config(model_name, use_timm_backbone)
# Load original checkpoint

View File

@ -471,7 +471,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
is_training = token_type_ids is not None and labels is not None
# Replace image id woth PAD if the image token if OOV, to avoid index-errors
# Replace image id with PAD if the image token if OOV, to avoid index-errors
if input_ids is not None and self.config.image_token_id >= self.vocab_size:
special_image_mask = input_ids == self.config.image_token_id
llm_input_ids = input_ids.clone()

View File

@ -1807,7 +1807,7 @@ class PatchTSMixerForTimeSeriesClassificationOutput(ModelOutput):
Args:
prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
Prediction output from the classfication head.
Prediction output from the classification head.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
Backbone embeddings before passing through the head.
hidden_states (`tuple(torch.FloatTensor)`, *optional*):

View File

@ -1487,7 +1487,7 @@ FLAX_PEGASUS_CONDITIONAL_GENERATION_DOCSTRING = """
Summarization example:
```pyton
```python
>>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
>>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')

View File

@ -127,7 +127,7 @@ def get_resize_output_image_size(
ratio = max(height / max_height, width / max_width)
if ratio > 1:
# Orgiginal implementation uses `round` which utilises bankers rounding, which can lead to surprising results
# Original implementation uses `round` which utilises bankers rounding, which can lead to surprising results
# Here we use floor to ensure the image is always smaller than the given "longest_edge"
height = int(math.floor(height / ratio))
width = int(math.floor(width / ratio))

View File

@ -35,7 +35,7 @@ logger = logging.get_logger(__name__)
def create_rename_keys(config):
rename_keys = []
for i in range(config.num_encoder_blocks):
# Remane embedings' paramters
# Rename embeddings' parameters
rename_keys.append((f"pos_embed{i + 1}", f"pvt.encoder.patch_embeddings.{i}.position_embeddings"))
rename_keys.append((f"patch_embed{i + 1}.proj.weight", f"pvt.encoder.patch_embeddings.{i}.projection.weight"))

View File

@ -1037,7 +1037,7 @@ REMBERT_INPUTS_DOCSTRING = r"""
@add_start_docstrings(
"The bare RemBERT Model transformer outputing raw hidden-states without any specific head on top.",
"The bare RemBERT Model transformer outputting raw hidden-states without any specific head on top.",
REMBERT_START_DOCSTRING,
)
class TFRemBertModel(TFRemBertPreTrainedModel):

View File

@ -911,7 +911,7 @@ ROFORMER_INPUTS_DOCSTRING = r"""
@add_start_docstrings(
"The bare RoFormer Model transformer outputing raw hidden-states without any specific head on top.",
"The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top.",
ROFORMER_START_DOCSTRING,
)
class TFRoFormerModel(TFRoFormerPreTrainedModel):

View File

@ -2171,7 +2171,7 @@ class SeamlessM4TTextToUnitForConditionalGeneration(SeamlessM4TPreTrainedModel,
config: SeamlessM4TConfig,
embed_tokens_decoder: Optional[nn.Embedding] = None,
):
# update config - used principaly for bos_token_id etc.
# update config - used principality for bos_token_id etc.
config = copy.deepcopy(config)
for param, val in config.to_dict().items():
if param.startswith("t2u_"):

View File

@ -184,7 +184,7 @@ SEAMLESS_M4T_V2_MULTIMODAL_INPUTS_DOCSTRING = r"""
[What are input IDs?](../glossary#input-ids)
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
"""
@ -202,7 +202,7 @@ M4T_TEXT_INPUTS_DOCSTRING = r"""
M4T_SPEECH_INPUTS_DOCSTRING = r"""
Args:
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
"""
@ -2461,7 +2461,7 @@ class SeamlessM4Tv2TextToUnitForConditionalGeneration(SeamlessM4Tv2PreTrainedMod
config: SeamlessM4Tv2Config,
embed_tokens_decoder: Optional[nn.Embedding] = None,
):
# update config - used principaly for bos_token_id etc.
# update config - used principality for bos_token_id etc.
config = copy.deepcopy(config)
for param, val in config.to_dict().items():
if param.startswith("t2u_"):
@ -4035,7 +4035,7 @@ class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMix
Args:
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
return_intermediate_token_ids (`bool`, *optional*):
If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
@ -4485,7 +4485,7 @@ class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
[What are input IDs?](../glossary#input-ids)
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`, *optional*):
Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
return_intermediate_token_ids (`bool`, *optional*):
If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want

View File

@ -114,7 +114,7 @@ def convert_seggpt_checkpoint(args):
verify_logits = args.verify_logits
push_to_hub = args.push_to_hub
# Define default GroundingDINO configuation
# Define default GroundingDINO configuration
config = SegGptConfig()
# Load original checkpoint

View File

@ -62,7 +62,7 @@ class SegGptEncoderOutput(ModelOutput):
intermediate_hidden_states (`Tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
Additionaly, each feature passes through a LayerNorm.
Additionally, each feature passes through a LayerNorm.
"""
last_hidden_state: torch.FloatTensor

View File

@ -1979,10 +1979,10 @@ SPEECHT5_BASE_START_DOCSTRING = r"""
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
encoder ([`SpeechT5EncoderWithSpeechPrenet`] or [`SpeechT5EncoderWithTextPrenet`] or `None`):
The Transformer encoder module that applies the appropiate speech or text encoder prenet. If `None`,
The Transformer encoder module that applies the appropriate speech or text encoder prenet. If `None`,
[`SpeechT5EncoderWithoutPrenet`] will be used and the `input_values` are assumed to be hidden states.
decoder ([`SpeechT5DecoderWithSpeechPrenet`] or [`SpeechT5DecoderWithTextPrenet`] or `None`):
The Transformer decoder module that applies the appropiate speech or text decoder prenet. If `None`,
The Transformer decoder module that applies the appropriate speech or text decoder prenet. If `None`,
[`SpeechT5DecoderWithoutPrenet`] will be used and the `decoder_input_values` are assumed to be hidden
states.
"""

View File

@ -175,7 +175,7 @@ def make_state_dict(converted_params, is_encoder_only: bool):
def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only):
"""Replaces the params in model witht the T5X converted params."""
"""Replaces the params in model with the T5X converted params."""
variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
converted = convert_t5x_to_pytorch(
variables,

View File

@ -2344,11 +2344,11 @@ def _calculate_expected_result(
if avg_approximation == AverageApproximationFunction.RATIO:
average_result = sum_result / (count_result + EPSILON_ZERO_DIVISION)
elif avg_approximation == AverageApproximationFunction.FIRST_ORDER:
# The sum of all probabilities exept that correspond to other cells
# The sum of all probabilities except that correspond to other cells
ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1
average_result = tf.reduce_sum(numeric_values_masked * scaled_probability_per_cell / ex, axis=1)
elif avg_approximation == AverageApproximationFunction.SECOND_ORDER:
# The sum of all probabilities exept that correspond to other cells
# The sum of all probabilities except that correspond to other cells
ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1
pointwise_var = scaled_probability_per_cell * (1 - scaled_probability_per_cell)
var = tf.reduce_sum(pointwise_var, axis=1, keepdims=True) - pointwise_var

View File

@ -2359,7 +2359,7 @@ _ORDINAL_WORDS = [
"second",
"third",
"fourth",
"fith",
"fifth",
"sixth",
"seventh",
"eighth",

View File

@ -1364,7 +1364,7 @@ class UdopStack(UdopPreTrainedModel):
if inputs_embeds is None:
if self.embed_tokens is None:
raise ValueError("You have to intialize the model with valid token embeddings")
raise ValueError("You have to initialize the model with valid token embeddings")
inputs_embeds = self.embed_tokens(input_ids)
if pixel_values is not None:

View File

@ -200,7 +200,7 @@ def make_state_dict(converted_params, is_encoder_only: bool):
def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention):
"""Replaces the params in model witht the T5X converted params."""
"""Replaces the params in model with the T5X converted params."""
variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
converted = convert_t5x_to_pytorch(
variables, num_layers=config.num_layers, is_encoder_only=is_encoder_only, scalable_attention=scalable_attention

View File

@ -164,7 +164,7 @@ class UniSpeechConfig(PretrainedConfig):
eos_token_id (`int`, *optional*, defaults to 2):
The id of the "end-of-sequence" token.
replace_prob (`float`, *optional*, defaults to 0.5):
Propability that transformer feature is replaced by quantized feature for pretraining.
Probability that transformer feature is replaced by quantized feature for pretraining.
Example:

View File

@ -56,7 +56,7 @@ def get_kernel_predictor_key_mapping(config: UnivNetConfig, old_prefix: str = ""
def get_key_mapping(config: UnivNetConfig):
mapping = {}
# NOTE: inital conv layer keys are the same
# NOTE: initial conv layer keys are the same
# LVC Residual blocks
for i in range(len(config.resblock_stride_sizes)):

View File

@ -64,7 +64,7 @@ class UnivNetFeatureExtractor(SequenceFeatureExtractor):
The number of FFT components to use. If `None`, this is determined using
`transformers.audio_utils.optimal_fft_length`.
max_length_s (`int`, *optional*, defaults to 10):
The maximum input lenght of the model in seconds. This is used to pad the audio.
The maximum input length of the model in seconds. This is used to pad the audio.
fmin (`float`, *optional*, defaults to 0.0):
Minimum mel frequency in Hz.
fmax (`float`, *optional*):

View File

@ -39,7 +39,7 @@ class ViltConfig(PretrainedConfig):
The vocabulary size of the `token_type_ids` passed when calling [`ViltModel`]. This is used when encoding
text.
modality_type_vocab_size (`int`, *optional*, defaults to 2):
The vocabulary size of the modalities passed when calling [`ViltModel`]. This is used after concatening the
The vocabulary size of the modalities passed when calling [`ViltModel`]. This is used after concatenating the
embeddings of the text and image modalities.
max_position_embeddings (`int`, *optional*, defaults to 40):
The maximum sequence length that this model might ever be used with.

View File

@ -139,7 +139,7 @@ class ViltEmbeddings(nn.Module):
x_mask = x_mask.flatten(1)
if max_image_length < 0 or max_image_length is None or not isinstance(max_image_length, int):
# suppose aug is 800 x 1333, then, maximum effective res is 800 x 1333 (if one side gets bigger, the other will be constrained and be shrinked)
# suppose aug is 800 x 1333, then, maximum effective res is 800 x 1333 (if one side gets bigger, the other will be constrained and be shrunk)
# (800 // self.patch_size) * (1333 // self.patch_size) is the maximum number of patches that single image can get.
# if self.patch_size = 32, 25 * 41 = 1025
# if res is 384 x 640, 12 * 20 = 240

View File

@ -85,7 +85,7 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
super().__init__(**kwargs)
if "encoder" not in kwargs or "decoder" not in kwargs:
raise ValueError(
f"A configuraton of type {self.model_type} cannot be instantiated because "
f"A configuration of type {self.model_type} cannot be instantiated because "
f"not both `encoder` and `decoder` sub-configurations are passed, but only {kwargs}"
)

View File

@ -1496,7 +1496,7 @@ WAV2VEC2_INPUTS_DOCSTRING = r"""
@add_start_docstrings(
"The bare TFWav2Vec2 Model transformer outputing raw hidden-states without any specific head on top.",
"The bare TFWav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.",
WAV2VEC2_START_DOCSTRING,
)
class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):

View File

@ -101,7 +101,7 @@ class WavLMConfig(PretrainedConfig):
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition](https://arxiv.org/abs/1904.08779).
mask_time_prob (`float`, *optional*, defaults to 0.05):
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
Probability of each feature vector along the time axis to be chosen as the start of the vector span to be
masked. Approximately `mask_time_prob * sequence_length // mask_time_length` feature vectors will be masked
along the time axis. This is only relevant if `apply_spec_augment is True`.
mask_time_length (`int`, *optional*, defaults to 10):
@ -111,7 +111,7 @@ class WavLMConfig(PretrainedConfig):
irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
mask_time_min_masks''
mask_feature_prob (`float`, *optional*, defaults to 0.0):
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
Probability of each feature vector along the feature axis to be chosen as the start of the vector span to
be masked. Approximately `mask_time_prob * hidden_size // mask_time_length` feature vectors will be masked
along the time axis. This is only relevant if `apply_spec_augment is True`.
mask_feature_length (`int`, *optional*, defaults to 10):

View File

@ -474,7 +474,7 @@ class ZoeDepthImageProcessor(BaseImageProcessor):
outputs_flipped ([`ZoeDepthDepthEstimatorOutput`], *optional*):
Raw outputs of the model from flipped input (averaged out in the end).
do_remove_padding (`bool`, *optional*):
By default ZoeDepth addes padding equal to `int((height / 2) * 3)` (and similarly for width) to fix the
By default ZoeDepth adds padding equal to `int((height / 2) * 3)` (and similarly for width) to fix the
boundary artifacts in the output depth map, so we need remove this padding during post_processing. The
parameter exists here in case the user changed the image preprocessing to not include padding.

View File

@ -37,7 +37,7 @@ class AwqQuantizer(HfQuantizer):
4-bit quantization for Activation-aware Weight Quantization(AWQ) (https://arxiv.org/abs/2306.00978)
"""
# AWQ requires data callibration - we support only inference
# AWQ requires data calibration - we support only inference
requires_calibration = True
required_packages = ["awq", "accelerate"]

View File

@ -69,7 +69,7 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
return missing_keys
# We expect some keys to be missing for
# compresed models
# compressed models
# This is fine as the weights are reconstructed by ModelCompressor
# in _process_model_after_weight_loading

View File

@ -1728,14 +1728,14 @@ Please note that you may need to restart your runtime after installation.
# docstyle-ignore
LIBROSA_IMPORT_ERROR = """
{0} requires thes librosa library. But that was not found in your environment. You can install them with pip:
{0} requires the librosa library. But that was not found in your environment. You can install them with pip:
`pip install librosa`
Please note that you may need to restart your runtime after installation.
"""
# docstyle-ignore
PRETTY_MIDI_IMPORT_ERROR = """
{0} requires thes pretty_midi library. But that was not found in your environment. You can install them with pip:
{0} requires the pretty_midi library. But that was not found in your environment. You can install them with pip:
`pip install pretty_midi`
Please note that you may need to restart your runtime after installation.
"""

View File

@ -1120,7 +1120,7 @@ class VptqLayerConfig(QuantizationConfigMixin):
group_size (`int`, *optional*, defaults to `-1`): depends on out-features
indices_as_float (`bool`, *optional*, defaults to `False`): for Finetuning
is_indice_packed (`bool`, *optional*, defaults to `True`): should always be True
num_centroids (`list`, *optional*, defaults to `[-1, -1]`): centriod numbers of clusters
num_centroids (`list`, *optional*, defaults to `[-1, -1]`): centroid numbers of clusters
num_res_centroids (`list`, *optional*, defaults to `[-1, -1]`): ditto for residual
outlier_size (`int`, *optional*, defaults to `1`): outliers
vector_lens (`list`, *optional*, defaults to `[-1, -1]`): centroid vector length in quantization

View File

@ -146,7 +146,7 @@ class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_tokenization_base_hard_symbols(self):
symbols = (
'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
" add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
" add words that should not exist and be tokenized to <unk>, such as saoneuhaoesuth"
)
original_tokenizer_encodings = [
871,

View File

@ -170,7 +170,7 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_tokenization_base_hard_symbols(self):
symbols = (
'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
" add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
" add words that should not exist and be tokenized to <unk>, such as saoneuhaoesuth"
)
original_tokenizer_encodings = [65, 871, 419, 358, 946, 991, 2521, 452, 358, 1357, 387, 7751, 3536, 112, 985, 456, 126, 865, 938, 5400, 5734, 458, 1368, 467, 786, 2462, 5246, 1159, 633, 865, 4519, 457, 582, 852, 2557, 427, 916, 508, 405, 34324, 497, 391, 408, 11342, 1244, 385, 100, 938, 985, 456, 574, 362, 12597, 3200, 3129, 1172, 66] # fmt: skip
self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))

View File

@ -438,7 +438,7 @@ class BridgeTowerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
if self.has_attentions:
self.assertIsNotNone(attentions.grad)
# override as the `logit_scale` parameter initilization is different for BRIDGE TOWER
# override as the `logit_scale` parameter initialization is different for BRIDGE TOWER
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

View File

@ -55,7 +55,7 @@ class BridgeTowerProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def tearDownClass(cls):
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
# Some kwargs tests are overriden from common tests to handle shortest_edge
# Some kwargs tests are overridden from common tests to handle shortest_edge
# and size_divisor behaviour
@require_torch

View File

@ -924,7 +924,7 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
def test_model_get_set_embeddings(self):
pass
# override as the `logit_scale` parameter initilization is different for FLAVA
# override as the `logit_scale` parameter initialization is different for FLAVA
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@ -933,7 +933,7 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if param.requires_grad:
# check if `logit_scale` is initilized as per the original implementation
# check if `logit_scale` is initialized as per the original implementation
if name == "logit_scale" or name == "flava.logit_scale":
self.assertAlmostEqual(
param.data.item(),

View File

@ -137,7 +137,7 @@ class Gemma3ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
@unittest.skip(
reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`"
" as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
" as in Dynamic Cache doesn't work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
)
def test_multi_gpu_data_parallel_forward(self):
pass
@ -275,7 +275,7 @@ class Gemma3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unitte
@unittest.skip(
reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`"
" as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
" as in Dynamic Cache doesn't work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
)
def test_multi_gpu_data_parallel_forward(self):
pass

View File

@ -88,7 +88,7 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
image = self.prepare_image_inputs()
# If text has no image tokens, iamge should be `None`
# If text has no image tokens, image should be `None`
with self.assertRaises(ValueError):
_ = processor(text=text_no_image, images=image, return_tensors="np")

View File

@ -478,8 +478,8 @@ class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
# the last 2 tokens are masked, and should have 0 attn_probs
self.assertTrue(torch.all(attn_probs[:, :, -mask_tokens:, -mask_tokens:] == 0))
# in loacal attention each token can only attend to the previous window_size tokens (including itself)
# here window_size is 4, so a token at index 5 can only attend to indcies [2, 3, 4, 5]
# in local attention each token can only attend to the previous window_size tokens (including itself)
# here window_size is 4, so a token at index 5 can only attend to indices [2, 3, 4, 5]
# and the attn_probs should be 0 for token [0, 1]
self.assertTrue(torch.all(attn_probs[:, :, 5, 2:6] != 0))
self.assertTrue(torch.all(attn_probs[:, :, 5, :2] == 0))

View File

@ -769,7 +769,7 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):
encoding1 = processor(images=image, text=text1, return_tensors="pt").to(torch_device)
encoding2 = processor(images=image, text=text2, return_tensors="pt").to(torch_device)
# If we batch the text and cross attention masking is working the batched result should be equal to
# The singe text result
# The single text result
encoding_batched = processor(
images=[image] * len(text_batched), text=text_batched, padding="longest", return_tensors="pt"
).to(torch_device)

View File

@ -658,7 +658,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
def test_sdpa_can_dispatch_composite_models(self):
"""
Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
This tests only by looking at layer names, as usually SDPA layers are called "SDPAAttention".
In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
See https://github.com/huggingface/transformers/pull/32238 for more info

View File

@ -56,7 +56,7 @@ class LlavaConfigTest(unittest.TestCase):
def test_arbitrary_reload(self):
"""
Simple test for reloading arbirarily composed subconfigs
Simple test for reloading arbitrarily composed subconfigs
"""
default_values = LlavaConfig().to_diff_dict()
default_values["vision_config"]["model_type"] = "pixtral"

View File

@ -553,8 +553,8 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
# image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(model.device)
generate_ids = model.generate(**inputs, max_new_tokens=500)
ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(ouptut)
output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(output)
# fmt: off
EXPECTED_GENERATION = """
@ -573,7 +573,7 @@ These descriptions provide a detailed overview of the content and atmosphere of
"""
# fmt: on
# check that both inputs are handled correctly and generate the same output
self.assertEqual(ouptut, EXPECTED_GENERATION)
self.assertEqual(output, EXPECTED_GENERATION)
@slow
@require_bitsandbytes

Some files were not shown because too many files have changed in this diff Show More