mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 12:50:06 +06:00
Fix typos in strings and comments (#37799)
This commit is contained in:
parent
f466603963
commit
d5fa7d2d19
@ -1146,9 +1146,9 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: Option
|
||||
tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
|
||||
fft_window_size (`int`, *optional*):
|
||||
Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the
|
||||
spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of
|
||||
spectrogram. 400 means that the fourier transform is computed on windows of 400 samples. The number of
|
||||
frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to
|
||||
`(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally.
|
||||
`(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionally.
|
||||
|
||||
Example:
|
||||
|
||||
|
@ -850,7 +850,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
|
||||
beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, generated_len=generated_len)
|
||||
ids_collect.append(beam_id)
|
||||
|
||||
# due to overly complex constraints or other factors, sometimes we can't gaurantee a successful
|
||||
# due to overly complex constraints or other factors, sometimes we can't guarantee a successful
|
||||
# generation. In these cases we simply return the highest scoring outputs.
|
||||
if len(ids_collect) < self.num_beam_hyps_to_keep:
|
||||
for beam_id in range(self.num_beams):
|
||||
|
@ -192,7 +192,7 @@ class GenerationConfig(PushToHubMixin):
|
||||
our [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information.
|
||||
cache_config (`CacheConfig` or `dict`, *optional*, default to `None`):
|
||||
Arguments used in the key-value cache class can be passed in `cache_config`. Can be passed as a `Dict` and
|
||||
it will be converted to its repsective `CacheConfig` internally.
|
||||
it will be converted to its respective `CacheConfig` internally.
|
||||
Otherwise can be passed as a `CacheConfig` class matching the indicated `cache_implementation`.
|
||||
return_legacy_cache (`bool`, *optional*, default to `True`):
|
||||
Whether to return the legacy or new format of the cache when `DynamicCache` is used by default.
|
||||
@ -235,7 +235,7 @@ class GenerationConfig(PushToHubMixin):
|
||||
The parameter for repetition penalty. 1.0 means no penalty. See [this
|
||||
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
|
||||
encoder_repetition_penalty (`float`, *optional*, defaults to 1.0):
|
||||
The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
|
||||
The parameter for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
|
||||
original input. 1.0 means no penalty.
|
||||
length_penalty (`float`, *optional*, defaults to 1.0):
|
||||
Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
|
||||
@ -385,7 +385,7 @@ class GenerationConfig(PushToHubMixin):
|
||||
inference.
|
||||
disable_compile (`bool`, *optional*):
|
||||
Whether to disable the automatic compilation of the forward pass. Automatic compilation happens when
|
||||
specific criteria are met, including using a compileable cache. Please open an issue if you find the
|
||||
specific criteria are met, including using a compilable cache. Please open an issue if you find the
|
||||
need to use this flag.
|
||||
|
||||
> Wild card
|
||||
@ -710,7 +710,7 @@ class GenerationConfig(PushToHubMixin):
|
||||
UserWarning,
|
||||
)
|
||||
|
||||
# 3. detect incorrect paramaterization specific to advanced beam modes
|
||||
# 3. detect incorrect parameterization specific to advanced beam modes
|
||||
else:
|
||||
# constrained beam search
|
||||
if self.constraints is not None or self.force_words_ids is not None:
|
||||
|
@ -271,7 +271,7 @@ class FlaxMinLengthLogitsProcessor(FlaxLogitsProcessor):
|
||||
|
||||
class FlaxSuppressTokensAtBeginLogitsProcessor(FlaxLogitsProcessor):
|
||||
r"""
|
||||
[`FlaxLogitsProcessor`] supressing a list of tokens as soon as the `generate` function starts generating using
|
||||
[`FlaxLogitsProcessor`] suppressing a list of tokens as soon as the `generate` function starts generating using
|
||||
`begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are not sampled at the
|
||||
beginning of the generation.
|
||||
|
||||
|
@ -543,7 +543,7 @@ class TopKLogitsWarper(LogitsProcessor):
|
||||
class MinPLogitsWarper(LogitsProcessor):
|
||||
"""
|
||||
[`LogitsProcessor`] that performs min-p, i.e. keeps all tokens that are above a minimum probability, scaled by the
|
||||
probability of the most likely token. As a result, the filter becomes more agressive in the presence of
|
||||
probability of the most likely token. As a result, the filter becomes more aggressive in the presence of
|
||||
high-probability tokens, which is a sign of a confident output that we shouldn't deviate from.
|
||||
|
||||
Often used together with [`TemperatureLogitsWarper`]. Used as an alternative to [`TopPLogitsWarper`] and
|
||||
@ -738,7 +738,7 @@ class EpsilonLogitsWarper(LogitsProcessor):
|
||||
|
||||
>>> # With epsilon sampling, the output gets restricted to high-probability tokens. Note that this is similar to
|
||||
>>> # Top P sampling, which restricts tokens based on their cumulative probability.
|
||||
>>> # Pro tip: The paper recomends using `epsilon_cutoff` values between 3e-4 and 9e-4
|
||||
>>> # Pro tip: The paper recommends using `epsilon_cutoff` values between 3e-4 and 9e-4
|
||||
>>> outputs = model.generate(**inputs, do_sample=True, epsilon_cutoff=0.1)
|
||||
>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
|
||||
A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
|
||||
@ -819,7 +819,7 @@ class EtaLogitsWarper(LogitsProcessor):
|
||||
|
||||
>>> # With eta sampling, the output gets restricted to high-probability tokens. You can see it as a dynamic form of
|
||||
>>> # epsilon sampling that adapts its cutoff probability based on the entropy (high entropy = lower cutoff).
|
||||
>>> # Pro tip: The paper recomends using `eta_cutoff` values between 3e-4 to 4e-3
|
||||
>>> # Pro tip: The paper recommends using `eta_cutoff` values between 3e-4 to 4e-3
|
||||
>>> outputs = model.generate(**inputs, do_sample=True, eta_cutoff=0.1)
|
||||
>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
|
||||
A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
|
||||
@ -1348,7 +1348,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
|
||||
>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
|
||||
Alice and Bob are friends
|
||||
|
||||
>>> # We can contrain it with `prefix_allowed_tokens_fn` to force a certain behavior based on a prefix.
|
||||
>>> # We can constrain it with `prefix_allowed_tokens_fn` to force a certain behavior based on a prefix.
|
||||
>>> # For instance, we can force an entire entity to be generated when its beginning is detected.
|
||||
>>> entity = tokenizer(" Bob Marley", return_tensors="pt").input_ids[0] # 3 tokens
|
||||
>>> def prefix_allowed_tokens_fn(batch_id, input_ids):
|
||||
@ -1791,7 +1791,7 @@ class LogitNormalization(LogitsProcessor):
|
||||
|
||||
class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
|
||||
r"""
|
||||
[`SuppressTokensAtBeginLogitsProcessor`] supresses a list of tokens as soon as the `generate` function starts
|
||||
[`SuppressTokensAtBeginLogitsProcessor`] suppresses a list of tokens as soon as the `generate` function starts
|
||||
generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are
|
||||
not generated at the beginning. Originally created for
|
||||
[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
|
||||
@ -2642,7 +2642,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
|
||||
We assume that the scores are in the log space.
|
||||
Args:
|
||||
scores (`torch.FloatTensor`): Scores (batch_size, vocab_size).
|
||||
g_values (`torch.FloatTensor`): G valus (batch_size, vocab_size, depth).
|
||||
g_values (`torch.FloatTensor`): G values (batch_size, vocab_size, depth).
|
||||
|
||||
Returns:
|
||||
Updated scores (batch_size, vocab_size).
|
||||
@ -2668,7 +2668,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
|
||||
if self.debug_mode:
|
||||
scores = torch.ones_like(scores)
|
||||
|
||||
# Currently indices is just a arange to compute watermarking on the desnse logits.
|
||||
# Currently indices is just a arange to compute watermarking on the dense logits.
|
||||
all_indices = torch.stack([torch.arange(vocab_size, device=self.device) for _ in range(batch_size)])
|
||||
|
||||
if self.state is None:
|
||||
|
@ -343,7 +343,7 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
|
||||
)
|
||||
|
||||
def _match_found():
|
||||
# Finaly, runs the actual comparison. Can only be called if the previous comparisons do not yield
|
||||
# Finally, runs the actual comparison. Can only be called if the previous comparisons do not yield
|
||||
# an answer (otherwise we get indexing exceptions)
|
||||
compare_len = self.bad_word_seqs_len[bad_word_seq_number] - 1
|
||||
return tf.cond(
|
||||
|
@ -962,7 +962,7 @@ class TFGenerationMixin:
|
||||
raise ValueError(
|
||||
"Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
|
||||
f" num_return_sequences, got {generation_config.num_beams} and"
|
||||
f" {generation_config.num_return_sequences} (respectivelly)"
|
||||
f" {generation_config.num_return_sequences} (respectively)"
|
||||
)
|
||||
|
||||
# 11. broadcast inputs to the desired number of beams
|
||||
@ -994,7 +994,7 @@ class TFGenerationMixin:
|
||||
raise ValueError(
|
||||
"Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
|
||||
f" num_return_sequences, got {generation_config.num_beams} and"
|
||||
f" {generation_config.num_return_sequences} (respectivelly)"
|
||||
f" {generation_config.num_return_sequences} (respectively)"
|
||||
)
|
||||
|
||||
# 11. prepare logits warper
|
||||
@ -1626,7 +1626,7 @@ class TFGenerationMixin:
|
||||
)
|
||||
use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
|
||||
use_xla = not tf.executing_eagerly()
|
||||
# TODO (Joao): fix cache format or find programatic way to detect cache index
|
||||
# TODO (Joao): fix cache format or find programmatic way to detect cache index
|
||||
# GPT2 and other models has a slightly different cache structure, with a different batch axis
|
||||
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
|
||||
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
|
||||
@ -1910,7 +1910,7 @@ class TFGenerationMixin:
|
||||
)
|
||||
use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
|
||||
use_xla = not tf.executing_eagerly()
|
||||
# TODO (Joao): fix cache format or find programatic way to detect cache index
|
||||
# TODO (Joao): fix cache format or find programmatic way to detect cache index
|
||||
# GPT2 and other models has a slightly different cache structure, with a different batch axis
|
||||
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
|
||||
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
|
||||
@ -2253,7 +2253,7 @@ class TFGenerationMixin:
|
||||
|
||||
use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
|
||||
use_xla = not tf.executing_eagerly()
|
||||
# TODO (Joao): fix cache format or find programatic way to detect cache index
|
||||
# TODO (Joao): fix cache format or find programmatic way to detect cache index
|
||||
# GPT2 and other models has a slightly different cache structure, with a different batch axis
|
||||
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
|
||||
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
|
||||
@ -2788,7 +2788,7 @@ class TFGenerationMixin:
|
||||
model_kwargs.pop("use_cache", None)
|
||||
|
||||
use_xla = not tf.executing_eagerly()
|
||||
# TODO (Joao): fix cache format or find programatic way to detect cache index
|
||||
# TODO (Joao): fix cache format or find programmatic way to detect cache index
|
||||
# GPT2 and other models has a slightly different cache structure, with a different batch axis
|
||||
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
|
||||
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
|
||||
|
@ -362,7 +362,7 @@ class GenerationMixin:
|
||||
inherit from `GenerationMixin` to benefit from all generation-related automation in our codebase;
|
||||
- `BarkModel` has a custom `generate` method and one of its inner models calls `GenerationMixin.generate`.
|
||||
However, its `generate` does not share the same interface as `GenerationMixin.generate`. In this case,
|
||||
`BarkModel` shoud NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.
|
||||
`BarkModel` should NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.
|
||||
|
||||
The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
|
||||
- *greedy decoding* if `num_beams=1` and `do_sample=False`
|
||||
@ -392,7 +392,7 @@ class GenerationMixin:
|
||||
- Exception 1: when passing input_embeds, input_ids may be missing entries
|
||||
- Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
|
||||
- Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
|
||||
- Excpetion 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
|
||||
- Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
|
||||
generate the first token for each sequence. Later use the generated Input ids for continuation.
|
||||
|
||||
The current implementation does not rely on ``self`` and could be
|
||||
@ -967,7 +967,7 @@ class GenerationMixin:
|
||||
assistant_model=assistant_model,
|
||||
assistant_prune_lm_head=True, # prune LM head of assistant model
|
||||
)
|
||||
# Since we prune the LM head, we cannot use the repetition penalty on the assistant model due to mismaches between token ids and logits index
|
||||
# Since we prune the LM head, we cannot use the repetition penalty on the assistant model due to mismatches between token ids and logits index
|
||||
assistant_model.generation_config.repetition_penalty = None
|
||||
candidate_generator = UniversalSpeculativeDecodingGenerator(
|
||||
input_ids=input_ids,
|
||||
|
@ -171,7 +171,7 @@ def find_tied_parameters(model: "nn.Module", **kwargs):
|
||||
```
|
||||
"""
|
||||
|
||||
# get ALL model parameters and thier names
|
||||
# get ALL model parameters and their names
|
||||
all_named_parameters = dict(model.named_parameters(remove_duplicate=False))
|
||||
|
||||
# get ONLY unique named parameters,
|
||||
@ -187,7 +187,7 @@ def find_tied_parameters(model: "nn.Module", **kwargs):
|
||||
for tied_param_name in tied_param_names:
|
||||
tied_param = all_named_parameters[tied_param_name]
|
||||
for param_name, param in no_duplicate_named_parameters.items():
|
||||
# compare if parameters are the same, if so, group thier names together
|
||||
# compare if parameters are the same, if so, group their names together
|
||||
if param is tied_param:
|
||||
if param_name not in tied_param_groups:
|
||||
tied_param_groups[param_name] = []
|
||||
|
@ -329,7 +329,7 @@ class TorchExportableModuleWithStaticCache(torch.nn.Module):
|
||||
This util function is designed to test exported models by simulating the generation process.
|
||||
It processes the input prompt tokens sequentially (no parallel prefill).
|
||||
This generate function is not intended to replace the original `generate` method, and the support
|
||||
for leveraging the original `generate` is potentially planed!
|
||||
for leveraging the original `generate` is potentially planned!
|
||||
|
||||
Args:
|
||||
exported_program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
|
||||
|
@ -28,7 +28,7 @@ def autoname_modules(model):
|
||||
module.name = name
|
||||
|
||||
|
||||
# Get the linear_tag from a modul name. For example: model.layers.31.self_attn.k_proj -> self_attn.k_proj
|
||||
# Get the linear_tag from a module name. For example: model.layers.31.self_attn.k_proj -> self_attn.k_proj
|
||||
def name_to_linear_tag(name):
|
||||
return ".".join([n for n in name.split(".") if ((n not in ["model", "layers"]) and (not n.isnumeric()))])
|
||||
|
||||
@ -86,9 +86,9 @@ def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_conve
|
||||
"""
|
||||
Prepares nn.Linear layers for HQQ quantization.
|
||||
Since each layer type can have separate quantization parameters, we need to do the following:
|
||||
1- tag each module with its neme via autoname_modules()
|
||||
1- tag each module with its name via autoname_modules()
|
||||
2- Extract linear_tags (e.g. ['self_attn.q_proj', ...])
|
||||
3- Map quantization parameters as a dictionary linear_tag -> quant_params as HQQLinear exepects it, this is referred to as patch_params
|
||||
3- Map quantization parameters as a dictionary linear_tag -> quant_params as HQQLinear expects it, this is referred to as patch_params
|
||||
"""
|
||||
|
||||
modules_to_not_convert = [] if modules_to_not_convert is None else modules_to_not_convert
|
||||
|
@ -160,7 +160,7 @@ def distribute_module(
|
||||
output_fn=None,
|
||||
) -> nn.Module:
|
||||
"""
|
||||
Copy pasted from torch's function but we remove the communications (partitionning)
|
||||
Copy pasted from torch's function but we remove the communications (partitioning)
|
||||
as well as buffer registering that is similarly not efficient.
|
||||
"""
|
||||
if len(module._forward_pre_hooks) == 0:
|
||||
@ -225,7 +225,7 @@ class GatherParallel(TensorParallelLayer):
|
||||
|
||||
@staticmethod
|
||||
def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
|
||||
# this op cannot be asynch, otherwise it completely breaks the outputs of models
|
||||
# this op cannot be async, otherwise it completely breaks the outputs of models
|
||||
torch.distributed.all_reduce(outputs[0], op=torch.distributed.ReduceOp.SUM, async_op=False)
|
||||
return outputs
|
||||
|
||||
|
@ -343,7 +343,7 @@ class HungarianMatcher(nn.Module):
|
||||
|
||||
# Compute the classification cost. Contrary to the loss, we don't use the NLL,
|
||||
# but approximate it in 1 - proba[target class].
|
||||
# The 1 is a constant that doesn't change the matching, it can be ommitted.
|
||||
# The 1 is a constant that doesn't change the matching, it can be omitted.
|
||||
class_cost = -out_prob[:, target_ids]
|
||||
|
||||
# Compute the L1 cost between boxes
|
||||
|
@ -99,7 +99,7 @@ class RTDetrHungarianMatcher(nn.Module):
|
||||
target_bbox = torch.cat([v["boxes"] for v in targets])
|
||||
# Compute the classification cost. Contrary to the loss, we don't use the NLL,
|
||||
# but approximate it in 1 - proba[target class].
|
||||
# The 1 is a constant that doesn't change the matching, it can be ommitted.
|
||||
# The 1 is a constant that doesn't change the matching, it can be omitted.
|
||||
if self.use_focal_loss:
|
||||
out_prob = F.sigmoid(outputs["logits"].flatten(0, 1))
|
||||
out_prob = out_prob[:, target_ids]
|
||||
|
@ -593,7 +593,7 @@ class AlignVisionBlock(nn.Module):
|
||||
|
||||
class AlignVisionEncoder(nn.Module):
|
||||
r"""
|
||||
Forward propogates the embeddings through each vision encoder (EfficientNet) block.
|
||||
Forward propagates the embeddings through each vision encoder (EfficientNet) block.
|
||||
|
||||
Args:
|
||||
config ([`AlignVisionConfig`]):
|
||||
|
@ -36,7 +36,7 @@ class AlignProcessorKwargs(ProcessingKwargs, total=False):
|
||||
class AlignProcessor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs an ALIGN processor which wraps [`EfficientNetImageProcessor`] and
|
||||
[`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that interits both the image processor and
|
||||
[`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that inherits both the image processor and
|
||||
tokenizer functionalities. See the [`~AlignProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
|
||||
information.
|
||||
The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
|
||||
|
@ -1936,7 +1936,7 @@ class AutoformerForPrediction(AutoformerPreTrainedModel):
|
||||
params = None
|
||||
if future_values is not None:
|
||||
# outputs.last_hidden_state and trend
|
||||
# loc is 4rd last and scale is 3rd last output
|
||||
# loc is 4th last and scale is 3rd last output
|
||||
params = self.output_params(outputs[0] + outputs[1])
|
||||
distribution = self.output_distribution(params, loc=outputs[-3], scale=outputs[-2])
|
||||
|
||||
|
@ -164,7 +164,7 @@ def convert_checkpoint_to_pytorch(tf_checkpoint_path: str, config_path: str, pyt
|
||||
new_model = BertForMaskedLM.from_pretrained(pytorch_dump_path)
|
||||
print(new_model.eval())
|
||||
|
||||
print("Model conversion was done sucessfully!")
|
||||
print("Model conversion was done successfully!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -235,7 +235,7 @@ class Blip2Config(PretrainedConfig):
|
||||
num_query_tokens (`int`, *optional*, defaults to 32):
|
||||
The number of query tokens passed through the Transformer.
|
||||
image_text_hidden_size (`int`, *optional*, defaults to 256):
|
||||
Dimentionality of the hidden state of the image-text fusion layer.
|
||||
Dimensionality of the hidden state of the image-text fusion layer.
|
||||
|
||||
image_token_index (`int`, *optional*):
|
||||
Token index of special image token.
|
||||
|
@ -899,7 +899,7 @@ class BloomForCausalLM(BloomPreTrainedModel, GenerationMixin):
|
||||
use_cache=True,
|
||||
**kwargs,
|
||||
):
|
||||
# Overwriten because of the fixed-shape attention mask creation
|
||||
# Overwritten because of the fixed-shape attention mask creation
|
||||
|
||||
# If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
|
||||
# Exception 1: when passing input_embeds, input_ids may be missing entries
|
||||
|
@ -49,17 +49,17 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
|
||||
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
|
||||
to warn users if the audio fed to the feature extractor does not have the same sampling rate.
|
||||
hop_length (`int`,*optional*, defaults to 480):
|
||||
Length of the overlaping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
|
||||
Length of the overlapping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
|
||||
in smaller `frames` with a step of `hop_length` between each frame.
|
||||
max_length_s (`int`, *optional*, defaults to 10):
|
||||
The maximum input length of the model in seconds. This is used to pad the audio.
|
||||
fft_window_size (`int`, *optional*, defaults to 1024):
|
||||
Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency
|
||||
resolution of the spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples.
|
||||
resolution of the spectrogram. 400 means that the fourier transform is computed on windows of 400 samples.
|
||||
padding_value (`float`, *optional*, defaults to 0.0):
|
||||
Padding value used to pad the audio. Should correspond to silences.
|
||||
return_attention_mask (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the model should return the attention masks coresponding to the input.
|
||||
Whether or not the model should return the attention masks corresponding to the input.
|
||||
frequency_min (`float`, *optional*, defaults to 0):
|
||||
The lowest frequency of interest. The STFT will not be computed for values below this.
|
||||
frequency_max (`float`, *optional*, defaults to 14000):
|
||||
@ -141,7 +141,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
|
||||
Serializes this instance to a Python dictionary.
|
||||
|
||||
Returns:
|
||||
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, excpet for the
|
||||
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, except for the
|
||||
mel filter banks, which do not need to be saved or printed as they are too long.
|
||||
"""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
|
@ -1067,7 +1067,7 @@ CLAP_TEXT_INPUTS_DOCSTRING = r"""
|
||||
CLAP_AUDIO_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
|
||||
Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
|
||||
retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
|
||||
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
|
||||
Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
|
||||
@ -1105,7 +1105,7 @@ CLAP_INPUTS_DOCSTRING = r"""
|
||||
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
|
||||
Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
|
||||
retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
|
||||
return_loss (`bool`, *optional*):
|
||||
Whether or not to return the contrastive loss.
|
||||
|
@ -127,7 +127,7 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
|
||||
if kwargs.pop("add_bos_token", False):
|
||||
model_id = kwargs.pop("name_or_path", "")
|
||||
raise ValueError(
|
||||
"Currenty GPT2's fast tokenizer does NOT support adding a BOS token. "
|
||||
"Currently GPT2's fast tokenizer does NOT support adding a BOS token. "
|
||||
"Instead you should use GPT2's slow tokenizer class `CodeGenTokenizer` as follows: \n"
|
||||
f"`CodeGenTokenizer.from_pretrained('{model_id}')`\nor\n"
|
||||
f"`AutoTokenizer.from_pretrained('{model_id}', use_fast=False)`\n"
|
||||
|
@ -277,7 +277,7 @@ def final():
|
||||
|
||||
def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_folder):
|
||||
"""
|
||||
Fucntion to convert the microsoft cvt checkpoint to huggingface checkpoint
|
||||
Function to convert the microsoft cvt checkpoint to huggingface checkpoint
|
||||
"""
|
||||
img_labels_file = "imagenet-1k-id2label.json"
|
||||
num_labels = 1000
|
||||
|
@ -58,7 +58,7 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
|
||||
# activation function weight
|
||||
r"transformer\.encoder\.layers\.(\d+)\.activation\.weight": r"encoder.layers.\1.activation_fn.weight",
|
||||
#########################################################################################################################################
|
||||
# decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms + activiation function weight
|
||||
# decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms + activation function weight
|
||||
r"transformer\.decoder\.layers\.(\d+)\.self_attn\.out_proj\.(bias|weight)": r"decoder.layers.\1.self_attn.self_attn.output_proj.\2",
|
||||
r"transformer\.decoder\.layers\.(\d+)\.cross_attn\.out_proj\.(bias|weight)": r"decoder.layers.\1.cross_attn.cross_attn.output_proj.\2",
|
||||
# FFNs
|
||||
@ -144,7 +144,7 @@ def write_model(model_name, pretrained_model_weights_path, pytorch_dump_folder_p
|
||||
config.label2id = {v: k for k, v in id2label.items()}
|
||||
# load original model from local path
|
||||
loaded = torch.load(pretrained_model_weights_path, map_location=torch.device("cpu"), weights_only=True)["model"]
|
||||
# Renaming the original model state dictionary to HF compatibile
|
||||
# Renaming the original model state dictionary to HF compatible
|
||||
all_keys = list(loaded.keys())
|
||||
new_keys = convert_old_keys_to_new_keys(all_keys)
|
||||
state_dict = {}
|
||||
|
@ -1297,7 +1297,7 @@ class JukeboxConditionalAutoregressive(nn.Module):
|
||||
):
|
||||
"""
|
||||
Autoregressive model on either lyric tokens or music tokens, or both. The attention pattern should be properly
|
||||
set fro each configuration.
|
||||
set for each configuration.
|
||||
|
||||
Args:
|
||||
config (`JukeboxPriorConfig`):
|
||||
|
@ -142,7 +142,7 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
|
||||
return patches
|
||||
|
||||
if n_patches_per_batch < 4:
|
||||
# for each batch, atleast 4 small patches are required to
|
||||
# for each batch, at least 4 small patches are required to
|
||||
# recreate a large square patch from merging them and later padding is applied
|
||||
# 3 x (8x8) patches becomes 1 x ( 8x8 ) patch (extra patch ignored, no padding)
|
||||
# 4 x (8x8) patches becomes 1 x (16x16) patch (padding later)
|
||||
|
@ -118,7 +118,7 @@ class Embeddings(nn.Module):
|
||||
|
||||
# Setting the position-ids to the registered buffer in constructor, it helps
|
||||
# when tracing the model without passing position-ids, solves
|
||||
# isues similar to issue #5664
|
||||
# issues similar to issue #5664
|
||||
if hasattr(self, "position_ids"):
|
||||
position_ids = self.position_ids[:, :seq_length]
|
||||
else:
|
||||
|
@ -72,7 +72,7 @@ class DonutImageProcessor(BaseImageProcessor):
|
||||
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
|
||||
do_pad (`bool`, *optional*, defaults to `True`):
|
||||
Whether to pad the image. If `random_padding` is set to `True` in `preprocess`, each image is padded with a
|
||||
random amont of padding on each size, up to the largest image size in the batch. Otherwise, all images are
|
||||
random amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are
|
||||
padded to the largest image size in the batch.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
|
||||
@ -349,7 +349,7 @@ class DonutImageProcessor(BaseImageProcessor):
|
||||
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
|
||||
do_pad (`bool`, *optional*, defaults to `self.do_pad`):
|
||||
Whether to pad the image. If `random_padding` is set to `True`, each image is padded with a random
|
||||
amont of padding on each size, up to the largest image size in the batch. Otherwise, all images are
|
||||
amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are
|
||||
padded to the largest image size in the batch.
|
||||
random_padding (`bool`, *optional*, defaults to `self.random_padding`):
|
||||
Whether to use random padding when padding the image. If `True`, each image in the batch with be padded
|
||||
|
@ -142,7 +142,7 @@ class FastSpeech2ConformerConfig(PretrainedConfig):
|
||||
speaker id embedding layer.
|
||||
num_languages (`int`, *optional*):
|
||||
Number of languages. If set to > 1, assume that the language ids will be provided as the input and use the
|
||||
languge id embedding layer.
|
||||
language id embedding layer.
|
||||
speaker_embed_dim (`int`, *optional*):
|
||||
Speaker embedding dimension. If set to > 0, assume that speaker_embedding will be provided as the input.
|
||||
is_encoder_decoder (`bool`, *optional*, defaults to `True`):
|
||||
|
@ -391,7 +391,7 @@ class FastSpeech2ConformerVariancePredictor(nn.Module):
|
||||
dropout_rate=0.5,
|
||||
):
|
||||
"""
|
||||
Initilize variance predictor module.
|
||||
Initialize variance predictor module.
|
||||
|
||||
Args:
|
||||
input_dim (`int`): Input dimension.
|
||||
|
@ -948,7 +948,7 @@ class FlaubertModel(FlaubertPreTrainedModel):
|
||||
|
||||
# Setting the position-ids to the registered buffer in constructor, it helps
|
||||
# when tracing the model without passing position-ids, solves
|
||||
# isues similar to issue #5664
|
||||
# issues similar to issue #5664
|
||||
if position_ids is None:
|
||||
if hasattr(self, "position_ids"):
|
||||
position_ids = self.position_ids[:, :slen]
|
||||
|
@ -360,7 +360,7 @@ class FocalNetModulation(nn.Module):
|
||||
x = self.projection_in(hidden_state).permute(0, 3, 1, 2).contiguous()
|
||||
q, ctx, gates = torch.split(x, (num_channels, num_channels, self.focal_level + 1), 1)
|
||||
|
||||
# context aggreation
|
||||
# context aggregation
|
||||
ctx_all = 0
|
||||
for level in range(self.focal_level):
|
||||
ctx = self.focal_layers[level](ctx)
|
||||
@ -379,7 +379,7 @@ class FocalNetModulation(nn.Module):
|
||||
if self.use_post_layernorm_in_modulation:
|
||||
x_out = self.layernorm(x_out)
|
||||
|
||||
# post linear porjection
|
||||
# post linear projection
|
||||
x_out = self.projection_out(x_out)
|
||||
x_out = self.projection_dropout(x_out)
|
||||
return x_out
|
||||
@ -415,7 +415,7 @@ class FocalNetLayer(nn.Module):
|
||||
dim (`int`):
|
||||
Number of input channels.
|
||||
input_resolution (`Tuple[int]`):
|
||||
Input resulotion.
|
||||
Input resolution.
|
||||
drop_path (`float`, *optional*, defaults to 0.0):
|
||||
Stochastic depth rate.
|
||||
"""
|
||||
|
@ -244,7 +244,7 @@ def _tokenize_prompts_with_image_and_batch(
|
||||
- pad all the sequences to this length so we can convert them into a 3D tensor.
|
||||
"""
|
||||
|
||||
# If not tool use, tranform the coordinates while tokenizing
|
||||
# If not tool use, transform the coordinates while tokenizing
|
||||
if scale_factors is not None:
|
||||
transformed_prompt_tokens = []
|
||||
for prompt_seq, scale_factor_seq in zip(prompts, scale_factors):
|
||||
|
@ -96,7 +96,7 @@ class Gemma3TextConfig(PretrainedConfig):
|
||||
Scaling factor when applying tanh softcapping on the attention scores.
|
||||
cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
|
||||
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
||||
accordingly.
|
||||
Expected contents:
|
||||
|
@ -140,7 +140,7 @@ class Gemma3ImageProcessor(BaseImageProcessor):
|
||||
):
|
||||
"""
|
||||
Pan and Scan and image, by cropping into smaller images when the aspect ratio exceeds
|
||||
minumum allowed ratio.
|
||||
minimum allowed ratio.
|
||||
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
|
@ -108,7 +108,7 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
|
||||
):
|
||||
"""
|
||||
Pan and Scan an image, by cropping into smaller images when the aspect ratio exceeds
|
||||
minumum allowed ratio.
|
||||
minimum allowed ratio.
|
||||
|
||||
Args:
|
||||
image (`torch.Tensor`):
|
||||
|
@ -1270,7 +1270,7 @@ class Gemma3ForConditionalGeneration(Gemma3PreTrainedModel, GenerationMixin):
|
||||
|
||||
is_training = token_type_ids is not None and labels is not None
|
||||
|
||||
# Replace image id woth PAD if the image token if OOV, to avoid index-errors
|
||||
# Replace image id with PAD if the image token if OOV, to avoid index-errors
|
||||
if input_ids is not None and self.config.image_token_id >= self.vocab_size:
|
||||
special_image_mask = input_ids == self.config.image_token_id
|
||||
llm_input_ids = input_ids.clone()
|
||||
|
@ -128,7 +128,7 @@ class Gemma3TextConfig(Gemma2Config):
|
||||
Scaling factor when applying tanh softcapping on the attention scores.
|
||||
cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
|
||||
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
||||
accordingly.
|
||||
Expected contents:
|
||||
@ -926,7 +926,7 @@ class Gemma3ForConditionalGeneration(PaliGemmaForConditionalGeneration):
|
||||
|
||||
is_training = token_type_ids is not None and labels is not None
|
||||
|
||||
# Replace image id woth PAD if the image token if OOV, to avoid index-errors
|
||||
# Replace image id with PAD if the image token if OOV, to avoid index-errors
|
||||
if input_ids is not None and self.config.image_token_id >= self.vocab_size:
|
||||
special_image_mask = input_ids == self.config.image_token_id
|
||||
llm_input_ids = input_ids.clone()
|
||||
|
@ -1495,7 +1495,7 @@ class GitForCausalLM(GitPreTrainedModel, GenerationMixin):
|
||||
>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
|
||||
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")
|
||||
|
||||
>>> # set seed for reproducability
|
||||
>>> # set seed for reproducibility
|
||||
>>> np.random.seed(45)
|
||||
|
||||
|
||||
|
@ -199,7 +199,7 @@ class GPTNeoXConfig(PretrainedConfig):
|
||||
|
||||
if self.hidden_size % self.num_attention_heads != 0:
|
||||
raise ValueError(
|
||||
"The hidden size is not divisble by the number of attention heads! Make sure to update them!"
|
||||
"The hidden size is not divisible by the number of attention heads! Make sure to update them!"
|
||||
)
|
||||
|
||||
|
||||
|
@ -402,7 +402,7 @@ def convert_grounding_dino_checkpoint(args):
|
||||
"grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth",
|
||||
"grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swinb_cogcoor.pth",
|
||||
}
|
||||
# Define default GroundingDino configuation
|
||||
# Define default GroundingDino configuration
|
||||
config = get_grounding_dino_config(model_name)
|
||||
|
||||
# Load original checkpoint
|
||||
|
@ -1850,7 +1850,7 @@ class GroundingDinoDecoder(GroundingDinoPreTrainedModel):
|
||||
|
||||
# In original implementation they apply layer norm before outputting intermediate hidden states
|
||||
# Though that's not through between layers so the layers use as input the output of the previous layer
|
||||
# withtout layer norm
|
||||
# without layer norm
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (self.layer_norm(hidden_states),)
|
||||
|
||||
|
@ -1425,7 +1425,7 @@ HUBERT_INPUTS_DOCSTRING = r"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare TFHubert Model transformer outputing raw hidden-states without any specific head on top.",
|
||||
"The bare TFHubert Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
HUBERT_START_DOCSTRING,
|
||||
)
|
||||
class TFHubertModel(TFHubertPreTrainedModel):
|
||||
|
@ -74,8 +74,8 @@ class IBertConfig(PretrainedConfig):
|
||||
quant_mode (`bool`, *optional*, defaults to `False`):
|
||||
Whether to quantize the model or not.
|
||||
force_dequant (`str`, *optional*, defaults to `"none"`):
|
||||
Force dequantize specific nonlinear layer. Dequatized layers are then executed with full precision.
|
||||
`"none"`, `"gelu"`, `"softmax"`, `"layernorm"` and `"nonlinear"` are supported. As deafult, it is set as
|
||||
Force dequantize specific nonlinear layer. Dequantized layers are then executed with full precision.
|
||||
`"none"`, `"gelu"`, `"softmax"`, `"layernorm"` and `"nonlinear"` are supported. As default, it is set as
|
||||
`"none"`, which does not dequantize any layers. Please specify `"gelu"`, `"softmax"`, or `"layernorm"` to
|
||||
dequantize GELU, Softmax, or LayerNorm, respectively. `"nonlinear"` will dequantize all nonlinear layers,
|
||||
i.e., GELU, Softmax, and LayerNorm.
|
||||
|
@ -276,7 +276,7 @@ class InternVLProcessor(ProcessorMixin):
|
||||
|
||||
Args:
|
||||
metadata (`VideoMetadata`):
|
||||
`VideoMetadata` object containing metadat about the video, such as "total_num_frames" or "fps".
|
||||
`VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
|
||||
num_frames (`int`, *optional*):
|
||||
Number of frames to sample uniformly. If None, all frames are sampled.
|
||||
initial_shift (`bool`, `float` or `int`, defaults to `0`):
|
||||
|
@ -246,7 +246,7 @@ class LevitAttentionSubsample(nn.Module):
|
||||
self.out_dim_keys_values = attention_ratio * key_dim * num_attention_heads + key_dim * num_attention_heads
|
||||
self.out_dim_projection = attention_ratio * key_dim * num_attention_heads
|
||||
self.resolution_out = resolution_out
|
||||
# resolution_in is the intial resolution, resoloution_out is final resolution after downsampling
|
||||
# resolution_in is the initial resolution, resolution_out is final resolution after downsampling
|
||||
self.keys_values = MLPLayerWithBN(input_dim, self.out_dim_keys_values)
|
||||
self.queries_subsample = LevitSubsample(stride, resolution_in)
|
||||
self.queries = MLPLayerWithBN(input_dim, key_dim * num_attention_heads)
|
||||
@ -370,7 +370,7 @@ class LevitStage(nn.Module):
|
||||
self.layers = []
|
||||
self.config = config
|
||||
self.resolution_in = resolution_in
|
||||
# resolution_in is the intial resolution, resolution_out is final resolution after downsampling
|
||||
# resolution_in is the initial resolution, resolution_out is final resolution after downsampling
|
||||
for _ in range(depths):
|
||||
self.layers.append(
|
||||
LevitResidualLayer(
|
||||
|
@ -55,7 +55,7 @@ if is_torchvision_available():
|
||||
|
||||
def get_factors(dividend: int) -> Set[int]:
|
||||
"""
|
||||
Calculate all factors of a given number, i.e. a dividor that leaves
|
||||
Calculate all factors of a given number, i.e. a divisor that leaves
|
||||
no remainder. For example, if dividend=12, it will return {1, 2, 3, 4, 6, 12}.
|
||||
|
||||
Args:
|
||||
|
@ -60,7 +60,7 @@ class LlavaNextVideoImageProcessor(BaseImageProcessor):
|
||||
image_grid_pinpoints (`List` *optional*, defaults to `[[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]]`):
|
||||
A list of possible resolutions to use for processing high resolution images. The best resolution is selected
|
||||
based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
|
||||
method. Not used for processinf videos.
|
||||
method. Not used for processing videos.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
||||
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
|
||||
do_center_crop (`bool`, *optional*, defaults to `True`):
|
||||
|
@ -405,7 +405,7 @@ class Mask2FormerHungarianMatcher(nn.Module):
|
||||
"""
|
||||
super().__init__()
|
||||
if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
|
||||
raise ValueError("All costs cant be 0")
|
||||
raise ValueError("All costs can't be 0")
|
||||
|
||||
self.num_points = num_points
|
||||
self.cost_class = cost_class
|
||||
|
@ -829,7 +829,7 @@ class MaskFormerHungarianMatcher(nn.Module):
|
||||
"""
|
||||
super().__init__()
|
||||
if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
|
||||
raise ValueError("All costs cant be 0")
|
||||
raise ValueError("All costs can't be 0")
|
||||
self.cost_class = cost_class
|
||||
self.cost_mask = cost_mask
|
||||
self.cost_dice = cost_dice
|
||||
|
@ -98,7 +98,7 @@ def add_megatron_checkpoint_args(parser):
|
||||
default=128,
|
||||
help=(
|
||||
"Pad the vocab size to be divisible by this value. "
|
||||
"This is added for computational efficieny reasons. "
|
||||
"This is added for computational efficiency reasons. "
|
||||
"Only used when converting a Transformers checkpoint to a Megatron checkpoint."
|
||||
),
|
||||
)
|
||||
@ -235,7 +235,7 @@ def transformers_to_megatron_fix_query_key_value_ordering(
|
||||
param, checkpoint_version, num_splits, num_heads, hidden_size
|
||||
):
|
||||
"""
|
||||
Permutes layout of param tensor to the one compatible with respective NVIDIA Megatron-LM chekpoint versions. Input
|
||||
Permutes layout of param tensor to the one compatible with respective NVIDIA Megatron-LM checkpoint versions. Input
|
||||
is [num_splits * num_heads * hidden_size, :] and output is [num_heads * hidden_size * num_splits, :] for version
|
||||
1.0 and [num_heads * num_splits * hidden_size, :] for version 2.0 and later. If param is the weight tensor of the
|
||||
self-attention block, the param needs to be already transposed before calling this function.
|
||||
@ -348,7 +348,7 @@ def convert_checkpoint_from_megatron_to_transformers(args):
|
||||
raise ValueError(
|
||||
"Megatron-LM checkpoint does not contain arguments. This utility only supports Megatron-LM checkpoints"
|
||||
" containing all the megatron arguments. This is because it loads all config related to model"
|
||||
" architecture, the tensor and pipeline model parallel size from the checkpoint insead of user having to"
|
||||
" architecture, the tensor and pipeline model parallel size from the checkpoint instead of user having to"
|
||||
" manually specify all the details. Please save Megatron-LM checkpoint along with all the megatron"
|
||||
" arguments to use this utility."
|
||||
)
|
||||
|
@ -1601,7 +1601,7 @@ class MusicgenForCausalLM(MusicgenPreTrainedModel, GenerationMixin):
|
||||
# 7. determine generation mode
|
||||
generation_mode = generation_config.get_generation_mode()
|
||||
|
||||
# 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
|
||||
# 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
|
||||
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
|
||||
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
|
||||
generation_config.guidance_scale = None
|
||||
@ -2617,7 +2617,7 @@ class MusicgenForConditionalGeneration(PreTrainedModel, GenerationMixin):
|
||||
# 7. determine generation mode
|
||||
generation_mode = generation_config.get_generation_mode()
|
||||
|
||||
# 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
|
||||
# 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
|
||||
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
|
||||
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
|
||||
generation_config.guidance_scale = None
|
||||
|
@ -54,7 +54,7 @@ class MusicgenMelodyFeatureExtractor(SequenceFeatureExtractor):
|
||||
sampling_rate (`int`, *optional*, defaults to 32000):
|
||||
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
|
||||
hop_length (`int`, *optional*, defaults to 4096):
|
||||
Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
|
||||
Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
|
||||
chunk_length (`int`, *optional*, defaults to 30):
|
||||
The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
|
||||
sequences.
|
||||
|
@ -92,7 +92,7 @@ class MusicgenMelodyOutputWithPast(ModelOutput):
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of conditional hidden-states representing the concatenation of the projeted text encoder output and the projeted audio encoder output.
|
||||
Sequence of conditional hidden-states representing the concatenation of the projected text encoder output and the projected audio encoder output.
|
||||
Used as a conditional signal.
|
||||
"""
|
||||
|
||||
@ -757,8 +757,8 @@ MUSICGEN_MELODY_INPUTS_DOCSTRING = r"""
|
||||
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
|
||||
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
|
||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of conditional hidden-states representing the concatenation of the projeted text encoder output and the projeted audio encoder output.
|
||||
Used as a conditional signal and will thus be concatenated to the projeted `decoder_input_ids`.
|
||||
Sequence of conditional hidden-states representing the concatenation of the projected text encoder output and the projected audio encoder output.
|
||||
Used as a conditional signal and will thus be concatenated to the projected `decoder_input_ids`.
|
||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
|
||||
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||
@ -818,7 +818,7 @@ MUSICGEN_MELODY_DECODER_INPUTS_DOCSTRING = r"""
|
||||
[What are attention masks?](../glossary#attention-mask)
|
||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states representing the concatenation of the text encoder output and the processed audio encoder output.
|
||||
Used as a conditional signal and will thus be concatenated to the projeted `decoder_input_ids`.
|
||||
Used as a conditional signal and will thus be concatenated to the projected `decoder_input_ids`.
|
||||
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
|
||||
Mask to avoid performing attention on conditional hidden states. Mask values
|
||||
selected in `[0, 1]`:
|
||||
@ -1522,7 +1522,7 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel, GenerationMixin):
|
||||
# 7. determine generation mode
|
||||
generation_mode = generation_config.get_generation_mode()
|
||||
|
||||
# 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
|
||||
# 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
|
||||
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
|
||||
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
|
||||
generation_config.guidance_scale = None
|
||||
@ -2478,7 +2478,7 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin):
|
||||
# 7. determine generation mode
|
||||
generation_mode = generation_config.get_generation_mode()
|
||||
|
||||
# 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
|
||||
# 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
|
||||
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
|
||||
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
|
||||
generation_config.guidance_scale = None
|
||||
|
@ -425,7 +425,7 @@ class NllbMoeSparseMLP(nn.Module):
|
||||
r"""
|
||||
The goal of this forward pass is to have the same number of operation as the equivalent `NllbMoeDenseActDense`
|
||||
(mlp) layer. This means that all of the hidden states should be processed at most twice ( since we are using a
|
||||
top_2 gating mecanism). This means that we keep the complexity to O(batch_size x sequence_length x hidden_dim)
|
||||
top_2 gating mechanism). This means that we keep the complexity to O(batch_size x sequence_length x hidden_dim)
|
||||
instead of O(num_experts x batch_size x sequence_length x hidden_dim).
|
||||
|
||||
1- Get the `router_probs` from the `router`. The shape of the `router_mask` is `(batch_size X sequence_length,
|
||||
|
@ -376,7 +376,7 @@ class NougatTokenizerFast(PreTrainedTokenizerFast):
|
||||
contains everything needed to load the tokenizer.
|
||||
|
||||
clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
|
||||
Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
|
||||
Whether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
|
||||
spaces.
|
||||
|
||||
unk_token (`str`, *optional*, defaults to `"<unk>"`):
|
||||
|
@ -268,7 +268,7 @@ def convert_omdet_turbo_checkpoint(args):
|
||||
"https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/ViT-B-16.pt",
|
||||
],
|
||||
}
|
||||
# Define default OmDetTurbo configuation
|
||||
# Define default OmDetTurbo configuration
|
||||
config = get_omdet_turbo_config(model_name, use_timm_backbone)
|
||||
|
||||
# Load original checkpoint
|
||||
|
@ -471,7 +471,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
|
||||
|
||||
is_training = token_type_ids is not None and labels is not None
|
||||
|
||||
# Replace image id woth PAD if the image token if OOV, to avoid index-errors
|
||||
# Replace image id with PAD if the image token if OOV, to avoid index-errors
|
||||
if input_ids is not None and self.config.image_token_id >= self.vocab_size:
|
||||
special_image_mask = input_ids == self.config.image_token_id
|
||||
llm_input_ids = input_ids.clone()
|
||||
|
@ -1807,7 +1807,7 @@ class PatchTSMixerForTimeSeriesClassificationOutput(ModelOutput):
|
||||
|
||||
Args:
|
||||
prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
|
||||
Prediction output from the classfication head.
|
||||
Prediction output from the classification head.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
|
||||
Backbone embeddings before passing through the head.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
|
@ -1487,7 +1487,7 @@ FLAX_PEGASUS_CONDITIONAL_GENERATION_DOCSTRING = """
|
||||
|
||||
Summarization example:
|
||||
|
||||
```pyton
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
|
||||
|
||||
>>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
|
||||
|
@ -127,7 +127,7 @@ def get_resize_output_image_size(
|
||||
ratio = max(height / max_height, width / max_width)
|
||||
|
||||
if ratio > 1:
|
||||
# Orgiginal implementation uses `round` which utilises bankers rounding, which can lead to surprising results
|
||||
# Original implementation uses `round` which utilises bankers rounding, which can lead to surprising results
|
||||
# Here we use floor to ensure the image is always smaller than the given "longest_edge"
|
||||
height = int(math.floor(height / ratio))
|
||||
width = int(math.floor(width / ratio))
|
||||
|
@ -35,7 +35,7 @@ logger = logging.get_logger(__name__)
|
||||
def create_rename_keys(config):
|
||||
rename_keys = []
|
||||
for i in range(config.num_encoder_blocks):
|
||||
# Remane embedings' paramters
|
||||
# Rename embeddings' parameters
|
||||
rename_keys.append((f"pos_embed{i + 1}", f"pvt.encoder.patch_embeddings.{i}.position_embeddings"))
|
||||
|
||||
rename_keys.append((f"patch_embed{i + 1}.proj.weight", f"pvt.encoder.patch_embeddings.{i}.projection.weight"))
|
||||
|
@ -1037,7 +1037,7 @@ REMBERT_INPUTS_DOCSTRING = r"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare RemBERT Model transformer outputing raw hidden-states without any specific head on top.",
|
||||
"The bare RemBERT Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
REMBERT_START_DOCSTRING,
|
||||
)
|
||||
class TFRemBertModel(TFRemBertPreTrainedModel):
|
||||
|
@ -911,7 +911,7 @@ ROFORMER_INPUTS_DOCSTRING = r"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare RoFormer Model transformer outputing raw hidden-states without any specific head on top.",
|
||||
"The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
ROFORMER_START_DOCSTRING,
|
||||
)
|
||||
class TFRoFormerModel(TFRoFormerPreTrainedModel):
|
||||
|
@ -2171,7 +2171,7 @@ class SeamlessM4TTextToUnitForConditionalGeneration(SeamlessM4TPreTrainedModel,
|
||||
config: SeamlessM4TConfig,
|
||||
embed_tokens_decoder: Optional[nn.Embedding] = None,
|
||||
):
|
||||
# update config - used principaly for bos_token_id etc.
|
||||
# update config - used principality for bos_token_id etc.
|
||||
config = copy.deepcopy(config)
|
||||
for param, val in config.to_dict().items():
|
||||
if param.startswith("t2u_"):
|
||||
|
@ -184,7 +184,7 @@ SEAMLESS_M4T_V2_MULTIMODAL_INPUTS_DOCSTRING = r"""
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
|
||||
Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
||||
"""
|
||||
|
||||
@ -202,7 +202,7 @@ M4T_TEXT_INPUTS_DOCSTRING = r"""
|
||||
M4T_SPEECH_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
|
||||
Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
||||
"""
|
||||
|
||||
@ -2461,7 +2461,7 @@ class SeamlessM4Tv2TextToUnitForConditionalGeneration(SeamlessM4Tv2PreTrainedMod
|
||||
config: SeamlessM4Tv2Config,
|
||||
embed_tokens_decoder: Optional[nn.Embedding] = None,
|
||||
):
|
||||
# update config - used principaly for bos_token_id etc.
|
||||
# update config - used principality for bos_token_id etc.
|
||||
config = copy.deepcopy(config)
|
||||
for param, val in config.to_dict().items():
|
||||
if param.startswith("t2u_"):
|
||||
@ -4035,7 +4035,7 @@ class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMix
|
||||
|
||||
Args:
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
|
||||
Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
||||
return_intermediate_token_ids (`bool`, *optional*):
|
||||
If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
|
||||
@ -4485,7 +4485,7 @@ class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`, *optional*):
|
||||
Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
||||
return_intermediate_token_ids (`bool`, *optional*):
|
||||
If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
|
||||
|
@ -114,7 +114,7 @@ def convert_seggpt_checkpoint(args):
|
||||
verify_logits = args.verify_logits
|
||||
push_to_hub = args.push_to_hub
|
||||
|
||||
# Define default GroundingDINO configuation
|
||||
# Define default GroundingDINO configuration
|
||||
config = SegGptConfig()
|
||||
|
||||
# Load original checkpoint
|
||||
|
@ -62,7 +62,7 @@ class SegGptEncoderOutput(ModelOutput):
|
||||
intermediate_hidden_states (`Tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
|
||||
Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
|
||||
Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
|
||||
Additionaly, each feature passes through a LayerNorm.
|
||||
Additionally, each feature passes through a LayerNorm.
|
||||
"""
|
||||
|
||||
last_hidden_state: torch.FloatTensor
|
||||
|
@ -1979,10 +1979,10 @@ SPEECHT5_BASE_START_DOCSTRING = r"""
|
||||
load the weights associated with the model, only the configuration. Check out the
|
||||
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
||||
encoder ([`SpeechT5EncoderWithSpeechPrenet`] or [`SpeechT5EncoderWithTextPrenet`] or `None`):
|
||||
The Transformer encoder module that applies the appropiate speech or text encoder prenet. If `None`,
|
||||
The Transformer encoder module that applies the appropriate speech or text encoder prenet. If `None`,
|
||||
[`SpeechT5EncoderWithoutPrenet`] will be used and the `input_values` are assumed to be hidden states.
|
||||
decoder ([`SpeechT5DecoderWithSpeechPrenet`] or [`SpeechT5DecoderWithTextPrenet`] or `None`):
|
||||
The Transformer decoder module that applies the appropiate speech or text decoder prenet. If `None`,
|
||||
The Transformer decoder module that applies the appropriate speech or text decoder prenet. If `None`,
|
||||
[`SpeechT5DecoderWithoutPrenet`] will be used and the `decoder_input_values` are assumed to be hidden
|
||||
states.
|
||||
"""
|
||||
|
@ -175,7 +175,7 @@ def make_state_dict(converted_params, is_encoder_only: bool):
|
||||
|
||||
|
||||
def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only):
|
||||
"""Replaces the params in model witht the T5X converted params."""
|
||||
"""Replaces the params in model with the T5X converted params."""
|
||||
variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
|
||||
converted = convert_t5x_to_pytorch(
|
||||
variables,
|
||||
|
@ -2344,11 +2344,11 @@ def _calculate_expected_result(
|
||||
if avg_approximation == AverageApproximationFunction.RATIO:
|
||||
average_result = sum_result / (count_result + EPSILON_ZERO_DIVISION)
|
||||
elif avg_approximation == AverageApproximationFunction.FIRST_ORDER:
|
||||
# The sum of all probabilities exept that correspond to other cells
|
||||
# The sum of all probabilities except that correspond to other cells
|
||||
ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1
|
||||
average_result = tf.reduce_sum(numeric_values_masked * scaled_probability_per_cell / ex, axis=1)
|
||||
elif avg_approximation == AverageApproximationFunction.SECOND_ORDER:
|
||||
# The sum of all probabilities exept that correspond to other cells
|
||||
# The sum of all probabilities except that correspond to other cells
|
||||
ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1
|
||||
pointwise_var = scaled_probability_per_cell * (1 - scaled_probability_per_cell)
|
||||
var = tf.reduce_sum(pointwise_var, axis=1, keepdims=True) - pointwise_var
|
||||
|
@ -2359,7 +2359,7 @@ _ORDINAL_WORDS = [
|
||||
"second",
|
||||
"third",
|
||||
"fourth",
|
||||
"fith",
|
||||
"fifth",
|
||||
"sixth",
|
||||
"seventh",
|
||||
"eighth",
|
||||
|
@ -1364,7 +1364,7 @@ class UdopStack(UdopPreTrainedModel):
|
||||
|
||||
if inputs_embeds is None:
|
||||
if self.embed_tokens is None:
|
||||
raise ValueError("You have to intialize the model with valid token embeddings")
|
||||
raise ValueError("You have to initialize the model with valid token embeddings")
|
||||
inputs_embeds = self.embed_tokens(input_ids)
|
||||
|
||||
if pixel_values is not None:
|
||||
|
@ -200,7 +200,7 @@ def make_state_dict(converted_params, is_encoder_only: bool):
|
||||
|
||||
|
||||
def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention):
|
||||
"""Replaces the params in model witht the T5X converted params."""
|
||||
"""Replaces the params in model with the T5X converted params."""
|
||||
variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
|
||||
converted = convert_t5x_to_pytorch(
|
||||
variables, num_layers=config.num_layers, is_encoder_only=is_encoder_only, scalable_attention=scalable_attention
|
||||
|
@ -164,7 +164,7 @@ class UniSpeechConfig(PretrainedConfig):
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
The id of the "end-of-sequence" token.
|
||||
replace_prob (`float`, *optional*, defaults to 0.5):
|
||||
Propability that transformer feature is replaced by quantized feature for pretraining.
|
||||
Probability that transformer feature is replaced by quantized feature for pretraining.
|
||||
|
||||
Example:
|
||||
|
||||
|
@ -56,7 +56,7 @@ def get_kernel_predictor_key_mapping(config: UnivNetConfig, old_prefix: str = ""
|
||||
def get_key_mapping(config: UnivNetConfig):
|
||||
mapping = {}
|
||||
|
||||
# NOTE: inital conv layer keys are the same
|
||||
# NOTE: initial conv layer keys are the same
|
||||
|
||||
# LVC Residual blocks
|
||||
for i in range(len(config.resblock_stride_sizes)):
|
||||
|
@ -64,7 +64,7 @@ class UnivNetFeatureExtractor(SequenceFeatureExtractor):
|
||||
The number of FFT components to use. If `None`, this is determined using
|
||||
`transformers.audio_utils.optimal_fft_length`.
|
||||
max_length_s (`int`, *optional*, defaults to 10):
|
||||
The maximum input lenght of the model in seconds. This is used to pad the audio.
|
||||
The maximum input length of the model in seconds. This is used to pad the audio.
|
||||
fmin (`float`, *optional*, defaults to 0.0):
|
||||
Minimum mel frequency in Hz.
|
||||
fmax (`float`, *optional*):
|
||||
|
@ -39,7 +39,7 @@ class ViltConfig(PretrainedConfig):
|
||||
The vocabulary size of the `token_type_ids` passed when calling [`ViltModel`]. This is used when encoding
|
||||
text.
|
||||
modality_type_vocab_size (`int`, *optional*, defaults to 2):
|
||||
The vocabulary size of the modalities passed when calling [`ViltModel`]. This is used after concatening the
|
||||
The vocabulary size of the modalities passed when calling [`ViltModel`]. This is used after concatenating the
|
||||
embeddings of the text and image modalities.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 40):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
|
@ -139,7 +139,7 @@ class ViltEmbeddings(nn.Module):
|
||||
x_mask = x_mask.flatten(1)
|
||||
|
||||
if max_image_length < 0 or max_image_length is None or not isinstance(max_image_length, int):
|
||||
# suppose aug is 800 x 1333, then, maximum effective res is 800 x 1333 (if one side gets bigger, the other will be constrained and be shrinked)
|
||||
# suppose aug is 800 x 1333, then, maximum effective res is 800 x 1333 (if one side gets bigger, the other will be constrained and be shrunk)
|
||||
# (800 // self.patch_size) * (1333 // self.patch_size) is the maximum number of patches that single image can get.
|
||||
# if self.patch_size = 32, 25 * 41 = 1025
|
||||
# if res is 384 x 640, 12 * 20 = 240
|
||||
|
@ -85,7 +85,7 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
|
||||
super().__init__(**kwargs)
|
||||
if "encoder" not in kwargs or "decoder" not in kwargs:
|
||||
raise ValueError(
|
||||
f"A configuraton of type {self.model_type} cannot be instantiated because "
|
||||
f"A configuration of type {self.model_type} cannot be instantiated because "
|
||||
f"not both `encoder` and `decoder` sub-configurations are passed, but only {kwargs}"
|
||||
)
|
||||
|
||||
|
@ -1496,7 +1496,7 @@ WAV2VEC2_INPUTS_DOCSTRING = r"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare TFWav2Vec2 Model transformer outputing raw hidden-states without any specific head on top.",
|
||||
"The bare TFWav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
WAV2VEC2_START_DOCSTRING,
|
||||
)
|
||||
class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
|
||||
|
@ -101,7 +101,7 @@ class WavLMConfig(PretrainedConfig):
|
||||
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
||||
Recognition](https://arxiv.org/abs/1904.08779).
|
||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
||||
Probability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
||||
masked. Approximately `mask_time_prob * sequence_length // mask_time_length` feature vectors will be masked
|
||||
along the time axis. This is only relevant if `apply_spec_augment is True`.
|
||||
mask_time_length (`int`, *optional*, defaults to 10):
|
||||
@ -111,7 +111,7 @@ class WavLMConfig(PretrainedConfig):
|
||||
irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
|
||||
mask_time_min_masks''
|
||||
mask_feature_prob (`float`, *optional*, defaults to 0.0):
|
||||
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
||||
Probability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
||||
be masked. Approximately `mask_time_prob * hidden_size // mask_time_length` feature vectors will be masked
|
||||
along the time axis. This is only relevant if `apply_spec_augment is True`.
|
||||
mask_feature_length (`int`, *optional*, defaults to 10):
|
||||
|
@ -474,7 +474,7 @@ class ZoeDepthImageProcessor(BaseImageProcessor):
|
||||
outputs_flipped ([`ZoeDepthDepthEstimatorOutput`], *optional*):
|
||||
Raw outputs of the model from flipped input (averaged out in the end).
|
||||
do_remove_padding (`bool`, *optional*):
|
||||
By default ZoeDepth addes padding equal to `int(√(height / 2) * 3)` (and similarly for width) to fix the
|
||||
By default ZoeDepth adds padding equal to `int(√(height / 2) * 3)` (and similarly for width) to fix the
|
||||
boundary artifacts in the output depth map, so we need remove this padding during post_processing. The
|
||||
parameter exists here in case the user changed the image preprocessing to not include padding.
|
||||
|
||||
|
@ -37,7 +37,7 @@ class AwqQuantizer(HfQuantizer):
|
||||
4-bit quantization for Activation-aware Weight Quantization(AWQ) (https://arxiv.org/abs/2306.00978)
|
||||
"""
|
||||
|
||||
# AWQ requires data callibration - we support only inference
|
||||
# AWQ requires data calibration - we support only inference
|
||||
requires_calibration = True
|
||||
|
||||
required_packages = ["awq", "accelerate"]
|
||||
|
@ -69,7 +69,7 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
|
||||
return missing_keys
|
||||
|
||||
# We expect some keys to be missing for
|
||||
# compresed models
|
||||
# compressed models
|
||||
# This is fine as the weights are reconstructed by ModelCompressor
|
||||
# in _process_model_after_weight_loading
|
||||
|
||||
|
@ -1728,14 +1728,14 @@ Please note that you may need to restart your runtime after installation.
|
||||
|
||||
# docstyle-ignore
|
||||
LIBROSA_IMPORT_ERROR = """
|
||||
{0} requires thes librosa library. But that was not found in your environment. You can install them with pip:
|
||||
{0} requires the librosa library. But that was not found in your environment. You can install them with pip:
|
||||
`pip install librosa`
|
||||
Please note that you may need to restart your runtime after installation.
|
||||
"""
|
||||
|
||||
# docstyle-ignore
|
||||
PRETTY_MIDI_IMPORT_ERROR = """
|
||||
{0} requires thes pretty_midi library. But that was not found in your environment. You can install them with pip:
|
||||
{0} requires the pretty_midi library. But that was not found in your environment. You can install them with pip:
|
||||
`pip install pretty_midi`
|
||||
Please note that you may need to restart your runtime after installation.
|
||||
"""
|
||||
|
@ -1120,7 +1120,7 @@ class VptqLayerConfig(QuantizationConfigMixin):
|
||||
group_size (`int`, *optional*, defaults to `-1`): depends on out-features
|
||||
indices_as_float (`bool`, *optional*, defaults to `False`): for Finetuning
|
||||
is_indice_packed (`bool`, *optional*, defaults to `True`): should always be True
|
||||
num_centroids (`list`, *optional*, defaults to `[-1, -1]`): centriod numbers of clusters
|
||||
num_centroids (`list`, *optional*, defaults to `[-1, -1]`): centroid numbers of clusters
|
||||
num_res_centroids (`list`, *optional*, defaults to `[-1, -1]`): ditto for residual
|
||||
outlier_size (`int`, *optional*, defaults to `1`): outliers
|
||||
vector_lens (`list`, *optional*, defaults to `[-1, -1]`): centroid vector length in quantization
|
||||
|
@ -146,7 +146,7 @@ class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
def test_tokenization_base_hard_symbols(self):
|
||||
symbols = (
|
||||
'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
|
||||
" add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
|
||||
" add words that should not exist and be tokenized to <unk>, such as saoneuhaoesuth"
|
||||
)
|
||||
original_tokenizer_encodings = [
|
||||
871,
|
||||
|
@ -170,7 +170,7 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
def test_tokenization_base_hard_symbols(self):
|
||||
symbols = (
|
||||
'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
|
||||
" add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
|
||||
" add words that should not exist and be tokenized to <unk>, such as saoneuhaoesuth"
|
||||
)
|
||||
original_tokenizer_encodings = [65, 871, 419, 358, 946, 991, 2521, 452, 358, 1357, 387, 7751, 3536, 112, 985, 456, 126, 865, 938, 5400, 5734, 458, 1368, 467, 786, 2462, 5246, 1159, 633, 865, 4519, 457, 582, 852, 2557, 427, 916, 508, 405, 34324, 497, 391, 408, 11342, 1244, 385, 100, 938, 985, 456, 574, 362, 12597, 3200, 3129, 1172, 66] # fmt: skip
|
||||
self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
|
||||
|
@ -438,7 +438,7 @@ class BridgeTowerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
|
||||
if self.has_attentions:
|
||||
self.assertIsNotNone(attentions.grad)
|
||||
|
||||
# override as the `logit_scale` parameter initilization is different for BRIDGE TOWER
|
||||
# override as the `logit_scale` parameter initialization is different for BRIDGE TOWER
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
|
@ -55,7 +55,7 @@ class BridgeTowerProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
def tearDownClass(cls):
|
||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||
|
||||
# Some kwargs tests are overriden from common tests to handle shortest_edge
|
||||
# Some kwargs tests are overridden from common tests to handle shortest_edge
|
||||
# and size_divisor behaviour
|
||||
|
||||
@require_torch
|
||||
|
@ -924,7 +924,7 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
def test_model_get_set_embeddings(self):
|
||||
pass
|
||||
|
||||
# override as the `logit_scale` parameter initilization is different for FLAVA
|
||||
# override as the `logit_scale` parameter initialization is different for FLAVA
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@ -933,7 +933,7 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
model = model_class(config=configs_no_init)
|
||||
for name, param in model.named_parameters():
|
||||
if param.requires_grad:
|
||||
# check if `logit_scale` is initilized as per the original implementation
|
||||
# check if `logit_scale` is initialized as per the original implementation
|
||||
if name == "logit_scale" or name == "flava.logit_scale":
|
||||
self.assertAlmostEqual(
|
||||
param.data.item(),
|
||||
|
@ -137,7 +137,7 @@ class Gemma3ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
|
||||
|
||||
@unittest.skip(
|
||||
reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`"
|
||||
" as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
|
||||
" as in Dynamic Cache doesn't work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
|
||||
)
|
||||
def test_multi_gpu_data_parallel_forward(self):
|
||||
pass
|
||||
@ -275,7 +275,7 @@ class Gemma3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unitte
|
||||
|
||||
@unittest.skip(
|
||||
reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`"
|
||||
" as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
|
||||
" as in Dynamic Cache doesn't work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
|
||||
)
|
||||
def test_multi_gpu_data_parallel_forward(self):
|
||||
pass
|
||||
|
@ -88,7 +88,7 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
|
||||
image = self.prepare_image_inputs()
|
||||
|
||||
# If text has no image tokens, iamge should be `None`
|
||||
# If text has no image tokens, image should be `None`
|
||||
with self.assertRaises(ValueError):
|
||||
_ = processor(text=text_no_image, images=image, return_tensors="np")
|
||||
|
||||
|
@ -478,8 +478,8 @@ class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
|
||||
# the last 2 tokens are masked, and should have 0 attn_probs
|
||||
self.assertTrue(torch.all(attn_probs[:, :, -mask_tokens:, -mask_tokens:] == 0))
|
||||
|
||||
# in loacal attention each token can only attend to the previous window_size tokens (including itself)
|
||||
# here window_size is 4, so a token at index 5 can only attend to indcies [2, 3, 4, 5]
|
||||
# in local attention each token can only attend to the previous window_size tokens (including itself)
|
||||
# here window_size is 4, so a token at index 5 can only attend to indices [2, 3, 4, 5]
|
||||
# and the attn_probs should be 0 for token [0, 1]
|
||||
self.assertTrue(torch.all(attn_probs[:, :, 5, 2:6] != 0))
|
||||
self.assertTrue(torch.all(attn_probs[:, :, 5, :2] == 0))
|
||||
|
@ -769,7 +769,7 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):
|
||||
encoding1 = processor(images=image, text=text1, return_tensors="pt").to(torch_device)
|
||||
encoding2 = processor(images=image, text=text2, return_tensors="pt").to(torch_device)
|
||||
# If we batch the text and cross attention masking is working the batched result should be equal to
|
||||
# The singe text result
|
||||
# The single text result
|
||||
encoding_batched = processor(
|
||||
images=[image] * len(text_batched), text=text_batched, padding="longest", return_tensors="pt"
|
||||
).to(torch_device)
|
||||
|
@ -658,7 +658,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
|
||||
def test_sdpa_can_dispatch_composite_models(self):
|
||||
"""
|
||||
Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
|
||||
This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
|
||||
This tests only by looking at layer names, as usually SDPA layers are called "SDPAAttention".
|
||||
In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
|
||||
is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
|
||||
See https://github.com/huggingface/transformers/pull/32238 for more info
|
||||
|
@ -56,7 +56,7 @@ class LlavaConfigTest(unittest.TestCase):
|
||||
|
||||
def test_arbitrary_reload(self):
|
||||
"""
|
||||
Simple test for reloading arbirarily composed subconfigs
|
||||
Simple test for reloading arbitrarily composed subconfigs
|
||||
"""
|
||||
default_values = LlavaConfig().to_diff_dict()
|
||||
default_values["vision_config"]["model_type"] = "pixtral"
|
||||
|
@ -553,8 +553,8 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
# image = Image.open(requests.get(url, stream=True).raw)
|
||||
inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(model.device)
|
||||
generate_ids = model.generate(**inputs, max_new_tokens=500)
|
||||
ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||||
print(ouptut)
|
||||
output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||||
print(output)
|
||||
|
||||
# fmt: off
|
||||
EXPECTED_GENERATION = """
|
||||
@ -573,7 +573,7 @@ These descriptions provide a detailed overview of the content and atmosphere of
|
||||
"""
|
||||
# fmt: on
|
||||
# check that both inputs are handled correctly and generate the same output
|
||||
self.assertEqual(ouptut, EXPECTED_GENERATION)
|
||||
self.assertEqual(output, EXPECTED_GENERATION)
|
||||
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user