mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-19 20:48:22 +06:00
Fix typos in strings and comments (#37799)
This commit is contained in:
parent
f466603963
commit
d5fa7d2d19
@ -1146,9 +1146,9 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: Option
|
|||||||
tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
|
tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
|
||||||
fft_window_size (`int`, *optional*):
|
fft_window_size (`int`, *optional*):
|
||||||
Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the
|
Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the
|
||||||
spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of
|
spectrogram. 400 means that the fourier transform is computed on windows of 400 samples. The number of
|
||||||
frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to
|
frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to
|
||||||
`(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally.
|
`(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionally.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
|
@ -850,7 +850,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
|
|||||||
beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, generated_len=generated_len)
|
beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, generated_len=generated_len)
|
||||||
ids_collect.append(beam_id)
|
ids_collect.append(beam_id)
|
||||||
|
|
||||||
# due to overly complex constraints or other factors, sometimes we can't gaurantee a successful
|
# due to overly complex constraints or other factors, sometimes we can't guarantee a successful
|
||||||
# generation. In these cases we simply return the highest scoring outputs.
|
# generation. In these cases we simply return the highest scoring outputs.
|
||||||
if len(ids_collect) < self.num_beam_hyps_to_keep:
|
if len(ids_collect) < self.num_beam_hyps_to_keep:
|
||||||
for beam_id in range(self.num_beams):
|
for beam_id in range(self.num_beams):
|
||||||
|
@ -192,7 +192,7 @@ class GenerationConfig(PushToHubMixin):
|
|||||||
our [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information.
|
our [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information.
|
||||||
cache_config (`CacheConfig` or `dict`, *optional*, default to `None`):
|
cache_config (`CacheConfig` or `dict`, *optional*, default to `None`):
|
||||||
Arguments used in the key-value cache class can be passed in `cache_config`. Can be passed as a `Dict` and
|
Arguments used in the key-value cache class can be passed in `cache_config`. Can be passed as a `Dict` and
|
||||||
it will be converted to its repsective `CacheConfig` internally.
|
it will be converted to its respective `CacheConfig` internally.
|
||||||
Otherwise can be passed as a `CacheConfig` class matching the indicated `cache_implementation`.
|
Otherwise can be passed as a `CacheConfig` class matching the indicated `cache_implementation`.
|
||||||
return_legacy_cache (`bool`, *optional*, default to `True`):
|
return_legacy_cache (`bool`, *optional*, default to `True`):
|
||||||
Whether to return the legacy or new format of the cache when `DynamicCache` is used by default.
|
Whether to return the legacy or new format of the cache when `DynamicCache` is used by default.
|
||||||
@ -235,7 +235,7 @@ class GenerationConfig(PushToHubMixin):
|
|||||||
The parameter for repetition penalty. 1.0 means no penalty. See [this
|
The parameter for repetition penalty. 1.0 means no penalty. See [this
|
||||||
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
|
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
|
||||||
encoder_repetition_penalty (`float`, *optional*, defaults to 1.0):
|
encoder_repetition_penalty (`float`, *optional*, defaults to 1.0):
|
||||||
The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
|
The parameter for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
|
||||||
original input. 1.0 means no penalty.
|
original input. 1.0 means no penalty.
|
||||||
length_penalty (`float`, *optional*, defaults to 1.0):
|
length_penalty (`float`, *optional*, defaults to 1.0):
|
||||||
Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
|
Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
|
||||||
@ -385,7 +385,7 @@ class GenerationConfig(PushToHubMixin):
|
|||||||
inference.
|
inference.
|
||||||
disable_compile (`bool`, *optional*):
|
disable_compile (`bool`, *optional*):
|
||||||
Whether to disable the automatic compilation of the forward pass. Automatic compilation happens when
|
Whether to disable the automatic compilation of the forward pass. Automatic compilation happens when
|
||||||
specific criteria are met, including using a compileable cache. Please open an issue if you find the
|
specific criteria are met, including using a compilable cache. Please open an issue if you find the
|
||||||
need to use this flag.
|
need to use this flag.
|
||||||
|
|
||||||
> Wild card
|
> Wild card
|
||||||
@ -710,7 +710,7 @@ class GenerationConfig(PushToHubMixin):
|
|||||||
UserWarning,
|
UserWarning,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 3. detect incorrect paramaterization specific to advanced beam modes
|
# 3. detect incorrect parameterization specific to advanced beam modes
|
||||||
else:
|
else:
|
||||||
# constrained beam search
|
# constrained beam search
|
||||||
if self.constraints is not None or self.force_words_ids is not None:
|
if self.constraints is not None or self.force_words_ids is not None:
|
||||||
|
@ -271,7 +271,7 @@ class FlaxMinLengthLogitsProcessor(FlaxLogitsProcessor):
|
|||||||
|
|
||||||
class FlaxSuppressTokensAtBeginLogitsProcessor(FlaxLogitsProcessor):
|
class FlaxSuppressTokensAtBeginLogitsProcessor(FlaxLogitsProcessor):
|
||||||
r"""
|
r"""
|
||||||
[`FlaxLogitsProcessor`] supressing a list of tokens as soon as the `generate` function starts generating using
|
[`FlaxLogitsProcessor`] suppressing a list of tokens as soon as the `generate` function starts generating using
|
||||||
`begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are not sampled at the
|
`begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are not sampled at the
|
||||||
beginning of the generation.
|
beginning of the generation.
|
||||||
|
|
||||||
|
@ -543,7 +543,7 @@ class TopKLogitsWarper(LogitsProcessor):
|
|||||||
class MinPLogitsWarper(LogitsProcessor):
|
class MinPLogitsWarper(LogitsProcessor):
|
||||||
"""
|
"""
|
||||||
[`LogitsProcessor`] that performs min-p, i.e. keeps all tokens that are above a minimum probability, scaled by the
|
[`LogitsProcessor`] that performs min-p, i.e. keeps all tokens that are above a minimum probability, scaled by the
|
||||||
probability of the most likely token. As a result, the filter becomes more agressive in the presence of
|
probability of the most likely token. As a result, the filter becomes more aggressive in the presence of
|
||||||
high-probability tokens, which is a sign of a confident output that we shouldn't deviate from.
|
high-probability tokens, which is a sign of a confident output that we shouldn't deviate from.
|
||||||
|
|
||||||
Often used together with [`TemperatureLogitsWarper`]. Used as an alternative to [`TopPLogitsWarper`] and
|
Often used together with [`TemperatureLogitsWarper`]. Used as an alternative to [`TopPLogitsWarper`] and
|
||||||
@ -738,7 +738,7 @@ class EpsilonLogitsWarper(LogitsProcessor):
|
|||||||
|
|
||||||
>>> # With epsilon sampling, the output gets restricted to high-probability tokens. Note that this is similar to
|
>>> # With epsilon sampling, the output gets restricted to high-probability tokens. Note that this is similar to
|
||||||
>>> # Top P sampling, which restricts tokens based on their cumulative probability.
|
>>> # Top P sampling, which restricts tokens based on their cumulative probability.
|
||||||
>>> # Pro tip: The paper recomends using `epsilon_cutoff` values between 3e-4 and 9e-4
|
>>> # Pro tip: The paper recommends using `epsilon_cutoff` values between 3e-4 and 9e-4
|
||||||
>>> outputs = model.generate(**inputs, do_sample=True, epsilon_cutoff=0.1)
|
>>> outputs = model.generate(**inputs, do_sample=True, epsilon_cutoff=0.1)
|
||||||
>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
|
>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
|
||||||
A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
|
A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
|
||||||
@ -819,7 +819,7 @@ class EtaLogitsWarper(LogitsProcessor):
|
|||||||
|
|
||||||
>>> # With eta sampling, the output gets restricted to high-probability tokens. You can see it as a dynamic form of
|
>>> # With eta sampling, the output gets restricted to high-probability tokens. You can see it as a dynamic form of
|
||||||
>>> # epsilon sampling that adapts its cutoff probability based on the entropy (high entropy = lower cutoff).
|
>>> # epsilon sampling that adapts its cutoff probability based on the entropy (high entropy = lower cutoff).
|
||||||
>>> # Pro tip: The paper recomends using `eta_cutoff` values between 3e-4 to 4e-3
|
>>> # Pro tip: The paper recommends using `eta_cutoff` values between 3e-4 to 4e-3
|
||||||
>>> outputs = model.generate(**inputs, do_sample=True, eta_cutoff=0.1)
|
>>> outputs = model.generate(**inputs, do_sample=True, eta_cutoff=0.1)
|
||||||
>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
|
>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
|
||||||
A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
|
A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
|
||||||
@ -1348,7 +1348,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
|
|||||||
>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
|
>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
|
||||||
Alice and Bob are friends
|
Alice and Bob are friends
|
||||||
|
|
||||||
>>> # We can contrain it with `prefix_allowed_tokens_fn` to force a certain behavior based on a prefix.
|
>>> # We can constrain it with `prefix_allowed_tokens_fn` to force a certain behavior based on a prefix.
|
||||||
>>> # For instance, we can force an entire entity to be generated when its beginning is detected.
|
>>> # For instance, we can force an entire entity to be generated when its beginning is detected.
|
||||||
>>> entity = tokenizer(" Bob Marley", return_tensors="pt").input_ids[0] # 3 tokens
|
>>> entity = tokenizer(" Bob Marley", return_tensors="pt").input_ids[0] # 3 tokens
|
||||||
>>> def prefix_allowed_tokens_fn(batch_id, input_ids):
|
>>> def prefix_allowed_tokens_fn(batch_id, input_ids):
|
||||||
@ -1791,7 +1791,7 @@ class LogitNormalization(LogitsProcessor):
|
|||||||
|
|
||||||
class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
|
class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
|
||||||
r"""
|
r"""
|
||||||
[`SuppressTokensAtBeginLogitsProcessor`] supresses a list of tokens as soon as the `generate` function starts
|
[`SuppressTokensAtBeginLogitsProcessor`] suppresses a list of tokens as soon as the `generate` function starts
|
||||||
generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are
|
generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are
|
||||||
not generated at the beginning. Originally created for
|
not generated at the beginning. Originally created for
|
||||||
[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
|
[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
|
||||||
@ -2642,7 +2642,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
|
|||||||
We assume that the scores are in the log space.
|
We assume that the scores are in the log space.
|
||||||
Args:
|
Args:
|
||||||
scores (`torch.FloatTensor`): Scores (batch_size, vocab_size).
|
scores (`torch.FloatTensor`): Scores (batch_size, vocab_size).
|
||||||
g_values (`torch.FloatTensor`): G valus (batch_size, vocab_size, depth).
|
g_values (`torch.FloatTensor`): G values (batch_size, vocab_size, depth).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Updated scores (batch_size, vocab_size).
|
Updated scores (batch_size, vocab_size).
|
||||||
@ -2668,7 +2668,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
|
|||||||
if self.debug_mode:
|
if self.debug_mode:
|
||||||
scores = torch.ones_like(scores)
|
scores = torch.ones_like(scores)
|
||||||
|
|
||||||
# Currently indices is just a arange to compute watermarking on the desnse logits.
|
# Currently indices is just a arange to compute watermarking on the dense logits.
|
||||||
all_indices = torch.stack([torch.arange(vocab_size, device=self.device) for _ in range(batch_size)])
|
all_indices = torch.stack([torch.arange(vocab_size, device=self.device) for _ in range(batch_size)])
|
||||||
|
|
||||||
if self.state is None:
|
if self.state is None:
|
||||||
|
@ -343,7 +343,7 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _match_found():
|
def _match_found():
|
||||||
# Finaly, runs the actual comparison. Can only be called if the previous comparisons do not yield
|
# Finally, runs the actual comparison. Can only be called if the previous comparisons do not yield
|
||||||
# an answer (otherwise we get indexing exceptions)
|
# an answer (otherwise we get indexing exceptions)
|
||||||
compare_len = self.bad_word_seqs_len[bad_word_seq_number] - 1
|
compare_len = self.bad_word_seqs_len[bad_word_seq_number] - 1
|
||||||
return tf.cond(
|
return tf.cond(
|
||||||
|
@ -962,7 +962,7 @@ class TFGenerationMixin:
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
|
"Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
|
||||||
f" num_return_sequences, got {generation_config.num_beams} and"
|
f" num_return_sequences, got {generation_config.num_beams} and"
|
||||||
f" {generation_config.num_return_sequences} (respectivelly)"
|
f" {generation_config.num_return_sequences} (respectively)"
|
||||||
)
|
)
|
||||||
|
|
||||||
# 11. broadcast inputs to the desired number of beams
|
# 11. broadcast inputs to the desired number of beams
|
||||||
@ -994,7 +994,7 @@ class TFGenerationMixin:
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
|
"Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
|
||||||
f" num_return_sequences, got {generation_config.num_beams} and"
|
f" num_return_sequences, got {generation_config.num_beams} and"
|
||||||
f" {generation_config.num_return_sequences} (respectivelly)"
|
f" {generation_config.num_return_sequences} (respectively)"
|
||||||
)
|
)
|
||||||
|
|
||||||
# 11. prepare logits warper
|
# 11. prepare logits warper
|
||||||
@ -1626,7 +1626,7 @@ class TFGenerationMixin:
|
|||||||
)
|
)
|
||||||
use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
|
use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
|
||||||
use_xla = not tf.executing_eagerly()
|
use_xla = not tf.executing_eagerly()
|
||||||
# TODO (Joao): fix cache format or find programatic way to detect cache index
|
# TODO (Joao): fix cache format or find programmatic way to detect cache index
|
||||||
# GPT2 and other models has a slightly different cache structure, with a different batch axis
|
# GPT2 and other models has a slightly different cache structure, with a different batch axis
|
||||||
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
|
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
|
||||||
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
|
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
|
||||||
@ -1910,7 +1910,7 @@ class TFGenerationMixin:
|
|||||||
)
|
)
|
||||||
use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
|
use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
|
||||||
use_xla = not tf.executing_eagerly()
|
use_xla = not tf.executing_eagerly()
|
||||||
# TODO (Joao): fix cache format or find programatic way to detect cache index
|
# TODO (Joao): fix cache format or find programmatic way to detect cache index
|
||||||
# GPT2 and other models has a slightly different cache structure, with a different batch axis
|
# GPT2 and other models has a slightly different cache structure, with a different batch axis
|
||||||
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
|
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
|
||||||
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
|
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
|
||||||
@ -2253,7 +2253,7 @@ class TFGenerationMixin:
|
|||||||
|
|
||||||
use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
|
use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
|
||||||
use_xla = not tf.executing_eagerly()
|
use_xla = not tf.executing_eagerly()
|
||||||
# TODO (Joao): fix cache format or find programatic way to detect cache index
|
# TODO (Joao): fix cache format or find programmatic way to detect cache index
|
||||||
# GPT2 and other models has a slightly different cache structure, with a different batch axis
|
# GPT2 and other models has a slightly different cache structure, with a different batch axis
|
||||||
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
|
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
|
||||||
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
|
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
|
||||||
@ -2788,7 +2788,7 @@ class TFGenerationMixin:
|
|||||||
model_kwargs.pop("use_cache", None)
|
model_kwargs.pop("use_cache", None)
|
||||||
|
|
||||||
use_xla = not tf.executing_eagerly()
|
use_xla = not tf.executing_eagerly()
|
||||||
# TODO (Joao): fix cache format or find programatic way to detect cache index
|
# TODO (Joao): fix cache format or find programmatic way to detect cache index
|
||||||
# GPT2 and other models has a slightly different cache structure, with a different batch axis
|
# GPT2 and other models has a slightly different cache structure, with a different batch axis
|
||||||
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
|
model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
|
||||||
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
|
cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
|
||||||
|
@ -362,7 +362,7 @@ class GenerationMixin:
|
|||||||
inherit from `GenerationMixin` to benefit from all generation-related automation in our codebase;
|
inherit from `GenerationMixin` to benefit from all generation-related automation in our codebase;
|
||||||
- `BarkModel` has a custom `generate` method and one of its inner models calls `GenerationMixin.generate`.
|
- `BarkModel` has a custom `generate` method and one of its inner models calls `GenerationMixin.generate`.
|
||||||
However, its `generate` does not share the same interface as `GenerationMixin.generate`. In this case,
|
However, its `generate` does not share the same interface as `GenerationMixin.generate`. In this case,
|
||||||
`BarkModel` shoud NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.
|
`BarkModel` should NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.
|
||||||
|
|
||||||
The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
|
The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
|
||||||
- *greedy decoding* if `num_beams=1` and `do_sample=False`
|
- *greedy decoding* if `num_beams=1` and `do_sample=False`
|
||||||
@ -392,7 +392,7 @@ class GenerationMixin:
|
|||||||
- Exception 1: when passing input_embeds, input_ids may be missing entries
|
- Exception 1: when passing input_embeds, input_ids may be missing entries
|
||||||
- Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
|
- Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
|
||||||
- Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
|
- Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
|
||||||
- Excpetion 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
|
- Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
|
||||||
generate the first token for each sequence. Later use the generated Input ids for continuation.
|
generate the first token for each sequence. Later use the generated Input ids for continuation.
|
||||||
|
|
||||||
The current implementation does not rely on ``self`` and could be
|
The current implementation does not rely on ``self`` and could be
|
||||||
@ -967,7 +967,7 @@ class GenerationMixin:
|
|||||||
assistant_model=assistant_model,
|
assistant_model=assistant_model,
|
||||||
assistant_prune_lm_head=True, # prune LM head of assistant model
|
assistant_prune_lm_head=True, # prune LM head of assistant model
|
||||||
)
|
)
|
||||||
# Since we prune the LM head, we cannot use the repetition penalty on the assistant model due to mismaches between token ids and logits index
|
# Since we prune the LM head, we cannot use the repetition penalty on the assistant model due to mismatches between token ids and logits index
|
||||||
assistant_model.generation_config.repetition_penalty = None
|
assistant_model.generation_config.repetition_penalty = None
|
||||||
candidate_generator = UniversalSpeculativeDecodingGenerator(
|
candidate_generator = UniversalSpeculativeDecodingGenerator(
|
||||||
input_ids=input_ids,
|
input_ids=input_ids,
|
||||||
|
@ -171,7 +171,7 @@ def find_tied_parameters(model: "nn.Module", **kwargs):
|
|||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# get ALL model parameters and thier names
|
# get ALL model parameters and their names
|
||||||
all_named_parameters = dict(model.named_parameters(remove_duplicate=False))
|
all_named_parameters = dict(model.named_parameters(remove_duplicate=False))
|
||||||
|
|
||||||
# get ONLY unique named parameters,
|
# get ONLY unique named parameters,
|
||||||
@ -187,7 +187,7 @@ def find_tied_parameters(model: "nn.Module", **kwargs):
|
|||||||
for tied_param_name in tied_param_names:
|
for tied_param_name in tied_param_names:
|
||||||
tied_param = all_named_parameters[tied_param_name]
|
tied_param = all_named_parameters[tied_param_name]
|
||||||
for param_name, param in no_duplicate_named_parameters.items():
|
for param_name, param in no_duplicate_named_parameters.items():
|
||||||
# compare if parameters are the same, if so, group thier names together
|
# compare if parameters are the same, if so, group their names together
|
||||||
if param is tied_param:
|
if param is tied_param:
|
||||||
if param_name not in tied_param_groups:
|
if param_name not in tied_param_groups:
|
||||||
tied_param_groups[param_name] = []
|
tied_param_groups[param_name] = []
|
||||||
|
@ -329,7 +329,7 @@ class TorchExportableModuleWithStaticCache(torch.nn.Module):
|
|||||||
This util function is designed to test exported models by simulating the generation process.
|
This util function is designed to test exported models by simulating the generation process.
|
||||||
It processes the input prompt tokens sequentially (no parallel prefill).
|
It processes the input prompt tokens sequentially (no parallel prefill).
|
||||||
This generate function is not intended to replace the original `generate` method, and the support
|
This generate function is not intended to replace the original `generate` method, and the support
|
||||||
for leveraging the original `generate` is potentially planed!
|
for leveraging the original `generate` is potentially planned!
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
exported_program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
|
exported_program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
|
||||||
|
@ -28,7 +28,7 @@ def autoname_modules(model):
|
|||||||
module.name = name
|
module.name = name
|
||||||
|
|
||||||
|
|
||||||
# Get the linear_tag from a modul name. For example: model.layers.31.self_attn.k_proj -> self_attn.k_proj
|
# Get the linear_tag from a module name. For example: model.layers.31.self_attn.k_proj -> self_attn.k_proj
|
||||||
def name_to_linear_tag(name):
|
def name_to_linear_tag(name):
|
||||||
return ".".join([n for n in name.split(".") if ((n not in ["model", "layers"]) and (not n.isnumeric()))])
|
return ".".join([n for n in name.split(".") if ((n not in ["model", "layers"]) and (not n.isnumeric()))])
|
||||||
|
|
||||||
@ -86,9 +86,9 @@ def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_conve
|
|||||||
"""
|
"""
|
||||||
Prepares nn.Linear layers for HQQ quantization.
|
Prepares nn.Linear layers for HQQ quantization.
|
||||||
Since each layer type can have separate quantization parameters, we need to do the following:
|
Since each layer type can have separate quantization parameters, we need to do the following:
|
||||||
1- tag each module with its neme via autoname_modules()
|
1- tag each module with its name via autoname_modules()
|
||||||
2- Extract linear_tags (e.g. ['self_attn.q_proj', ...])
|
2- Extract linear_tags (e.g. ['self_attn.q_proj', ...])
|
||||||
3- Map quantization parameters as a dictionary linear_tag -> quant_params as HQQLinear exepects it, this is referred to as patch_params
|
3- Map quantization parameters as a dictionary linear_tag -> quant_params as HQQLinear expects it, this is referred to as patch_params
|
||||||
"""
|
"""
|
||||||
|
|
||||||
modules_to_not_convert = [] if modules_to_not_convert is None else modules_to_not_convert
|
modules_to_not_convert = [] if modules_to_not_convert is None else modules_to_not_convert
|
||||||
|
@ -160,7 +160,7 @@ def distribute_module(
|
|||||||
output_fn=None,
|
output_fn=None,
|
||||||
) -> nn.Module:
|
) -> nn.Module:
|
||||||
"""
|
"""
|
||||||
Copy pasted from torch's function but we remove the communications (partitionning)
|
Copy pasted from torch's function but we remove the communications (partitioning)
|
||||||
as well as buffer registering that is similarly not efficient.
|
as well as buffer registering that is similarly not efficient.
|
||||||
"""
|
"""
|
||||||
if len(module._forward_pre_hooks) == 0:
|
if len(module._forward_pre_hooks) == 0:
|
||||||
@ -225,7 +225,7 @@ class GatherParallel(TensorParallelLayer):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
|
def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
|
||||||
# this op cannot be asynch, otherwise it completely breaks the outputs of models
|
# this op cannot be async, otherwise it completely breaks the outputs of models
|
||||||
torch.distributed.all_reduce(outputs[0], op=torch.distributed.ReduceOp.SUM, async_op=False)
|
torch.distributed.all_reduce(outputs[0], op=torch.distributed.ReduceOp.SUM, async_op=False)
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
@ -343,7 +343,7 @@ class HungarianMatcher(nn.Module):
|
|||||||
|
|
||||||
# Compute the classification cost. Contrary to the loss, we don't use the NLL,
|
# Compute the classification cost. Contrary to the loss, we don't use the NLL,
|
||||||
# but approximate it in 1 - proba[target class].
|
# but approximate it in 1 - proba[target class].
|
||||||
# The 1 is a constant that doesn't change the matching, it can be ommitted.
|
# The 1 is a constant that doesn't change the matching, it can be omitted.
|
||||||
class_cost = -out_prob[:, target_ids]
|
class_cost = -out_prob[:, target_ids]
|
||||||
|
|
||||||
# Compute the L1 cost between boxes
|
# Compute the L1 cost between boxes
|
||||||
|
@ -99,7 +99,7 @@ class RTDetrHungarianMatcher(nn.Module):
|
|||||||
target_bbox = torch.cat([v["boxes"] for v in targets])
|
target_bbox = torch.cat([v["boxes"] for v in targets])
|
||||||
# Compute the classification cost. Contrary to the loss, we don't use the NLL,
|
# Compute the classification cost. Contrary to the loss, we don't use the NLL,
|
||||||
# but approximate it in 1 - proba[target class].
|
# but approximate it in 1 - proba[target class].
|
||||||
# The 1 is a constant that doesn't change the matching, it can be ommitted.
|
# The 1 is a constant that doesn't change the matching, it can be omitted.
|
||||||
if self.use_focal_loss:
|
if self.use_focal_loss:
|
||||||
out_prob = F.sigmoid(outputs["logits"].flatten(0, 1))
|
out_prob = F.sigmoid(outputs["logits"].flatten(0, 1))
|
||||||
out_prob = out_prob[:, target_ids]
|
out_prob = out_prob[:, target_ids]
|
||||||
|
@ -593,7 +593,7 @@ class AlignVisionBlock(nn.Module):
|
|||||||
|
|
||||||
class AlignVisionEncoder(nn.Module):
|
class AlignVisionEncoder(nn.Module):
|
||||||
r"""
|
r"""
|
||||||
Forward propogates the embeddings through each vision encoder (EfficientNet) block.
|
Forward propagates the embeddings through each vision encoder (EfficientNet) block.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config ([`AlignVisionConfig`]):
|
config ([`AlignVisionConfig`]):
|
||||||
|
@ -36,7 +36,7 @@ class AlignProcessorKwargs(ProcessingKwargs, total=False):
|
|||||||
class AlignProcessor(ProcessorMixin):
|
class AlignProcessor(ProcessorMixin):
|
||||||
r"""
|
r"""
|
||||||
Constructs an ALIGN processor which wraps [`EfficientNetImageProcessor`] and
|
Constructs an ALIGN processor which wraps [`EfficientNetImageProcessor`] and
|
||||||
[`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that interits both the image processor and
|
[`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that inherits both the image processor and
|
||||||
tokenizer functionalities. See the [`~AlignProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
|
tokenizer functionalities. See the [`~AlignProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
|
||||||
information.
|
information.
|
||||||
The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
|
The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
|
||||||
|
@ -1936,7 +1936,7 @@ class AutoformerForPrediction(AutoformerPreTrainedModel):
|
|||||||
params = None
|
params = None
|
||||||
if future_values is not None:
|
if future_values is not None:
|
||||||
# outputs.last_hidden_state and trend
|
# outputs.last_hidden_state and trend
|
||||||
# loc is 4rd last and scale is 3rd last output
|
# loc is 4th last and scale is 3rd last output
|
||||||
params = self.output_params(outputs[0] + outputs[1])
|
params = self.output_params(outputs[0] + outputs[1])
|
||||||
distribution = self.output_distribution(params, loc=outputs[-3], scale=outputs[-2])
|
distribution = self.output_distribution(params, loc=outputs[-3], scale=outputs[-2])
|
||||||
|
|
||||||
|
@ -164,7 +164,7 @@ def convert_checkpoint_to_pytorch(tf_checkpoint_path: str, config_path: str, pyt
|
|||||||
new_model = BertForMaskedLM.from_pretrained(pytorch_dump_path)
|
new_model = BertForMaskedLM.from_pretrained(pytorch_dump_path)
|
||||||
print(new_model.eval())
|
print(new_model.eval())
|
||||||
|
|
||||||
print("Model conversion was done sucessfully!")
|
print("Model conversion was done successfully!")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -235,7 +235,7 @@ class Blip2Config(PretrainedConfig):
|
|||||||
num_query_tokens (`int`, *optional*, defaults to 32):
|
num_query_tokens (`int`, *optional*, defaults to 32):
|
||||||
The number of query tokens passed through the Transformer.
|
The number of query tokens passed through the Transformer.
|
||||||
image_text_hidden_size (`int`, *optional*, defaults to 256):
|
image_text_hidden_size (`int`, *optional*, defaults to 256):
|
||||||
Dimentionality of the hidden state of the image-text fusion layer.
|
Dimensionality of the hidden state of the image-text fusion layer.
|
||||||
|
|
||||||
image_token_index (`int`, *optional*):
|
image_token_index (`int`, *optional*):
|
||||||
Token index of special image token.
|
Token index of special image token.
|
||||||
|
@ -899,7 +899,7 @@ class BloomForCausalLM(BloomPreTrainedModel, GenerationMixin):
|
|||||||
use_cache=True,
|
use_cache=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
# Overwriten because of the fixed-shape attention mask creation
|
# Overwritten because of the fixed-shape attention mask creation
|
||||||
|
|
||||||
# If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
|
# If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
|
||||||
# Exception 1: when passing input_embeds, input_ids may be missing entries
|
# Exception 1: when passing input_embeds, input_ids may be missing entries
|
||||||
|
@ -49,17 +49,17 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
|
|||||||
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
|
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
|
||||||
to warn users if the audio fed to the feature extractor does not have the same sampling rate.
|
to warn users if the audio fed to the feature extractor does not have the same sampling rate.
|
||||||
hop_length (`int`,*optional*, defaults to 480):
|
hop_length (`int`,*optional*, defaults to 480):
|
||||||
Length of the overlaping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
|
Length of the overlapping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
|
||||||
in smaller `frames` with a step of `hop_length` between each frame.
|
in smaller `frames` with a step of `hop_length` between each frame.
|
||||||
max_length_s (`int`, *optional*, defaults to 10):
|
max_length_s (`int`, *optional*, defaults to 10):
|
||||||
The maximum input length of the model in seconds. This is used to pad the audio.
|
The maximum input length of the model in seconds. This is used to pad the audio.
|
||||||
fft_window_size (`int`, *optional*, defaults to 1024):
|
fft_window_size (`int`, *optional*, defaults to 1024):
|
||||||
Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency
|
Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency
|
||||||
resolution of the spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples.
|
resolution of the spectrogram. 400 means that the fourier transform is computed on windows of 400 samples.
|
||||||
padding_value (`float`, *optional*, defaults to 0.0):
|
padding_value (`float`, *optional*, defaults to 0.0):
|
||||||
Padding value used to pad the audio. Should correspond to silences.
|
Padding value used to pad the audio. Should correspond to silences.
|
||||||
return_attention_mask (`bool`, *optional*, defaults to `False`):
|
return_attention_mask (`bool`, *optional*, defaults to `False`):
|
||||||
Whether or not the model should return the attention masks coresponding to the input.
|
Whether or not the model should return the attention masks corresponding to the input.
|
||||||
frequency_min (`float`, *optional*, defaults to 0):
|
frequency_min (`float`, *optional*, defaults to 0):
|
||||||
The lowest frequency of interest. The STFT will not be computed for values below this.
|
The lowest frequency of interest. The STFT will not be computed for values below this.
|
||||||
frequency_max (`float`, *optional*, defaults to 14000):
|
frequency_max (`float`, *optional*, defaults to 14000):
|
||||||
@ -141,7 +141,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
|
|||||||
Serializes this instance to a Python dictionary.
|
Serializes this instance to a Python dictionary.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, excpet for the
|
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, except for the
|
||||||
mel filter banks, which do not need to be saved or printed as they are too long.
|
mel filter banks, which do not need to be saved or printed as they are too long.
|
||||||
"""
|
"""
|
||||||
output = copy.deepcopy(self.__dict__)
|
output = copy.deepcopy(self.__dict__)
|
||||||
|
@ -1067,7 +1067,7 @@ CLAP_TEXT_INPUTS_DOCSTRING = r"""
|
|||||||
CLAP_AUDIO_INPUTS_DOCSTRING = r"""
|
CLAP_AUDIO_INPUTS_DOCSTRING = r"""
|
||||||
Args:
|
Args:
|
||||||
input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||||
Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
|
Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
|
||||||
retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
|
retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
|
||||||
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
|
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
|
||||||
Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
|
Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
|
||||||
@ -1105,7 +1105,7 @@ CLAP_INPUTS_DOCSTRING = r"""
|
|||||||
|
|
||||||
[What are position IDs?](../glossary#position-ids)
|
[What are position IDs?](../glossary#position-ids)
|
||||||
input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||||
Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
|
Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
|
||||||
retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
|
retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
|
||||||
return_loss (`bool`, *optional*):
|
return_loss (`bool`, *optional*):
|
||||||
Whether or not to return the contrastive loss.
|
Whether or not to return the contrastive loss.
|
||||||
|
@ -127,7 +127,7 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
if kwargs.pop("add_bos_token", False):
|
if kwargs.pop("add_bos_token", False):
|
||||||
model_id = kwargs.pop("name_or_path", "")
|
model_id = kwargs.pop("name_or_path", "")
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Currenty GPT2's fast tokenizer does NOT support adding a BOS token. "
|
"Currently GPT2's fast tokenizer does NOT support adding a BOS token. "
|
||||||
"Instead you should use GPT2's slow tokenizer class `CodeGenTokenizer` as follows: \n"
|
"Instead you should use GPT2's slow tokenizer class `CodeGenTokenizer` as follows: \n"
|
||||||
f"`CodeGenTokenizer.from_pretrained('{model_id}')`\nor\n"
|
f"`CodeGenTokenizer.from_pretrained('{model_id}')`\nor\n"
|
||||||
f"`AutoTokenizer.from_pretrained('{model_id}', use_fast=False)`\n"
|
f"`AutoTokenizer.from_pretrained('{model_id}', use_fast=False)`\n"
|
||||||
|
@ -277,7 +277,7 @@ def final():
|
|||||||
|
|
||||||
def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_folder):
|
def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_folder):
|
||||||
"""
|
"""
|
||||||
Fucntion to convert the microsoft cvt checkpoint to huggingface checkpoint
|
Function to convert the microsoft cvt checkpoint to huggingface checkpoint
|
||||||
"""
|
"""
|
||||||
img_labels_file = "imagenet-1k-id2label.json"
|
img_labels_file = "imagenet-1k-id2label.json"
|
||||||
num_labels = 1000
|
num_labels = 1000
|
||||||
|
@ -58,7 +58,7 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
|
|||||||
# activation function weight
|
# activation function weight
|
||||||
r"transformer\.encoder\.layers\.(\d+)\.activation\.weight": r"encoder.layers.\1.activation_fn.weight",
|
r"transformer\.encoder\.layers\.(\d+)\.activation\.weight": r"encoder.layers.\1.activation_fn.weight",
|
||||||
#########################################################################################################################################
|
#########################################################################################################################################
|
||||||
# decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms + activiation function weight
|
# decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms + activation function weight
|
||||||
r"transformer\.decoder\.layers\.(\d+)\.self_attn\.out_proj\.(bias|weight)": r"decoder.layers.\1.self_attn.self_attn.output_proj.\2",
|
r"transformer\.decoder\.layers\.(\d+)\.self_attn\.out_proj\.(bias|weight)": r"decoder.layers.\1.self_attn.self_attn.output_proj.\2",
|
||||||
r"transformer\.decoder\.layers\.(\d+)\.cross_attn\.out_proj\.(bias|weight)": r"decoder.layers.\1.cross_attn.cross_attn.output_proj.\2",
|
r"transformer\.decoder\.layers\.(\d+)\.cross_attn\.out_proj\.(bias|weight)": r"decoder.layers.\1.cross_attn.cross_attn.output_proj.\2",
|
||||||
# FFNs
|
# FFNs
|
||||||
@ -144,7 +144,7 @@ def write_model(model_name, pretrained_model_weights_path, pytorch_dump_folder_p
|
|||||||
config.label2id = {v: k for k, v in id2label.items()}
|
config.label2id = {v: k for k, v in id2label.items()}
|
||||||
# load original model from local path
|
# load original model from local path
|
||||||
loaded = torch.load(pretrained_model_weights_path, map_location=torch.device("cpu"), weights_only=True)["model"]
|
loaded = torch.load(pretrained_model_weights_path, map_location=torch.device("cpu"), weights_only=True)["model"]
|
||||||
# Renaming the original model state dictionary to HF compatibile
|
# Renaming the original model state dictionary to HF compatible
|
||||||
all_keys = list(loaded.keys())
|
all_keys = list(loaded.keys())
|
||||||
new_keys = convert_old_keys_to_new_keys(all_keys)
|
new_keys = convert_old_keys_to_new_keys(all_keys)
|
||||||
state_dict = {}
|
state_dict = {}
|
||||||
|
@ -1297,7 +1297,7 @@ class JukeboxConditionalAutoregressive(nn.Module):
|
|||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Autoregressive model on either lyric tokens or music tokens, or both. The attention pattern should be properly
|
Autoregressive model on either lyric tokens or music tokens, or both. The attention pattern should be properly
|
||||||
set fro each configuration.
|
set for each configuration.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (`JukeboxPriorConfig`):
|
config (`JukeboxPriorConfig`):
|
||||||
|
@ -142,7 +142,7 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
|
|||||||
return patches
|
return patches
|
||||||
|
|
||||||
if n_patches_per_batch < 4:
|
if n_patches_per_batch < 4:
|
||||||
# for each batch, atleast 4 small patches are required to
|
# for each batch, at least 4 small patches are required to
|
||||||
# recreate a large square patch from merging them and later padding is applied
|
# recreate a large square patch from merging them and later padding is applied
|
||||||
# 3 x (8x8) patches becomes 1 x ( 8x8 ) patch (extra patch ignored, no padding)
|
# 3 x (8x8) patches becomes 1 x ( 8x8 ) patch (extra patch ignored, no padding)
|
||||||
# 4 x (8x8) patches becomes 1 x (16x16) patch (padding later)
|
# 4 x (8x8) patches becomes 1 x (16x16) patch (padding later)
|
||||||
|
@ -118,7 +118,7 @@ class Embeddings(nn.Module):
|
|||||||
|
|
||||||
# Setting the position-ids to the registered buffer in constructor, it helps
|
# Setting the position-ids to the registered buffer in constructor, it helps
|
||||||
# when tracing the model without passing position-ids, solves
|
# when tracing the model without passing position-ids, solves
|
||||||
# isues similar to issue #5664
|
# issues similar to issue #5664
|
||||||
if hasattr(self, "position_ids"):
|
if hasattr(self, "position_ids"):
|
||||||
position_ids = self.position_ids[:, :seq_length]
|
position_ids = self.position_ids[:, :seq_length]
|
||||||
else:
|
else:
|
||||||
|
@ -72,7 +72,7 @@ class DonutImageProcessor(BaseImageProcessor):
|
|||||||
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
|
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
|
||||||
do_pad (`bool`, *optional*, defaults to `True`):
|
do_pad (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to pad the image. If `random_padding` is set to `True` in `preprocess`, each image is padded with a
|
Whether to pad the image. If `random_padding` is set to `True` in `preprocess`, each image is padded with a
|
||||||
random amont of padding on each size, up to the largest image size in the batch. Otherwise, all images are
|
random amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are
|
||||||
padded to the largest image size in the batch.
|
padded to the largest image size in the batch.
|
||||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
|
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
|
||||||
@ -349,7 +349,7 @@ class DonutImageProcessor(BaseImageProcessor):
|
|||||||
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
|
Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
|
||||||
do_pad (`bool`, *optional*, defaults to `self.do_pad`):
|
do_pad (`bool`, *optional*, defaults to `self.do_pad`):
|
||||||
Whether to pad the image. If `random_padding` is set to `True`, each image is padded with a random
|
Whether to pad the image. If `random_padding` is set to `True`, each image is padded with a random
|
||||||
amont of padding on each size, up to the largest image size in the batch. Otherwise, all images are
|
amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are
|
||||||
padded to the largest image size in the batch.
|
padded to the largest image size in the batch.
|
||||||
random_padding (`bool`, *optional*, defaults to `self.random_padding`):
|
random_padding (`bool`, *optional*, defaults to `self.random_padding`):
|
||||||
Whether to use random padding when padding the image. If `True`, each image in the batch with be padded
|
Whether to use random padding when padding the image. If `True`, each image in the batch with be padded
|
||||||
|
@ -142,7 +142,7 @@ class FastSpeech2ConformerConfig(PretrainedConfig):
|
|||||||
speaker id embedding layer.
|
speaker id embedding layer.
|
||||||
num_languages (`int`, *optional*):
|
num_languages (`int`, *optional*):
|
||||||
Number of languages. If set to > 1, assume that the language ids will be provided as the input and use the
|
Number of languages. If set to > 1, assume that the language ids will be provided as the input and use the
|
||||||
languge id embedding layer.
|
language id embedding layer.
|
||||||
speaker_embed_dim (`int`, *optional*):
|
speaker_embed_dim (`int`, *optional*):
|
||||||
Speaker embedding dimension. If set to > 0, assume that speaker_embedding will be provided as the input.
|
Speaker embedding dimension. If set to > 0, assume that speaker_embedding will be provided as the input.
|
||||||
is_encoder_decoder (`bool`, *optional*, defaults to `True`):
|
is_encoder_decoder (`bool`, *optional*, defaults to `True`):
|
||||||
|
@ -391,7 +391,7 @@ class FastSpeech2ConformerVariancePredictor(nn.Module):
|
|||||||
dropout_rate=0.5,
|
dropout_rate=0.5,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initilize variance predictor module.
|
Initialize variance predictor module.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
input_dim (`int`): Input dimension.
|
input_dim (`int`): Input dimension.
|
||||||
|
@ -948,7 +948,7 @@ class FlaubertModel(FlaubertPreTrainedModel):
|
|||||||
|
|
||||||
# Setting the position-ids to the registered buffer in constructor, it helps
|
# Setting the position-ids to the registered buffer in constructor, it helps
|
||||||
# when tracing the model without passing position-ids, solves
|
# when tracing the model without passing position-ids, solves
|
||||||
# isues similar to issue #5664
|
# issues similar to issue #5664
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
if hasattr(self, "position_ids"):
|
if hasattr(self, "position_ids"):
|
||||||
position_ids = self.position_ids[:, :slen]
|
position_ids = self.position_ids[:, :slen]
|
||||||
|
@ -360,7 +360,7 @@ class FocalNetModulation(nn.Module):
|
|||||||
x = self.projection_in(hidden_state).permute(0, 3, 1, 2).contiguous()
|
x = self.projection_in(hidden_state).permute(0, 3, 1, 2).contiguous()
|
||||||
q, ctx, gates = torch.split(x, (num_channels, num_channels, self.focal_level + 1), 1)
|
q, ctx, gates = torch.split(x, (num_channels, num_channels, self.focal_level + 1), 1)
|
||||||
|
|
||||||
# context aggreation
|
# context aggregation
|
||||||
ctx_all = 0
|
ctx_all = 0
|
||||||
for level in range(self.focal_level):
|
for level in range(self.focal_level):
|
||||||
ctx = self.focal_layers[level](ctx)
|
ctx = self.focal_layers[level](ctx)
|
||||||
@ -379,7 +379,7 @@ class FocalNetModulation(nn.Module):
|
|||||||
if self.use_post_layernorm_in_modulation:
|
if self.use_post_layernorm_in_modulation:
|
||||||
x_out = self.layernorm(x_out)
|
x_out = self.layernorm(x_out)
|
||||||
|
|
||||||
# post linear porjection
|
# post linear projection
|
||||||
x_out = self.projection_out(x_out)
|
x_out = self.projection_out(x_out)
|
||||||
x_out = self.projection_dropout(x_out)
|
x_out = self.projection_dropout(x_out)
|
||||||
return x_out
|
return x_out
|
||||||
@ -415,7 +415,7 @@ class FocalNetLayer(nn.Module):
|
|||||||
dim (`int`):
|
dim (`int`):
|
||||||
Number of input channels.
|
Number of input channels.
|
||||||
input_resolution (`Tuple[int]`):
|
input_resolution (`Tuple[int]`):
|
||||||
Input resulotion.
|
Input resolution.
|
||||||
drop_path (`float`, *optional*, defaults to 0.0):
|
drop_path (`float`, *optional*, defaults to 0.0):
|
||||||
Stochastic depth rate.
|
Stochastic depth rate.
|
||||||
"""
|
"""
|
||||||
|
@ -244,7 +244,7 @@ def _tokenize_prompts_with_image_and_batch(
|
|||||||
- pad all the sequences to this length so we can convert them into a 3D tensor.
|
- pad all the sequences to this length so we can convert them into a 3D tensor.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# If not tool use, tranform the coordinates while tokenizing
|
# If not tool use, transform the coordinates while tokenizing
|
||||||
if scale_factors is not None:
|
if scale_factors is not None:
|
||||||
transformed_prompt_tokens = []
|
transformed_prompt_tokens = []
|
||||||
for prompt_seq, scale_factor_seq in zip(prompts, scale_factors):
|
for prompt_seq, scale_factor_seq in zip(prompts, scale_factors):
|
||||||
|
@ -96,7 +96,7 @@ class Gemma3TextConfig(PretrainedConfig):
|
|||||||
Scaling factor when applying tanh softcapping on the attention scores.
|
Scaling factor when applying tanh softcapping on the attention scores.
|
||||||
cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
|
cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
|
||||||
rope_scaling (`Dict`, *optional*):
|
rope_scaling (`Dict`, *optional*):
|
||||||
Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type
|
Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
|
||||||
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
||||||
accordingly.
|
accordingly.
|
||||||
Expected contents:
|
Expected contents:
|
||||||
|
@ -140,7 +140,7 @@ class Gemma3ImageProcessor(BaseImageProcessor):
|
|||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Pan and Scan and image, by cropping into smaller images when the aspect ratio exceeds
|
Pan and Scan and image, by cropping into smaller images when the aspect ratio exceeds
|
||||||
minumum allowed ratio.
|
minimum allowed ratio.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image (`np.ndarray`):
|
image (`np.ndarray`):
|
||||||
|
@ -108,7 +108,7 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
|
|||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Pan and Scan an image, by cropping into smaller images when the aspect ratio exceeds
|
Pan and Scan an image, by cropping into smaller images when the aspect ratio exceeds
|
||||||
minumum allowed ratio.
|
minimum allowed ratio.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image (`torch.Tensor`):
|
image (`torch.Tensor`):
|
||||||
|
@ -1270,7 +1270,7 @@ class Gemma3ForConditionalGeneration(Gemma3PreTrainedModel, GenerationMixin):
|
|||||||
|
|
||||||
is_training = token_type_ids is not None and labels is not None
|
is_training = token_type_ids is not None and labels is not None
|
||||||
|
|
||||||
# Replace image id woth PAD if the image token if OOV, to avoid index-errors
|
# Replace image id with PAD if the image token if OOV, to avoid index-errors
|
||||||
if input_ids is not None and self.config.image_token_id >= self.vocab_size:
|
if input_ids is not None and self.config.image_token_id >= self.vocab_size:
|
||||||
special_image_mask = input_ids == self.config.image_token_id
|
special_image_mask = input_ids == self.config.image_token_id
|
||||||
llm_input_ids = input_ids.clone()
|
llm_input_ids = input_ids.clone()
|
||||||
|
@ -128,7 +128,7 @@ class Gemma3TextConfig(Gemma2Config):
|
|||||||
Scaling factor when applying tanh softcapping on the attention scores.
|
Scaling factor when applying tanh softcapping on the attention scores.
|
||||||
cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
|
cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
|
||||||
rope_scaling (`Dict`, *optional*):
|
rope_scaling (`Dict`, *optional*):
|
||||||
Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type
|
Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
|
||||||
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
||||||
accordingly.
|
accordingly.
|
||||||
Expected contents:
|
Expected contents:
|
||||||
@ -926,7 +926,7 @@ class Gemma3ForConditionalGeneration(PaliGemmaForConditionalGeneration):
|
|||||||
|
|
||||||
is_training = token_type_ids is not None and labels is not None
|
is_training = token_type_ids is not None and labels is not None
|
||||||
|
|
||||||
# Replace image id woth PAD if the image token if OOV, to avoid index-errors
|
# Replace image id with PAD if the image token if OOV, to avoid index-errors
|
||||||
if input_ids is not None and self.config.image_token_id >= self.vocab_size:
|
if input_ids is not None and self.config.image_token_id >= self.vocab_size:
|
||||||
special_image_mask = input_ids == self.config.image_token_id
|
special_image_mask = input_ids == self.config.image_token_id
|
||||||
llm_input_ids = input_ids.clone()
|
llm_input_ids = input_ids.clone()
|
||||||
|
@ -1495,7 +1495,7 @@ class GitForCausalLM(GitPreTrainedModel, GenerationMixin):
|
|||||||
>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
|
>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
|
||||||
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")
|
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")
|
||||||
|
|
||||||
>>> # set seed for reproducability
|
>>> # set seed for reproducibility
|
||||||
>>> np.random.seed(45)
|
>>> np.random.seed(45)
|
||||||
|
|
||||||
|
|
||||||
|
@ -199,7 +199,7 @@ class GPTNeoXConfig(PretrainedConfig):
|
|||||||
|
|
||||||
if self.hidden_size % self.num_attention_heads != 0:
|
if self.hidden_size % self.num_attention_heads != 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The hidden size is not divisble by the number of attention heads! Make sure to update them!"
|
"The hidden size is not divisible by the number of attention heads! Make sure to update them!"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -402,7 +402,7 @@ def convert_grounding_dino_checkpoint(args):
|
|||||||
"grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth",
|
"grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth",
|
||||||
"grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swinb_cogcoor.pth",
|
"grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swinb_cogcoor.pth",
|
||||||
}
|
}
|
||||||
# Define default GroundingDino configuation
|
# Define default GroundingDino configuration
|
||||||
config = get_grounding_dino_config(model_name)
|
config = get_grounding_dino_config(model_name)
|
||||||
|
|
||||||
# Load original checkpoint
|
# Load original checkpoint
|
||||||
|
@ -1850,7 +1850,7 @@ class GroundingDinoDecoder(GroundingDinoPreTrainedModel):
|
|||||||
|
|
||||||
# In original implementation they apply layer norm before outputting intermediate hidden states
|
# In original implementation they apply layer norm before outputting intermediate hidden states
|
||||||
# Though that's not through between layers so the layers use as input the output of the previous layer
|
# Though that's not through between layers so the layers use as input the output of the previous layer
|
||||||
# withtout layer norm
|
# without layer norm
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (self.layer_norm(hidden_states),)
|
all_hidden_states += (self.layer_norm(hidden_states),)
|
||||||
|
|
||||||
|
@ -1425,7 +1425,7 @@ HUBERT_INPUTS_DOCSTRING = r"""
|
|||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"The bare TFHubert Model transformer outputing raw hidden-states without any specific head on top.",
|
"The bare TFHubert Model transformer outputting raw hidden-states without any specific head on top.",
|
||||||
HUBERT_START_DOCSTRING,
|
HUBERT_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
class TFHubertModel(TFHubertPreTrainedModel):
|
class TFHubertModel(TFHubertPreTrainedModel):
|
||||||
|
@ -74,8 +74,8 @@ class IBertConfig(PretrainedConfig):
|
|||||||
quant_mode (`bool`, *optional*, defaults to `False`):
|
quant_mode (`bool`, *optional*, defaults to `False`):
|
||||||
Whether to quantize the model or not.
|
Whether to quantize the model or not.
|
||||||
force_dequant (`str`, *optional*, defaults to `"none"`):
|
force_dequant (`str`, *optional*, defaults to `"none"`):
|
||||||
Force dequantize specific nonlinear layer. Dequatized layers are then executed with full precision.
|
Force dequantize specific nonlinear layer. Dequantized layers are then executed with full precision.
|
||||||
`"none"`, `"gelu"`, `"softmax"`, `"layernorm"` and `"nonlinear"` are supported. As deafult, it is set as
|
`"none"`, `"gelu"`, `"softmax"`, `"layernorm"` and `"nonlinear"` are supported. As default, it is set as
|
||||||
`"none"`, which does not dequantize any layers. Please specify `"gelu"`, `"softmax"`, or `"layernorm"` to
|
`"none"`, which does not dequantize any layers. Please specify `"gelu"`, `"softmax"`, or `"layernorm"` to
|
||||||
dequantize GELU, Softmax, or LayerNorm, respectively. `"nonlinear"` will dequantize all nonlinear layers,
|
dequantize GELU, Softmax, or LayerNorm, respectively. `"nonlinear"` will dequantize all nonlinear layers,
|
||||||
i.e., GELU, Softmax, and LayerNorm.
|
i.e., GELU, Softmax, and LayerNorm.
|
||||||
|
@ -276,7 +276,7 @@ class InternVLProcessor(ProcessorMixin):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
metadata (`VideoMetadata`):
|
metadata (`VideoMetadata`):
|
||||||
`VideoMetadata` object containing metadat about the video, such as "total_num_frames" or "fps".
|
`VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
|
||||||
num_frames (`int`, *optional*):
|
num_frames (`int`, *optional*):
|
||||||
Number of frames to sample uniformly. If None, all frames are sampled.
|
Number of frames to sample uniformly. If None, all frames are sampled.
|
||||||
initial_shift (`bool`, `float` or `int`, defaults to `0`):
|
initial_shift (`bool`, `float` or `int`, defaults to `0`):
|
||||||
|
@ -246,7 +246,7 @@ class LevitAttentionSubsample(nn.Module):
|
|||||||
self.out_dim_keys_values = attention_ratio * key_dim * num_attention_heads + key_dim * num_attention_heads
|
self.out_dim_keys_values = attention_ratio * key_dim * num_attention_heads + key_dim * num_attention_heads
|
||||||
self.out_dim_projection = attention_ratio * key_dim * num_attention_heads
|
self.out_dim_projection = attention_ratio * key_dim * num_attention_heads
|
||||||
self.resolution_out = resolution_out
|
self.resolution_out = resolution_out
|
||||||
# resolution_in is the intial resolution, resoloution_out is final resolution after downsampling
|
# resolution_in is the initial resolution, resolution_out is final resolution after downsampling
|
||||||
self.keys_values = MLPLayerWithBN(input_dim, self.out_dim_keys_values)
|
self.keys_values = MLPLayerWithBN(input_dim, self.out_dim_keys_values)
|
||||||
self.queries_subsample = LevitSubsample(stride, resolution_in)
|
self.queries_subsample = LevitSubsample(stride, resolution_in)
|
||||||
self.queries = MLPLayerWithBN(input_dim, key_dim * num_attention_heads)
|
self.queries = MLPLayerWithBN(input_dim, key_dim * num_attention_heads)
|
||||||
@ -370,7 +370,7 @@ class LevitStage(nn.Module):
|
|||||||
self.layers = []
|
self.layers = []
|
||||||
self.config = config
|
self.config = config
|
||||||
self.resolution_in = resolution_in
|
self.resolution_in = resolution_in
|
||||||
# resolution_in is the intial resolution, resolution_out is final resolution after downsampling
|
# resolution_in is the initial resolution, resolution_out is final resolution after downsampling
|
||||||
for _ in range(depths):
|
for _ in range(depths):
|
||||||
self.layers.append(
|
self.layers.append(
|
||||||
LevitResidualLayer(
|
LevitResidualLayer(
|
||||||
|
@ -55,7 +55,7 @@ if is_torchvision_available():
|
|||||||
|
|
||||||
def get_factors(dividend: int) -> Set[int]:
|
def get_factors(dividend: int) -> Set[int]:
|
||||||
"""
|
"""
|
||||||
Calculate all factors of a given number, i.e. a dividor that leaves
|
Calculate all factors of a given number, i.e. a divisor that leaves
|
||||||
no remainder. For example, if dividend=12, it will return {1, 2, 3, 4, 6, 12}.
|
no remainder. For example, if dividend=12, it will return {1, 2, 3, 4, 6, 12}.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -60,7 +60,7 @@ class LlavaNextVideoImageProcessor(BaseImageProcessor):
|
|||||||
image_grid_pinpoints (`List` *optional*, defaults to `[[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]]`):
|
image_grid_pinpoints (`List` *optional*, defaults to `[[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]]`):
|
||||||
A list of possible resolutions to use for processing high resolution images. The best resolution is selected
|
A list of possible resolutions to use for processing high resolution images. The best resolution is selected
|
||||||
based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
|
based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
|
||||||
method. Not used for processinf videos.
|
method. Not used for processing videos.
|
||||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
||||||
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
|
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
|
||||||
do_center_crop (`bool`, *optional*, defaults to `True`):
|
do_center_crop (`bool`, *optional*, defaults to `True`):
|
||||||
|
@ -405,7 +405,7 @@ class Mask2FormerHungarianMatcher(nn.Module):
|
|||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
|
if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
|
||||||
raise ValueError("All costs cant be 0")
|
raise ValueError("All costs can't be 0")
|
||||||
|
|
||||||
self.num_points = num_points
|
self.num_points = num_points
|
||||||
self.cost_class = cost_class
|
self.cost_class = cost_class
|
||||||
|
@ -829,7 +829,7 @@ class MaskFormerHungarianMatcher(nn.Module):
|
|||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
|
if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
|
||||||
raise ValueError("All costs cant be 0")
|
raise ValueError("All costs can't be 0")
|
||||||
self.cost_class = cost_class
|
self.cost_class = cost_class
|
||||||
self.cost_mask = cost_mask
|
self.cost_mask = cost_mask
|
||||||
self.cost_dice = cost_dice
|
self.cost_dice = cost_dice
|
||||||
|
@ -98,7 +98,7 @@ def add_megatron_checkpoint_args(parser):
|
|||||||
default=128,
|
default=128,
|
||||||
help=(
|
help=(
|
||||||
"Pad the vocab size to be divisible by this value. "
|
"Pad the vocab size to be divisible by this value. "
|
||||||
"This is added for computational efficieny reasons. "
|
"This is added for computational efficiency reasons. "
|
||||||
"Only used when converting a Transformers checkpoint to a Megatron checkpoint."
|
"Only used when converting a Transformers checkpoint to a Megatron checkpoint."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
@ -235,7 +235,7 @@ def transformers_to_megatron_fix_query_key_value_ordering(
|
|||||||
param, checkpoint_version, num_splits, num_heads, hidden_size
|
param, checkpoint_version, num_splits, num_heads, hidden_size
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Permutes layout of param tensor to the one compatible with respective NVIDIA Megatron-LM chekpoint versions. Input
|
Permutes layout of param tensor to the one compatible with respective NVIDIA Megatron-LM checkpoint versions. Input
|
||||||
is [num_splits * num_heads * hidden_size, :] and output is [num_heads * hidden_size * num_splits, :] for version
|
is [num_splits * num_heads * hidden_size, :] and output is [num_heads * hidden_size * num_splits, :] for version
|
||||||
1.0 and [num_heads * num_splits * hidden_size, :] for version 2.0 and later. If param is the weight tensor of the
|
1.0 and [num_heads * num_splits * hidden_size, :] for version 2.0 and later. If param is the weight tensor of the
|
||||||
self-attention block, the param needs to be already transposed before calling this function.
|
self-attention block, the param needs to be already transposed before calling this function.
|
||||||
@ -348,7 +348,7 @@ def convert_checkpoint_from_megatron_to_transformers(args):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Megatron-LM checkpoint does not contain arguments. This utility only supports Megatron-LM checkpoints"
|
"Megatron-LM checkpoint does not contain arguments. This utility only supports Megatron-LM checkpoints"
|
||||||
" containing all the megatron arguments. This is because it loads all config related to model"
|
" containing all the megatron arguments. This is because it loads all config related to model"
|
||||||
" architecture, the tensor and pipeline model parallel size from the checkpoint insead of user having to"
|
" architecture, the tensor and pipeline model parallel size from the checkpoint instead of user having to"
|
||||||
" manually specify all the details. Please save Megatron-LM checkpoint along with all the megatron"
|
" manually specify all the details. Please save Megatron-LM checkpoint along with all the megatron"
|
||||||
" arguments to use this utility."
|
" arguments to use this utility."
|
||||||
)
|
)
|
||||||
|
@ -1601,7 +1601,7 @@ class MusicgenForCausalLM(MusicgenPreTrainedModel, GenerationMixin):
|
|||||||
# 7. determine generation mode
|
# 7. determine generation mode
|
||||||
generation_mode = generation_config.get_generation_mode()
|
generation_mode = generation_config.get_generation_mode()
|
||||||
|
|
||||||
# 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
|
# 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
|
||||||
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
|
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
|
||||||
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
|
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
|
||||||
generation_config.guidance_scale = None
|
generation_config.guidance_scale = None
|
||||||
@ -2617,7 +2617,7 @@ class MusicgenForConditionalGeneration(PreTrainedModel, GenerationMixin):
|
|||||||
# 7. determine generation mode
|
# 7. determine generation mode
|
||||||
generation_mode = generation_config.get_generation_mode()
|
generation_mode = generation_config.get_generation_mode()
|
||||||
|
|
||||||
# 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
|
# 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
|
||||||
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
|
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
|
||||||
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
|
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
|
||||||
generation_config.guidance_scale = None
|
generation_config.guidance_scale = None
|
||||||
|
@ -54,7 +54,7 @@ class MusicgenMelodyFeatureExtractor(SequenceFeatureExtractor):
|
|||||||
sampling_rate (`int`, *optional*, defaults to 32000):
|
sampling_rate (`int`, *optional*, defaults to 32000):
|
||||||
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
|
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
|
||||||
hop_length (`int`, *optional*, defaults to 4096):
|
hop_length (`int`, *optional*, defaults to 4096):
|
||||||
Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
|
Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
|
||||||
chunk_length (`int`, *optional*, defaults to 30):
|
chunk_length (`int`, *optional*, defaults to 30):
|
||||||
The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
|
The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
|
||||||
sequences.
|
sequences.
|
||||||
|
@ -92,7 +92,7 @@ class MusicgenMelodyOutputWithPast(ModelOutput):
|
|||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||||
heads.
|
heads.
|
||||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
|
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
|
||||||
Sequence of conditional hidden-states representing the concatenation of the projeted text encoder output and the projeted audio encoder output.
|
Sequence of conditional hidden-states representing the concatenation of the projected text encoder output and the projected audio encoder output.
|
||||||
Used as a conditional signal.
|
Used as a conditional signal.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -757,8 +757,8 @@ MUSICGEN_MELODY_INPUTS_DOCSTRING = r"""
|
|||||||
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
|
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
|
||||||
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
|
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
|
||||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
|
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
|
||||||
Sequence of conditional hidden-states representing the concatenation of the projeted text encoder output and the projeted audio encoder output.
|
Sequence of conditional hidden-states representing the concatenation of the projected text encoder output and the projected audio encoder output.
|
||||||
Used as a conditional signal and will thus be concatenated to the projeted `decoder_input_ids`.
|
Used as a conditional signal and will thus be concatenated to the projected `decoder_input_ids`.
|
||||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
|
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
|
||||||
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
@ -818,7 +818,7 @@ MUSICGEN_MELODY_DECODER_INPUTS_DOCSTRING = r"""
|
|||||||
[What are attention masks?](../glossary#attention-mask)
|
[What are attention masks?](../glossary#attention-mask)
|
||||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
|
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
|
||||||
Sequence of hidden-states representing the concatenation of the text encoder output and the processed audio encoder output.
|
Sequence of hidden-states representing the concatenation of the text encoder output and the processed audio encoder output.
|
||||||
Used as a conditional signal and will thus be concatenated to the projeted `decoder_input_ids`.
|
Used as a conditional signal and will thus be concatenated to the projected `decoder_input_ids`.
|
||||||
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
|
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
|
||||||
Mask to avoid performing attention on conditional hidden states. Mask values
|
Mask to avoid performing attention on conditional hidden states. Mask values
|
||||||
selected in `[0, 1]`:
|
selected in `[0, 1]`:
|
||||||
@ -1522,7 +1522,7 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel, GenerationMixin):
|
|||||||
# 7. determine generation mode
|
# 7. determine generation mode
|
||||||
generation_mode = generation_config.get_generation_mode()
|
generation_mode = generation_config.get_generation_mode()
|
||||||
|
|
||||||
# 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
|
# 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
|
||||||
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
|
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
|
||||||
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
|
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
|
||||||
generation_config.guidance_scale = None
|
generation_config.guidance_scale = None
|
||||||
@ -2478,7 +2478,7 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin):
|
|||||||
# 7. determine generation mode
|
# 7. determine generation mode
|
||||||
generation_mode = generation_config.get_generation_mode()
|
generation_mode = generation_config.get_generation_mode()
|
||||||
|
|
||||||
# 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
|
# 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
|
||||||
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
|
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
|
||||||
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
|
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
|
||||||
generation_config.guidance_scale = None
|
generation_config.guidance_scale = None
|
||||||
|
@ -425,7 +425,7 @@ class NllbMoeSparseMLP(nn.Module):
|
|||||||
r"""
|
r"""
|
||||||
The goal of this forward pass is to have the same number of operation as the equivalent `NllbMoeDenseActDense`
|
The goal of this forward pass is to have the same number of operation as the equivalent `NllbMoeDenseActDense`
|
||||||
(mlp) layer. This means that all of the hidden states should be processed at most twice ( since we are using a
|
(mlp) layer. This means that all of the hidden states should be processed at most twice ( since we are using a
|
||||||
top_2 gating mecanism). This means that we keep the complexity to O(batch_size x sequence_length x hidden_dim)
|
top_2 gating mechanism). This means that we keep the complexity to O(batch_size x sequence_length x hidden_dim)
|
||||||
instead of O(num_experts x batch_size x sequence_length x hidden_dim).
|
instead of O(num_experts x batch_size x sequence_length x hidden_dim).
|
||||||
|
|
||||||
1- Get the `router_probs` from the `router`. The shape of the `router_mask` is `(batch_size X sequence_length,
|
1- Get the `router_probs` from the `router`. The shape of the `router_mask` is `(batch_size X sequence_length,
|
||||||
|
@ -376,7 +376,7 @@ class NougatTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
contains everything needed to load the tokenizer.
|
contains everything needed to load the tokenizer.
|
||||||
|
|
||||||
clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
|
clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
|
||||||
Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
|
Whether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
|
||||||
spaces.
|
spaces.
|
||||||
|
|
||||||
unk_token (`str`, *optional*, defaults to `"<unk>"`):
|
unk_token (`str`, *optional*, defaults to `"<unk>"`):
|
||||||
|
@ -268,7 +268,7 @@ def convert_omdet_turbo_checkpoint(args):
|
|||||||
"https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/ViT-B-16.pt",
|
"https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/ViT-B-16.pt",
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
# Define default OmDetTurbo configuation
|
# Define default OmDetTurbo configuration
|
||||||
config = get_omdet_turbo_config(model_name, use_timm_backbone)
|
config = get_omdet_turbo_config(model_name, use_timm_backbone)
|
||||||
|
|
||||||
# Load original checkpoint
|
# Load original checkpoint
|
||||||
|
@ -471,7 +471,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
|
|||||||
|
|
||||||
is_training = token_type_ids is not None and labels is not None
|
is_training = token_type_ids is not None and labels is not None
|
||||||
|
|
||||||
# Replace image id woth PAD if the image token if OOV, to avoid index-errors
|
# Replace image id with PAD if the image token if OOV, to avoid index-errors
|
||||||
if input_ids is not None and self.config.image_token_id >= self.vocab_size:
|
if input_ids is not None and self.config.image_token_id >= self.vocab_size:
|
||||||
special_image_mask = input_ids == self.config.image_token_id
|
special_image_mask = input_ids == self.config.image_token_id
|
||||||
llm_input_ids = input_ids.clone()
|
llm_input_ids = input_ids.clone()
|
||||||
|
@ -1807,7 +1807,7 @@ class PatchTSMixerForTimeSeriesClassificationOutput(ModelOutput):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
|
prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
|
||||||
Prediction output from the classfication head.
|
Prediction output from the classification head.
|
||||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
|
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
|
||||||
Backbone embeddings before passing through the head.
|
Backbone embeddings before passing through the head.
|
||||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||||
|
@ -1487,7 +1487,7 @@ FLAX_PEGASUS_CONDITIONAL_GENERATION_DOCSTRING = """
|
|||||||
|
|
||||||
Summarization example:
|
Summarization example:
|
||||||
|
|
||||||
```pyton
|
```python
|
||||||
>>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
|
>>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
|
||||||
|
|
||||||
>>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
|
>>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
|
||||||
|
@ -127,7 +127,7 @@ def get_resize_output_image_size(
|
|||||||
ratio = max(height / max_height, width / max_width)
|
ratio = max(height / max_height, width / max_width)
|
||||||
|
|
||||||
if ratio > 1:
|
if ratio > 1:
|
||||||
# Orgiginal implementation uses `round` which utilises bankers rounding, which can lead to surprising results
|
# Original implementation uses `round` which utilises bankers rounding, which can lead to surprising results
|
||||||
# Here we use floor to ensure the image is always smaller than the given "longest_edge"
|
# Here we use floor to ensure the image is always smaller than the given "longest_edge"
|
||||||
height = int(math.floor(height / ratio))
|
height = int(math.floor(height / ratio))
|
||||||
width = int(math.floor(width / ratio))
|
width = int(math.floor(width / ratio))
|
||||||
|
@ -35,7 +35,7 @@ logger = logging.get_logger(__name__)
|
|||||||
def create_rename_keys(config):
|
def create_rename_keys(config):
|
||||||
rename_keys = []
|
rename_keys = []
|
||||||
for i in range(config.num_encoder_blocks):
|
for i in range(config.num_encoder_blocks):
|
||||||
# Remane embedings' paramters
|
# Rename embeddings' parameters
|
||||||
rename_keys.append((f"pos_embed{i + 1}", f"pvt.encoder.patch_embeddings.{i}.position_embeddings"))
|
rename_keys.append((f"pos_embed{i + 1}", f"pvt.encoder.patch_embeddings.{i}.position_embeddings"))
|
||||||
|
|
||||||
rename_keys.append((f"patch_embed{i + 1}.proj.weight", f"pvt.encoder.patch_embeddings.{i}.projection.weight"))
|
rename_keys.append((f"patch_embed{i + 1}.proj.weight", f"pvt.encoder.patch_embeddings.{i}.projection.weight"))
|
||||||
|
@ -1037,7 +1037,7 @@ REMBERT_INPUTS_DOCSTRING = r"""
|
|||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"The bare RemBERT Model transformer outputing raw hidden-states without any specific head on top.",
|
"The bare RemBERT Model transformer outputting raw hidden-states without any specific head on top.",
|
||||||
REMBERT_START_DOCSTRING,
|
REMBERT_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
class TFRemBertModel(TFRemBertPreTrainedModel):
|
class TFRemBertModel(TFRemBertPreTrainedModel):
|
||||||
|
@ -911,7 +911,7 @@ ROFORMER_INPUTS_DOCSTRING = r"""
|
|||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"The bare RoFormer Model transformer outputing raw hidden-states without any specific head on top.",
|
"The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top.",
|
||||||
ROFORMER_START_DOCSTRING,
|
ROFORMER_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
class TFRoFormerModel(TFRoFormerPreTrainedModel):
|
class TFRoFormerModel(TFRoFormerPreTrainedModel):
|
||||||
|
@ -2171,7 +2171,7 @@ class SeamlessM4TTextToUnitForConditionalGeneration(SeamlessM4TPreTrainedModel,
|
|||||||
config: SeamlessM4TConfig,
|
config: SeamlessM4TConfig,
|
||||||
embed_tokens_decoder: Optional[nn.Embedding] = None,
|
embed_tokens_decoder: Optional[nn.Embedding] = None,
|
||||||
):
|
):
|
||||||
# update config - used principaly for bos_token_id etc.
|
# update config - used principality for bos_token_id etc.
|
||||||
config = copy.deepcopy(config)
|
config = copy.deepcopy(config)
|
||||||
for param, val in config.to_dict().items():
|
for param, val in config.to_dict().items():
|
||||||
if param.startswith("t2u_"):
|
if param.startswith("t2u_"):
|
||||||
|
@ -184,7 +184,7 @@ SEAMLESS_M4T_V2_MULTIMODAL_INPUTS_DOCSTRING = r"""
|
|||||||
|
|
||||||
[What are input IDs?](../glossary#input-ids)
|
[What are input IDs?](../glossary#input-ids)
|
||||||
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
|
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
|
||||||
Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
|
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||||
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -202,7 +202,7 @@ M4T_TEXT_INPUTS_DOCSTRING = r"""
|
|||||||
M4T_SPEECH_INPUTS_DOCSTRING = r"""
|
M4T_SPEECH_INPUTS_DOCSTRING = r"""
|
||||||
Args:
|
Args:
|
||||||
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
|
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
|
||||||
Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
|
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||||
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -2461,7 +2461,7 @@ class SeamlessM4Tv2TextToUnitForConditionalGeneration(SeamlessM4Tv2PreTrainedMod
|
|||||||
config: SeamlessM4Tv2Config,
|
config: SeamlessM4Tv2Config,
|
||||||
embed_tokens_decoder: Optional[nn.Embedding] = None,
|
embed_tokens_decoder: Optional[nn.Embedding] = None,
|
||||||
):
|
):
|
||||||
# update config - used principaly for bos_token_id etc.
|
# update config - used principality for bos_token_id etc.
|
||||||
config = copy.deepcopy(config)
|
config = copy.deepcopy(config)
|
||||||
for param, val in config.to_dict().items():
|
for param, val in config.to_dict().items():
|
||||||
if param.startswith("t2u_"):
|
if param.startswith("t2u_"):
|
||||||
@ -4035,7 +4035,7 @@ class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMix
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
|
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
|
||||||
Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
|
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||||
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
||||||
return_intermediate_token_ids (`bool`, *optional*):
|
return_intermediate_token_ids (`bool`, *optional*):
|
||||||
If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
|
If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
|
||||||
@ -4485,7 +4485,7 @@ class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
|
|||||||
|
|
||||||
[What are input IDs?](../glossary#input-ids)
|
[What are input IDs?](../glossary#input-ids)
|
||||||
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`, *optional*):
|
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`, *optional*):
|
||||||
Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
|
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||||
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
||||||
return_intermediate_token_ids (`bool`, *optional*):
|
return_intermediate_token_ids (`bool`, *optional*):
|
||||||
If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
|
If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
|
||||||
|
@ -114,7 +114,7 @@ def convert_seggpt_checkpoint(args):
|
|||||||
verify_logits = args.verify_logits
|
verify_logits = args.verify_logits
|
||||||
push_to_hub = args.push_to_hub
|
push_to_hub = args.push_to_hub
|
||||||
|
|
||||||
# Define default GroundingDINO configuation
|
# Define default GroundingDINO configuration
|
||||||
config = SegGptConfig()
|
config = SegGptConfig()
|
||||||
|
|
||||||
# Load original checkpoint
|
# Load original checkpoint
|
||||||
|
@ -62,7 +62,7 @@ class SegGptEncoderOutput(ModelOutput):
|
|||||||
intermediate_hidden_states (`Tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
|
intermediate_hidden_states (`Tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
|
||||||
Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
|
Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
|
||||||
Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
|
Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
|
||||||
Additionaly, each feature passes through a LayerNorm.
|
Additionally, each feature passes through a LayerNorm.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
last_hidden_state: torch.FloatTensor
|
last_hidden_state: torch.FloatTensor
|
||||||
|
@ -1979,10 +1979,10 @@ SPEECHT5_BASE_START_DOCSTRING = r"""
|
|||||||
load the weights associated with the model, only the configuration. Check out the
|
load the weights associated with the model, only the configuration. Check out the
|
||||||
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
||||||
encoder ([`SpeechT5EncoderWithSpeechPrenet`] or [`SpeechT5EncoderWithTextPrenet`] or `None`):
|
encoder ([`SpeechT5EncoderWithSpeechPrenet`] or [`SpeechT5EncoderWithTextPrenet`] or `None`):
|
||||||
The Transformer encoder module that applies the appropiate speech or text encoder prenet. If `None`,
|
The Transformer encoder module that applies the appropriate speech or text encoder prenet. If `None`,
|
||||||
[`SpeechT5EncoderWithoutPrenet`] will be used and the `input_values` are assumed to be hidden states.
|
[`SpeechT5EncoderWithoutPrenet`] will be used and the `input_values` are assumed to be hidden states.
|
||||||
decoder ([`SpeechT5DecoderWithSpeechPrenet`] or [`SpeechT5DecoderWithTextPrenet`] or `None`):
|
decoder ([`SpeechT5DecoderWithSpeechPrenet`] or [`SpeechT5DecoderWithTextPrenet`] or `None`):
|
||||||
The Transformer decoder module that applies the appropiate speech or text decoder prenet. If `None`,
|
The Transformer decoder module that applies the appropriate speech or text decoder prenet. If `None`,
|
||||||
[`SpeechT5DecoderWithoutPrenet`] will be used and the `decoder_input_values` are assumed to be hidden
|
[`SpeechT5DecoderWithoutPrenet`] will be used and the `decoder_input_values` are assumed to be hidden
|
||||||
states.
|
states.
|
||||||
"""
|
"""
|
||||||
|
@ -175,7 +175,7 @@ def make_state_dict(converted_params, is_encoder_only: bool):
|
|||||||
|
|
||||||
|
|
||||||
def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only):
|
def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only):
|
||||||
"""Replaces the params in model witht the T5X converted params."""
|
"""Replaces the params in model with the T5X converted params."""
|
||||||
variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
|
variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
|
||||||
converted = convert_t5x_to_pytorch(
|
converted = convert_t5x_to_pytorch(
|
||||||
variables,
|
variables,
|
||||||
|
@ -2344,11 +2344,11 @@ def _calculate_expected_result(
|
|||||||
if avg_approximation == AverageApproximationFunction.RATIO:
|
if avg_approximation == AverageApproximationFunction.RATIO:
|
||||||
average_result = sum_result / (count_result + EPSILON_ZERO_DIVISION)
|
average_result = sum_result / (count_result + EPSILON_ZERO_DIVISION)
|
||||||
elif avg_approximation == AverageApproximationFunction.FIRST_ORDER:
|
elif avg_approximation == AverageApproximationFunction.FIRST_ORDER:
|
||||||
# The sum of all probabilities exept that correspond to other cells
|
# The sum of all probabilities except that correspond to other cells
|
||||||
ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1
|
ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1
|
||||||
average_result = tf.reduce_sum(numeric_values_masked * scaled_probability_per_cell / ex, axis=1)
|
average_result = tf.reduce_sum(numeric_values_masked * scaled_probability_per_cell / ex, axis=1)
|
||||||
elif avg_approximation == AverageApproximationFunction.SECOND_ORDER:
|
elif avg_approximation == AverageApproximationFunction.SECOND_ORDER:
|
||||||
# The sum of all probabilities exept that correspond to other cells
|
# The sum of all probabilities except that correspond to other cells
|
||||||
ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1
|
ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1
|
||||||
pointwise_var = scaled_probability_per_cell * (1 - scaled_probability_per_cell)
|
pointwise_var = scaled_probability_per_cell * (1 - scaled_probability_per_cell)
|
||||||
var = tf.reduce_sum(pointwise_var, axis=1, keepdims=True) - pointwise_var
|
var = tf.reduce_sum(pointwise_var, axis=1, keepdims=True) - pointwise_var
|
||||||
|
@ -2359,7 +2359,7 @@ _ORDINAL_WORDS = [
|
|||||||
"second",
|
"second",
|
||||||
"third",
|
"third",
|
||||||
"fourth",
|
"fourth",
|
||||||
"fith",
|
"fifth",
|
||||||
"sixth",
|
"sixth",
|
||||||
"seventh",
|
"seventh",
|
||||||
"eighth",
|
"eighth",
|
||||||
|
@ -1364,7 +1364,7 @@ class UdopStack(UdopPreTrainedModel):
|
|||||||
|
|
||||||
if inputs_embeds is None:
|
if inputs_embeds is None:
|
||||||
if self.embed_tokens is None:
|
if self.embed_tokens is None:
|
||||||
raise ValueError("You have to intialize the model with valid token embeddings")
|
raise ValueError("You have to initialize the model with valid token embeddings")
|
||||||
inputs_embeds = self.embed_tokens(input_ids)
|
inputs_embeds = self.embed_tokens(input_ids)
|
||||||
|
|
||||||
if pixel_values is not None:
|
if pixel_values is not None:
|
||||||
|
@ -200,7 +200,7 @@ def make_state_dict(converted_params, is_encoder_only: bool):
|
|||||||
|
|
||||||
|
|
||||||
def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention):
|
def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention):
|
||||||
"""Replaces the params in model witht the T5X converted params."""
|
"""Replaces the params in model with the T5X converted params."""
|
||||||
variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
|
variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
|
||||||
converted = convert_t5x_to_pytorch(
|
converted = convert_t5x_to_pytorch(
|
||||||
variables, num_layers=config.num_layers, is_encoder_only=is_encoder_only, scalable_attention=scalable_attention
|
variables, num_layers=config.num_layers, is_encoder_only=is_encoder_only, scalable_attention=scalable_attention
|
||||||
|
@ -164,7 +164,7 @@ class UniSpeechConfig(PretrainedConfig):
|
|||||||
eos_token_id (`int`, *optional*, defaults to 2):
|
eos_token_id (`int`, *optional*, defaults to 2):
|
||||||
The id of the "end-of-sequence" token.
|
The id of the "end-of-sequence" token.
|
||||||
replace_prob (`float`, *optional*, defaults to 0.5):
|
replace_prob (`float`, *optional*, defaults to 0.5):
|
||||||
Propability that transformer feature is replaced by quantized feature for pretraining.
|
Probability that transformer feature is replaced by quantized feature for pretraining.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ def get_kernel_predictor_key_mapping(config: UnivNetConfig, old_prefix: str = ""
|
|||||||
def get_key_mapping(config: UnivNetConfig):
|
def get_key_mapping(config: UnivNetConfig):
|
||||||
mapping = {}
|
mapping = {}
|
||||||
|
|
||||||
# NOTE: inital conv layer keys are the same
|
# NOTE: initial conv layer keys are the same
|
||||||
|
|
||||||
# LVC Residual blocks
|
# LVC Residual blocks
|
||||||
for i in range(len(config.resblock_stride_sizes)):
|
for i in range(len(config.resblock_stride_sizes)):
|
||||||
|
@ -64,7 +64,7 @@ class UnivNetFeatureExtractor(SequenceFeatureExtractor):
|
|||||||
The number of FFT components to use. If `None`, this is determined using
|
The number of FFT components to use. If `None`, this is determined using
|
||||||
`transformers.audio_utils.optimal_fft_length`.
|
`transformers.audio_utils.optimal_fft_length`.
|
||||||
max_length_s (`int`, *optional*, defaults to 10):
|
max_length_s (`int`, *optional*, defaults to 10):
|
||||||
The maximum input lenght of the model in seconds. This is used to pad the audio.
|
The maximum input length of the model in seconds. This is used to pad the audio.
|
||||||
fmin (`float`, *optional*, defaults to 0.0):
|
fmin (`float`, *optional*, defaults to 0.0):
|
||||||
Minimum mel frequency in Hz.
|
Minimum mel frequency in Hz.
|
||||||
fmax (`float`, *optional*):
|
fmax (`float`, *optional*):
|
||||||
|
@ -39,7 +39,7 @@ class ViltConfig(PretrainedConfig):
|
|||||||
The vocabulary size of the `token_type_ids` passed when calling [`ViltModel`]. This is used when encoding
|
The vocabulary size of the `token_type_ids` passed when calling [`ViltModel`]. This is used when encoding
|
||||||
text.
|
text.
|
||||||
modality_type_vocab_size (`int`, *optional*, defaults to 2):
|
modality_type_vocab_size (`int`, *optional*, defaults to 2):
|
||||||
The vocabulary size of the modalities passed when calling [`ViltModel`]. This is used after concatening the
|
The vocabulary size of the modalities passed when calling [`ViltModel`]. This is used after concatenating the
|
||||||
embeddings of the text and image modalities.
|
embeddings of the text and image modalities.
|
||||||
max_position_embeddings (`int`, *optional*, defaults to 40):
|
max_position_embeddings (`int`, *optional*, defaults to 40):
|
||||||
The maximum sequence length that this model might ever be used with.
|
The maximum sequence length that this model might ever be used with.
|
||||||
|
@ -139,7 +139,7 @@ class ViltEmbeddings(nn.Module):
|
|||||||
x_mask = x_mask.flatten(1)
|
x_mask = x_mask.flatten(1)
|
||||||
|
|
||||||
if max_image_length < 0 or max_image_length is None or not isinstance(max_image_length, int):
|
if max_image_length < 0 or max_image_length is None or not isinstance(max_image_length, int):
|
||||||
# suppose aug is 800 x 1333, then, maximum effective res is 800 x 1333 (if one side gets bigger, the other will be constrained and be shrinked)
|
# suppose aug is 800 x 1333, then, maximum effective res is 800 x 1333 (if one side gets bigger, the other will be constrained and be shrunk)
|
||||||
# (800 // self.patch_size) * (1333 // self.patch_size) is the maximum number of patches that single image can get.
|
# (800 // self.patch_size) * (1333 // self.patch_size) is the maximum number of patches that single image can get.
|
||||||
# if self.patch_size = 32, 25 * 41 = 1025
|
# if self.patch_size = 32, 25 * 41 = 1025
|
||||||
# if res is 384 x 640, 12 * 20 = 240
|
# if res is 384 x 640, 12 * 20 = 240
|
||||||
|
@ -85,7 +85,7 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
|
|||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
if "encoder" not in kwargs or "decoder" not in kwargs:
|
if "encoder" not in kwargs or "decoder" not in kwargs:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"A configuraton of type {self.model_type} cannot be instantiated because "
|
f"A configuration of type {self.model_type} cannot be instantiated because "
|
||||||
f"not both `encoder` and `decoder` sub-configurations are passed, but only {kwargs}"
|
f"not both `encoder` and `decoder` sub-configurations are passed, but only {kwargs}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1496,7 +1496,7 @@ WAV2VEC2_INPUTS_DOCSTRING = r"""
|
|||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"The bare TFWav2Vec2 Model transformer outputing raw hidden-states without any specific head on top.",
|
"The bare TFWav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.",
|
||||||
WAV2VEC2_START_DOCSTRING,
|
WAV2VEC2_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
|
class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
|
||||||
|
@ -101,7 +101,7 @@ class WavLMConfig(PretrainedConfig):
|
|||||||
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
||||||
Recognition](https://arxiv.org/abs/1904.08779).
|
Recognition](https://arxiv.org/abs/1904.08779).
|
||||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||||
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
Probability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
||||||
masked. Approximately `mask_time_prob * sequence_length // mask_time_length` feature vectors will be masked
|
masked. Approximately `mask_time_prob * sequence_length // mask_time_length` feature vectors will be masked
|
||||||
along the time axis. This is only relevant if `apply_spec_augment is True`.
|
along the time axis. This is only relevant if `apply_spec_augment is True`.
|
||||||
mask_time_length (`int`, *optional*, defaults to 10):
|
mask_time_length (`int`, *optional*, defaults to 10):
|
||||||
@ -111,7 +111,7 @@ class WavLMConfig(PretrainedConfig):
|
|||||||
irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
|
irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
|
||||||
mask_time_min_masks''
|
mask_time_min_masks''
|
||||||
mask_feature_prob (`float`, *optional*, defaults to 0.0):
|
mask_feature_prob (`float`, *optional*, defaults to 0.0):
|
||||||
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
Probability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
||||||
be masked. Approximately `mask_time_prob * hidden_size // mask_time_length` feature vectors will be masked
|
be masked. Approximately `mask_time_prob * hidden_size // mask_time_length` feature vectors will be masked
|
||||||
along the time axis. This is only relevant if `apply_spec_augment is True`.
|
along the time axis. This is only relevant if `apply_spec_augment is True`.
|
||||||
mask_feature_length (`int`, *optional*, defaults to 10):
|
mask_feature_length (`int`, *optional*, defaults to 10):
|
||||||
|
@ -474,7 +474,7 @@ class ZoeDepthImageProcessor(BaseImageProcessor):
|
|||||||
outputs_flipped ([`ZoeDepthDepthEstimatorOutput`], *optional*):
|
outputs_flipped ([`ZoeDepthDepthEstimatorOutput`], *optional*):
|
||||||
Raw outputs of the model from flipped input (averaged out in the end).
|
Raw outputs of the model from flipped input (averaged out in the end).
|
||||||
do_remove_padding (`bool`, *optional*):
|
do_remove_padding (`bool`, *optional*):
|
||||||
By default ZoeDepth addes padding equal to `int(√(height / 2) * 3)` (and similarly for width) to fix the
|
By default ZoeDepth adds padding equal to `int(√(height / 2) * 3)` (and similarly for width) to fix the
|
||||||
boundary artifacts in the output depth map, so we need remove this padding during post_processing. The
|
boundary artifacts in the output depth map, so we need remove this padding during post_processing. The
|
||||||
parameter exists here in case the user changed the image preprocessing to not include padding.
|
parameter exists here in case the user changed the image preprocessing to not include padding.
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ class AwqQuantizer(HfQuantizer):
|
|||||||
4-bit quantization for Activation-aware Weight Quantization(AWQ) (https://arxiv.org/abs/2306.00978)
|
4-bit quantization for Activation-aware Weight Quantization(AWQ) (https://arxiv.org/abs/2306.00978)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# AWQ requires data callibration - we support only inference
|
# AWQ requires data calibration - we support only inference
|
||||||
requires_calibration = True
|
requires_calibration = True
|
||||||
|
|
||||||
required_packages = ["awq", "accelerate"]
|
required_packages = ["awq", "accelerate"]
|
||||||
|
@ -69,7 +69,7 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
|
|||||||
return missing_keys
|
return missing_keys
|
||||||
|
|
||||||
# We expect some keys to be missing for
|
# We expect some keys to be missing for
|
||||||
# compresed models
|
# compressed models
|
||||||
# This is fine as the weights are reconstructed by ModelCompressor
|
# This is fine as the weights are reconstructed by ModelCompressor
|
||||||
# in _process_model_after_weight_loading
|
# in _process_model_after_weight_loading
|
||||||
|
|
||||||
|
@ -1728,14 +1728,14 @@ Please note that you may need to restart your runtime after installation.
|
|||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
LIBROSA_IMPORT_ERROR = """
|
LIBROSA_IMPORT_ERROR = """
|
||||||
{0} requires thes librosa library. But that was not found in your environment. You can install them with pip:
|
{0} requires the librosa library. But that was not found in your environment. You can install them with pip:
|
||||||
`pip install librosa`
|
`pip install librosa`
|
||||||
Please note that you may need to restart your runtime after installation.
|
Please note that you may need to restart your runtime after installation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
PRETTY_MIDI_IMPORT_ERROR = """
|
PRETTY_MIDI_IMPORT_ERROR = """
|
||||||
{0} requires thes pretty_midi library. But that was not found in your environment. You can install them with pip:
|
{0} requires the pretty_midi library. But that was not found in your environment. You can install them with pip:
|
||||||
`pip install pretty_midi`
|
`pip install pretty_midi`
|
||||||
Please note that you may need to restart your runtime after installation.
|
Please note that you may need to restart your runtime after installation.
|
||||||
"""
|
"""
|
||||||
|
@ -1120,7 +1120,7 @@ class VptqLayerConfig(QuantizationConfigMixin):
|
|||||||
group_size (`int`, *optional*, defaults to `-1`): depends on out-features
|
group_size (`int`, *optional*, defaults to `-1`): depends on out-features
|
||||||
indices_as_float (`bool`, *optional*, defaults to `False`): for Finetuning
|
indices_as_float (`bool`, *optional*, defaults to `False`): for Finetuning
|
||||||
is_indice_packed (`bool`, *optional*, defaults to `True`): should always be True
|
is_indice_packed (`bool`, *optional*, defaults to `True`): should always be True
|
||||||
num_centroids (`list`, *optional*, defaults to `[-1, -1]`): centriod numbers of clusters
|
num_centroids (`list`, *optional*, defaults to `[-1, -1]`): centroid numbers of clusters
|
||||||
num_res_centroids (`list`, *optional*, defaults to `[-1, -1]`): ditto for residual
|
num_res_centroids (`list`, *optional*, defaults to `[-1, -1]`): ditto for residual
|
||||||
outlier_size (`int`, *optional*, defaults to `1`): outliers
|
outlier_size (`int`, *optional*, defaults to `1`): outliers
|
||||||
vector_lens (`list`, *optional*, defaults to `[-1, -1]`): centroid vector length in quantization
|
vector_lens (`list`, *optional*, defaults to `[-1, -1]`): centroid vector length in quantization
|
||||||
|
@ -146,7 +146,7 @@ class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_tokenization_base_hard_symbols(self):
|
def test_tokenization_base_hard_symbols(self):
|
||||||
symbols = (
|
symbols = (
|
||||||
'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
|
'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
|
||||||
" add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
|
" add words that should not exist and be tokenized to <unk>, such as saoneuhaoesuth"
|
||||||
)
|
)
|
||||||
original_tokenizer_encodings = [
|
original_tokenizer_encodings = [
|
||||||
871,
|
871,
|
||||||
|
@ -170,7 +170,7 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_tokenization_base_hard_symbols(self):
|
def test_tokenization_base_hard_symbols(self):
|
||||||
symbols = (
|
symbols = (
|
||||||
'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
|
'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
|
||||||
" add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
|
" add words that should not exist and be tokenized to <unk>, such as saoneuhaoesuth"
|
||||||
)
|
)
|
||||||
original_tokenizer_encodings = [65, 871, 419, 358, 946, 991, 2521, 452, 358, 1357, 387, 7751, 3536, 112, 985, 456, 126, 865, 938, 5400, 5734, 458, 1368, 467, 786, 2462, 5246, 1159, 633, 865, 4519, 457, 582, 852, 2557, 427, 916, 508, 405, 34324, 497, 391, 408, 11342, 1244, 385, 100, 938, 985, 456, 574, 362, 12597, 3200, 3129, 1172, 66] # fmt: skip
|
original_tokenizer_encodings = [65, 871, 419, 358, 946, 991, 2521, 452, 358, 1357, 387, 7751, 3536, 112, 985, 456, 126, 865, 938, 5400, 5734, 458, 1368, 467, 786, 2462, 5246, 1159, 633, 865, 4519, 457, 582, 852, 2557, 427, 916, 508, 405, 34324, 497, 391, 408, 11342, 1244, 385, 100, 938, 985, 456, 574, 362, 12597, 3200, 3129, 1172, 66] # fmt: skip
|
||||||
self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
|
self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
|
||||||
|
@ -438,7 +438,7 @@ class BridgeTowerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
|
|||||||
if self.has_attentions:
|
if self.has_attentions:
|
||||||
self.assertIsNotNone(attentions.grad)
|
self.assertIsNotNone(attentions.grad)
|
||||||
|
|
||||||
# override as the `logit_scale` parameter initilization is different for BRIDGE TOWER
|
# override as the `logit_scale` parameter initialization is different for BRIDGE TOWER
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
|
@ -55,7 +55,7 @@ class BridgeTowerProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
def tearDownClass(cls):
|
def tearDownClass(cls):
|
||||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||||
|
|
||||||
# Some kwargs tests are overriden from common tests to handle shortest_edge
|
# Some kwargs tests are overridden from common tests to handle shortest_edge
|
||||||
# and size_divisor behaviour
|
# and size_divisor behaviour
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
|
@ -924,7 +924,7 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
def test_model_get_set_embeddings(self):
|
def test_model_get_set_embeddings(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# override as the `logit_scale` parameter initilization is different for FLAVA
|
# override as the `logit_scale` parameter initialization is different for FLAVA
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
@ -933,7 +933,7 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
model = model_class(config=configs_no_init)
|
model = model_class(config=configs_no_init)
|
||||||
for name, param in model.named_parameters():
|
for name, param in model.named_parameters():
|
||||||
if param.requires_grad:
|
if param.requires_grad:
|
||||||
# check if `logit_scale` is initilized as per the original implementation
|
# check if `logit_scale` is initialized as per the original implementation
|
||||||
if name == "logit_scale" or name == "flava.logit_scale":
|
if name == "logit_scale" or name == "flava.logit_scale":
|
||||||
self.assertAlmostEqual(
|
self.assertAlmostEqual(
|
||||||
param.data.item(),
|
param.data.item(),
|
||||||
|
@ -137,7 +137,7 @@ class Gemma3ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
|
|||||||
|
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`"
|
reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`"
|
||||||
" as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
|
" as in Dynamic Cache doesn't work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
|
||||||
)
|
)
|
||||||
def test_multi_gpu_data_parallel_forward(self):
|
def test_multi_gpu_data_parallel_forward(self):
|
||||||
pass
|
pass
|
||||||
@ -275,7 +275,7 @@ class Gemma3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unitte
|
|||||||
|
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`"
|
reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`"
|
||||||
" as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
|
" as in Dynamic Cache doesn't work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
|
||||||
)
|
)
|
||||||
def test_multi_gpu_data_parallel_forward(self):
|
def test_multi_gpu_data_parallel_forward(self):
|
||||||
pass
|
pass
|
||||||
|
@ -88,7 +88,7 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
image = self.prepare_image_inputs()
|
image = self.prepare_image_inputs()
|
||||||
|
|
||||||
# If text has no image tokens, iamge should be `None`
|
# If text has no image tokens, image should be `None`
|
||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
_ = processor(text=text_no_image, images=image, return_tensors="np")
|
_ = processor(text=text_no_image, images=image, return_tensors="np")
|
||||||
|
|
||||||
|
@ -478,8 +478,8 @@ class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
|
|||||||
# the last 2 tokens are masked, and should have 0 attn_probs
|
# the last 2 tokens are masked, and should have 0 attn_probs
|
||||||
self.assertTrue(torch.all(attn_probs[:, :, -mask_tokens:, -mask_tokens:] == 0))
|
self.assertTrue(torch.all(attn_probs[:, :, -mask_tokens:, -mask_tokens:] == 0))
|
||||||
|
|
||||||
# in loacal attention each token can only attend to the previous window_size tokens (including itself)
|
# in local attention each token can only attend to the previous window_size tokens (including itself)
|
||||||
# here window_size is 4, so a token at index 5 can only attend to indcies [2, 3, 4, 5]
|
# here window_size is 4, so a token at index 5 can only attend to indices [2, 3, 4, 5]
|
||||||
# and the attn_probs should be 0 for token [0, 1]
|
# and the attn_probs should be 0 for token [0, 1]
|
||||||
self.assertTrue(torch.all(attn_probs[:, :, 5, 2:6] != 0))
|
self.assertTrue(torch.all(attn_probs[:, :, 5, 2:6] != 0))
|
||||||
self.assertTrue(torch.all(attn_probs[:, :, 5, :2] == 0))
|
self.assertTrue(torch.all(attn_probs[:, :, 5, :2] == 0))
|
||||||
|
@ -769,7 +769,7 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):
|
|||||||
encoding1 = processor(images=image, text=text1, return_tensors="pt").to(torch_device)
|
encoding1 = processor(images=image, text=text1, return_tensors="pt").to(torch_device)
|
||||||
encoding2 = processor(images=image, text=text2, return_tensors="pt").to(torch_device)
|
encoding2 = processor(images=image, text=text2, return_tensors="pt").to(torch_device)
|
||||||
# If we batch the text and cross attention masking is working the batched result should be equal to
|
# If we batch the text and cross attention masking is working the batched result should be equal to
|
||||||
# The singe text result
|
# The single text result
|
||||||
encoding_batched = processor(
|
encoding_batched = processor(
|
||||||
images=[image] * len(text_batched), text=text_batched, padding="longest", return_tensors="pt"
|
images=[image] * len(text_batched), text=text_batched, padding="longest", return_tensors="pt"
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
|
@ -658,7 +658,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
|
|||||||
def test_sdpa_can_dispatch_composite_models(self):
|
def test_sdpa_can_dispatch_composite_models(self):
|
||||||
"""
|
"""
|
||||||
Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
|
Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
|
||||||
This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
|
This tests only by looking at layer names, as usually SDPA layers are called "SDPAAttention".
|
||||||
In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
|
In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
|
||||||
is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
|
is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
|
||||||
See https://github.com/huggingface/transformers/pull/32238 for more info
|
See https://github.com/huggingface/transformers/pull/32238 for more info
|
||||||
|
@ -56,7 +56,7 @@ class LlavaConfigTest(unittest.TestCase):
|
|||||||
|
|
||||||
def test_arbitrary_reload(self):
|
def test_arbitrary_reload(self):
|
||||||
"""
|
"""
|
||||||
Simple test for reloading arbirarily composed subconfigs
|
Simple test for reloading arbitrarily composed subconfigs
|
||||||
"""
|
"""
|
||||||
default_values = LlavaConfig().to_diff_dict()
|
default_values = LlavaConfig().to_diff_dict()
|
||||||
default_values["vision_config"]["model_type"] = "pixtral"
|
default_values["vision_config"]["model_type"] = "pixtral"
|
||||||
|
@ -553,8 +553,8 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
# image = Image.open(requests.get(url, stream=True).raw)
|
# image = Image.open(requests.get(url, stream=True).raw)
|
||||||
inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(model.device)
|
inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(model.device)
|
||||||
generate_ids = model.generate(**inputs, max_new_tokens=500)
|
generate_ids = model.generate(**inputs, max_new_tokens=500)
|
||||||
ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||||||
print(ouptut)
|
print(output)
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
EXPECTED_GENERATION = """
|
EXPECTED_GENERATION = """
|
||||||
@ -573,7 +573,7 @@ These descriptions provide a detailed overview of the content and atmosphere of
|
|||||||
"""
|
"""
|
||||||
# fmt: on
|
# fmt: on
|
||||||
# check that both inputs are handled correctly and generate the same output
|
# check that both inputs are handled correctly and generate the same output
|
||||||
self.assertEqual(ouptut, EXPECTED_GENERATION)
|
self.assertEqual(output, EXPECTED_GENERATION)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_bitsandbytes
|
@require_bitsandbytes
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user