mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 05:10:06 +06:00
Update ruff to 0.11.2
(#36962)
* update * update * update --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
bc1c90a755
commit
c6814b4ee8
2
setup.py
2
setup.py
@ -162,7 +162,7 @@ _deps = [
|
|||||||
"rhoknp>=1.1.0,<1.3.1",
|
"rhoknp>=1.1.0,<1.3.1",
|
||||||
"rjieba",
|
"rjieba",
|
||||||
"rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
|
"rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
|
||||||
"ruff==0.5.1",
|
"ruff==0.11.2",
|
||||||
"sacrebleu>=1.4.12,<2.0.0",
|
"sacrebleu>=1.4.12,<2.0.0",
|
||||||
"sacremoses",
|
"sacremoses",
|
||||||
"safetensors>=0.4.3",
|
"safetensors>=0.4.3",
|
||||||
|
@ -167,9 +167,9 @@ class Tool:
|
|||||||
)
|
)
|
||||||
for input_name, input_content in self.inputs.items():
|
for input_name, input_content in self.inputs.items():
|
||||||
assert isinstance(input_content, dict), f"Input '{input_name}' should be a dictionary."
|
assert isinstance(input_content, dict), f"Input '{input_name}' should be a dictionary."
|
||||||
assert (
|
assert "type" in input_content and "description" in input_content, (
|
||||||
"type" in input_content and "description" in input_content
|
f"Input '{input_name}' should have keys 'type' and 'description', has only {list(input_content.keys())}."
|
||||||
), f"Input '{input_name}' should have keys 'type' and 'description', has only {list(input_content.keys())}."
|
)
|
||||||
if input_content["type"] not in authorized_types:
|
if input_content["type"] not in authorized_types:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"Input '{input_name}': type '{input_content['type']}' is not an authorized value, should be one of {authorized_types}."
|
f"Input '{input_name}': type '{input_content['type']}' is not an authorized value, should be one of {authorized_types}."
|
||||||
|
@ -313,11 +313,9 @@ def add_fast_image_processor_to_doc(fast_image_processor_name: str, model_name:
|
|||||||
raise ValueError(f"No doc files found for {model_name}")
|
raise ValueError(f"No doc files found for {model_name}")
|
||||||
|
|
||||||
base_doc_string = (
|
base_doc_string = (
|
||||||
f"## {fast_image_processor_name[:-4]}\n\n" f"[[autodoc]] {fast_image_processor_name[:-4]}\n" " - preprocess"
|
f"## {fast_image_processor_name[:-4]}\n\n[[autodoc]] {fast_image_processor_name[:-4]}\n - preprocess"
|
||||||
)
|
|
||||||
fast_doc_string = (
|
|
||||||
f"## {fast_image_processor_name}\n\n" f"[[autodoc]] {fast_image_processor_name}\n" " - preprocess"
|
|
||||||
)
|
)
|
||||||
|
fast_doc_string = f"## {fast_image_processor_name}\n\n[[autodoc]] {fast_image_processor_name}\n - preprocess"
|
||||||
|
|
||||||
for doc_file in doc_files:
|
for doc_file in doc_files:
|
||||||
with open(doc_file, "r", encoding="utf-8") as f:
|
with open(doc_file, "r", encoding="utf-8") as f:
|
||||||
@ -385,7 +383,7 @@ def add_fast_image_processor_to_tests(fast_image_processor_name: str, model_name
|
|||||||
# add the fast image processor to the imports
|
# add the fast image processor to the imports
|
||||||
base_import_string = f" from transformers import {fast_image_processor_name[:-4]}"
|
base_import_string = f" from transformers import {fast_image_processor_name[:-4]}"
|
||||||
fast_import_string = (
|
fast_import_string = (
|
||||||
" if is_torchvision_available():\n" f" from transformers import {fast_image_processor_name}"
|
f" if is_torchvision_available():\n from transformers import {fast_image_processor_name}"
|
||||||
)
|
)
|
||||||
if fast_import_string not in updated_content:
|
if fast_import_string not in updated_content:
|
||||||
updated_content = updated_content.replace(base_import_string, base_import_string + "\n\n" + fast_import_string)
|
updated_content = updated_content.replace(base_import_string, base_import_string + "\n\n" + fast_import_string)
|
||||||
@ -546,17 +544,17 @@ def add_fast_image_processor_file(
|
|||||||
" # For an example of a fast image processor requiring more complex augmentations, see `LlavaNextImageProcessorFast`.\n\n"
|
" # For an example of a fast image processor requiring more complex augmentations, see `LlavaNextImageProcessorFast`.\n\n"
|
||||||
" # Default values should be checked against the slow image processor\n"
|
" # Default values should be checked against the slow image processor\n"
|
||||||
" # None values left after checking can be removed\n"
|
" # None values left after checking can be removed\n"
|
||||||
f' resample = {default_args_dict.get("resample")}\n'
|
f" resample = {default_args_dict.get('resample')}\n"
|
||||||
f' image_mean = {default_args_dict.get("image_mean")}\n'
|
f" image_mean = {default_args_dict.get('image_mean')}\n"
|
||||||
f' image_std = {default_args_dict.get("image_std")}\n'
|
f" image_std = {default_args_dict.get('image_std')}\n"
|
||||||
f' size = {default_args_dict.get("size")}\n'
|
f" size = {default_args_dict.get('size')}\n"
|
||||||
f' default_to_square = {default_args_dict.get("default_to_square")}\n'
|
f" default_to_square = {default_args_dict.get('default_to_square')}\n"
|
||||||
f' crop_size = {default_args_dict.get("crop_size")}\n'
|
f" crop_size = {default_args_dict.get('crop_size')}\n"
|
||||||
f' do_resize = {default_args_dict.get("do_resize")}\n'
|
f" do_resize = {default_args_dict.get('do_resize')}\n"
|
||||||
f' do_center_crop = {default_args_dict.get("do_center_crop")}\n'
|
f" do_center_crop = {default_args_dict.get('do_center_crop')}\n"
|
||||||
f' do_rescale = {default_args_dict.get("do_rescale")}\n'
|
f" do_rescale = {default_args_dict.get('do_rescale')}\n"
|
||||||
f' do_normalize = {default_args_dict.get("do_normalize")}\n'
|
f" do_normalize = {default_args_dict.get('do_normalize')}\n"
|
||||||
f' do_convert_rgb = {default_args_dict.get("do_convert_rgb")}\n\n\n'
|
f" do_convert_rgb = {default_args_dict.get('do_convert_rgb')}\n\n\n"
|
||||||
f'__all__ = ["{fast_image_processor_name}"]\n'
|
f'__all__ = ["{fast_image_processor_name}"]\n'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -189,7 +189,7 @@ def infer_shapes(nlp: Pipeline, framework: str) -> tuple[list[str], list[str], d
|
|||||||
raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})")
|
raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})")
|
||||||
else:
|
else:
|
||||||
seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len]
|
seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len]
|
||||||
axes.update({dim: "sequence" for dim in seq_axes})
|
axes.update(dict.fromkeys(seq_axes, "sequence"))
|
||||||
|
|
||||||
print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}")
|
print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}")
|
||||||
return axes
|
return axes
|
||||||
|
@ -226,7 +226,7 @@ def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_
|
|||||||
no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
|
no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
|
||||||
|
|
||||||
if no_answer_probs is None:
|
if no_answer_probs is None:
|
||||||
no_answer_probs = {k: 0.0 for k in preds}
|
no_answer_probs = dict.fromkeys(preds, 0.0)
|
||||||
|
|
||||||
exact, f1 = get_raw_scores(examples, preds)
|
exact, f1 = get_raw_scores(examples, preds)
|
||||||
|
|
||||||
|
@ -101,7 +101,7 @@ if is_tf_available():
|
|||||||
|
|
||||||
return tf.data.Dataset.from_generator(
|
return tf.data.Dataset.from_generator(
|
||||||
gen,
|
gen,
|
||||||
({k: tf.int32 for k in input_names}, label_type),
|
(dict.fromkeys(input_names, tf.int32), label_type),
|
||||||
({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
|
({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -68,7 +68,7 @@ deps = {
|
|||||||
"rhoknp": "rhoknp>=1.1.0,<1.3.1",
|
"rhoknp": "rhoknp>=1.1.0,<1.3.1",
|
||||||
"rjieba": "rjieba",
|
"rjieba": "rjieba",
|
||||||
"rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
|
"rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
|
||||||
"ruff": "ruff==0.5.1",
|
"ruff": "ruff==0.11.2",
|
||||||
"sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
|
"sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
|
||||||
"sacremoses": "sacremoses",
|
"sacremoses": "sacremoses",
|
||||||
"safetensors": "safetensors>=0.4.3",
|
"safetensors": "safetensors>=0.4.3",
|
||||||
|
@ -2749,9 +2749,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
|
|||||||
ngram keys (batch_size, num_ngrams, depth).
|
ngram keys (batch_size, num_ngrams, depth).
|
||||||
"""
|
"""
|
||||||
if len(ngrams.shape) != 3:
|
if len(ngrams.shape) != 3:
|
||||||
raise ValueError(
|
raise ValueError(f"Ngrams should be of shape (batch_size, num_ngrams, ngram_len), but is {ngrams.shape}")
|
||||||
"Ngrams should be of shape (batch_size, num_ngrams, ngram_len), but" f" is {ngrams.shape}"
|
|
||||||
)
|
|
||||||
if ngrams.shape[2] != self.ngram_len:
|
if ngrams.shape[2] != self.ngram_len:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Ngrams should be of shape (batch_size, num_ngrams, ngram_len),"
|
"Ngrams should be of shape (batch_size, num_ngrams, ngram_len),"
|
||||||
@ -2836,7 +2834,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
|
|||||||
def _check_input_ids_shape(self, input_ids: torch.LongTensor):
|
def _check_input_ids_shape(self, input_ids: torch.LongTensor):
|
||||||
"""Checks the shape of input ids."""
|
"""Checks the shape of input ids."""
|
||||||
if len(input_ids.shape) != 2:
|
if len(input_ids.shape) != 2:
|
||||||
raise ValueError("Input ids should be of shape (batch_size, input_len), but is" f" {input_ids.shape}")
|
raise ValueError(f"Input ids should be of shape (batch_size, input_len), but is {input_ids.shape}")
|
||||||
|
|
||||||
def compute_g_values(self, input_ids: torch.LongTensor) -> torch.LongTensor:
|
def compute_g_values(self, input_ids: torch.LongTensor) -> torch.LongTensor:
|
||||||
"""
|
"""
|
||||||
|
@ -1678,7 +1678,7 @@ class GenerationMixin:
|
|||||||
if execution_device_map is None:
|
if execution_device_map is None:
|
||||||
return None
|
return None
|
||||||
elif len(execution_device_map) == 1 and "" in execution_device_map:
|
elif len(execution_device_map) == 1 and "" in execution_device_map:
|
||||||
return {idx: execution_device_map[""] for idx in range(num_hidden_layers)}
|
return dict.fromkeys(range(num_hidden_layers), execution_device_map[""])
|
||||||
layer_device_map = {}
|
layer_device_map = {}
|
||||||
for layer in execution_device_map:
|
for layer in execution_device_map:
|
||||||
for idx in range(num_hidden_layers):
|
for idx in range(num_hidden_layers):
|
||||||
|
@ -106,11 +106,11 @@ def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_conve
|
|||||||
|
|
||||||
if any(key in linear_tags for key in quant_config.keys()):
|
if any(key in linear_tags for key in quant_config.keys()):
|
||||||
# If the user doesn't specify a key from get_linear_tags, the layer is not quantized via (key, None)
|
# If the user doesn't specify a key from get_linear_tags, the layer is not quantized via (key, None)
|
||||||
patch_params = {key: None for key in linear_tags}
|
patch_params = dict.fromkeys(linear_tags)
|
||||||
patch_params.update(quant_config)
|
patch_params.update(quant_config)
|
||||||
else:
|
else:
|
||||||
# Same quant_config for all layers
|
# Same quant_config for all layers
|
||||||
patch_params = {k: quant_config for k in linear_tags}
|
patch_params = dict.fromkeys(linear_tags, quant_config)
|
||||||
|
|
||||||
model, has_been_replaced = _prepare_for_hqq_linear(
|
model, has_been_replaced = _prepare_for_hqq_linear(
|
||||||
model, patch_params=patch_params, has_been_replaced=has_been_replaced
|
model, patch_params=patch_params, has_been_replaced=has_been_replaced
|
||||||
|
@ -21,9 +21,9 @@ def tpu_spmd_dataloader(dataloader: DataLoader):
|
|||||||
if is_torch_xla_available():
|
if is_torch_xla_available():
|
||||||
import torch_xla.distributed.parallel_loader as pl
|
import torch_xla.distributed.parallel_loader as pl
|
||||||
|
|
||||||
assert isinstance(
|
assert isinstance(dataloader, pl.MpDeviceLoader), (
|
||||||
dataloader, pl.MpDeviceLoader
|
"The dataloader must be a `torch_xla.distributed.parallel_loader.MpDeviceLoader`."
|
||||||
), "The dataloader must be a `torch_xla.distributed.parallel_loader.MpDeviceLoader`."
|
)
|
||||||
|
|
||||||
# This is to support PyTorch/XLA FSDP via SPMD.
|
# This is to support PyTorch/XLA FSDP via SPMD.
|
||||||
# Here we shard the input data's 0th dim across the fsdp axis.
|
# Here we shard the input data's 0th dim across the fsdp axis.
|
||||||
|
@ -2509,9 +2509,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
|||||||
total_decoder_name="",
|
total_decoder_name="",
|
||||||
total_encoder_name="",
|
total_encoder_name="",
|
||||||
):
|
):
|
||||||
assert isinstance(decoder_pointer, nn.Module) and isinstance(
|
assert isinstance(decoder_pointer, nn.Module) and isinstance(encoder_pointer, nn.Module), (
|
||||||
encoder_pointer, nn.Module
|
f"{decoder_pointer} and {encoder_pointer} have to be of type nn.Module"
|
||||||
), f"{decoder_pointer} and {encoder_pointer} have to be of type nn.Module"
|
)
|
||||||
if hasattr(decoder_pointer, "weight"):
|
if hasattr(decoder_pointer, "weight"):
|
||||||
assert hasattr(encoder_pointer, "weight")
|
assert hasattr(encoder_pointer, "weight")
|
||||||
encoder_pointer.weight = decoder_pointer.weight
|
encoder_pointer.weight = decoder_pointer.weight
|
||||||
@ -2525,9 +2525,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
|||||||
encoder_modules = encoder_pointer._modules
|
encoder_modules = encoder_pointer._modules
|
||||||
decoder_modules = decoder_pointer._modules
|
decoder_modules = decoder_pointer._modules
|
||||||
if len(decoder_modules) > 0:
|
if len(decoder_modules) > 0:
|
||||||
assert (
|
assert len(encoder_modules) > 0, (
|
||||||
len(encoder_modules) > 0
|
f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
|
||||||
), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
|
)
|
||||||
|
|
||||||
all_encoder_weights = {module_name + "/" + sub_name for sub_name in encoder_modules.keys()}
|
all_encoder_weights = {module_name + "/" + sub_name for sub_name in encoder_modules.keys()}
|
||||||
encoder_layer_pos = 0
|
encoder_layer_pos = 0
|
||||||
@ -3571,7 +3571,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
|||||||
f"Please upgrade accelerate with `pip install -U accelerate`"
|
f"Please upgrade accelerate with `pip install -U accelerate`"
|
||||||
)
|
)
|
||||||
# init state_dict for this shard
|
# init state_dict for this shard
|
||||||
shard_state_dict = {name: "" for name in shard}
|
shard_state_dict = dict.fromkeys(shard, "")
|
||||||
for module_name in shard:
|
for module_name in shard:
|
||||||
# skip to collect this weight again
|
# skip to collect this weight again
|
||||||
if shard_state_dict.get(module_name) != "":
|
if shard_state_dict.get(module_name) != "":
|
||||||
@ -4814,7 +4814,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
|||||||
param_device_map = expand_device_map(device_map, checkpoint_keys)
|
param_device_map = expand_device_map(device_map, checkpoint_keys)
|
||||||
str_dtype = str(dtype).replace("torch.", "") if dtype is not None else "float32"
|
str_dtype = str(dtype).replace("torch.", "") if dtype is not None else "float32"
|
||||||
if sharded_metadata is None:
|
if sharded_metadata is None:
|
||||||
weight_map = {p: checkpoint_files[0] for p in checkpoint_keys}
|
weight_map = dict.fromkeys(checkpoint_keys, checkpoint_files[0])
|
||||||
else:
|
else:
|
||||||
folder = os.path.sep.join(checkpoint_files[0].split(os.path.sep)[:-1])
|
folder = os.path.sep.join(checkpoint_files[0].split(os.path.sep)[:-1])
|
||||||
# Fix the weight map keys according to the key mapping
|
# Fix the weight map keys according to the key mapping
|
||||||
@ -5446,9 +5446,9 @@ class PoolerEndLogits(nn.Module):
|
|||||||
Returns:
|
Returns:
|
||||||
`torch.FloatTensor`: The end logits for SQuAD.
|
`torch.FloatTensor`: The end logits for SQuAD.
|
||||||
"""
|
"""
|
||||||
assert (
|
assert start_states is not None or start_positions is not None, (
|
||||||
start_states is not None or start_positions is not None
|
"One of start_states, start_positions should be not None"
|
||||||
), "One of start_states, start_positions should be not None"
|
)
|
||||||
if start_positions is not None:
|
if start_positions is not None:
|
||||||
slen, hsz = hidden_states.shape[-2:]
|
slen, hsz = hidden_states.shape[-2:]
|
||||||
start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
|
start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
|
||||||
@ -5514,9 +5514,9 @@ class PoolerAnswerClass(nn.Module):
|
|||||||
"""
|
"""
|
||||||
# No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
|
# No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
|
||||||
hsz = hidden_states.shape[-1]
|
hsz = hidden_states.shape[-1]
|
||||||
assert (
|
assert start_states is not None or start_positions is not None, (
|
||||||
start_states is not None or start_positions is not None
|
"One of start_states, start_positions should be not None"
|
||||||
), "One of start_states, start_positions should be not None"
|
)
|
||||||
if start_positions is not None:
|
if start_positions is not None:
|
||||||
start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
|
start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
|
||||||
start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz)
|
start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz)
|
||||||
|
@ -1058,7 +1058,7 @@ class AltCLIPVisionEmbeddings(nn.Module):
|
|||||||
batch_size, _, height, width = pixel_values.shape
|
batch_size, _, height, width = pixel_values.shape
|
||||||
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
|
f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
|
||||||
)
|
)
|
||||||
target_dtype = self.patch_embedding.weight.dtype
|
target_dtype = self.patch_embedding.weight.dtype
|
||||||
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
||||||
|
@ -329,7 +329,7 @@ class BridgeTowerVisionEmbeddings(nn.Module):
|
|||||||
batch_size, _, height, width = pixel_values.shape
|
batch_size, _, height, width = pixel_values.shape
|
||||||
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
|
f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
|
||||||
)
|
)
|
||||||
target_dtype = self.patch_embedding.weight.dtype
|
target_dtype = self.patch_embedding.weight.dtype
|
||||||
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
||||||
|
@ -234,7 +234,7 @@ class ChineseCLIPVisionEmbeddings(nn.Module):
|
|||||||
batch_size, _, height, width = pixel_values.shape
|
batch_size, _, height, width = pixel_values.shape
|
||||||
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
|
f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
|
||||||
)
|
)
|
||||||
target_dtype = self.patch_embedding.weight.dtype
|
target_dtype = self.patch_embedding.weight.dtype
|
||||||
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
||||||
|
@ -242,7 +242,7 @@ class CLIPVisionEmbeddings(nn.Module):
|
|||||||
batch_size, _, height, width = pixel_values.shape
|
batch_size, _, height, width = pixel_values.shape
|
||||||
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
|
f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
|
||||||
)
|
)
|
||||||
target_dtype = self.patch_embedding.weight.dtype
|
target_dtype = self.patch_embedding.weight.dtype
|
||||||
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
||||||
|
@ -209,7 +209,7 @@ class CLIPSegVisionEmbeddings(nn.Module):
|
|||||||
batch_size, _, height, width = pixel_values.shape
|
batch_size, _, height, width = pixel_values.shape
|
||||||
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
|
f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
|
||||||
)
|
)
|
||||||
patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
|
patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
|
||||||
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
|
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
|
||||||
|
@ -144,7 +144,7 @@ class ClvpEncoderConfig(PretrainedConfig):
|
|||||||
# this is to make sure that we can load only text or speech configs from the nested ClvpConfig.
|
# this is to make sure that we can load only text or speech configs from the nested ClvpConfig.
|
||||||
if config_type not in cls.base_config_key:
|
if config_type not in cls.base_config_key:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"We can only load either 'text_config' or 'speech_config' but you are trying to load" f"{config_type}"
|
f"We can only load either 'text_config' or 'speech_config' but you are trying to load{config_type}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# get the text config dict if we are loading from ClvpConfig
|
# get the text config dict if we are loading from ClvpConfig
|
||||||
|
@ -127,9 +127,9 @@ def convert_data2vec_checkpoint_to_pytorch(
|
|||||||
|
|
||||||
# self-attention output
|
# self-attention output
|
||||||
self_output: BertSelfOutput = layer.attention.output
|
self_output: BertSelfOutput = layer.attention.output
|
||||||
assert (
|
assert self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape, (
|
||||||
self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape
|
f"Shape for self_output.dense.weight should be {data2vec_layer.self_attn.out_proj.weight.shape}"
|
||||||
), f"Shape for self_output.dense.weight should be {data2vec_layer.self_attn.out_proj.weight.shape}"
|
)
|
||||||
self_output.dense.weight = data2vec_layer.self_attn.out_proj.weight
|
self_output.dense.weight = data2vec_layer.self_attn.out_proj.weight
|
||||||
self_output.dense.bias = data2vec_layer.self_attn.out_proj.bias
|
self_output.dense.bias = data2vec_layer.self_attn.out_proj.bias
|
||||||
self_output.LayerNorm.weight = data2vec_layer.self_attn_layer_norm.weight
|
self_output.LayerNorm.weight = data2vec_layer.self_attn_layer_norm.weight
|
||||||
@ -137,17 +137,17 @@ def convert_data2vec_checkpoint_to_pytorch(
|
|||||||
|
|
||||||
# intermediate
|
# intermediate
|
||||||
intermediate: BertIntermediate = layer.intermediate
|
intermediate: BertIntermediate = layer.intermediate
|
||||||
assert (
|
assert intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape, (
|
||||||
intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape
|
f"Shape for intermediate.dense.weight should be {data2vec_layer.fc1.weight.shape}"
|
||||||
), f"Shape for intermediate.dense.weight should be {data2vec_layer.fc1.weight.shape}"
|
)
|
||||||
intermediate.dense.weight = data2vec_layer.fc1.weight
|
intermediate.dense.weight = data2vec_layer.fc1.weight
|
||||||
intermediate.dense.bias = data2vec_layer.fc1.bias
|
intermediate.dense.bias = data2vec_layer.fc1.bias
|
||||||
|
|
||||||
# output
|
# output
|
||||||
bert_output: BertOutput = layer.output
|
bert_output: BertOutput = layer.output
|
||||||
assert (
|
assert bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape, (
|
||||||
bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape
|
f"Shape for bert_output.dense.weight should be {data2vec_layer.fc2.weight.shape}"
|
||||||
), f"Shape for bert_output.dense.weight should be {data2vec_layer.fc2.weight.shape}"
|
)
|
||||||
bert_output.dense.weight = data2vec_layer.fc2.weight
|
bert_output.dense.weight = data2vec_layer.fc2.weight
|
||||||
bert_output.dense.bias = data2vec_layer.fc2.bias
|
bert_output.dense.bias = data2vec_layer.fc2.bias
|
||||||
bert_output.LayerNorm.weight = data2vec_layer.final_layer_norm.weight
|
bert_output.LayerNorm.weight = data2vec_layer.final_layer_norm.weight
|
||||||
|
@ -180,9 +180,9 @@ def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_f
|
|||||||
gluon_param = to_torch(params[gluon_param])
|
gluon_param = to_torch(params[gluon_param])
|
||||||
shape_gluon = gluon_param.shape
|
shape_gluon = gluon_param.shape
|
||||||
|
|
||||||
assert (
|
assert shape_hf == shape_gluon, (
|
||||||
shape_hf == shape_gluon
|
f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers"
|
||||||
), f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers"
|
)
|
||||||
|
|
||||||
return gluon_param
|
return gluon_param
|
||||||
|
|
||||||
|
@ -427,7 +427,7 @@ class SubWordJapaneseTokenizer:
|
|||||||
)
|
)
|
||||||
keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
|
keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
|
||||||
blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
|
blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
|
||||||
self.content_trans1 = str.maketrans({k: "<BLOCK>" for k in keisen + blocks})
|
self.content_trans1 = str.maketrans(dict.fromkeys(keisen + blocks, "<BLOCK>"))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.ids_to_tokens)
|
return len(self.ids_to_tokens)
|
||||||
|
@ -154,7 +154,7 @@ class OpenLlamaConfig(PretrainedConfig):
|
|||||||
|
|
||||||
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
|
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
|
f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}"
|
||||||
)
|
)
|
||||||
rope_scaling_type = self.rope_scaling.get("type", None)
|
rope_scaling_type = self.rope_scaling.get("type", None)
|
||||||
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
||||||
|
@ -139,9 +139,9 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
|
|||||||
elif m_name == "kernel":
|
elif m_name == "kernel":
|
||||||
array = np.transpose(array)
|
array = np.transpose(array)
|
||||||
try:
|
try:
|
||||||
assert (
|
assert pointer.shape == array.shape, (
|
||||||
pointer.shape == array.shape
|
f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
|
||||||
), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
|
)
|
||||||
except AssertionError as e:
|
except AssertionError as e:
|
||||||
e.args += (pointer.shape, array.shape)
|
e.args += (pointer.shape, array.shape)
|
||||||
raise
|
raise
|
||||||
|
@ -579,7 +579,7 @@ class Speech2Text2Decoder(Speech2Text2PreTrainedModel):
|
|||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
if use_cache:
|
if use_cache:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache =" " False`..."
|
"`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..."
|
||||||
)
|
)
|
||||||
use_cache = False
|
use_cache = False
|
||||||
|
|
||||||
|
@ -1095,9 +1095,9 @@ class TFTransfoXLForSequenceClassification(TFTransfoXLPreTrainedModel, TFSequenc
|
|||||||
batch_size, sequence_length = shape_list(input_ids)[:2]
|
batch_size, sequence_length = shape_list(input_ids)[:2]
|
||||||
else:
|
else:
|
||||||
batch_size, sequence_length = shape_list(inputs_embeds)[:2]
|
batch_size, sequence_length = shape_list(inputs_embeds)[:2]
|
||||||
assert (
|
assert self.config.pad_token_id is not None or batch_size == 1, (
|
||||||
self.config.pad_token_id is not None or batch_size == 1
|
"Cannot handle batch sizes > 1 if no padding token is defined."
|
||||||
), "Cannot handle batch sizes > 1 if no padding token is defined."
|
)
|
||||||
|
|
||||||
if not tf.is_tensor(sequence_lengths):
|
if not tf.is_tensor(sequence_lengths):
|
||||||
in_logits = logits[0:batch_size, sequence_lengths]
|
in_logits = logits[0:batch_size, sequence_lengths]
|
||||||
|
@ -155,9 +155,9 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
|
|||||||
p_i.data = torch.from_numpy(arr_i)
|
p_i.data = torch.from_numpy(arr_i)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
assert (
|
assert pointer.shape == array.shape, (
|
||||||
pointer.shape == array.shape
|
f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
|
||||||
), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
|
)
|
||||||
except AssertionError as e:
|
except AssertionError as e:
|
||||||
e.args += (pointer.shape, array.shape)
|
e.args += (pointer.shape, array.shape)
|
||||||
raise
|
raise
|
||||||
@ -1238,9 +1238,9 @@ class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel):
|
|||||||
else:
|
else:
|
||||||
batch_size, sequence_length = inputs_embeds.shape[:2]
|
batch_size, sequence_length = inputs_embeds.shape[:2]
|
||||||
|
|
||||||
assert (
|
assert self.config.pad_token_id is not None or batch_size == 1, (
|
||||||
self.config.pad_token_id is not None or batch_size == 1
|
"Cannot handle batch sizes > 1 if no padding token is defined."
|
||||||
), "Cannot handle batch sizes > 1 if no padding token is defined."
|
)
|
||||||
if self.config.pad_token_id is None:
|
if self.config.pad_token_id is None:
|
||||||
sequence_lengths = -1
|
sequence_lengths = -1
|
||||||
else:
|
else:
|
||||||
|
@ -588,9 +588,9 @@ class XLMProphetNetPositionalEmbeddings(nn.Embedding):
|
|||||||
super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id)
|
super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id)
|
||||||
|
|
||||||
def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None):
|
def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None):
|
||||||
assert (position_ids is None) or (
|
assert (position_ids is None) or (self.padding_idx is None), (
|
||||||
self.padding_idx is None
|
"If position_ids is pre-computed then padding_idx should not be set."
|
||||||
), "If position_ids is pre-computed then padding_idx should not be set."
|
)
|
||||||
|
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
if past_key_values is not None:
|
if past_key_values is not None:
|
||||||
@ -784,9 +784,9 @@ class XLMProphetNetNgramSelfAttention(nn.Module):
|
|||||||
self.head_dim = config.hidden_size // self.num_attn_heads
|
self.head_dim = config.hidden_size // self.num_attn_heads
|
||||||
self.ngram = config.ngram
|
self.ngram = config.ngram
|
||||||
|
|
||||||
assert (
|
assert self.head_dim * self.num_attn_heads == config.hidden_size, (
|
||||||
self.head_dim * self.num_attn_heads == config.hidden_size
|
"config.hidden_size must be divisible by num_attn_heads"
|
||||||
), "config.hidden_size must be divisible by num_attn_heads"
|
)
|
||||||
# key, value, query projection
|
# key, value, query projection
|
||||||
self.key_proj = nn.Linear(config.hidden_size, config.hidden_size)
|
self.key_proj = nn.Linear(config.hidden_size, config.hidden_size)
|
||||||
self.value_proj = nn.Linear(config.hidden_size, config.hidden_size)
|
self.value_proj = nn.Linear(config.hidden_size, config.hidden_size)
|
||||||
@ -1041,9 +1041,9 @@ class XLMProphetNetNgramSelfAttention(nn.Module):
|
|||||||
|
|
||||||
if predict_relative_position_buckets is None:
|
if predict_relative_position_buckets is None:
|
||||||
key_sequence_length = attn_weights.shape[-1]
|
key_sequence_length = attn_weights.shape[-1]
|
||||||
assert (
|
assert position_ids[0][0] == key_sequence_length - 1, (
|
||||||
position_ids[0][0] == key_sequence_length - 1
|
"`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
|
||||||
), "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
|
)
|
||||||
relative_positions = (
|
relative_positions = (
|
||||||
torch.arange(0, key_sequence_length)
|
torch.arange(0, key_sequence_length)
|
||||||
.unsqueeze(0)
|
.unsqueeze(0)
|
||||||
@ -1313,9 +1313,9 @@ class XLMProphetNetEncoder(XLMProphetNetPreTrainedModel):
|
|||||||
|
|
||||||
# check if head_mask has a correct number of layers specified if desired
|
# check if head_mask has a correct number of layers specified if desired
|
||||||
if head_mask is not None:
|
if head_mask is not None:
|
||||||
assert head_mask.size()[0] == (
|
assert head_mask.size()[0] == (len(self.layers)), (
|
||||||
len(self.layers)
|
f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
||||||
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
)
|
||||||
for idx, encoder_layer in enumerate(self.layers):
|
for idx, encoder_layer in enumerate(self.layers):
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_hidden_states = encoder_hidden_states + (hidden_states,)
|
encoder_hidden_states = encoder_hidden_states + (hidden_states,)
|
||||||
@ -1488,9 +1488,9 @@ class XLMProphetNetDecoder(XLMProphetNetPreTrainedModel):
|
|||||||
|
|
||||||
# prepare attention mask
|
# prepare attention mask
|
||||||
if past_key_values is not None:
|
if past_key_values is not None:
|
||||||
assert (
|
assert hidden_states.size(1) == 1, (
|
||||||
hidden_states.size(1) == 1
|
"At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
|
||||||
), "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
|
)
|
||||||
|
|
||||||
ngram_hidden_states = [
|
ngram_hidden_states = [
|
||||||
(ngram_embeddings[ngram - 1] + predicting_stream_pos_embed).repeat(batch_size, 1, 1)
|
(ngram_embeddings[ngram - 1] + predicting_stream_pos_embed).repeat(batch_size, 1, 1)
|
||||||
|
@ -114,7 +114,7 @@ class DepthProConfig(PretrainedConfig):
|
|||||||
# scaled_images_ratios is sorted
|
# scaled_images_ratios is sorted
|
||||||
if scaled_images_ratios != sorted(scaled_images_ratios):
|
if scaled_images_ratios != sorted(scaled_images_ratios):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Values in scaled_images_ratios={scaled_images_ratios} " "should be sorted from low to high"
|
f"Values in scaled_images_ratios={scaled_images_ratios} should be sorted from low to high"
|
||||||
)
|
)
|
||||||
|
|
||||||
# scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims should be consistent
|
# scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims should be consistent
|
||||||
|
@ -275,9 +275,9 @@ class FlaxTransformerBlock(nn.Module):
|
|||||||
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
|
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
|
||||||
|
|
||||||
def setup(self):
|
def setup(self):
|
||||||
assert (
|
assert self.config.dim % self.config.n_heads == 0, (
|
||||||
self.config.dim % self.config.n_heads == 0
|
f"Hidden size {self.config.dim} not dividable by number of heads {self.config.n_heads}"
|
||||||
), f"Hidden size {self.config.dim} not dividable by number of heads {self.config.n_heads}"
|
)
|
||||||
|
|
||||||
self.attention = FlaxMultiHeadSelfAttention(self.config, dtype=self.dtype)
|
self.attention = FlaxMultiHeadSelfAttention(self.config, dtype=self.dtype)
|
||||||
self.sa_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype)
|
self.sa_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype)
|
||||||
|
@ -269,9 +269,9 @@ class TFTransformerBlock(keras.layers.Layer):
|
|||||||
self.activation = config.activation
|
self.activation = config.activation
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
|
|
||||||
assert (
|
assert config.dim % config.n_heads == 0, (
|
||||||
config.dim % config.n_heads == 0
|
f"Hidden size {config.dim} not dividable by number of heads {config.n_heads}"
|
||||||
), f"Hidden size {config.dim} not dividable by number of heads {config.n_heads}"
|
)
|
||||||
|
|
||||||
self.attention = TFMultiHeadSelfAttention(config, name="attention")
|
self.attention = TFMultiHeadSelfAttention(config, name="attention")
|
||||||
self.sa_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm")
|
self.sa_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm")
|
||||||
|
@ -137,7 +137,7 @@ if __name__ == "__main__":
|
|||||||
dest_dir = f"converted-{src_file.name}" if args.dest is None else args.dest
|
dest_dir = f"converted-{src_file.name}" if args.dest is None else args.dest
|
||||||
dest_dir = Path(dest_dir)
|
dest_dir = Path(dest_dir)
|
||||||
assert src_file.exists()
|
assert src_file.exists()
|
||||||
assert (
|
assert args.type is not None, (
|
||||||
args.type is not None
|
"Please specify the component type of the DPR model to convert: 'ctx_encoder', 'question_encoder' or 'reader'."
|
||||||
), "Please specify the component type of the DPR model to convert: 'ctx_encoder', 'question_encoder' or 'reader'."
|
)
|
||||||
convert(args.type, src_file, dest_dir)
|
convert(args.type, src_file, dest_dir)
|
||||||
|
@ -170,9 +170,9 @@ class CustomDPRReaderTokenizerMixin:
|
|||||||
texts = texts if not isinstance(texts, str) else [texts]
|
texts = texts if not isinstance(texts, str) else [texts]
|
||||||
n_passages = len(titles)
|
n_passages = len(titles)
|
||||||
questions = questions if not isinstance(questions, str) else [questions] * n_passages
|
questions = questions if not isinstance(questions, str) else [questions] * n_passages
|
||||||
assert len(titles) == len(
|
assert len(titles) == len(texts), (
|
||||||
texts
|
f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts."
|
||||||
), f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts."
|
)
|
||||||
encoded_question_and_titles = super().__call__(questions, titles, padding=False, truncation=False)["input_ids"]
|
encoded_question_and_titles = super().__call__(questions, titles, padding=False, truncation=False)["input_ids"]
|
||||||
encoded_texts = super().__call__(texts, add_special_tokens=False, padding=False, truncation=False)["input_ids"]
|
encoded_texts = super().__call__(texts, add_special_tokens=False, padding=False, truncation=False)["input_ids"]
|
||||||
encoded_inputs = {
|
encoded_inputs = {
|
||||||
|
@ -617,8 +617,7 @@ class EncodecModel(EncodecPreTrainedModel):
|
|||||||
bandwidth = self.config.target_bandwidths[0]
|
bandwidth = self.config.target_bandwidths[0]
|
||||||
if bandwidth not in self.config.target_bandwidths:
|
if bandwidth not in self.config.target_bandwidths:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"This model doesn't support the bandwidth {bandwidth}. "
|
f"This model doesn't support the bandwidth {bandwidth}. Select one of {self.config.target_bandwidths}."
|
||||||
f"Select one of {self.config.target_bandwidths}."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
_, channels, input_length = input_values.shape
|
_, channels, input_length = input_values.shape
|
||||||
|
@ -399,13 +399,11 @@ def map_structure_with_atom_order(in_list: list, first_call: bool = True) -> lis
|
|||||||
|
|
||||||
|
|
||||||
@functools.lru_cache(maxsize=None)
|
@functools.lru_cache(maxsize=None)
|
||||||
def load_stereo_chemical_props() -> (
|
def load_stereo_chemical_props() -> Tuple[
|
||||||
Tuple[
|
|
||||||
Mapping[str, List[Bond]],
|
Mapping[str, List[Bond]],
|
||||||
Mapping[str, List[Bond]],
|
Mapping[str, List[Bond]],
|
||||||
Mapping[str, List[BondAngle]],
|
Mapping[str, List[BondAngle]],
|
||||||
]
|
]:
|
||||||
):
|
|
||||||
"""Load stereo_chemical_props.txt into a nice structure.
|
"""Load stereo_chemical_props.txt into a nice structure.
|
||||||
|
|
||||||
Load literature values for bond lengths and bond angles and translate bond angles into the length of the opposite
|
Load literature values for bond lengths and bond angles and translate bond angles into the length of the opposite
|
||||||
|
@ -539,9 +539,9 @@ class FSMTEncoder(nn.Module):
|
|||||||
all_attentions = () if output_attentions else None
|
all_attentions = () if output_attentions else None
|
||||||
# check if head_mask has a correct number of layers specified if desired
|
# check if head_mask has a correct number of layers specified if desired
|
||||||
if head_mask is not None:
|
if head_mask is not None:
|
||||||
assert head_mask.size()[0] == (
|
assert head_mask.size()[0] == (len(self.layers)), (
|
||||||
len(self.layers)
|
f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
||||||
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
)
|
||||||
for idx, encoder_layer in enumerate(self.layers):
|
for idx, encoder_layer in enumerate(self.layers):
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
x = x.transpose(0, 1) # T x B x C -> B x T x C
|
x = x.transpose(0, 1) # T x B x C -> B x T x C
|
||||||
@ -960,9 +960,9 @@ class Attention(nn.Module):
|
|||||||
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
|
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
|
||||||
|
|
||||||
if layer_head_mask is not None:
|
if layer_head_mask is not None:
|
||||||
assert layer_head_mask.size() == (
|
assert layer_head_mask.size() == (self.num_heads,), (
|
||||||
self.num_heads,
|
f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
|
||||||
), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
|
)
|
||||||
attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
||||||
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
||||||
|
|
||||||
|
@ -113,9 +113,9 @@ class FunnelConfig(PretrainedConfig):
|
|||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.block_sizes = block_sizes
|
self.block_sizes = block_sizes
|
||||||
self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats
|
self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats
|
||||||
assert len(block_sizes) == len(
|
assert len(block_sizes) == len(self.block_repeats), (
|
||||||
self.block_repeats
|
"`block_sizes` and `block_repeats` should have the same length."
|
||||||
), "`block_sizes` and `block_repeats` should have the same length."
|
)
|
||||||
self.num_decoder_layers = num_decoder_layers
|
self.num_decoder_layers = num_decoder_layers
|
||||||
self.d_model = d_model
|
self.d_model = d_model
|
||||||
self.n_head = n_head
|
self.n_head = n_head
|
||||||
|
@ -195,7 +195,7 @@ class FuyuConfig(PretrainedConfig):
|
|||||||
|
|
||||||
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
|
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
|
f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}"
|
||||||
)
|
)
|
||||||
rope_scaling_type = self.rope_scaling.get("type", None)
|
rope_scaling_type = self.rope_scaling.get("type", None)
|
||||||
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
||||||
|
@ -683,7 +683,7 @@ class GitVisionEmbeddings(nn.Module):
|
|||||||
batch_size, _, height, width = pixel_values.shape
|
batch_size, _, height, width = pixel_values.shape
|
||||||
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
|
f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
|
||||||
)
|
)
|
||||||
target_dtype = self.patch_embedding.weight.dtype
|
target_dtype = self.patch_embedding.weight.dtype
|
||||||
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
||||||
|
@ -250,7 +250,7 @@ class SubWordJapaneseTokenizer:
|
|||||||
)
|
)
|
||||||
keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
|
keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
|
||||||
blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
|
blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
|
||||||
self.content_trans1 = str.maketrans({k: "<BLOCK>" for k in keisen + blocks})
|
self.content_trans1 = str.maketrans(dict.fromkeys(keisen + blocks, "<BLOCK>"))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.ids_to_tokens)
|
return len(self.ids_to_tokens)
|
||||||
|
@ -171,9 +171,9 @@ class QuantAct(nn.Module):
|
|||||||
x_min = x_act.data.min()
|
x_min = x_act.data.min()
|
||||||
x_max = x_act.data.max()
|
x_max = x_act.data.max()
|
||||||
|
|
||||||
assert (
|
assert x_max.isnan().sum() == 0 and x_min.isnan().sum() == 0, (
|
||||||
x_max.isnan().sum() == 0 and x_min.isnan().sum() == 0
|
"NaN detected when computing min/max of the activation"
|
||||||
), "NaN detected when computing min/max of the activation"
|
)
|
||||||
|
|
||||||
# Initialization
|
# Initialization
|
||||||
if self.x_min.min() > -1.1e-5 and self.x_max.max() < 1.1e-5:
|
if self.x_min.min() > -1.1e-5 and self.x_max.max() < 1.1e-5:
|
||||||
|
@ -451,7 +451,7 @@ class Kosmos2VisionEmbeddings(nn.Module):
|
|||||||
batch_size, _, height, width = pixel_values.shape
|
batch_size, _, height, width = pixel_values.shape
|
||||||
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
|
f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
|
||||||
)
|
)
|
||||||
target_dtype = self.patch_embedding.weight.dtype
|
target_dtype = self.patch_embedding.weight.dtype
|
||||||
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
||||||
|
@ -101,8 +101,7 @@ class LayoutXLMProcessor(ProcessorMixin):
|
|||||||
# verify input
|
# verify input
|
||||||
if self.image_processor.apply_ocr and (boxes is not None):
|
if self.image_processor.apply_ocr and (boxes is not None):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"You cannot provide bounding boxes "
|
"You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
|
||||||
"if you initialized the image processor with apply_ocr set to True."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.image_processor.apply_ocr and (word_labels is not None):
|
if self.image_processor.apply_ocr and (word_labels is not None):
|
||||||
|
@ -130,12 +130,12 @@ class LEDEncoderSelfAttention(nn.Module):
|
|||||||
|
|
||||||
self.layer_id = layer_id
|
self.layer_id = layer_id
|
||||||
attention_window = config.attention_window[self.layer_id]
|
attention_window = config.attention_window[self.layer_id]
|
||||||
assert (
|
assert attention_window % 2 == 0, (
|
||||||
attention_window % 2 == 0
|
f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
|
||||||
), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
|
)
|
||||||
assert (
|
assert attention_window > 0, (
|
||||||
attention_window > 0
|
f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
|
||||||
), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
|
)
|
||||||
|
|
||||||
self.one_sided_attn_window_size = attention_window // 2
|
self.one_sided_attn_window_size = attention_window // 2
|
||||||
|
|
||||||
@ -169,9 +169,9 @@ class LEDEncoderSelfAttention(nn.Module):
|
|||||||
value_vectors = self.value(hidden_states)
|
value_vectors = self.value(hidden_states)
|
||||||
|
|
||||||
seq_len, batch_size, embed_dim = hidden_states.size()
|
seq_len, batch_size, embed_dim = hidden_states.size()
|
||||||
assert (
|
assert embed_dim == self.embed_dim, (
|
||||||
embed_dim == self.embed_dim
|
f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
|
||||||
), f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
|
)
|
||||||
|
|
||||||
# normalize query
|
# normalize query
|
||||||
query_vectors /= math.sqrt(self.head_dim)
|
query_vectors /= math.sqrt(self.head_dim)
|
||||||
@ -239,9 +239,9 @@ class LEDEncoderSelfAttention(nn.Module):
|
|||||||
) # use fp32 for numerical stability
|
) # use fp32 for numerical stability
|
||||||
|
|
||||||
if layer_head_mask is not None:
|
if layer_head_mask is not None:
|
||||||
assert layer_head_mask.size() == (
|
assert layer_head_mask.size() == (self.num_heads,), (
|
||||||
self.num_heads,
|
f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
|
||||||
), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
|
)
|
||||||
attn_probs = layer_head_mask.view(1, 1, -1, 1) * attn_probs
|
attn_probs = layer_head_mask.view(1, 1, -1, 1) * attn_probs
|
||||||
|
|
||||||
# softmax sometimes inserts NaN if all positions are masked, replace them with 0
|
# softmax sometimes inserts NaN if all positions are masked, replace them with 0
|
||||||
@ -433,9 +433,9 @@ class LEDEncoderSelfAttention(nn.Module):
|
|||||||
overlap of size window_overlap
|
overlap of size window_overlap
|
||||||
"""
|
"""
|
||||||
batch_size, seq_len, num_heads, head_dim = query.size()
|
batch_size, seq_len, num_heads, head_dim = query.size()
|
||||||
assert (
|
assert seq_len % (window_overlap * 2) == 0, (
|
||||||
seq_len % (window_overlap * 2) == 0
|
f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
|
||||||
), f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
|
)
|
||||||
assert query.size() == key.size()
|
assert query.size() == key.size()
|
||||||
|
|
||||||
chunks_count = torch.div(seq_len, window_overlap, rounding_mode="trunc") - 1
|
chunks_count = torch.div(seq_len, window_overlap, rounding_mode="trunc") - 1
|
||||||
@ -706,9 +706,9 @@ class LEDEncoderSelfAttention(nn.Module):
|
|||||||
|
|
||||||
# apply layer head masking
|
# apply layer head masking
|
||||||
if layer_head_mask is not None:
|
if layer_head_mask is not None:
|
||||||
assert layer_head_mask.size() == (
|
assert layer_head_mask.size() == (self.num_heads,), (
|
||||||
self.num_heads,
|
f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
|
||||||
), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
|
)
|
||||||
global_attn_probs_float = layer_head_mask.view(1, -1, 1, 1) * global_attn_probs_float.view(
|
global_attn_probs_float = layer_head_mask.view(1, -1, 1, 1) * global_attn_probs_float.view(
|
||||||
batch_size, self.num_heads, max_num_global_attn_indices, seq_len
|
batch_size, self.num_heads, max_num_global_attn_indices, seq_len
|
||||||
)
|
)
|
||||||
|
@ -182,12 +182,12 @@ class TFLEDEncoderSelfAttention(keras.layers.Layer):
|
|||||||
self.layer_id = layer_id
|
self.layer_id = layer_id
|
||||||
attention_window = config.attention_window[self.layer_id]
|
attention_window = config.attention_window[self.layer_id]
|
||||||
|
|
||||||
assert (
|
assert attention_window % 2 == 0, (
|
||||||
attention_window % 2 == 0
|
f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
|
||||||
), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
|
)
|
||||||
assert (
|
assert attention_window > 0, (
|
||||||
attention_window > 0
|
f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
|
||||||
), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
|
)
|
||||||
|
|
||||||
self.one_sided_attn_window_size = attention_window // 2
|
self.one_sided_attn_window_size = attention_window // 2
|
||||||
|
|
||||||
|
@ -510,12 +510,12 @@ class LongformerSelfAttention(nn.Module):
|
|||||||
|
|
||||||
self.layer_id = layer_id
|
self.layer_id = layer_id
|
||||||
attention_window = config.attention_window[self.layer_id]
|
attention_window = config.attention_window[self.layer_id]
|
||||||
assert (
|
assert attention_window % 2 == 0, (
|
||||||
attention_window % 2 == 0
|
f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
|
||||||
), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
|
)
|
||||||
assert (
|
assert attention_window > 0, (
|
||||||
attention_window > 0
|
f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
|
||||||
), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
|
)
|
||||||
|
|
||||||
self.one_sided_attn_window_size = attention_window // 2
|
self.one_sided_attn_window_size = attention_window // 2
|
||||||
|
|
||||||
@ -549,9 +549,9 @@ class LongformerSelfAttention(nn.Module):
|
|||||||
value_vectors = self.value(hidden_states)
|
value_vectors = self.value(hidden_states)
|
||||||
|
|
||||||
seq_len, batch_size, embed_dim = hidden_states.size()
|
seq_len, batch_size, embed_dim = hidden_states.size()
|
||||||
assert (
|
assert embed_dim == self.embed_dim, (
|
||||||
embed_dim == self.embed_dim
|
f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
|
||||||
), f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
|
)
|
||||||
|
|
||||||
# normalize query
|
# normalize query
|
||||||
query_vectors /= math.sqrt(self.head_dim)
|
query_vectors /= math.sqrt(self.head_dim)
|
||||||
@ -619,9 +619,9 @@ class LongformerSelfAttention(nn.Module):
|
|||||||
) # use fp32 for numerical stability
|
) # use fp32 for numerical stability
|
||||||
|
|
||||||
if layer_head_mask is not None:
|
if layer_head_mask is not None:
|
||||||
assert layer_head_mask.size() == (
|
assert layer_head_mask.size() == (self.num_heads,), (
|
||||||
self.num_heads,
|
f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
|
||||||
), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
|
)
|
||||||
attn_probs = layer_head_mask.view(1, 1, -1, 1) * attn_probs
|
attn_probs = layer_head_mask.view(1, 1, -1, 1) * attn_probs
|
||||||
|
|
||||||
# softmax sometimes inserts NaN if all positions are masked, replace them with 0
|
# softmax sometimes inserts NaN if all positions are masked, replace them with 0
|
||||||
@ -813,9 +813,9 @@ class LongformerSelfAttention(nn.Module):
|
|||||||
overlap of size window_overlap
|
overlap of size window_overlap
|
||||||
"""
|
"""
|
||||||
batch_size, seq_len, num_heads, head_dim = query.size()
|
batch_size, seq_len, num_heads, head_dim = query.size()
|
||||||
assert (
|
assert seq_len % (window_overlap * 2) == 0, (
|
||||||
seq_len % (window_overlap * 2) == 0
|
f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
|
||||||
), f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
|
)
|
||||||
assert query.size() == key.size()
|
assert query.size() == key.size()
|
||||||
|
|
||||||
chunks_count = torch.div(seq_len, window_overlap, rounding_mode="trunc") - 1
|
chunks_count = torch.div(seq_len, window_overlap, rounding_mode="trunc") - 1
|
||||||
@ -1086,9 +1086,9 @@ class LongformerSelfAttention(nn.Module):
|
|||||||
|
|
||||||
# apply layer head masking
|
# apply layer head masking
|
||||||
if layer_head_mask is not None:
|
if layer_head_mask is not None:
|
||||||
assert layer_head_mask.size() == (
|
assert layer_head_mask.size() == (self.num_heads,), (
|
||||||
self.num_heads,
|
f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
|
||||||
), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
|
)
|
||||||
global_attn_probs_float = layer_head_mask.view(1, -1, 1, 1) * global_attn_probs_float.view(
|
global_attn_probs_float = layer_head_mask.view(1, -1, 1, 1) * global_attn_probs_float.view(
|
||||||
batch_size, self.num_heads, max_num_global_attn_indices, seq_len
|
batch_size, self.num_heads, max_num_global_attn_indices, seq_len
|
||||||
)
|
)
|
||||||
@ -1287,9 +1287,9 @@ class LongformerEncoder(nn.Module):
|
|||||||
|
|
||||||
# check if head_mask has a correct number of layers specified if desired
|
# check if head_mask has a correct number of layers specified if desired
|
||||||
if head_mask is not None:
|
if head_mask is not None:
|
||||||
assert head_mask.size()[0] == (
|
assert head_mask.size()[0] == (len(self.layer)), (
|
||||||
len(self.layer)
|
f"The head_mask should be specified for {len(self.layer)} layers, but it is for {head_mask.size()[0]}."
|
||||||
), f"The head_mask should be specified for {len(self.layer)} layers, but it is for {head_mask.size()[0]}."
|
)
|
||||||
for idx, layer_module in enumerate(self.layer):
|
for idx, layer_module in enumerate(self.layer):
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||||
@ -1590,8 +1590,7 @@ class LongformerModel(LongformerPreTrainedModel):
|
|||||||
# this path should be recorded in the ONNX export, it is fine with padding_len == 0 as well
|
# this path should be recorded in the ONNX export, it is fine with padding_len == 0 as well
|
||||||
if padding_len > 0:
|
if padding_len > 0:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
f"Input ids are automatically padded to be a multiple of "
|
f"Input ids are automatically padded to be a multiple of `config.attention_window`: {attention_window}"
|
||||||
f"`config.attention_window`: {attention_window}"
|
|
||||||
)
|
)
|
||||||
if input_ids is not None:
|
if input_ids is not None:
|
||||||
input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id)
|
input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id)
|
||||||
|
@ -746,12 +746,12 @@ class TFLongformerSelfAttention(keras.layers.Layer):
|
|||||||
self.layer_id = layer_id
|
self.layer_id = layer_id
|
||||||
attention_window = config.attention_window[self.layer_id]
|
attention_window = config.attention_window[self.layer_id]
|
||||||
|
|
||||||
assert (
|
assert attention_window % 2 == 0, (
|
||||||
attention_window % 2 == 0
|
f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
|
||||||
), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
|
)
|
||||||
assert (
|
assert attention_window > 0, (
|
||||||
attention_window > 0
|
f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
|
||||||
), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
|
)
|
||||||
|
|
||||||
self.one_sided_attn_window_size = attention_window // 2
|
self.one_sided_attn_window_size = attention_window // 2
|
||||||
|
|
||||||
|
@ -1294,7 +1294,7 @@ class M2M100Decoder(M2M100PreTrainedModel):
|
|||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
if use_cache:
|
if use_cache:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"`use_cache=True` is incompatible with gradient checkpointing. Setting" " `use_cache=False`..."
|
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
|
||||||
)
|
)
|
||||||
use_cache = False
|
use_cache = False
|
||||||
|
|
||||||
|
@ -228,7 +228,7 @@ class TatoebaConverter:
|
|||||||
# combine with Tatoeba markdown
|
# combine with Tatoeba markdown
|
||||||
readme_url = f"{TATOEBA_MODELS_URL}/{model_dict['_name']}/README.md"
|
readme_url = f"{TATOEBA_MODELS_URL}/{model_dict['_name']}/README.md"
|
||||||
extra_markdown = f"""
|
extra_markdown = f"""
|
||||||
### {model_dict['_name']}
|
### {model_dict["_name"]}
|
||||||
|
|
||||||
* source language name: {self.tag2name[a3_src]}
|
* source language name: {self.tag2name[a3_src]}
|
||||||
* target language name: {self.tag2name[a3_tgt]}
|
* target language name: {self.tag2name[a3_tgt]}
|
||||||
@ -237,12 +237,12 @@ class TatoebaConverter:
|
|||||||
|
|
||||||
content = (
|
content = (
|
||||||
f"""
|
f"""
|
||||||
* model: {model_dict['modeltype']}
|
* model: {model_dict["modeltype"]}
|
||||||
* source language code{src_multilingual*'s'}: {', '.join(a2_src_tags)}
|
* source language code{src_multilingual * "s"}: {", ".join(a2_src_tags)}
|
||||||
* target language code{tgt_multilingual*'s'}: {', '.join(a2_tgt_tags)}
|
* target language code{tgt_multilingual * "s"}: {", ".join(a2_tgt_tags)}
|
||||||
* dataset: opus {backtranslated_data}
|
* dataset: opus {backtranslated_data}
|
||||||
* release date: {model_dict['release-date']}
|
* release date: {model_dict["release-date"]}
|
||||||
* pre-processing: {model_dict['pre-processing']}
|
* pre-processing: {model_dict["pre-processing"]}
|
||||||
"""
|
"""
|
||||||
+ multilingual_data
|
+ multilingual_data
|
||||||
+ tuned
|
+ tuned
|
||||||
|
@ -741,9 +741,9 @@ class MarianEncoder(MarianPreTrainedModel):
|
|||||||
|
|
||||||
# check if head_mask has a correct number of layers specified if desired
|
# check if head_mask has a correct number of layers specified if desired
|
||||||
if head_mask is not None:
|
if head_mask is not None:
|
||||||
assert head_mask.size()[0] == (
|
assert head_mask.size()[0] == (len(self.layers)), (
|
||||||
len(self.layers)
|
f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
||||||
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
)
|
||||||
for idx, encoder_layer in enumerate(self.layers):
|
for idx, encoder_layer in enumerate(self.layers):
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
|
@ -339,7 +339,7 @@ class MarianTokenizer(PreTrainedTokenizer):
|
|||||||
def __getstate__(self) -> Dict:
|
def __getstate__(self) -> Dict:
|
||||||
state = self.__dict__.copy()
|
state = self.__dict__.copy()
|
||||||
state.update(
|
state.update(
|
||||||
{k: None for k in ["spm_source", "spm_target", "current_spm", "punc_normalizer", "target_vocab_file"]}
|
dict.fromkeys(["spm_source", "spm_target", "current_spm", "punc_normalizer", "target_vocab_file"])
|
||||||
)
|
)
|
||||||
return state
|
return state
|
||||||
|
|
||||||
|
@ -863,9 +863,9 @@ def test(
|
|||||||
for original_model_feature, our_model_feature in zip(
|
for original_model_feature, our_model_feature in zip(
|
||||||
original_model_backbone_features.values(), our_model_output.encoder_hidden_states
|
original_model_backbone_features.values(), our_model_output.encoder_hidden_states
|
||||||
):
|
):
|
||||||
assert torch.allclose(
|
assert torch.allclose(original_model_feature, our_model_feature, atol=tolerance), (
|
||||||
original_model_feature, our_model_feature, atol=tolerance
|
"The backbone features are not the same."
|
||||||
), "The backbone features are not the same."
|
)
|
||||||
|
|
||||||
# Test pixel decoder
|
# Test pixel decoder
|
||||||
mask_features, _, multi_scale_features = original_model.sem_seg_head.pixel_decoder.forward_features(
|
mask_features, _, multi_scale_features = original_model.sem_seg_head.pixel_decoder.forward_features(
|
||||||
@ -875,9 +875,9 @@ def test(
|
|||||||
for original_model_feature, our_model_feature in zip(
|
for original_model_feature, our_model_feature in zip(
|
||||||
multi_scale_features, our_model_output.pixel_decoder_hidden_states
|
multi_scale_features, our_model_output.pixel_decoder_hidden_states
|
||||||
):
|
):
|
||||||
assert torch.allclose(
|
assert torch.allclose(original_model_feature, our_model_feature, atol=tolerance), (
|
||||||
original_model_feature, our_model_feature, atol=tolerance
|
"The pixel decoder feature are not the same"
|
||||||
), "The pixel decoder feature are not the same"
|
)
|
||||||
|
|
||||||
# Let's test the full model
|
# Let's test the full model
|
||||||
tr_complete = T.Compose(
|
tr_complete = T.Compose(
|
||||||
@ -894,12 +894,12 @@ def test(
|
|||||||
|
|
||||||
assert original_mask_logits.shape == our_mask_logits.shape, "Output masks shapes are not matching."
|
assert original_mask_logits.shape == our_mask_logits.shape, "Output masks shapes are not matching."
|
||||||
assert original_class_logits.shape == our_class_logits.shape, "Output class logits shapes are not matching."
|
assert original_class_logits.shape == our_class_logits.shape, "Output class logits shapes are not matching."
|
||||||
assert torch.allclose(
|
assert torch.allclose(original_class_logits, our_class_logits, atol=tolerance), (
|
||||||
original_class_logits, our_class_logits, atol=tolerance
|
"The class logits are not the same."
|
||||||
), "The class logits are not the same."
|
)
|
||||||
assert torch.allclose(
|
assert torch.allclose(original_mask_logits, our_mask_logits, atol=tolerance), (
|
||||||
original_mask_logits, our_mask_logits, atol=tolerance
|
"The predicted masks are not the same."
|
||||||
), "The predicted masks are not the same."
|
)
|
||||||
|
|
||||||
logger.info("✅ Test passed!")
|
logger.info("✅ Test passed!")
|
||||||
|
|
||||||
|
@ -581,9 +581,9 @@ def test(original_model, our_model: MaskFormerForInstanceSegmentation, image_pro
|
|||||||
for original_model_feature, our_model_feature in zip(
|
for original_model_feature, our_model_feature in zip(
|
||||||
original_model_backbone_features.values(), our_model_output.encoder_hidden_states
|
original_model_backbone_features.values(), our_model_output.encoder_hidden_states
|
||||||
):
|
):
|
||||||
assert torch.allclose(
|
assert torch.allclose(original_model_feature, our_model_feature, atol=1e-3), (
|
||||||
original_model_feature, our_model_feature, atol=1e-3
|
"The backbone features are not the same."
|
||||||
), "The backbone features are not the same."
|
)
|
||||||
|
|
||||||
original_model_pixel_out = original_model.sem_seg_head.pixel_decoder.forward_features(
|
original_model_pixel_out = original_model.sem_seg_head.pixel_decoder.forward_features(
|
||||||
original_model_backbone_features
|
original_model_backbone_features
|
||||||
@ -602,9 +602,9 @@ def test(original_model, our_model: MaskFormerForInstanceSegmentation, image_pro
|
|||||||
|
|
||||||
our_segmentation = image_processor.post_process_segmentation(our_model_out, target_size=(384, 384))
|
our_segmentation = image_processor.post_process_segmentation(our_model_out, target_size=(384, 384))
|
||||||
|
|
||||||
assert torch.allclose(
|
assert torch.allclose(original_segmentation, our_segmentation, atol=1e-3), (
|
||||||
original_segmentation, our_segmentation, atol=1e-3
|
"The segmentation image is not the same."
|
||||||
), "The segmentation image is not the same."
|
)
|
||||||
|
|
||||||
logger.info("✅ Test passed!")
|
logger.info("✅ Test passed!")
|
||||||
|
|
||||||
|
@ -144,9 +144,9 @@ def load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path):
|
|||||||
elif m_name == "kernel":
|
elif m_name == "kernel":
|
||||||
array = np.transpose(array)
|
array = np.transpose(array)
|
||||||
try:
|
try:
|
||||||
assert (
|
assert pointer.shape == array.shape, (
|
||||||
pointer.shape == array.shape
|
f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
|
||||||
), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
|
)
|
||||||
except AssertionError as e:
|
except AssertionError as e:
|
||||||
e.args += (pointer.shape, array.shape)
|
e.args += (pointer.shape, array.shape)
|
||||||
raise
|
raise
|
||||||
|
@ -99,9 +99,9 @@ def get_mobilevitv2_config(task_name, orig_cfg_file):
|
|||||||
orig_config = load_orig_config_file(orig_cfg_file)
|
orig_config = load_orig_config_file(orig_cfg_file)
|
||||||
assert getattr(orig_config, "model.classification.name", -1) == "mobilevit_v2", "Invalid model"
|
assert getattr(orig_config, "model.classification.name", -1) == "mobilevit_v2", "Invalid model"
|
||||||
config.width_multiplier = getattr(orig_config, "model.classification.mitv2.width_multiplier", 1.0)
|
config.width_multiplier = getattr(orig_config, "model.classification.mitv2.width_multiplier", 1.0)
|
||||||
assert (
|
assert getattr(orig_config, "model.classification.mitv2.attn_norm_layer", -1) == "layer_norm_2d", (
|
||||||
getattr(orig_config, "model.classification.mitv2.attn_norm_layer", -1) == "layer_norm_2d"
|
"Norm layers other than layer_norm_2d is not supported"
|
||||||
), "Norm layers other than layer_norm_2d is not supported"
|
)
|
||||||
config.hidden_act = getattr(orig_config, "model.classification.activation.name", "swish")
|
config.hidden_act = getattr(orig_config, "model.classification.activation.name", "swish")
|
||||||
# config.image_size == getattr(orig_config, 'sampler.bs.crop_size_width', 256)
|
# config.image_size == getattr(orig_config, 'sampler.bs.crop_size_width', 256)
|
||||||
|
|
||||||
@ -184,7 +184,9 @@ def create_rename_keys(state_dict, base_model=False):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if f"layer_{i}.1.conv_proj." in k:
|
if f"layer_{i}.1.conv_proj." in k:
|
||||||
k_new = k_new.replace(f"layer_{i}.1.conv_proj.", f"{model_prefix}encoder.layer.{i-1}.conv_projection.")
|
k_new = k_new.replace(
|
||||||
|
f"layer_{i}.1.conv_proj.", f"{model_prefix}encoder.layer.{i - 1}.conv_projection."
|
||||||
|
)
|
||||||
|
|
||||||
if "pre_norm_attn.0." in k:
|
if "pre_norm_attn.0." in k:
|
||||||
k_new = k_new.replace("pre_norm_attn.0.", "layernorm_before.")
|
k_new = k_new.replace("pre_norm_attn.0.", "layernorm_before.")
|
||||||
|
@ -56,7 +56,7 @@ def _read_h5_weights(group, current_key="", weights={}):
|
|||||||
def _convert_layer_names(name, gated_mlp=False):
|
def _convert_layer_names(name, gated_mlp=False):
|
||||||
name = re.sub(
|
name = re.sub(
|
||||||
r"layers\.functional(?:_(\d+))?\.layers",
|
r"layers\.functional(?:_(\d+))?\.layers",
|
||||||
lambda m: f'layers.{m.group(1) if m.group(1) else "0"}',
|
lambda m: f"layers.{m.group(1) if m.group(1) else '0'}",
|
||||||
name,
|
name,
|
||||||
count=1,
|
count=1,
|
||||||
)
|
)
|
||||||
|
@ -719,9 +719,9 @@ def load_tf_weights_in_mt5(model, config, tf_checkpoint_path):
|
|||||||
logger.info(f"Transposing numpy weight of shape {array.shape} for {name}")
|
logger.info(f"Transposing numpy weight of shape {array.shape} for {name}")
|
||||||
array = np.transpose(array)
|
array = np.transpose(array)
|
||||||
try:
|
try:
|
||||||
assert (
|
assert pointer.shape == array.shape, (
|
||||||
pointer.shape == array.shape
|
f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
|
||||||
), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
|
)
|
||||||
except AssertionError as e:
|
except AssertionError as e:
|
||||||
e.args += (pointer.shape, array.shape)
|
e.args += (pointer.shape, array.shape)
|
||||||
raise
|
raise
|
||||||
|
@ -65,13 +65,13 @@ def get_args():
|
|||||||
"--hf_input_path",
|
"--hf_input_path",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help="A HF model path, " "e.g. a folder containing https://huggingface.co/nvidia/Minitron-8B-Base",
|
help="A HF model path, e.g. a folder containing https://huggingface.co/nvidia/Minitron-8B-Base",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--hf_output_path",
|
"--hf_output_path",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help="Output HF model path, " "with the same format as above but user's own weights",
|
help="Output HF model path, with the same format as above but user's own weights",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--precision",
|
"--precision",
|
||||||
|
@ -91,7 +91,9 @@ def shard_on_the_fly(switch_checkpoint_path, dump_path, num_experts, dtype, weig
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Add the last block
|
# Add the last block
|
||||||
save_path = os.path.join(dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin"))
|
save_path = os.path.join(
|
||||||
|
dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts) + 1:05d}-of-???.bin")
|
||||||
|
)
|
||||||
shared_weights = torch.load(switch_checkpoint_path + "-shared.pt")["model"]
|
shared_weights = torch.load(switch_checkpoint_path + "-shared.pt")["model"]
|
||||||
remove_ignore_keys_(shared_weights)
|
remove_ignore_keys_(shared_weights)
|
||||||
shared_weights = rename_fairseq_keys(shared_weights, None)
|
shared_weights = rename_fairseq_keys(shared_weights, None)
|
||||||
|
@ -1352,7 +1352,7 @@ class NllbMoeDecoder(NllbMoePreTrainedModel):
|
|||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
if use_cache:
|
if use_cache:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"`use_cache=True` is incompatible with gradient checkpointing. Setting" " `use_cache=False`..."
|
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
|
||||||
)
|
)
|
||||||
use_cache = False
|
use_cache = False
|
||||||
|
|
||||||
|
@ -5,7 +5,6 @@
|
|||||||
# modular_olmo2.py file directly. One of our CI enforces this.
|
# modular_olmo2.py file directly. One of our CI enforces this.
|
||||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
|
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
from typing import Callable, Optional, Tuple
|
from typing import Callable, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
import torch.nn as nn
|
||||||
|
|
||||||
from ...cache_utils import Cache
|
from ...cache_utils import Cache
|
||||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||||
|
@ -1010,9 +1010,9 @@ def test(
|
|||||||
for original_model_feature, our_model_feature in zip(
|
for original_model_feature, our_model_feature in zip(
|
||||||
original_model_backbone_features.values(), our_model_output.encoder_hidden_states
|
original_model_backbone_features.values(), our_model_output.encoder_hidden_states
|
||||||
):
|
):
|
||||||
assert torch.allclose(
|
assert torch.allclose(original_model_feature, our_model_feature, atol=3e-3), (
|
||||||
original_model_feature, our_model_feature, atol=3e-3
|
"The backbone features are not the same."
|
||||||
), "The backbone features are not the same."
|
)
|
||||||
mask_features, _, multi_scale_features, _, _ = original_model.sem_seg_head.pixel_decoder.forward_features(
|
mask_features, _, multi_scale_features, _, _ = original_model.sem_seg_head.pixel_decoder.forward_features(
|
||||||
original_model_backbone_features
|
original_model_backbone_features
|
||||||
)
|
)
|
||||||
@ -1025,9 +1025,9 @@ def test(
|
|||||||
for original_model_feature, our_model_feature in zip(
|
for original_model_feature, our_model_feature in zip(
|
||||||
original_pixel_decoder_features, our_model_output.pixel_decoder_hidden_states
|
original_pixel_decoder_features, our_model_output.pixel_decoder_hidden_states
|
||||||
):
|
):
|
||||||
assert torch.allclose(
|
assert torch.allclose(original_model_feature, our_model_feature, atol=3e-4), (
|
||||||
original_model_feature, our_model_feature, atol=3e-4
|
"The pixel decoder feature are not the same"
|
||||||
), "The pixel decoder feature are not the same"
|
)
|
||||||
|
|
||||||
tr_complete = T.Compose(
|
tr_complete = T.Compose(
|
||||||
[
|
[
|
||||||
@ -1049,9 +1049,9 @@ def test(
|
|||||||
|
|
||||||
our_segmentation = post_process_sem_seg_output(our_model_out, target_size=(640, 640))[0]
|
our_segmentation = post_process_sem_seg_output(our_model_out, target_size=(640, 640))[0]
|
||||||
|
|
||||||
assert torch.allclose(
|
assert torch.allclose(original_segmentation, our_segmentation, atol=1e-3), (
|
||||||
original_segmentation, our_segmentation, atol=1e-3
|
"The segmentation image is not the same."
|
||||||
), "The segmentation image is not the same."
|
)
|
||||||
|
|
||||||
logger.info("✅ Test passed!")
|
logger.info("✅ Test passed!")
|
||||||
|
|
||||||
|
@ -62,9 +62,9 @@ class TFAttention(keras.layers.Layer):
|
|||||||
|
|
||||||
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
||||||
# [switch nx => n_state from Block to Attention to keep identical to TF implementation]
|
# [switch nx => n_state from Block to Attention to keep identical to TF implementation]
|
||||||
assert (
|
assert n_state % config.n_head == 0, (
|
||||||
n_state % config.n_head == 0
|
f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"
|
||||||
), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"
|
)
|
||||||
self.n_head = config.n_head
|
self.n_head = config.n_head
|
||||||
self.split_size = n_state
|
self.split_size = n_state
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
|
@ -173,7 +173,7 @@ def _preprocess_resize_output_shape(image, output_shape):
|
|||||||
# multichannel case: append shape of last axis
|
# multichannel case: append shape of last axis
|
||||||
output_shape = output_shape + (image.shape[-1],)
|
output_shape = output_shape + (image.shape[-1],)
|
||||||
elif output_ndim < image.ndim:
|
elif output_ndim < image.ndim:
|
||||||
raise ValueError("output_shape length cannot be smaller than the " "image number of dimensions")
|
raise ValueError("output_shape length cannot be smaller than the image number of dimensions")
|
||||||
|
|
||||||
return image, output_shape
|
return image, output_shape
|
||||||
|
|
||||||
@ -345,10 +345,10 @@ class Owlv2ImageProcessor(BaseImageProcessor):
|
|||||||
else:
|
else:
|
||||||
anti_aliasing_sigma = np.atleast_1d(anti_aliasing_sigma) * np.ones_like(factors)
|
anti_aliasing_sigma = np.atleast_1d(anti_aliasing_sigma) * np.ones_like(factors)
|
||||||
if np.any(anti_aliasing_sigma < 0):
|
if np.any(anti_aliasing_sigma < 0):
|
||||||
raise ValueError("Anti-aliasing standard deviation must be " "greater than or equal to zero")
|
raise ValueError("Anti-aliasing standard deviation must be greater than or equal to zero")
|
||||||
elif np.any((anti_aliasing_sigma > 0) & (factors <= 1)):
|
elif np.any((anti_aliasing_sigma > 0) & (factors <= 1)):
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"Anti-aliasing standard deviation greater than zero but " "not down-sampling along all axes"
|
"Anti-aliasing standard deviation greater than zero but not down-sampling along all axes"
|
||||||
)
|
)
|
||||||
filtered = ndi.gaussian_filter(image, anti_aliasing_sigma, cval=cval, mode=ndi_mode)
|
filtered = ndi.gaussian_filter(image, anti_aliasing_sigma, cval=cval, mode=ndi_mode)
|
||||||
else:
|
else:
|
||||||
|
@ -118,9 +118,9 @@ def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, py
|
|||||||
is_key_init = True
|
is_key_init = True
|
||||||
break
|
break
|
||||||
elif attribute == "position_embeddings":
|
elif attribute == "position_embeddings":
|
||||||
assert (
|
assert model.position_embeddings.weight.shape[-1] == old_model.embed_positions.weight.shape[-1], (
|
||||||
model.position_embeddings.weight.shape[-1] == old_model.embed_positions.weight.shape[-1]
|
"Hidden size has to match"
|
||||||
), "Hidden size has to match"
|
)
|
||||||
assert model.position_embeddings.weight.shape[0] == 512, "We want 512 position_embeddings."
|
assert model.position_embeddings.weight.shape[0] == 512, "We want 512 position_embeddings."
|
||||||
model.position_embeddings.weight = nn.Parameter(old_model.embed_positions.weight[:512, :])
|
model.position_embeddings.weight = nn.Parameter(old_model.embed_positions.weight[:512, :])
|
||||||
is_key_init = True
|
is_key_init = True
|
||||||
|
@ -588,9 +588,9 @@ class ProphetNetPositionalEmbeddings(nn.Embedding):
|
|||||||
super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id)
|
super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id)
|
||||||
|
|
||||||
def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None):
|
def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None):
|
||||||
assert (position_ids is None) or (
|
assert (position_ids is None) or (self.padding_idx is None), (
|
||||||
self.padding_idx is None
|
"If position_ids is pre-computed then padding_idx should not be set."
|
||||||
), "If position_ids is pre-computed then padding_idx should not be set."
|
)
|
||||||
|
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
if past_key_values is not None:
|
if past_key_values is not None:
|
||||||
@ -784,9 +784,9 @@ class ProphetNetNgramSelfAttention(nn.Module):
|
|||||||
self.head_dim = config.hidden_size // self.num_attn_heads
|
self.head_dim = config.hidden_size // self.num_attn_heads
|
||||||
self.ngram = config.ngram
|
self.ngram = config.ngram
|
||||||
|
|
||||||
assert (
|
assert self.head_dim * self.num_attn_heads == config.hidden_size, (
|
||||||
self.head_dim * self.num_attn_heads == config.hidden_size
|
"config.hidden_size must be divisible by num_attn_heads"
|
||||||
), "config.hidden_size must be divisible by num_attn_heads"
|
)
|
||||||
# key, value, query projection
|
# key, value, query projection
|
||||||
self.key_proj = nn.Linear(config.hidden_size, config.hidden_size)
|
self.key_proj = nn.Linear(config.hidden_size, config.hidden_size)
|
||||||
self.value_proj = nn.Linear(config.hidden_size, config.hidden_size)
|
self.value_proj = nn.Linear(config.hidden_size, config.hidden_size)
|
||||||
@ -1041,9 +1041,9 @@ class ProphetNetNgramSelfAttention(nn.Module):
|
|||||||
|
|
||||||
if predict_relative_position_buckets is None:
|
if predict_relative_position_buckets is None:
|
||||||
key_sequence_length = attn_weights.shape[-1]
|
key_sequence_length = attn_weights.shape[-1]
|
||||||
assert (
|
assert position_ids[0][0] == key_sequence_length - 1, (
|
||||||
position_ids[0][0] == key_sequence_length - 1
|
"`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
|
||||||
), "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
|
)
|
||||||
relative_positions = (
|
relative_positions = (
|
||||||
torch.arange(0, key_sequence_length)
|
torch.arange(0, key_sequence_length)
|
||||||
.unsqueeze(0)
|
.unsqueeze(0)
|
||||||
@ -1313,9 +1313,9 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
|
|||||||
|
|
||||||
# check if head_mask has a correct number of layers specified if desired
|
# check if head_mask has a correct number of layers specified if desired
|
||||||
if head_mask is not None:
|
if head_mask is not None:
|
||||||
assert head_mask.size()[0] == (
|
assert head_mask.size()[0] == (len(self.layers)), (
|
||||||
len(self.layers)
|
f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
||||||
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
)
|
||||||
for idx, encoder_layer in enumerate(self.layers):
|
for idx, encoder_layer in enumerate(self.layers):
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_hidden_states = encoder_hidden_states + (hidden_states,)
|
encoder_hidden_states = encoder_hidden_states + (hidden_states,)
|
||||||
@ -1488,9 +1488,9 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
|
|||||||
|
|
||||||
# prepare attention mask
|
# prepare attention mask
|
||||||
if past_key_values is not None:
|
if past_key_values is not None:
|
||||||
assert (
|
assert hidden_states.size(1) == 1, (
|
||||||
hidden_states.size(1) == 1
|
"At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
|
||||||
), "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
|
)
|
||||||
|
|
||||||
ngram_hidden_states = [
|
ngram_hidden_states = [
|
||||||
(ngram_embeddings[ngram - 1] + predicting_stream_pos_embed).repeat(batch_size, 1, 1)
|
(ngram_embeddings[ngram - 1] + predicting_stream_pos_embed).repeat(batch_size, 1, 1)
|
||||||
|
@ -162,7 +162,7 @@ def convert_pvt_checkpoint(pvt_size, pvt_checkpoint, pytorch_dump_folder_path):
|
|||||||
elif pvt_size == "large":
|
elif pvt_size == "large":
|
||||||
config_path = "Zetatech/pvt-large-224"
|
config_path = "Zetatech/pvt-large-224"
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")
|
raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but '{pvt_size}' was given")
|
||||||
config = PvtConfig(name_or_path=config_path)
|
config = PvtConfig(name_or_path=config_path)
|
||||||
# load original model from https://github.com/whai362/PVT
|
# load original model from https://github.com/whai362/PVT
|
||||||
state_dict = torch.load(pvt_checkpoint, map_location="cpu")
|
state_dict = torch.load(pvt_checkpoint, map_location="cpu")
|
||||||
@ -192,7 +192,7 @@ def convert_pvt_checkpoint(pvt_size, pvt_checkpoint, pytorch_dump_folder_path):
|
|||||||
elif pvt_size == "large":
|
elif pvt_size == "large":
|
||||||
expected_slice_logits = torch.tensor([0.3740, -0.7739, -0.4214])
|
expected_slice_logits = torch.tensor([0.3740, -0.7739, -0.4214])
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")
|
raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but '{pvt_size}' was given")
|
||||||
|
|
||||||
assert torch.allclose(logits[0, :3], expected_slice_logits, atol=1e-4)
|
assert torch.allclose(logits[0, :3], expected_slice_logits, atol=1e-4)
|
||||||
|
|
||||||
|
@ -203,8 +203,7 @@ def convert_pvt_v2_checkpoint(pvt_v2_size, pvt_v2_checkpoint, pytorch_dump_folde
|
|||||||
config_path = "OpenGVLab/pvt_v2_b5"
|
config_path = "OpenGVLab/pvt_v2_b5"
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Available model sizes: 'b0', 'b1', 'b2', 'b2-linear', 'b3', 'b4', 'b5', but "
|
f"Available model sizes: 'b0', 'b1', 'b2', 'b2-linear', 'b3', 'b4', 'b5', but '{pvt_v2_size}' was given"
|
||||||
f"'{pvt_v2_size}' was given"
|
|
||||||
)
|
)
|
||||||
config = PvtV2Config.from_pretrained(config_path)
|
config = PvtV2Config.from_pretrained(config_path)
|
||||||
# load original model from https://github.com/whai362/PVT
|
# load original model from https://github.com/whai362/PVT
|
||||||
@ -248,9 +247,9 @@ def convert_pvt_v2_checkpoint(pvt_v2_size, pvt_v2_checkpoint, pytorch_dump_folde
|
|||||||
f"'{pvt_v2_size}' was given"
|
f"'{pvt_v2_size}' was given"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert torch.allclose(
|
assert torch.allclose(logits[0, :3], expected_slice_logits, atol=1e-4), (
|
||||||
logits[0, :3], expected_slice_logits, atol=1e-4
|
"ImageNet weights not converted successfully."
|
||||||
), "ImageNet weights not converted successfully."
|
)
|
||||||
|
|
||||||
print("ImageNet weights verified, conversion successful.")
|
print("ImageNet weights verified, conversion successful.")
|
||||||
|
|
||||||
|
@ -623,9 +623,9 @@ class Qwen2AudioEncoder(Qwen2AudioPreTrainedModel):
|
|||||||
|
|
||||||
# check if head_mask has a correct number of layers specified if desired
|
# check if head_mask has a correct number of layers specified if desired
|
||||||
if head_mask is not None:
|
if head_mask is not None:
|
||||||
assert head_mask.size()[0] == (
|
assert head_mask.size()[0] == (len(self.layers)), (
|
||||||
len(self.layers)
|
f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
||||||
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
)
|
||||||
|
|
||||||
for idx, encoder_layer in enumerate(self.layers):
|
for idx, encoder_layer in enumerate(self.layers):
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
|
@ -494,9 +494,9 @@ class RagModel(RagPreTrainedModel):
|
|||||||
retriever: Optional[RagRetriever] = None, # or maybe just use a `set_retriever(...)` method
|
retriever: Optional[RagRetriever] = None, # or maybe just use a `set_retriever(...)` method
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
assert config is not None or (
|
assert config is not None or (question_encoder is not None and generator is not None), (
|
||||||
question_encoder is not None and generator is not None
|
"Either a configuration or an question_encoder and a generator has to be provided."
|
||||||
), "Either a configuration or an question_encoder and a generator has to be provided."
|
)
|
||||||
|
|
||||||
if config is None:
|
if config is None:
|
||||||
config = RagConfig.from_question_encoder_generator_configs(
|
config = RagConfig.from_question_encoder_generator_configs(
|
||||||
@ -517,9 +517,9 @@ class RagModel(RagPreTrainedModel):
|
|||||||
|
|
||||||
self.retriever = retriever
|
self.retriever = retriever
|
||||||
if self.retriever is not None:
|
if self.retriever is not None:
|
||||||
assert isinstance(
|
assert isinstance(retriever, RagRetriever), (
|
||||||
retriever, RagRetriever
|
f"`self.retriever` is of type {type(self.retriever)}, but should be of type `RagRetriever`"
|
||||||
), f"`self.retriever` is of type {type(self.retriever)}, but should be of type `RagRetriever`"
|
)
|
||||||
self.retriever = retriever
|
self.retriever = retriever
|
||||||
|
|
||||||
self.question_encoder = question_encoder
|
self.question_encoder = question_encoder
|
||||||
@ -660,9 +660,9 @@ class RagModel(RagPreTrainedModel):
|
|||||||
" retriever using the `set_retriever(...)` function."
|
" retriever using the `set_retriever(...)` function."
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert doc_scores is not None, (
|
||||||
doc_scores is not None
|
"Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function."
|
||||||
), "Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function."
|
)
|
||||||
|
|
||||||
assert (doc_scores.shape[1] % n_docs) == 0, (
|
assert (doc_scores.shape[1] % n_docs) == 0, (
|
||||||
f" The first dimension of `context_input_ids` should be a multiple of `n_docs`={n_docs}, but is"
|
f" The first dimension of `context_input_ids` should be a multiple of `n_docs`={n_docs}, but is"
|
||||||
@ -740,9 +740,9 @@ class RagSequenceForGeneration(RagPreTrainedModel):
|
|||||||
retriever: Optional[RagRetriever] = None,
|
retriever: Optional[RagRetriever] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
assert config is not None or (
|
assert config is not None or (question_encoder is not None and generator is not None), (
|
||||||
question_encoder is not None and generator is not None
|
"Either a configuration or an encoder and a generator has to be provided."
|
||||||
), "Either a configuration or an encoder and a generator has to be provided."
|
)
|
||||||
|
|
||||||
if config is None:
|
if config is None:
|
||||||
config = RagConfig.from_question_encoder_generator_configs(
|
config = RagConfig.from_question_encoder_generator_configs(
|
||||||
@ -973,9 +973,9 @@ class RagSequenceForGeneration(RagPreTrainedModel):
|
|||||||
)
|
)
|
||||||
num_beams = num_beams if num_beams is not None else self.config.num_beams
|
num_beams = num_beams if num_beams is not None else self.config.num_beams
|
||||||
|
|
||||||
assert (
|
assert input_ids is not None or context_input_ids is not None, (
|
||||||
input_ids is not None or context_input_ids is not None
|
" At least one of input_ids or context_input_ids must be given"
|
||||||
), " At least one of input_ids or context_input_ids must be given"
|
)
|
||||||
|
|
||||||
if self.retriever is not None and context_input_ids is None:
|
if self.retriever is not None and context_input_ids is None:
|
||||||
question_hidden_states = self.question_encoder(input_ids, attention_mask=attention_mask)[0]
|
question_hidden_states = self.question_encoder(input_ids, attention_mask=attention_mask)[0]
|
||||||
@ -1138,9 +1138,9 @@ class RagTokenForGeneration(RagPreTrainedModel):
|
|||||||
retriever: Optional[RagRetriever] = None,
|
retriever: Optional[RagRetriever] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
assert config is not None or (
|
assert config is not None or (question_encoder is not None and generator is not None), (
|
||||||
question_encoder is not None and generator is not None
|
"Either a configuration or an encoder and a generator has to be provided."
|
||||||
), "Either a configuration or an encoder and a generator has to be provided."
|
)
|
||||||
|
|
||||||
if config is None:
|
if config is None:
|
||||||
config = RagConfig.from_question_encoder_generator_configs(
|
config = RagConfig.from_question_encoder_generator_configs(
|
||||||
|
@ -506,9 +506,9 @@ class TFRagModel(TFRagPreTrainedModel):
|
|||||||
load_weight_prefix: Optional[str] = None,
|
load_weight_prefix: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
assert config is not None or (
|
assert config is not None or (question_encoder is not None and generator is not None), (
|
||||||
question_encoder is not None and generator is not None
|
"Either a configuration or an question_encoder and a generator has to be provided."
|
||||||
), "Either a configuration or an question_encoder and a generator has to be provided."
|
)
|
||||||
|
|
||||||
if config is None:
|
if config is None:
|
||||||
config = RagConfig.from_question_encoder_generator_configs(
|
config = RagConfig.from_question_encoder_generator_configs(
|
||||||
@ -533,9 +533,9 @@ class TFRagModel(TFRagPreTrainedModel):
|
|||||||
|
|
||||||
self.retriever = retriever
|
self.retriever = retriever
|
||||||
if self.retriever is not None:
|
if self.retriever is not None:
|
||||||
assert isinstance(
|
assert isinstance(retriever, RagRetriever), (
|
||||||
retriever, RagRetriever
|
f"`self.retriever` is of type {type(self.retriever)}, but should be of type `RagRetriever`"
|
||||||
), f"`self.retriever` is of type {type(self.retriever)}, but should be of type `RagRetriever`"
|
)
|
||||||
self.retriever = retriever
|
self.retriever = retriever
|
||||||
|
|
||||||
self.question_encoder = question_encoder
|
self.question_encoder = question_encoder
|
||||||
@ -589,9 +589,9 @@ class TFRagModel(TFRagPreTrainedModel):
|
|||||||
>>> input_ids = input_dict["input_ids"]
|
>>> input_ids = input_dict["input_ids"]
|
||||||
>>> outputs = model(input_ids)
|
>>> outputs = model(input_ids)
|
||||||
```"""
|
```"""
|
||||||
assert (
|
assert "decoder_cached_states" not in kwargs, (
|
||||||
"decoder_cached_states" not in kwargs
|
"Please use past_key_values to cache intermediate outputs"
|
||||||
), "Please use past_key_values to cache intermediate outputs" # from modeling_tf_bart.py
|
) # from modeling_tf_bart.py
|
||||||
|
|
||||||
# aliasing to minimize code changing
|
# aliasing to minimize code changing
|
||||||
n_docs = n_docs if n_docs is not None else self.config.n_docs
|
n_docs = n_docs if n_docs is not None else self.config.n_docs
|
||||||
@ -657,9 +657,9 @@ class TFRagModel(TFRagPreTrainedModel):
|
|||||||
" retriever using the `set_retriever(...)` function."
|
" retriever using the `set_retriever(...)` function."
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert doc_scores is not None, (
|
||||||
doc_scores is not None
|
"Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function."
|
||||||
), "Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function."
|
)
|
||||||
|
|
||||||
assert (doc_scores.shape[1] % n_docs) == 0, (
|
assert (doc_scores.shape[1] % n_docs) == 0, (
|
||||||
f" The first dimension of `context_input_ids` should be a multiple of `n_docs`={n_docs}, but is"
|
f" The first dimension of `context_input_ids` should be a multiple of `n_docs`={n_docs}, but is"
|
||||||
@ -747,9 +747,9 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
|
|||||||
retriever: Optional[RagRetriever] = None,
|
retriever: Optional[RagRetriever] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
assert config is not None or (
|
assert config is not None or (question_encoder is not None and generator is not None), (
|
||||||
question_encoder is not None and generator is not None
|
"Either a configuration or an encoder and a generator has to be provided."
|
||||||
), "Either a configuration or an encoder and a generator has to be provided."
|
)
|
||||||
|
|
||||||
if config is None:
|
if config is None:
|
||||||
config = RagConfig.from_question_encoder_generator_configs(
|
config = RagConfig.from_question_encoder_generator_configs(
|
||||||
@ -939,9 +939,9 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
|
|||||||
>>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
|
>>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
|
||||||
```"""
|
```"""
|
||||||
|
|
||||||
assert (
|
assert "decoder_cached_states" not in kwargs, (
|
||||||
"decoder_cached_states" not in kwargs
|
"Please use past_key_values to cache intermediate outputs"
|
||||||
), "Please use past_key_values to cache intermediate outputs" # from modeling_tf_bart.py
|
) # from modeling_tf_bart.py
|
||||||
|
|
||||||
do_marginalize = do_marginalize if do_marginalize else self.config.do_marginalize
|
do_marginalize = do_marginalize if do_marginalize else self.config.do_marginalize
|
||||||
reduce_loss = reduce_loss if reduce_loss else self.config.reduce_loss
|
reduce_loss = reduce_loss if reduce_loss else self.config.reduce_loss
|
||||||
@ -1327,9 +1327,9 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
|
|||||||
retriever: Optional[RagRetriever] = None,
|
retriever: Optional[RagRetriever] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
assert config is not None or (
|
assert config is not None or (question_encoder is not None and generator is not None), (
|
||||||
question_encoder is not None and generator is not None
|
"Either a configuration or an encoder and a generator has to be provided."
|
||||||
), "Either a configuration or an encoder and a generator has to be provided."
|
)
|
||||||
|
|
||||||
if config is None:
|
if config is None:
|
||||||
config = RagConfig.from_question_encoder_generator_configs(
|
config = RagConfig.from_question_encoder_generator_configs(
|
||||||
@ -1454,9 +1454,9 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
|
|||||||
>>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
|
>>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
|
||||||
```"""
|
```"""
|
||||||
|
|
||||||
assert (
|
assert "decoder_cached_states" not in kwargs, (
|
||||||
"decoder_cached_states" not in kwargs
|
"Please use past_key_values to cache intermediate outputs"
|
||||||
), "Please use past_key_values to cache intermediate outputs" # from modeling_tf_bart.py
|
) # from modeling_tf_bart.py
|
||||||
|
|
||||||
exclude_bos_score = exclude_bos_score if exclude_bos_score else self.config.exclude_bos_score
|
exclude_bos_score = exclude_bos_score if exclude_bos_score else self.config.exclude_bos_score
|
||||||
reduce_loss = reduce_loss if reduce_loss else self.config.reduce_loss
|
reduce_loss = reduce_loss if reduce_loss else self.config.reduce_loss
|
||||||
@ -1663,9 +1663,9 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
|
|||||||
)
|
)
|
||||||
num_beams = num_beams if num_beams is not None else self.config.num_beams
|
num_beams = num_beams if num_beams is not None else self.config.num_beams
|
||||||
|
|
||||||
assert (
|
assert input_ids is not None or context_input_ids is not None, (
|
||||||
input_ids is not None or context_input_ids is not None
|
" At least one of input_ids or context_input_ids must be given"
|
||||||
), " At least one of input_ids or context_input_ids must be given"
|
)
|
||||||
|
|
||||||
if self.retriever is not None and context_input_ids is None:
|
if self.retriever is not None and context_input_ids is None:
|
||||||
question_hidden_states = self.question_encoder(input_ids, attention_mask=attention_mask)[0]
|
question_hidden_states = self.question_encoder(input_ids, attention_mask=attention_mask)[0]
|
||||||
|
@ -156,9 +156,9 @@ class LegacyIndex(Index):
|
|||||||
)
|
)
|
||||||
with open(resolved_meta_path, "rb") as metadata_file:
|
with open(resolved_meta_path, "rb") as metadata_file:
|
||||||
self.index_id_to_db_id = pickle.load(metadata_file)
|
self.index_id_to_db_id = pickle.load(metadata_file)
|
||||||
assert (
|
assert len(self.index_id_to_db_id) == self.index.ntotal, (
|
||||||
len(self.index_id_to_db_id) == self.index.ntotal
|
"Deserialized index_id_to_db_id should match faiss index size"
|
||||||
), "Deserialized index_id_to_db_id should match faiss index size"
|
)
|
||||||
|
|
||||||
def is_initialized(self):
|
def is_initialized(self):
|
||||||
return self._index_initialized
|
return self._index_initialized
|
||||||
|
@ -150,15 +150,15 @@ def set_model_weights_in_torch(weights, torch_model, hidden_size):
|
|||||||
position_embeddings = torch_model_reformer.embeddings.position_embeddings
|
position_embeddings = torch_model_reformer.embeddings.position_embeddings
|
||||||
for emb_idx in range(len(position_embeddings.weights)):
|
for emb_idx in range(len(position_embeddings.weights)):
|
||||||
emb_weights = np.asarray(weights[3][emb_idx][0])
|
emb_weights = np.asarray(weights[3][emb_idx][0])
|
||||||
assert (
|
assert position_embeddings.weights[emb_idx].shape == emb_weights.shape, (
|
||||||
position_embeddings.weights[emb_idx].shape == emb_weights.shape
|
f"{position_embeddings[emb_idx]} emb does not match"
|
||||||
), f"{position_embeddings[emb_idx]} emb does not match"
|
)
|
||||||
position_embeddings.weights[emb_idx] = nn.Parameter(torch.tensor(emb_weights))
|
position_embeddings.weights[emb_idx] = nn.Parameter(torch.tensor(emb_weights))
|
||||||
|
|
||||||
trax_layer_weights = weights[5]
|
trax_layer_weights = weights[5]
|
||||||
assert len(torch_model_reformer.encoder.layers) * 4 == len(
|
assert len(torch_model_reformer.encoder.layers) * 4 == len(trax_layer_weights), (
|
||||||
trax_layer_weights
|
"HF and trax model do not have the same number of layers"
|
||||||
), "HF and trax model do not have the same number of layers"
|
)
|
||||||
for layer_idx, layer in enumerate(torch_model_reformer.encoder.layers):
|
for layer_idx, layer in enumerate(torch_model_reformer.encoder.layers):
|
||||||
block_weights = trax_layer_weights[4 * layer_idx : 4 * (layer_idx + 1)]
|
block_weights = trax_layer_weights[4 * layer_idx : 4 * (layer_idx + 1)]
|
||||||
set_block_weights_in_torch(block_weights, layer, hidden_size)
|
set_block_weights_in_torch(block_weights, layer, hidden_size)
|
||||||
|
@ -446,12 +446,12 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||||||
# free memory
|
# free memory
|
||||||
del hidden_states
|
del hidden_states
|
||||||
|
|
||||||
assert (
|
assert query_key_vectors.shape[-1] == self.attention_head_size, (
|
||||||
query_key_vectors.shape[-1] == self.attention_head_size
|
f"last dim of query_key_vectors is {query_key_vectors.shape[-1]} but should be {self.attention_head_size}."
|
||||||
), f"last dim of query_key_vectors is {query_key_vectors.shape[-1]} but should be {self.attention_head_size}."
|
)
|
||||||
assert (
|
assert value_vectors.shape[-1] == self.attention_head_size, (
|
||||||
value_vectors.shape[-1] == self.attention_head_size
|
f"last dim of value_vectors is {value_vectors.shape[-1]} but should be {self.attention_head_size}."
|
||||||
), f"last dim of value_vectors is {value_vectors.shape[-1]} but should be {self.attention_head_size}."
|
)
|
||||||
|
|
||||||
do_standard_self_attention = (sequence_length <= self.chunk_length) or (
|
do_standard_self_attention = (sequence_length <= self.chunk_length) or (
|
||||||
use_cache and past_buckets_states[1] is not None
|
use_cache and past_buckets_states[1] is not None
|
||||||
@ -470,9 +470,9 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||||||
# make sure buckets has correct shape for LSH attention
|
# make sure buckets has correct shape for LSH attention
|
||||||
buckets = buckets.view(batch_size, self.num_attention_heads, num_hashes * sequence_length)
|
buckets = buckets.view(batch_size, self.num_attention_heads, num_hashes * sequence_length)
|
||||||
|
|
||||||
assert (
|
assert int(buckets.shape[-1]) == num_hashes * sequence_length, (
|
||||||
int(buckets.shape[-1]) == num_hashes * sequence_length
|
f"last dim of buckets is {buckets.shape[-1]}, but should be {num_hashes * sequence_length}"
|
||||||
), f"last dim of buckets is {buckets.shape[-1]}, but should be {num_hashes * sequence_length}"
|
)
|
||||||
|
|
||||||
sorted_bucket_idx, undo_sorted_bucket_idx = self._get_sorted_bucket_idx_and_undo_sorted_bucket_idx(
|
sorted_bucket_idx, undo_sorted_bucket_idx = self._get_sorted_bucket_idx_and_undo_sorted_bucket_idx(
|
||||||
sequence_length, buckets, num_hashes
|
sequence_length, buckets, num_hashes
|
||||||
@ -612,18 +612,18 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||||||
# We sample a different random rotation for each round of hashing to
|
# We sample a different random rotation for each round of hashing to
|
||||||
# decrease the probability of hash misses.
|
# decrease the probability of hash misses.
|
||||||
if isinstance(self.num_buckets, int):
|
if isinstance(self.num_buckets, int):
|
||||||
assert (
|
assert self.num_buckets % 2 == 0, (
|
||||||
self.num_buckets % 2 == 0
|
f"There should be an even number of buckets, but `self.num_buckets`: {self.num_buckets}"
|
||||||
), f"There should be an even number of buckets, but `self.num_buckets`: {self.num_buckets}"
|
)
|
||||||
rotation_size = self.num_buckets
|
rotation_size = self.num_buckets
|
||||||
num_buckets = self.num_buckets
|
num_buckets = self.num_buckets
|
||||||
else:
|
else:
|
||||||
# Factorize the hash if self.num_buckets is a list or tuple
|
# Factorize the hash if self.num_buckets is a list or tuple
|
||||||
rotation_size, num_buckets = 0, 1
|
rotation_size, num_buckets = 0, 1
|
||||||
for bucket_factor in self.num_buckets:
|
for bucket_factor in self.num_buckets:
|
||||||
assert (
|
assert bucket_factor % 2 == 0, (
|
||||||
bucket_factor % 2 == 0
|
f"The number of buckets should be even, but `num_bucket`: {bucket_factor}"
|
||||||
), f"The number of buckets should be even, but `num_bucket`: {bucket_factor}"
|
)
|
||||||
rotation_size = rotation_size + bucket_factor
|
rotation_size = rotation_size + bucket_factor
|
||||||
num_buckets = num_buckets * bucket_factor
|
num_buckets = num_buckets * bucket_factor
|
||||||
|
|
||||||
@ -1090,15 +1090,15 @@ class LocalSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||||||
key_vectors = self._split_hidden_size_dim(key_vectors, self.num_attention_heads, self.attention_head_size)
|
key_vectors = self._split_hidden_size_dim(key_vectors, self.num_attention_heads, self.attention_head_size)
|
||||||
value_vectors = self._split_hidden_size_dim(value_vectors, self.num_attention_heads, self.attention_head_size)
|
value_vectors = self._split_hidden_size_dim(value_vectors, self.num_attention_heads, self.attention_head_size)
|
||||||
|
|
||||||
assert (
|
assert query_vectors.shape[-1] == self.attention_head_size, (
|
||||||
query_vectors.shape[-1] == self.attention_head_size
|
f"last dim of query_key_vectors is {query_vectors.shape[-1]} but should be {self.attention_head_size}."
|
||||||
), f"last dim of query_key_vectors is {query_vectors.shape[-1]} but should be {self.attention_head_size}."
|
)
|
||||||
assert (
|
assert key_vectors.shape[-1] == self.attention_head_size, (
|
||||||
key_vectors.shape[-1] == self.attention_head_size
|
f"last dim of query_key_vectors is {key_vectors.shape[-1]} but should be {self.attention_head_size}."
|
||||||
), f"last dim of query_key_vectors is {key_vectors.shape[-1]} but should be {self.attention_head_size}."
|
)
|
||||||
assert (
|
assert value_vectors.shape[-1] == self.attention_head_size, (
|
||||||
value_vectors.shape[-1] == self.attention_head_size
|
f"last dim of query_key_vectors is {value_vectors.shape[-1]} but should be {self.attention_head_size}."
|
||||||
), f"last dim of query_key_vectors is {value_vectors.shape[-1]} but should be {self.attention_head_size}."
|
)
|
||||||
|
|
||||||
if self.chunk_length is None:
|
if self.chunk_length is None:
|
||||||
assert self.num_chunks_before == 0 and self.num_chunks_after == 0, (
|
assert self.num_chunks_before == 0 and self.num_chunks_after == 0, (
|
||||||
@ -1976,9 +1976,9 @@ class ReformerModel(ReformerPreTrainedModel):
|
|||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.config = config
|
self.config = config
|
||||||
assert (
|
assert self.config.num_hidden_layers > 0, (
|
||||||
self.config.num_hidden_layers > 0
|
"`config.attn_layers` is empty. Select at least one attn layer form ['lsh', 'local']"
|
||||||
), "`config.attn_layers` is empty. Select at least one attn layer form ['lsh', 'local']"
|
)
|
||||||
|
|
||||||
self.embeddings = ReformerEmbeddings(config)
|
self.embeddings = ReformerEmbeddings(config)
|
||||||
self.encoder = ReformerEncoder(config)
|
self.encoder = ReformerEncoder(config)
|
||||||
@ -2039,9 +2039,9 @@ class ReformerModel(ReformerPreTrainedModel):
|
|||||||
else:
|
else:
|
||||||
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
assert (
|
assert len(input_shape) == 2, (
|
||||||
len(input_shape) == 2
|
f"`input_ids` have be of shape `[batch_size, sequence_length]`, but got shape: {input_shape}"
|
||||||
), f"`input_ids` have be of shape `[batch_size, sequence_length]`, but got shape: {input_shape}"
|
)
|
||||||
|
|
||||||
if past_buckets_states is not None:
|
if past_buckets_states is not None:
|
||||||
assert not self.training, "`past_buckets_states` can only be used for inference, not for training`."
|
assert not self.training, "`past_buckets_states` can only be used for inference, not for training`."
|
||||||
|
@ -2869,7 +2869,7 @@ class SeamlessM4TForTextToText(SeamlessM4TPreTrainedModel, GenerationMixin):
|
|||||||
if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
|
if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
|
f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
|
||||||
{', '.join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
|
{", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
|
||||||
)
|
)
|
||||||
# tgt_lang gets priority over decoder input ids
|
# tgt_lang gets priority over decoder input ids
|
||||||
text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
|
text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
|
||||||
@ -3140,7 +3140,7 @@ class SeamlessM4TForSpeechToText(SeamlessM4TPreTrainedModel, GenerationMixin):
|
|||||||
if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
|
if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
|
f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
|
||||||
{', '.join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
|
{", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
|
||||||
)
|
)
|
||||||
# tgt_lang gets priority over decoder input ids
|
# tgt_lang gets priority over decoder input ids
|
||||||
text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
|
text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
|
||||||
@ -3407,7 +3407,7 @@ class SeamlessM4TForTextToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin):
|
|||||||
elif tgt_lang not in lang_code_to_id:
|
elif tgt_lang not in lang_code_to_id:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"""`tgt_lang={tgt_lang}` is not supported by this model.
|
f"""`tgt_lang={tgt_lang}` is not supported by this model.
|
||||||
Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
|
Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
|
||||||
more languages for text translation than for speech synthesis."""
|
more languages for text translation than for speech synthesis."""
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -3736,7 +3736,7 @@ class SeamlessM4TForSpeechToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin):
|
|||||||
elif tgt_lang not in lang_code_to_id:
|
elif tgt_lang not in lang_code_to_id:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"""`tgt_lang={tgt_lang}` is not supported by this model.
|
f"""`tgt_lang={tgt_lang}` is not supported by this model.
|
||||||
Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
|
Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
|
||||||
more languages for text translation than for speech synthesis."""
|
more languages for text translation than for speech synthesis."""
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -4151,7 +4151,7 @@ class SeamlessM4TModel(SeamlessM4TPreTrainedModel, GenerationMixin):
|
|||||||
elif tgt_lang not in lang_code_to_id:
|
elif tgt_lang not in lang_code_to_id:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"""`tgt_lang={tgt_lang}` is not supported by this model.
|
f"""`tgt_lang={tgt_lang}` is not supported by this model.
|
||||||
Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
|
Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
|
||||||
more languages for text translation than for speech synthesis."""
|
more languages for text translation than for speech synthesis."""
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -3149,7 +3149,7 @@ class SeamlessM4Tv2ForTextToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
|
|||||||
if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
|
if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
|
f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
|
||||||
{', '.join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
|
{", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
|
||||||
)
|
)
|
||||||
# tgt_lang gets priority over decoder input ids
|
# tgt_lang gets priority over decoder input ids
|
||||||
text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
|
text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
|
||||||
@ -3430,7 +3430,7 @@ class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin
|
|||||||
if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
|
if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
|
f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
|
||||||
{', '.join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
|
{", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
|
||||||
)
|
)
|
||||||
# tgt_lang gets priority over decoder input ids
|
# tgt_lang gets priority over decoder input ids
|
||||||
text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
|
text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
|
||||||
@ -3707,7 +3707,7 @@ class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin
|
|||||||
elif tgt_lang not in lang_code_to_id:
|
elif tgt_lang not in lang_code_to_id:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"""`tgt_lang={tgt_lang}` is not supported by this model.
|
f"""`tgt_lang={tgt_lang}` is not supported by this model.
|
||||||
Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
|
Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
|
||||||
more languages for text translation than for speech synthesis."""
|
more languages for text translation than for speech synthesis."""
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -4078,7 +4078,7 @@ class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMix
|
|||||||
elif tgt_lang not in lang_code_to_id:
|
elif tgt_lang not in lang_code_to_id:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"""`tgt_lang={tgt_lang}` is not supported by this model.
|
f"""`tgt_lang={tgt_lang}` is not supported by this model.
|
||||||
Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
|
Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
|
||||||
more languages for text translation than for speech synthesis."""
|
more languages for text translation than for speech synthesis."""
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -4539,7 +4539,7 @@ class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
|
|||||||
elif tgt_lang not in lang_code_to_id:
|
elif tgt_lang not in lang_code_to_id:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"""`tgt_lang={tgt_lang}` is not supported by this model.
|
f"""`tgt_lang={tgt_lang}` is not supported by this model.
|
||||||
Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
|
Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
|
||||||
more languages for text translation than for speech synthesis."""
|
more languages for text translation than for speech synthesis."""
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -192,41 +192,41 @@ def load_adapter(full_name, value, adapter, unused_weights):
|
|||||||
if "proj_ln" in full_name:
|
if "proj_ln" in full_name:
|
||||||
# has to be layer norm
|
# has to be layer norm
|
||||||
if "bias" in name:
|
if "bias" in name:
|
||||||
assert (
|
assert value.shape == adapter.proj_layer_norm.bias.data.shape, (
|
||||||
value.shape == adapter.proj_layer_norm.bias.data.shape
|
f"{full_name} has size {value.shape}, but {adapter.proj_layer_norm.bias.data.shape} was found."
|
||||||
), f"{full_name} has size {value.shape}, but {adapter.proj_layer_norm.bias.data.shape} was found."
|
)
|
||||||
adapter.proj_layer_norm.bias.data = value
|
adapter.proj_layer_norm.bias.data = value
|
||||||
logger.info(f"Adapter proj layer norm bias was initialized from {full_name}.")
|
logger.info(f"Adapter proj layer norm bias was initialized from {full_name}.")
|
||||||
if "weight" in name:
|
if "weight" in name:
|
||||||
assert (
|
assert value.shape == adapter.proj_layer_norm.weight.data.shape, (
|
||||||
value.shape == adapter.proj_layer_norm.weight.data.shape
|
f"{full_name} has size {value.shape}, but {adapter.proj_layer_norm.weight.data.shape} was found."
|
||||||
), f"{full_name} has size {value.shape}, but {adapter.proj_layer_norm.weight.data.shape} was found."
|
)
|
||||||
adapter.proj_layer_norm.weight.data = value
|
adapter.proj_layer_norm.weight.data = value
|
||||||
else:
|
else:
|
||||||
# has to be projection layer
|
# has to be projection layer
|
||||||
if "bias" in name:
|
if "bias" in name:
|
||||||
assert (
|
assert value.shape == adapter.proj.bias.data.shape, (
|
||||||
value.shape == adapter.proj.bias.data.shape
|
f"{full_name} has size {value.shape}, but {adapter.proj.bias.data.shape} was found."
|
||||||
), f"{full_name} has size {value.shape}, but {adapter.proj.bias.data.shape} was found."
|
)
|
||||||
adapter.proj.bias.data = value
|
adapter.proj.bias.data = value
|
||||||
logger.info(f"Adapter proj layer bias was initialized from {full_name}.")
|
logger.info(f"Adapter proj layer bias was initialized from {full_name}.")
|
||||||
if "weight" in name:
|
if "weight" in name:
|
||||||
assert (
|
assert value.shape == adapter.proj.weight.data.shape, (
|
||||||
value.shape == adapter.proj.weight.data.shape
|
f"{full_name} has size {value.shape}, but {adapter.proj.weight.data.shape} was found."
|
||||||
), f"{full_name} has size {value.shape}, but {adapter.proj.weight.data.shape} was found."
|
)
|
||||||
adapter.proj.weight.data = value
|
adapter.proj.weight.data = value
|
||||||
logger.info(f"Adapter proj layer weight was initialized from {full_name}.")
|
logger.info(f"Adapter proj layer weight was initialized from {full_name}.")
|
||||||
elif isinstance(layer_id, int):
|
elif isinstance(layer_id, int):
|
||||||
if "bias" in name:
|
if "bias" in name:
|
||||||
assert (
|
assert value.shape == adapter.layers[layer_id].conv.bias.data.shape, (
|
||||||
value.shape == adapter.layers[layer_id].conv.bias.data.shape
|
f"{full_name} has size {value.shape}, but {adapter.layers[layer_id].conv.bias.data.shape} was found."
|
||||||
), f"{full_name} has size {value.shape}, but {adapter.layers[layer_id].conv.bias.data.shape} was found."
|
)
|
||||||
adapter.layers[layer_id].conv.bias.data = value
|
adapter.layers[layer_id].conv.bias.data = value
|
||||||
logger.info(f"Adapter layer {layer_id} bias was initialized from {full_name}.")
|
logger.info(f"Adapter layer {layer_id} bias was initialized from {full_name}.")
|
||||||
elif "weight" in name:
|
elif "weight" in name:
|
||||||
assert (
|
assert value.shape == adapter.layers[layer_id].conv.weight.data.shape, (
|
||||||
value.shape == adapter.layers[layer_id].conv.weight.data.shape
|
f"{full_name} has size {value.shape}, but {adapter.layers[layer_id].conv.weight.data.shape} was found."
|
||||||
), f"{full_name} has size {value.shape}, but {adapter.layers[layer_id].conv.weight.data.shape} was found."
|
)
|
||||||
adapter.layers[layer_id].conv.weight.data = value
|
adapter.layers[layer_id].conv.weight.data = value
|
||||||
logger.info(f"Adapter layer {layer_id} bias was initialized from {full_name}.")
|
logger.info(f"Adapter layer {layer_id} bias was initialized from {full_name}.")
|
||||||
else:
|
else:
|
||||||
|
@ -774,9 +774,9 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
|
|||||||
|
|
||||||
# check if head_mask has a correct number of layers specified if desired
|
# check if head_mask has a correct number of layers specified if desired
|
||||||
if head_mask is not None:
|
if head_mask is not None:
|
||||||
assert head_mask.size()[0] == (
|
assert head_mask.size()[0] == (len(self.layers)), (
|
||||||
len(self.layers)
|
f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
||||||
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
)
|
||||||
|
|
||||||
for idx, encoder_layer in enumerate(self.layers):
|
for idx, encoder_layer in enumerate(self.layers):
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
|
@ -224,9 +224,9 @@ def convert_swin2sr_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to
|
|||||||
[[-0.5238, -0.5557, -0.6321], [-0.6016, -0.5903, -0.6391], [-0.6244, -0.6334, -0.6889]]
|
[[-0.5238, -0.5557, -0.6321], [-0.6016, -0.5903, -0.6391], [-0.6244, -0.6334, -0.6889]]
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert outputs.reconstruction.shape == expected_shape, (
|
||||||
outputs.reconstruction.shape == expected_shape
|
f"Shape of reconstruction should be {expected_shape}, but is {outputs.reconstruction.shape}"
|
||||||
), f"Shape of reconstruction should be {expected_shape}, but is {outputs.reconstruction.shape}"
|
)
|
||||||
assert torch.allclose(outputs.reconstruction[0, 0, :3, :3], expected_slice, atol=1e-3)
|
assert torch.allclose(outputs.reconstruction[0, 0, :3, :3], expected_slice, atol=1e-3)
|
||||||
print("Looks ok!")
|
print("Looks ok!")
|
||||||
|
|
||||||
|
@ -116,7 +116,9 @@ def shard_on_the_fly(switch_checkpoint_path, dump_path, max_shard_size, dtype, w
|
|||||||
total_size += weight_size
|
total_size += weight_size
|
||||||
|
|
||||||
# Add the last block
|
# Add the last block
|
||||||
save_path = os.path.join(dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin"))
|
save_path = os.path.join(
|
||||||
|
dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts) + 1:05d}-of-???.bin")
|
||||||
|
)
|
||||||
rename_and_save_block(current_block, save_path)
|
rename_and_save_block(current_block, save_path)
|
||||||
sharded_state_dicts.append(current_block.keys())
|
sharded_state_dicts.append(current_block.keys())
|
||||||
|
|
||||||
|
@ -363,9 +363,9 @@ class TFT5Attention(keras.layers.Layer):
|
|||||||
real_seq_length = seq_length
|
real_seq_length = seq_length
|
||||||
|
|
||||||
if past_key_value is not None:
|
if past_key_value is not None:
|
||||||
assert (
|
assert len(past_key_value) == 2, (
|
||||||
len(past_key_value) == 2
|
f"past_key_value should have 2 past states: keys and values. Got {len(past_key_value)} past states"
|
||||||
), f"past_key_value should have 2 past states: keys and values. Got {len(past_key_value)} past states"
|
)
|
||||||
real_seq_length += shape_list(past_key_value[0])[2] if query_length is None else query_length
|
real_seq_length += shape_list(past_key_value[0])[2] if query_length is None else query_length
|
||||||
|
|
||||||
key_length = real_seq_length if key_value_states is None else shape_list(key_value_states)[1]
|
key_length = real_seq_length if key_value_states is None else shape_list(key_value_states)[1]
|
||||||
|
@ -1284,9 +1284,9 @@ class TapasForQuestionAnswering(TapasPreTrainedModel):
|
|||||||
aggregate_mask = None
|
aggregate_mask = None
|
||||||
else:
|
else:
|
||||||
if float_answer is not None:
|
if float_answer is not None:
|
||||||
assert (
|
assert labels.shape[0] == float_answer.shape[0], (
|
||||||
labels.shape[0] == float_answer.shape[0]
|
"Make sure the answers are a FloatTensor of shape (batch_size,)"
|
||||||
), "Make sure the answers are a FloatTensor of shape (batch_size,)"
|
)
|
||||||
# <float32>[batch_size]
|
# <float32>[batch_size]
|
||||||
aggregate_mask = _calculate_aggregate_mask(
|
aggregate_mask = _calculate_aggregate_mask(
|
||||||
float_answer,
|
float_answer,
|
||||||
@ -1336,9 +1336,9 @@ class TapasForQuestionAnswering(TapasPreTrainedModel):
|
|||||||
if is_supervised:
|
if is_supervised:
|
||||||
# Note that `aggregate_mask` is None if the setting is supervised.
|
# Note that `aggregate_mask` is None if the setting is supervised.
|
||||||
if aggregation_labels is not None:
|
if aggregation_labels is not None:
|
||||||
assert (
|
assert labels.shape[0] == aggregation_labels.shape[0], (
|
||||||
labels.shape[0] == aggregation_labels.shape[0]
|
"Make sure the aggregation labels are a LongTensor of shape (batch_size,)"
|
||||||
), "Make sure the aggregation labels are a LongTensor of shape (batch_size,)"
|
)
|
||||||
per_example_additional_loss = _calculate_aggregation_loss(
|
per_example_additional_loss = _calculate_aggregation_loss(
|
||||||
logits_aggregation,
|
logits_aggregation,
|
||||||
aggregate_mask,
|
aggregate_mask,
|
||||||
|
@ -1562,9 +1562,9 @@ class TFTapasForQuestionAnswering(TFTapasPreTrainedModel):
|
|||||||
aggregate_mask = None
|
aggregate_mask = None
|
||||||
else:
|
else:
|
||||||
if float_answer is not None:
|
if float_answer is not None:
|
||||||
assert (
|
assert shape_list(labels)[0] == shape_list(float_answer)[0], (
|
||||||
shape_list(labels)[0] == shape_list(float_answer)[0]
|
"Make sure the answers are a FloatTensor of shape (batch_size,)"
|
||||||
), "Make sure the answers are a FloatTensor of shape (batch_size,)"
|
)
|
||||||
# <float32>[batch_size]
|
# <float32>[batch_size]
|
||||||
aggregate_mask = _calculate_aggregate_mask(
|
aggregate_mask = _calculate_aggregate_mask(
|
||||||
float_answer,
|
float_answer,
|
||||||
@ -1615,9 +1615,9 @@ class TFTapasForQuestionAnswering(TFTapasPreTrainedModel):
|
|||||||
if is_supervised:
|
if is_supervised:
|
||||||
# Note that `aggregate_mask` is None if the setting is supervised.
|
# Note that `aggregate_mask` is None if the setting is supervised.
|
||||||
if aggregation_labels is not None:
|
if aggregation_labels is not None:
|
||||||
assert (
|
assert shape_list(labels)[0] == shape_list(aggregation_labels)[0], (
|
||||||
shape_list(labels)[0] == shape_list(aggregation_labels)[0]
|
"Make sure the aggregation labels are a LongTensor of shape (batch_size,)"
|
||||||
), "Make sure the aggregation labels are a LongTensor of shape (batch_size,)"
|
)
|
||||||
per_example_additional_loss = _calculate_aggregation_loss(
|
per_example_additional_loss = _calculate_aggregation_loss(
|
||||||
logits_aggregation,
|
logits_aggregation,
|
||||||
aggregate_mask,
|
aggregate_mask,
|
||||||
|
@ -773,7 +773,7 @@ TVP_PROMPTER_CLASSES_MAPPING = {
|
|||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on" " top.",
|
"The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on top.",
|
||||||
TVP_START_DOCSTRING,
|
TVP_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
class TvpModel(TvpPreTrainedModel):
|
class TvpModel(TvpPreTrainedModel):
|
||||||
|
@ -407,8 +407,7 @@ class UdopPatchEmbeddings(nn.Module):
|
|||||||
batch_size, num_channels, height, width = pixel_values.shape
|
batch_size, num_channels, height, width = pixel_values.shape
|
||||||
if height != self.image_size[0] or width != self.image_size[1]:
|
if height != self.image_size[0] or width != self.image_size[1]:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Input image size ({height}*{width}) doesn't match model"
|
f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
|
||||||
f" ({self.image_size[0]}*{self.image_size[1]})."
|
|
||||||
)
|
)
|
||||||
embeddings = self.proj(pixel_values)
|
embeddings = self.proj(pixel_values)
|
||||||
embeddings = embeddings.flatten(2).transpose(1, 2)
|
embeddings = embeddings.flatten(2).transpose(1, 2)
|
||||||
|
@ -84,9 +84,9 @@ def convert_visual_bert_checkpoint(checkpoint_path, pytorch_dump_folder_path):
|
|||||||
Copy/paste/tweak model's weights to our VisualBERT structure.
|
Copy/paste/tweak model's weights to our VisualBERT structure.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
assert (
|
assert checkpoint_path.split("/")[-1] in ACCEPTABLE_CHECKPOINTS, (
|
||||||
checkpoint_path.split("/")[-1] in ACCEPTABLE_CHECKPOINTS
|
f"The checkpoint provided must be in {ACCEPTABLE_CHECKPOINTS}."
|
||||||
), f"The checkpoint provided must be in {ACCEPTABLE_CHECKPOINTS}."
|
)
|
||||||
|
|
||||||
# Get Config
|
# Get Config
|
||||||
if "pre" in checkpoint_path:
|
if "pre" in checkpoint_path:
|
||||||
|
@ -72,8 +72,7 @@ class VivitTubeletEmbeddings(nn.Module):
|
|||||||
batch_size, num_frames, num_channels, height, width = pixel_values.shape
|
batch_size, num_frames, num_channels, height, width = pixel_values.shape
|
||||||
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Image image size ({height}*{width}) doesn't match model"
|
f"Image image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
|
||||||
f" ({self.image_size[0]}*{self.image_size[1]})."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# permute to (batch_size, num_channels, num_frames, height, width)
|
# permute to (batch_size, num_channels, num_frames, height, width)
|
||||||
|
@ -1043,9 +1043,9 @@ class WhisperEncoder(WhisperPreTrainedModel):
|
|||||||
|
|
||||||
# check if head_mask has a correct number of layers specified if desired
|
# check if head_mask has a correct number of layers specified if desired
|
||||||
if head_mask is not None:
|
if head_mask is not None:
|
||||||
assert head_mask.size()[0] == (
|
assert head_mask.size()[0] == (len(self.layers)), (
|
||||||
len(self.layers)
|
f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
||||||
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
)
|
||||||
|
|
||||||
for idx, encoder_layer in enumerate(self.layers):
|
for idx, encoder_layer in enumerate(self.layers):
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
|
@ -167,7 +167,7 @@ class XCLIPVisionEmbeddings(nn.Module):
|
|||||||
batch_size, _, height, width = pixel_values.shape
|
batch_size, _, height, width = pixel_values.shape
|
||||||
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
|
f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
|
||||||
)
|
)
|
||||||
target_dtype = self.patch_embedding.weight.dtype
|
target_dtype = self.patch_embedding.weight.dtype
|
||||||
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
||||||
|
@ -601,8 +601,7 @@ class XGLMModel(XGLMPreTrainedModel):
|
|||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
if use_cache:
|
if use_cache:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache ="
|
"`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`..."
|
||||||
" False`..."
|
|
||||||
)
|
)
|
||||||
use_cache = False
|
use_cache = False
|
||||||
|
|
||||||
|
@ -164,15 +164,15 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
|
|||||||
array = np.transpose(array)
|
array = np.transpose(array)
|
||||||
if isinstance(pointer, list):
|
if isinstance(pointer, list):
|
||||||
# Here we will split the TF weights
|
# Here we will split the TF weights
|
||||||
assert (
|
assert len(pointer) == array.shape[0], (
|
||||||
len(pointer) == array.shape[0]
|
f"Pointer length {len(pointer)} and array length {array.shape[0]} mismatched"
|
||||||
), f"Pointer length {len(pointer)} and array length {array.shape[0]} mismatched"
|
)
|
||||||
for i, p_i in enumerate(pointer):
|
for i, p_i in enumerate(pointer):
|
||||||
arr_i = array[i, ...]
|
arr_i = array[i, ...]
|
||||||
try:
|
try:
|
||||||
assert (
|
assert p_i.shape == arr_i.shape, (
|
||||||
p_i.shape == arr_i.shape
|
f"Pointer shape {p_i.shape} and array shape {arr_i.shape} mismatched"
|
||||||
), f"Pointer shape {p_i.shape} and array shape {arr_i.shape} mismatched"
|
)
|
||||||
except AssertionError as e:
|
except AssertionError as e:
|
||||||
e.args += (p_i.shape, arr_i.shape)
|
e.args += (p_i.shape, arr_i.shape)
|
||||||
raise
|
raise
|
||||||
@ -180,9 +180,9 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
|
|||||||
p_i.data = torch.from_numpy(arr_i)
|
p_i.data = torch.from_numpy(arr_i)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
assert (
|
assert pointer.shape == array.shape, (
|
||||||
pointer.shape == array.shape
|
f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
|
||||||
), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
|
)
|
||||||
except AssertionError as e:
|
except AssertionError as e:
|
||||||
e.args += (pointer.shape, array.shape)
|
e.args += (pointer.shape, array.shape)
|
||||||
raise
|
raise
|
||||||
|
@ -203,9 +203,9 @@ class ZambaConfig(PretrainedConfig):
|
|||||||
|
|
||||||
self.layers_block_type = self._layers_block_type(num_hidden_layers, attn_layer_period, attn_layer_offset)
|
self.layers_block_type = self._layers_block_type(num_hidden_layers, attn_layer_period, attn_layer_offset)
|
||||||
|
|
||||||
assert (
|
assert (self.mamba_expand * self.hidden_size) % self.n_mamba_heads == 0, (
|
||||||
self.mamba_expand * self.hidden_size
|
"`intermediate_size` should be divisible by `n_mamba_heads`."
|
||||||
) % self.n_mamba_heads == 0, "`intermediate_size` should be divisible by `n_mamba_heads`."
|
)
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
pad_token_id=pad_token_id,
|
pad_token_id=pad_token_id,
|
||||||
|
@ -339,7 +339,7 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if self.model_type == ModelType.VisionEncoderDecoder:
|
if self.model_type == ModelType.VisionEncoderDecoder:
|
||||||
task_prompt = f'<s_docvqa><s_question>{input["question"]}</s_question><s_answer>'
|
task_prompt = f"<s_docvqa><s_question>{input['question']}</s_question><s_answer>"
|
||||||
# Adapted from https://huggingface.co/spaces/nielsr/donut-docvqa/blob/main/app.py
|
# Adapted from https://huggingface.co/spaces/nielsr/donut-docvqa/blob/main/app.py
|
||||||
encoding = {
|
encoding = {
|
||||||
"inputs": image_features["pixel_values"],
|
"inputs": image_features["pixel_values"],
|
||||||
|
@ -104,8 +104,7 @@ class FbgemmFp8HfQuantizer(HfQuantizer):
|
|||||||
)
|
)
|
||||||
elif torch_dtype == torch.float16:
|
elif torch_dtype == torch.float16:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"You cannot use FP8 with torch_dtype=torch.float16."
|
"You cannot use FP8 with torch_dtype=torch.float16.We recommend you passing torch_dtype=torch.bfloat16"
|
||||||
"We recommend you passing torch_dtype=torch.bfloat16"
|
|
||||||
)
|
)
|
||||||
return torch_dtype
|
return torch_dtype
|
||||||
|
|
||||||
|
@ -257,8 +257,7 @@ class TorchAoHfQuantizer(HfQuantizer):
|
|||||||
def is_serializable(self, safe_serialization=None) -> bool:
|
def is_serializable(self, safe_serialization=None) -> bool:
|
||||||
if safe_serialization:
|
if safe_serialization:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"torchao quantized model does not support safe serialization, "
|
"torchao quantized model does not support safe serialization, please set `safe_serialization` to False"
|
||||||
"please set `safe_serialization` to False"
|
|
||||||
)
|
)
|
||||||
return False
|
return False
|
||||||
_is_torchao_serializable = version.parse(importlib.metadata.version("huggingface_hub")) >= version.parse(
|
_is_torchao_serializable = version.parse(importlib.metadata.version("huggingface_hub")) >= version.parse(
|
||||||
|
@ -868,7 +868,7 @@ class SpecialTokensMixin:
|
|||||||
def __init__(self, verbose=False, **kwargs):
|
def __init__(self, verbose=False, **kwargs):
|
||||||
self._pad_token_type_id = 0
|
self._pad_token_type_id = 0
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self._special_tokens_map = {attr: None for attr in self.SPECIAL_TOKENS_ATTRIBUTES}
|
self._special_tokens_map = dict.fromkeys(self.SPECIAL_TOKENS_ATTRIBUTES)
|
||||||
self._special_tokens_map["additional_special_tokens"] = [] # for BC where it defaults to empty list
|
self._special_tokens_map["additional_special_tokens"] = [] # for BC where it defaults to empty list
|
||||||
|
|
||||||
# We directly set the hidden value to allow initialization with special tokens
|
# We directly set the hidden value to allow initialization with special tokens
|
||||||
@ -881,9 +881,9 @@ class SpecialTokensMixin:
|
|||||||
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
|
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
|
||||||
if key == "additional_special_tokens":
|
if key == "additional_special_tokens":
|
||||||
assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
|
assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
|
||||||
assert all(
|
assert all(isinstance(t, (str, AddedToken)) for t in value), (
|
||||||
isinstance(t, (str, AddedToken)) for t in value
|
"One of the tokens is not a string or an AddedToken"
|
||||||
), "One of the tokens is not a string or an AddedToken"
|
)
|
||||||
setattr(self, key, value)
|
setattr(self, key, value)
|
||||||
elif isinstance(value, (str, AddedToken)):
|
elif isinstance(value, (str, AddedToken)):
|
||||||
setattr(self, key, value)
|
setattr(self, key, value)
|
||||||
@ -967,9 +967,9 @@ class SpecialTokensMixin:
|
|||||||
logger.info(f"Assigning {value} to the {key} key of the tokenizer")
|
logger.info(f"Assigning {value} to the {key} key of the tokenizer")
|
||||||
|
|
||||||
if key == "additional_special_tokens":
|
if key == "additional_special_tokens":
|
||||||
assert isinstance(value, (list, tuple)) and all(
|
assert isinstance(value, (list, tuple)) and all(isinstance(t, (str, AddedToken)) for t in value), (
|
||||||
isinstance(t, (str, AddedToken)) for t in value
|
f"Tokens {value} for key {key} should all be str or AddedToken instances"
|
||||||
), f"Tokens {value} for key {key} should all be str or AddedToken instances"
|
)
|
||||||
|
|
||||||
to_add = []
|
to_add = []
|
||||||
for token in value:
|
for token in value:
|
||||||
@ -3379,9 +3379,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
|
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
|
||||||
|
|
||||||
batch_size = len(required_input)
|
batch_size = len(required_input)
|
||||||
assert all(
|
assert all(len(v) == batch_size for v in encoded_inputs.values()), (
|
||||||
len(v) == batch_size for v in encoded_inputs.values()
|
"Some items in the output dictionary have a different batch size than others."
|
||||||
), "Some items in the output dictionary have a different batch size than others."
|
)
|
||||||
|
|
||||||
if padding_strategy == PaddingStrategy.LONGEST:
|
if padding_strategy == PaddingStrategy.LONGEST:
|
||||||
max_length = max(len(inputs) for inputs in required_input)
|
max_length = max(len(inputs) for inputs in required_input)
|
||||||
|
@ -749,12 +749,12 @@ class EarlyStoppingCallback(TrainerCallback, ExportableState):
|
|||||||
"Using EarlyStoppingCallback without load_best_model_at_end=True. "
|
"Using EarlyStoppingCallback without load_best_model_at_end=True. "
|
||||||
"Once training is finished, the best model will not be loaded automatically."
|
"Once training is finished, the best model will not be loaded automatically."
|
||||||
)
|
)
|
||||||
assert (
|
assert args.metric_for_best_model is not None, (
|
||||||
args.metric_for_best_model is not None
|
"EarlyStoppingCallback requires metric_for_best_model to be defined"
|
||||||
), "EarlyStoppingCallback requires metric_for_best_model to be defined"
|
)
|
||||||
assert (
|
assert args.eval_strategy != IntervalStrategy.NO, (
|
||||||
args.eval_strategy != IntervalStrategy.NO
|
"EarlyStoppingCallback requires IntervalStrategy of steps or epoch"
|
||||||
), "EarlyStoppingCallback requires IntervalStrategy of steps or epoch"
|
)
|
||||||
|
|
||||||
def on_evaluate(self, args, state, control, metrics, **kwargs):
|
def on_evaluate(self, args, state, control, metrics, **kwargs):
|
||||||
metric_to_check = args.metric_for_best_model
|
metric_to_check = args.metric_for_best_model
|
||||||
|
@ -121,9 +121,9 @@ def nested_concat(tensors, new_tensors, padding_index=-100):
|
|||||||
nested list/tuples/dict of tensors.
|
nested list/tuples/dict of tensors.
|
||||||
"""
|
"""
|
||||||
if not (isinstance(tensors, torch.Tensor) and isinstance(new_tensors, torch.Tensor)):
|
if not (isinstance(tensors, torch.Tensor) and isinstance(new_tensors, torch.Tensor)):
|
||||||
assert (
|
assert type(tensors) is type(new_tensors), (
|
||||||
type(tensors) is type(new_tensors)
|
f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
|
||||||
), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
|
)
|
||||||
if isinstance(tensors, (list, tuple)):
|
if isinstance(tensors, (list, tuple)):
|
||||||
return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
|
return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
|
||||||
elif isinstance(tensors, torch.Tensor):
|
elif isinstance(tensors, torch.Tensor):
|
||||||
@ -381,15 +381,15 @@ class SequentialDistributedSampler(Sampler):
|
|||||||
|
|
||||||
# add extra samples to make it evenly divisible
|
# add extra samples to make it evenly divisible
|
||||||
indices += indices[: (self.total_size - len(indices))]
|
indices += indices[: (self.total_size - len(indices))]
|
||||||
assert (
|
assert len(indices) == self.total_size, (
|
||||||
len(indices) == self.total_size
|
f"Indices length {len(indices)} and total size {self.total_size} mismatched"
|
||||||
), f"Indices length {len(indices)} and total size {self.total_size} mismatched"
|
)
|
||||||
|
|
||||||
# subsample
|
# subsample
|
||||||
indices = indices[self.rank * self.num_samples : (self.rank + 1) * self.num_samples]
|
indices = indices[self.rank * self.num_samples : (self.rank + 1) * self.num_samples]
|
||||||
assert (
|
assert len(indices) == self.num_samples, (
|
||||||
len(indices) == self.num_samples
|
f"Indices length {len(indices)} and sample number {self.num_samples} mismatched"
|
||||||
), f"Indices length {len(indices)} and sample number {self.num_samples} mismatched"
|
)
|
||||||
|
|
||||||
return iter(indices)
|
return iter(indices)
|
||||||
|
|
||||||
@ -506,9 +506,9 @@ class DistributedTensorGatherer:
|
|||||||
if isinstance(arrays, (list, tuple)):
|
if isinstance(arrays, (list, tuple)):
|
||||||
result = [self._nested_set_tensors(x, y) for x, y in zip(storage, arrays)]
|
result = [self._nested_set_tensors(x, y) for x, y in zip(storage, arrays)]
|
||||||
return result[0][0], type(arrays)(r[1] for r in result)
|
return result[0][0], type(arrays)(r[1] for r in result)
|
||||||
assert (
|
assert arrays.shape[0] % self.world_size == 0, (
|
||||||
arrays.shape[0] % self.world_size == 0
|
f"Arrays passed should all have a first dimension multiple of {self.world_size}, found {arrays.shape[0]}."
|
||||||
), f"Arrays passed should all have a first dimension multiple of {self.world_size}, found {arrays.shape[0]}."
|
)
|
||||||
|
|
||||||
slice_len = arrays.shape[0] // self.world_size
|
slice_len = arrays.shape[0] // self.world_size
|
||||||
for i in range(self.world_size):
|
for i in range(self.world_size):
|
||||||
|
@ -412,7 +412,7 @@ def _compile_jinja_template(chat_template):
|
|||||||
|
|
||||||
if version.parse(jinja2.__version__) < version.parse("3.1.0"):
|
if version.parse(jinja2.__version__) < version.parse("3.1.0"):
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"apply_chat_template requires jinja2>=3.1.0 to be installed. Your version is " f"{jinja2.__version__}."
|
f"apply_chat_template requires jinja2>=3.1.0 to be installed. Your version is {jinja2.__version__}."
|
||||||
)
|
)
|
||||||
|
|
||||||
def raise_exception(message):
|
def raise_exception(message):
|
||||||
|
@ -513,7 +513,9 @@ def cached_files(
|
|||||||
return None
|
return None
|
||||||
# Now we raise for missing entries
|
# Now we raise for missing entries
|
||||||
revision_ = "main" if revision is None else revision
|
revision_ = "main" if revision is None else revision
|
||||||
msg = f"a file named {missing_entries[0]}" if len(missing_entries) == 1 else f"files named {*missing_entries,}"
|
msg = (
|
||||||
|
f"a file named {missing_entries[0]}" if len(missing_entries) == 1 else f"files named {(*missing_entries,)}"
|
||||||
|
)
|
||||||
raise EnvironmentError(
|
raise EnvironmentError(
|
||||||
f"{path_or_repo_id} does not appear to have {msg}. Checkout 'https://huggingface.co/{path_or_repo_id}/tree/{revision_}'"
|
f"{path_or_repo_id} does not appear to have {msg}. Checkout 'https://huggingface.co/{path_or_repo_id}/tree/{revision_}'"
|
||||||
"for available files."
|
"for available files."
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user