mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-03 03:31:05 +06:00
Early labels validation (#31240)
* Move label validation checks - fail early * Remove some formatting changes - add back labels change wav2vec2
This commit is contained in:
parent
03ea160937
commit
54659048a2
@ -763,6 +763,12 @@ class BarkCausalModel(BarkPreTrainedModel):
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError(
|
||||
"Training is not implemented yet for Bark - ensure you do not pass `labels` to the model."
|
||||
)
|
||||
|
||||
# Verify if input_embeds already exists
|
||||
# then compute embeddings.
|
||||
if input_ids is not None and input_embeds is not None:
|
||||
@ -870,12 +876,6 @@ class BarkCausalModel(BarkPreTrainedModel):
|
||||
|
||||
logits = self.lm_head(hidden_states)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError(
|
||||
"Training is not implemented yet for Bark - ensure you do not pass `labels` to the model."
|
||||
)
|
||||
|
||||
if not return_dict:
|
||||
return tuple(
|
||||
v for v in [None, logits, present_key_values, all_hidden_states, all_self_attentions] if v is not None
|
||||
@ -1393,6 +1393,10 @@ class BarkFineModel(BarkPreTrainedModel):
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Training is not implemented yet")
|
||||
|
||||
if codebook_idx == 0:
|
||||
raise ValueError("Cannot predict 0th codebook - 0th codebook should be predicted by the coarse model")
|
||||
|
||||
@ -1470,10 +1474,6 @@ class BarkFineModel(BarkPreTrainedModel):
|
||||
|
||||
logits = self.lm_heads[codebook_idx - self.config.n_codes_given](hidden_states)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Training is not implemented yet")
|
||||
|
||||
if not return_dict:
|
||||
return tuple(v for v in [None, logits, all_hidden_states, all_self_attentions] if v is not None)
|
||||
|
||||
|
@ -1247,6 +1247,9 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
|
||||
if labels is not None and self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
|
||||
outputs = self.beit(
|
||||
pixel_values,
|
||||
head_mask=head_mask,
|
||||
@ -1279,10 +1282,7 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
else:
|
||||
loss = self.compute_loss(logits, auxiliary_logits, labels)
|
||||
loss = self.compute_loss(logits, auxiliary_logits, labels)
|
||||
|
||||
if not return_dict:
|
||||
if output_hidden_states:
|
||||
|
@ -1372,9 +1372,11 @@ class Data2VecAudioForCTC(Data2VecAudioPreTrainedModel):
|
||||
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
|
||||
config.vocab_size - 1]`.
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None and labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
outputs = self.data2vec_audio(
|
||||
input_values,
|
||||
attention_mask=attention_mask,
|
||||
@ -1390,9 +1392,6 @@ class Data2VecAudioForCTC(Data2VecAudioPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
# retrieve loss input_lengths from attention_mask
|
||||
attention_mask = (
|
||||
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
|
||||
|
@ -1173,6 +1173,9 @@ class Data2VecVisionForSemanticSegmentation(Data2VecVisionPreTrainedModel):
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
|
||||
if labels is not None and self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
|
||||
outputs = self.data2vec_vision(
|
||||
pixel_values,
|
||||
head_mask=head_mask,
|
||||
@ -1205,10 +1208,7 @@ class Data2VecVisionForSemanticSegmentation(Data2VecVisionPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
else:
|
||||
loss = self.compute_loss(logits, auxiliary_logits, labels)
|
||||
loss = self.compute_loss(logits, auxiliary_logits, labels)
|
||||
|
||||
if not return_dict:
|
||||
if output_hidden_states:
|
||||
|
@ -1633,6 +1633,9 @@ class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel):
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
|
||||
if labels is not None and self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
|
||||
outputs = self.data2vec_vision(
|
||||
pixel_values,
|
||||
head_mask=head_mask,
|
||||
@ -1672,10 +1675,7 @@ class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
else:
|
||||
loss = self.compute_loss(logits, auxiliary_logits, labels)
|
||||
loss = self.compute_loss(logits, auxiliary_logits, labels)
|
||||
|
||||
if not return_dict:
|
||||
if output_hidden_states:
|
||||
|
@ -732,6 +732,8 @@ class MCTCTForCTC(MCTCTPreTrainedModel):
|
||||
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
|
||||
config.vocab_size - 1]`.
|
||||
"""
|
||||
if labels is not None and labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
outputs = self.mctct(
|
||||
@ -749,9 +751,6 @@ class MCTCTForCTC(MCTCTPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
# retrieve loss input_lengths from attention_mask
|
||||
attention_mask = (
|
||||
attention_mask
|
||||
|
@ -1440,9 +1440,13 @@ class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
|
||||
>>> outputs = model(**inputs)
|
||||
>>> logits = outputs.logits
|
||||
```"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None and relevance_score is None:
|
||||
raise ValueError(
|
||||
"You have to specify `relevance_score` when `labels` is specified in order to compute loss."
|
||||
)
|
||||
|
||||
(flattened_input_ids, flattened_attention_mask, flattened_token_type_ids) = self._flatten_inputs(
|
||||
input_ids, attention_mask, token_type_ids
|
||||
)
|
||||
@ -1468,11 +1472,6 @@ class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
|
||||
|
||||
masked_lm_loss = None
|
||||
if labels is not None:
|
||||
if candidate_score is None:
|
||||
raise ValueError(
|
||||
"You have to specify `relevance_score` when `labels` is specified in order to compute loss."
|
||||
)
|
||||
|
||||
batch_size, seq_length = labels.size()
|
||||
|
||||
if mlm_mask is None:
|
||||
|
@ -424,6 +424,10 @@ class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel):
|
||||
>>> formatted = (output * 255 / np.max(output)).astype("uint8")
|
||||
>>> depth = Image.fromarray(formatted)
|
||||
```"""
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Training is not implemented yet")
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
@ -444,10 +448,6 @@ class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel):
|
||||
|
||||
predicted_depth = self.head(hidden_states, patch_height, patch_width)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Training is not implemented yet")
|
||||
|
||||
if not return_dict:
|
||||
if output_hidden_states:
|
||||
output = (predicted_depth,) + outputs[1:]
|
||||
|
@ -1136,6 +1136,10 @@ class DPTForDepthEstimation(DPTPreTrainedModel):
|
||||
>>> formatted = (output * 255 / np.max(output)).astype("uint8")
|
||||
>>> depth = Image.fromarray(formatted)
|
||||
```"""
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Training is not implemented yet")
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
@ -1183,10 +1187,6 @@ class DPTForDepthEstimation(DPTPreTrainedModel):
|
||||
|
||||
predicted_depth = self.head(hidden_states)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Training is not implemented yet")
|
||||
|
||||
if not return_dict:
|
||||
if output_hidden_states:
|
||||
output = (predicted_depth,) + outputs[1:]
|
||||
@ -1308,6 +1308,9 @@ class DPTForSemanticSegmentation(DPTPreTrainedModel):
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
|
||||
if labels is not None and self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
|
||||
outputs = self.dpt(
|
||||
pixel_values,
|
||||
head_mask=head_mask,
|
||||
@ -1342,22 +1345,19 @@ class DPTForSemanticSegmentation(DPTPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
else:
|
||||
# upsample logits to the images' original size
|
||||
upsampled_logits = nn.functional.interpolate(
|
||||
logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
|
||||
# upsample logits to the images' original size
|
||||
upsampled_logits = nn.functional.interpolate(
|
||||
logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
|
||||
)
|
||||
if auxiliary_logits is not None:
|
||||
upsampled_auxiliary_logits = nn.functional.interpolate(
|
||||
auxiliary_logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
|
||||
)
|
||||
if auxiliary_logits is not None:
|
||||
upsampled_auxiliary_logits = nn.functional.interpolate(
|
||||
auxiliary_logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
|
||||
)
|
||||
# compute weighted loss
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
|
||||
main_loss = loss_fct(upsampled_logits, labels)
|
||||
auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels)
|
||||
loss = main_loss + self.config.auxiliary_loss_weight * auxiliary_loss
|
||||
# compute weighted loss
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
|
||||
main_loss = loss_fct(upsampled_logits, labels)
|
||||
auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels)
|
||||
loss = main_loss + self.config.auxiliary_loss_weight * auxiliary_loss
|
||||
|
||||
if not return_dict:
|
||||
if output_hidden_states:
|
||||
|
@ -921,6 +921,8 @@ class TFGPTJForSequenceClassification(TFGPTJPreTrainedModel, TFSequenceClassific
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
if labels is not None and self.config.pad_token_id is None and input_ids.shape[0] != 1:
|
||||
raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
input_ids=input_ids,
|
||||
@ -963,9 +965,6 @@ class TFGPTJForSequenceClassification(TFGPTJPreTrainedModel, TFSequenceClassific
|
||||
loss = None
|
||||
|
||||
if labels is not None:
|
||||
if self.config.pad_token_id is None and logits_shape[0] != 1:
|
||||
raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
|
||||
|
||||
if not tf.is_tensor(sequence_lengths):
|
||||
in_logits = logits[0 : logits_shape[0], sequence_lengths]
|
||||
|
||||
|
@ -1574,9 +1574,11 @@ class HubertForCTC(HubertPreTrainedModel):
|
||||
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
|
||||
config.vocab_size - 1]`.
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None and labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
outputs = self.hubert(
|
||||
input_values,
|
||||
attention_mask=attention_mask,
|
||||
@ -1592,9 +1594,6 @@ class HubertForCTC(HubertPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
# retrieve loss input_lengths from attention_mask
|
||||
attention_mask = (
|
||||
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
|
||||
|
@ -1600,6 +1600,8 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
|
||||
|
||||
>>> loss = model(input_values, labels=labels).loss
|
||||
```"""
|
||||
if labels is not None and tf.reduce_max(labels) >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
outputs = self.hubert(
|
||||
input_values=input_values,
|
||||
@ -1619,9 +1621,6 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
|
||||
logits = self.lm_head(hidden_states)
|
||||
|
||||
if labels is not None:
|
||||
if tf.reduce_max(labels) >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
attention_mask = (
|
||||
attention_mask if attention_mask is not None else tf.ones_like(input_values, dtype=tf.float32)
|
||||
)
|
||||
|
@ -822,6 +822,9 @@ class MobileNetV2ForSemanticSegmentation(MobileNetV2PreTrainedModel):
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None and self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
|
||||
outputs = self.mobilenet_v2(
|
||||
pixel_values,
|
||||
output_hidden_states=True, # we need the intermediate hidden states
|
||||
@ -834,15 +837,12 @@ class MobileNetV2ForSemanticSegmentation(MobileNetV2PreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
else:
|
||||
# upsample logits to the images' original size
|
||||
upsampled_logits = nn.functional.interpolate(
|
||||
logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
|
||||
)
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
|
||||
loss = loss_fct(upsampled_logits, labels)
|
||||
# upsample logits to the images' original size
|
||||
upsampled_logits = nn.functional.interpolate(
|
||||
logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
|
||||
)
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
|
||||
loss = loss_fct(upsampled_logits, labels)
|
||||
|
||||
if not return_dict:
|
||||
if output_hidden_states:
|
||||
|
@ -1026,6 +1026,9 @@ class MobileViTForSemanticSegmentation(MobileViTPreTrainedModel):
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None and self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
|
||||
outputs = self.mobilevit(
|
||||
pixel_values,
|
||||
output_hidden_states=True, # we need the intermediate hidden states
|
||||
@ -1038,15 +1041,12 @@ class MobileViTForSemanticSegmentation(MobileViTPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
else:
|
||||
# upsample logits to the images' original size
|
||||
upsampled_logits = nn.functional.interpolate(
|
||||
logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
|
||||
)
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
|
||||
loss = loss_fct(upsampled_logits, labels)
|
||||
# upsample logits to the images' original size
|
||||
upsampled_logits = nn.functional.interpolate(
|
||||
logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
|
||||
)
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
|
||||
loss = loss_fct(upsampled_logits, labels)
|
||||
|
||||
if not return_dict:
|
||||
if output_hidden_states:
|
||||
|
@ -1323,6 +1323,9 @@ class TFMobileViTForSemanticSegmentation(TFMobileViTPreTrainedModel):
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None and not self.config.num_labels > 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
|
||||
outputs = self.mobilevit(
|
||||
pixel_values,
|
||||
output_hidden_states=True, # we need the intermediate hidden states
|
||||
@ -1336,10 +1339,7 @@ class TFMobileViTForSemanticSegmentation(TFMobileViTPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if not self.config.num_labels > 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
else:
|
||||
loss = self.hf_compute_loss(logits=logits, labels=labels)
|
||||
loss = self.hf_compute_loss(logits=logits, labels=labels)
|
||||
|
||||
# make logits of shape (batch_size, num_labels, height, width) to
|
||||
# keep them consistent across APIs
|
||||
|
@ -990,6 +990,9 @@ class MobileViTV2ForSemanticSegmentation(MobileViTV2PreTrainedModel):
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None and self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
|
||||
outputs = self.mobilevitv2(
|
||||
pixel_values,
|
||||
output_hidden_states=True, # we need the intermediate hidden states
|
||||
@ -1002,15 +1005,12 @@ class MobileViTV2ForSemanticSegmentation(MobileViTV2PreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
else:
|
||||
# upsample logits to the images' original size
|
||||
upsampled_logits = nn.functional.interpolate(
|
||||
logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
|
||||
)
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
|
||||
loss = loss_fct(upsampled_logits, labels)
|
||||
# upsample logits to the images' original size
|
||||
upsampled_logits = nn.functional.interpolate(
|
||||
logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
|
||||
)
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
|
||||
loss = loss_fct(upsampled_logits, labels)
|
||||
|
||||
if not return_dict:
|
||||
if output_hidden_states:
|
||||
|
@ -1740,6 +1740,10 @@ class PerceiverForOpticalFlow(PerceiverPreTrainedModel):
|
||||
```"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Optical flow training is not yet supported")
|
||||
|
||||
outputs = self.perceiver(
|
||||
inputs=inputs,
|
||||
attention_mask=attention_mask,
|
||||
@ -1750,10 +1754,6 @@ class PerceiverForOpticalFlow(PerceiverPreTrainedModel):
|
||||
)
|
||||
logits = outputs.logits if return_dict else outputs[0]
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Optical flow training is not yet supported")
|
||||
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
@ -1974,6 +1974,10 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
|
||||
```"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Multimodal autoencoding training is not yet supported")
|
||||
|
||||
outputs = self.perceiver(
|
||||
inputs=inputs,
|
||||
attention_mask=attention_mask,
|
||||
@ -1985,10 +1989,6 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
|
||||
)
|
||||
logits = outputs.logits if return_dict else outputs[0]
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Multimodal autoencoding training is not yet supported")
|
||||
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
@ -784,6 +784,9 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
|
||||
if labels is not None and self.config.num_labels < 1:
|
||||
raise ValueError(f"Number of labels should be >=0: {self.config.num_labels}")
|
||||
|
||||
outputs = self.segformer(
|
||||
pixel_values,
|
||||
output_attentions=output_attentions,
|
||||
@ -809,8 +812,6 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
|
||||
loss_fct = BCEWithLogitsLoss(reduction="none")
|
||||
loss = loss_fct(upsampled_logits.squeeze(1), labels.float())
|
||||
loss = (loss * valid_mask).mean()
|
||||
else:
|
||||
raise ValueError(f"Number of labels should be >=0: {self.config.num_labels}")
|
||||
|
||||
if not return_dict:
|
||||
if output_hidden_states:
|
||||
|
@ -988,6 +988,9 @@ class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel):
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
|
||||
if labels is not None and not self.config.num_labels > 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
|
||||
outputs = self.segformer(
|
||||
pixel_values,
|
||||
output_attentions=output_attentions,
|
||||
@ -1001,10 +1004,7 @@ class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if not self.config.num_labels > 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
else:
|
||||
loss = self.hf_compute_loss(logits=logits, labels=labels)
|
||||
loss = self.hf_compute_loss(logits=logits, labels=labels)
|
||||
|
||||
# make logits of shape (batch_size, num_labels, height, width) to
|
||||
# keep them consistent across APIs
|
||||
|
@ -1418,9 +1418,11 @@ class SEWForCTC(SEWPreTrainedModel):
|
||||
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
|
||||
config.vocab_size - 1]`.
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None and labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
outputs = self.sew(
|
||||
input_values,
|
||||
attention_mask=attention_mask,
|
||||
@ -1436,9 +1438,6 @@ class SEWForCTC(SEWPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
# retrieve loss input_lengths from attention_mask
|
||||
attention_mask = (
|
||||
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
|
||||
|
@ -1575,9 +1575,11 @@ class SEWDForCTC(SEWDPreTrainedModel):
|
||||
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
|
||||
config.vocab_size - 1]`.
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None and labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
outputs = self.sew_d(
|
||||
input_values,
|
||||
attention_mask=attention_mask,
|
||||
@ -1593,9 +1595,6 @@ class SEWDForCTC(SEWDPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
# retrieve loss input_lengths from attention_mask
|
||||
attention_mask = (
|
||||
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
|
||||
|
@ -1128,6 +1128,10 @@ class Swin2SRForImageSuperResolution(Swin2SRPreTrainedModel):
|
||||
```"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Training is not supported at the moment")
|
||||
|
||||
height, width = pixel_values.shape[2:]
|
||||
|
||||
if self.config.upsampler == "pixelshuffle_aux":
|
||||
@ -1159,10 +1163,6 @@ class Swin2SRForImageSuperResolution(Swin2SRPreTrainedModel):
|
||||
reconstruction = reconstruction / self.swin2sr.img_range + self.swin2sr.mean
|
||||
reconstruction = reconstruction[:, :, : height * self.upscale, : width * self.upscale]
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Training is not supported at the moment")
|
||||
|
||||
if not return_dict:
|
||||
output = (reconstruction,) + outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
@ -1824,9 +1824,11 @@ class UniSpeechForCTC(UniSpeechPreTrainedModel):
|
||||
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
|
||||
config.vocab_size - 1]`.
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None and labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
outputs = self.unispeech(
|
||||
input_values,
|
||||
attention_mask=attention_mask,
|
||||
@ -1842,9 +1844,6 @@ class UniSpeechForCTC(UniSpeechPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
# retrieve loss input_lengths from attention_mask
|
||||
attention_mask = (
|
||||
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
|
||||
|
@ -1834,9 +1834,11 @@ class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel):
|
||||
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
|
||||
config.vocab_size - 1]`.
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None and labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
outputs = self.unispeech_sat(
|
||||
input_values,
|
||||
attention_mask=attention_mask,
|
||||
@ -1852,9 +1854,6 @@ class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
# retrieve loss input_lengths from attention_mask
|
||||
attention_mask = (
|
||||
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
|
||||
|
@ -392,6 +392,8 @@ class UperNetForSemanticSegmentation(UperNetPreTrainedModel):
|
||||
>>> list(logits.shape)
|
||||
[1, 150, 512, 512]
|
||||
```"""
|
||||
if labels is not None and self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
output_hidden_states = (
|
||||
@ -416,15 +418,12 @@ class UperNetForSemanticSegmentation(UperNetPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.num_labels == 1:
|
||||
raise ValueError("The number of labels should be greater than one")
|
||||
else:
|
||||
# compute weighted loss
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.config.loss_ignore_index)
|
||||
loss = loss_fct(logits, labels)
|
||||
if auxiliary_logits is not None:
|
||||
auxiliary_loss = loss_fct(auxiliary_logits, labels)
|
||||
loss += self.config.auxiliary_loss_weight * auxiliary_loss
|
||||
# compute weighted loss
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.config.loss_ignore_index)
|
||||
loss = loss_fct(logits, labels)
|
||||
if auxiliary_logits is not None:
|
||||
auxiliary_loss = loss_fct(auxiliary_logits, labels)
|
||||
loss += self.config.auxiliary_loss_weight * auxiliary_loss
|
||||
|
||||
if not return_dict:
|
||||
if output_hidden_states:
|
||||
|
@ -1226,6 +1226,10 @@ class ViltForImageAndTextRetrieval(ViltPreTrainedModel):
|
||||
```"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Training is not yet supported.")
|
||||
|
||||
outputs = self.vilt(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
@ -1244,12 +1248,6 @@ class ViltForImageAndTextRetrieval(ViltPreTrainedModel):
|
||||
|
||||
logits = self.rank_output(pooler_output)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
# move labels to correct device to enable PP
|
||||
labels = labels.to(logits.device)
|
||||
raise NotImplementedError("Training is not yet supported.")
|
||||
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
@ -939,6 +939,14 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel):
|
||||
```"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None:
|
||||
total_size = attention_mask.size(-1) + visual_attention_mask.size(-1)
|
||||
if labels.size(-1) != total_size:
|
||||
raise ValueError(
|
||||
"The labels provided should have same sequence length as total attention mask. "
|
||||
f"Found labels with sequence length {labels.size(-1)}, expected {total_size}."
|
||||
)
|
||||
|
||||
outputs = self.visual_bert(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
@ -960,26 +968,12 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel):
|
||||
|
||||
total_loss = None
|
||||
if labels is not None and sentence_image_labels is not None:
|
||||
total_size = attention_mask.size(-1) + visual_attention_mask.size(-1)
|
||||
if labels.size(-1) != total_size:
|
||||
raise ValueError(
|
||||
"The labels provided should have same sequence length as total attention mask. "
|
||||
f"Found labels with sequence length {labels.size(-1)}, expected {total_size}."
|
||||
)
|
||||
|
||||
loss_fct = CrossEntropyLoss()
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
sentence_image_loss = loss_fct(seq_relationship_score.view(-1, 2), sentence_image_labels.view(-1))
|
||||
total_loss = masked_lm_loss + sentence_image_loss
|
||||
|
||||
if labels is not None and sentence_image_labels is None:
|
||||
total_size = attention_mask.size(-1) + visual_attention_mask.size(-1)
|
||||
if labels.size(-1) != total_size:
|
||||
raise ValueError(
|
||||
"The labels provided should have same sequence length as total attention mask. "
|
||||
f"Found labels with sequence length {labels.size(-1)}, expected {total_size}."
|
||||
)
|
||||
|
||||
elif labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
total_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
|
||||
|
@ -310,6 +310,10 @@ class VitMatteForImageMatting(VitMattePreTrainedModel):
|
||||
)
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Training is not yet supported")
|
||||
|
||||
outputs = self.backbone.forward_with_filtered_kwargs(
|
||||
pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
|
||||
)
|
||||
@ -317,10 +321,6 @@ class VitMatteForImageMatting(VitMattePreTrainedModel):
|
||||
features = outputs.feature_maps[-1]
|
||||
alphas = self.decoder(features, pixel_values)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Training is not yet supported")
|
||||
|
||||
if not return_dict:
|
||||
output = (alphas,) + outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
@ -1394,6 +1394,9 @@ class VitsModel(VitsPreTrainedModel):
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Training of VITS is not supported yet.")
|
||||
|
||||
if attention_mask is not None:
|
||||
input_padding_mask = attention_mask.unsqueeze(-1).float()
|
||||
else:
|
||||
@ -1408,9 +1411,6 @@ class VitsModel(VitsPreTrainedModel):
|
||||
else:
|
||||
speaker_embeddings = None
|
||||
|
||||
if labels is not None:
|
||||
raise NotImplementedError("Training of VITS is not supported yet.")
|
||||
|
||||
text_encoder_output = self.text_encoder(
|
||||
input_ids=input_ids,
|
||||
padding_mask=input_padding_mask,
|
||||
|
@ -1671,6 +1671,8 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
|
||||
|
||||
>>> loss = model(input_values, labels=labels).loss
|
||||
```"""
|
||||
if labels is not None and tf.reduce_max(labels) >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
outputs = self.wav2vec2(
|
||||
input_values=input_values,
|
||||
@ -1690,9 +1692,6 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
|
||||
logits = self.lm_head(hidden_states)
|
||||
|
||||
if labels is not None:
|
||||
if tf.reduce_max(labels) >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
attention_mask = (
|
||||
attention_mask if attention_mask is not None else tf.ones_like(input_values, dtype=tf.float32)
|
||||
)
|
||||
|
@ -2327,9 +2327,11 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
|
||||
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
|
||||
config.vocab_size - 1]`.
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None and labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
outputs = self.wav2vec2(
|
||||
input_values,
|
||||
attention_mask=attention_mask,
|
||||
@ -2345,9 +2347,6 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
# retrieve loss input_lengths from attention_mask
|
||||
attention_mask = (
|
||||
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
|
||||
|
@ -1219,6 +1219,8 @@ class Wav2Vec2BertForCTC(Wav2Vec2BertPreTrainedModel):
|
||||
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
|
||||
config.vocab_size - 1]`.
|
||||
"""
|
||||
if labels is not None and labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
@ -1237,9 +1239,6 @@ class Wav2Vec2BertForCTC(Wav2Vec2BertPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
# retrieve loss input_lengths from attention_mask
|
||||
attention_mask = (
|
||||
attention_mask
|
||||
|
@ -1645,9 +1645,11 @@ class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel):
|
||||
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
|
||||
config.vocab_size - 1]`.
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None and labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
outputs = self.wav2vec2_conformer(
|
||||
input_values,
|
||||
attention_mask=attention_mask,
|
||||
@ -1663,9 +1665,6 @@ class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
# retrieve loss input_lengths from attention_mask
|
||||
attention_mask = (
|
||||
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
|
||||
|
@ -1349,9 +1349,11 @@ class WavLMForCTC(WavLMPreTrainedModel):
|
||||
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
|
||||
config.vocab_size - 1]`.
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None and labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
outputs = self.wavlm(
|
||||
input_values,
|
||||
attention_mask=attention_mask,
|
||||
@ -1367,9 +1369,6 @@ class WavLMForCTC(WavLMPreTrainedModel):
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if labels.max() >= self.config.vocab_size:
|
||||
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
|
||||
|
||||
# retrieve loss input_lengths from attention_mask
|
||||
attention_mask = (
|
||||
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
|
||||
|
Loading…
Reference in New Issue
Block a user