moved labels to the same device as logits for BLOOM, GPT Neo, GPT NeoX, RoBERTa and VIT models (#22663)

moved labels to the same device as logits
This commit is contained in:
Arun Brahma 2023-04-08 02:34:54 +05:30 committed by GitHub
parent 6db23af50c
commit 656e869a45
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 52 additions and 0 deletions

View File

@ -927,6 +927,8 @@ class BloomForCausalLM(BloomPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(lm_logits.device)
# Shift so that tokens < n predict n
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
@ -1194,6 +1196,8 @@ class BloomForTokenClassification(BloomPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
batch_size, seq_length = labels.shape
loss_fct = CrossEntropyLoss()
loss = loss_fct(

View File

@ -1015,6 +1015,8 @@ class CamembertForMaskedLM(CamembertPreTrainedModel):
masked_lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
@ -1097,6 +1099,8 @@ class CamembertForSequenceClassification(CamembertPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
@ -1210,6 +1214,8 @@ class CamembertForMultipleChoice(CamembertPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(reshaped_logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
@ -1297,6 +1303,8 @@ class CamembertForTokenClassification(CamembertPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
@ -1534,6 +1542,8 @@ class CamembertForCausalLM(CamembertPreTrainedModel):
lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
# we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous()

View File

@ -757,6 +757,8 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(lm_logits.device)
# Compute loss in fp32 to match with mesh-tf version
# https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
lm_logits = lm_logits.to(torch.float32)

View File

@ -677,6 +677,8 @@ class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(lm_logits.device)
# we are doing next-token prediction; shift prediction scores and input ids by one
shift_logits = lm_logits[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous()

View File

@ -993,6 +993,8 @@ class RobertaForCausalLM(RobertaPreTrainedModel):
lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
# we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous()
@ -1113,6 +1115,8 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
masked_lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
@ -1225,6 +1229,8 @@ class RobertaForSequenceClassification(RobertaPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
@ -1335,6 +1341,8 @@ class RobertaForMultipleChoice(RobertaPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(reshaped_logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
@ -1421,6 +1429,8 @@ class RobertaForTokenClassification(RobertaPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

View File

@ -1000,6 +1000,8 @@ class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel):
lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
# we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous()
@ -1124,6 +1126,8 @@ class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel):
masked_lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
@ -1236,6 +1240,8 @@ class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrained
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
@ -1349,6 +1355,8 @@ class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(reshaped_logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
@ -1434,6 +1442,8 @@ class RobertaPreLayerNormForTokenClassification(RobertaPreLayerNormPreTrainedMod
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

View File

@ -809,6 +809,8 @@ class ViTForImageClassification(ViTPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"

View File

@ -702,6 +702,8 @@ class ViTHybridForImageClassification(ViTHybridPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"

View File

@ -997,6 +997,8 @@ class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel):
lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
# we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous()
@ -1121,6 +1123,8 @@ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
masked_lm_loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(prediction_scores.device)
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
@ -1235,6 +1239,8 @@ class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
@ -1348,6 +1354,8 @@ class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(reshaped_logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
@ -1435,6 +1443,8 @@ class XLMRobertaForTokenClassification(XLMRobertaPreTrainedModel):
loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))