diff --git a/examples/contrib/run_openai_gpt.py b/examples/contrib/run_openai_gpt.py index 136e25821f1..cb17012ba51 100644 --- a/examples/contrib/run_openai_gpt.py +++ b/examples/contrib/run_openai_gpt.py @@ -81,7 +81,7 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d n_batch = len(dataset) input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64) mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64) - lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64) + lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64) mc_labels = np.zeros((n_batch,), dtype=np.int64) for i, (story, cont1, cont2, mc_label), in enumerate(dataset): with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token] diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py index 53669623b6f..92ac33a503e 100644 --- a/examples/distillation/distiller.py +++ b/examples/distillation/distiller.py @@ -109,7 +109,7 @@ class Distiller: self.last_log = 0 self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean") - self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100) + self.lm_loss_fct = nn.CrossEntropyLoss() if self.alpha_mse > 0.0: self.mse_loss_fct = nn.MSELoss(reduction="sum") if self.alpha_cos > 0.0: @@ -200,7 +200,7 @@ class Distiller: ------- token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM. attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention. - mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -100 where there is nothing to predict. + mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -1 where there is nothing to predict. """ token_ids, lengths = batch token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths) @@ -244,7 +244,7 @@ class Distiller: ) token_ids = token_ids.masked_scatter(pred_mask, _token_ids) - mlm_labels[~pred_mask] = -100 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility + mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility # sanity checks assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size @@ -265,7 +265,7 @@ class Distiller: ------- token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM. attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention. - clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -100 where there is nothing to predict. + clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1 where there is nothing to predict. """ token_ids, lengths = batch token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths) @@ -273,7 +273,7 @@ class Distiller: attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None] clm_labels = token_ids.new(token_ids.size()).copy_(token_ids) - clm_labels[~attn_mask] = -100 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility + clm_labels[~attn_mask] = -1 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility # sanity checks assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index 663881649d8..c37af5ef484 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -206,7 +206,7 @@ def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> T padding_mask = labels.eq(tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() - labels[~masked_indices] = -100 # We only compute loss on masked tokens + labels[~masked_indices] = -1 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index d2a5d4878e5..ac973d7622b 100644 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -632,8 +632,8 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. - Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 62e752b89cd..3f4047d63b1 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -846,8 +846,8 @@ class BertForPreTraining(BertPreTrainedModel): r""" masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. - Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring) @@ -948,13 +948,13 @@ class BertForMaskedLM(BertPreTrainedModel): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. - Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the left-to-right language modeling loss (next word prediction). - Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: @@ -1015,7 +1015,7 @@ class BertForMaskedLM(BertPreTrainedModel): # 2. If `lm_labels` is provided we are in a causal scenario where we # try to predict the next token for each input in the decoder. if masked_lm_labels is not None: - loss_fct = CrossEntropyLoss() # -100 index = padding token + loss_fct = CrossEntropyLoss() # -1 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs diff --git a/src/transformers/modeling_ctrl.py b/src/transformers/modeling_ctrl.py index 40e076a4982..10b4499144a 100644 --- a/src/transformers/modeling_ctrl.py +++ b/src/transformers/modeling_ctrl.py @@ -479,8 +479,8 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` - Indices are selected in ``[-100, 0, ..., config.vocab_size]`` - All labels set to ``-100`` are ignored (masked), the loss is only + Indices are selected in ``[-1, 0, ..., config.vocab_size]`` + All labels set to ``-1`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py index bbccdcddd7d..0698c302e9a 100644 --- a/src/transformers/modeling_distilbert.py +++ b/src/transformers/modeling_distilbert.py @@ -517,8 +517,8 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. - Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py index 2426ef43527..11452790739 100644 --- a/src/transformers/modeling_gpt2.py +++ b/src/transformers/modeling_gpt2.py @@ -547,8 +547,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` - Indices are selected in ``[-100, 0, ..., config.vocab_size]`` - All labels set to ``-100`` are ignored (masked), the loss is only + Indices are selected in ``[-1, 0, ..., config.vocab_size]`` + All labels set to ``-1`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: @@ -655,7 +655,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` - All labels set to ``-100`` are ignored (masked), the loss is only + All labels set to ``-1`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`) Labels for computing the multiple choice classification loss. diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index 70abd5a1dc5..e17ee3b7796 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -516,8 +516,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` - Indices are selected in ``[-100, 0, ..., config.vocab_size]`` - All labels set to ``-100`` are ignored (masked), the loss is only + Indices are selected in ``[-1, 0, ..., config.vocab_size]`` + All labels set to ``-1`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: @@ -621,7 +621,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` - All labels set to ``-100`` are ignored (masked), the loss is only + All labels set to ``-1`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`) Labels for computing the multiple choice classification loss. diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py index 50de77b85c1..cd2d668afbd 100644 --- a/src/transformers/modeling_roberta.py +++ b/src/transformers/modeling_roberta.py @@ -200,8 +200,8 @@ class RobertaForMaskedLM(BertPreTrainedModel): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. - Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py index 405ebe56674..4a033ae7b28 100644 --- a/src/transformers/modeling_t5.py +++ b/src/transformers/modeling_t5.py @@ -802,8 +802,8 @@ class T5WithLMHeadModel(T5PreTrainedModel): r""" **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for computing the masked language modeling loss. - Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring). - Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + Indices should either be in ``[0, ..., config.vocab_size]`` or -1 (see ``input_ids`` docstring). + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``. Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -906,7 +906,7 @@ class T5WithLMHeadModel(T5PreTrainedModel): if lm_labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() - loss_fct = CrossEntropyLoss(ignore_index=-100) + loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) decoder_outputs = ( loss, diff --git a/src/transformers/modeling_transfo_xl.py b/src/transformers/modeling_transfo_xl.py index 05bb5f7e3eb..57ef7592d6b 100644 --- a/src/transformers/modeling_transfo_xl.py +++ b/src/transformers/modeling_transfo_xl.py @@ -858,8 +858,8 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` - Indices are selected in ``[-100, 0, ..., config.vocab_size]`` - All labels set to ``-100`` are ignored (masked), the loss is only + Indices are selected in ``[-1, 0, ..., config.vocab_size]`` + All labels set to ``-1`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py index 9ba5540f9c8..9f75a5c9471 100644 --- a/src/transformers/modeling_xlm.py +++ b/src/transformers/modeling_xlm.py @@ -667,8 +667,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` - Indices are selected in ``[-100, 0, ..., config.vocab_size]`` - All labels set to ``-100`` are ignored (masked), the loss is only + Indices are selected in ``[-1, 0, ..., config.vocab_size]`` + All labels set to ``-1`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py index 2720c848914..5c2bd3c31ed 100644 --- a/src/transformers/modeling_xlnet.py +++ b/src/transformers/modeling_xlnet.py @@ -993,8 +993,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` - Indices are selected in ``[-100, 0, ..., config.vocab_size]`` - All labels set to ``-100`` are ignored (masked), the loss is only + Indices are selected in ``[-1, 0, ..., config.vocab_size]`` + All labels set to ``-1`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: