From 0a632f076d6b275690176b79c64c5559e1240b05 Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan Date: Wed, 7 Sep 2022 00:50:12 +0530 Subject: [PATCH] Fix incorrect size of input for 1st strided window length in `Perplexity of fixed-length models` (#18906) * update the PPL for stride 512 * fix 1st strided window size * linting * fix typo * styling --- docs/source/en/perplexity.mdx | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/docs/source/en/perplexity.mdx b/docs/source/en/perplexity.mdx index 3706a40091c..01f861c99c5 100644 --- a/docs/source/en/perplexity.mdx +++ b/docs/source/en/perplexity.mdx @@ -101,22 +101,32 @@ from tqdm import tqdm max_length = model.config.n_positions stride = 512 +seq_len = encodings.input_ids.size(1) nlls = [] -for i in tqdm(range(0, encodings.input_ids.size(1), stride)): - begin_loc = max(i + stride - max_length, 0) - end_loc = min(i + stride, encodings.input_ids.size(1)) - trg_len = end_loc - i # may be different from stride on last loop +prev_end_loc = 0 +for begin_loc in tqdm(range(0, seq_len, stride)): + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc # may be different from stride on last loop input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device) target_ids = input_ids.clone() target_ids[:, :-trg_len] = -100 with torch.no_grad(): outputs = model(input_ids, labels=target_ids) - neg_log_likelihood = outputs[0] * trg_len + + # loss is calculated using CrossEntropyLoss which averages over input tokens. + # Multiply it with trg_len to get the summation instead of average. + # We will take average over all the tokens to get the true average + # in the last step of this example. + neg_log_likelihood = outputs.loss * trg_len nlls.append(neg_log_likelihood) + prev_end_loc = end_loc + if end_loc == seq_len: + break + ppl = torch.exp(torch.stack(nlls).sum() / end_loc) ``` @@ -126,5 +136,5 @@ and the better the reported perplexity will typically be. When we run the above with `stride = 1024`, i.e. no overlap, the resulting PPL is `19.64`, which is about the same as the `19.93` reported in the GPT-2 paper. By using `stride = 512` and thereby employing our striding window -strategy, this jumps down to `16.53`. This is not only a more favorable score, but is calculated in a way that is +strategy, this jumps down to `16.44`. This is not only a more favorable score, but is calculated in a way that is closer to the true autoregressive decomposition of a sequence likelihood.