From 60005f464d2069801a2cf26dc0f011da8ed639b0 Mon Sep 17 00:00:00 2001 From: jeonsworld <37530102+jeonsworld@users.noreply.github.com> Date: Sat, 30 Mar 2019 14:50:17 +0900 Subject: [PATCH] Update pregenerate_training_data.py If the value of rand_end is returned from the randint function, the value of sampled_doc_index that matches current_idx is returned from searchsorted. example: cumsum_max = {int64} 30 doc_cumsum = {ndarray} [ 5 7 11 19 30] doc_lengths = {list} : [5, 2, 4, 8, 11] if current_idx = 1, rand_start = 7 rand_end = 35 sentence_index = randint(7, 35) % cumsum_max if randint return 35, sentence_index becomes 5. if sentence_index is 5, np.searchsorted returns 1 equal to current_index. --- examples/lm_finetuning/pregenerate_training_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py index 498ab22333a..8cc28d2e784 100644 --- a/examples/lm_finetuning/pregenerate_training_data.py +++ b/examples/lm_finetuning/pregenerate_training_data.py @@ -49,7 +49,7 @@ class DocumentDatabase: self._precalculate_doc_weights() rand_start = self.doc_cumsum[current_idx] rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx] - sentence_index = randint(rand_start, rand_end) % self.cumsum_max + sentence_index = randint(rand_start, rand_end-1) % self.cumsum_max sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right') else: # If we don't use sentence weighting, then every doc has an equal chance to be chosen