Update pipelines.py

Modified QA pipeline to consider all features for each example before generating topk answers. 
Current pipeline only takes one SquadExample, one SquadFeature, one start logit list, one end logit list to retrieve the answer, this is not correct as one SquadExample can produce multiple SquadFeatures.
This commit is contained in:
Rishabh Manoj 2020-01-08 21:12:34 +05:30 committed by GitHub
parent 16ce15ed4b
commit f26a353057
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -705,9 +705,16 @@ class QuestionAnsweringPipeline(Pipeline):
# Convert inputs to features
examples = self._args_parser(*texts, **kwargs)
features = squad_convert_examples_to_features(
examples, self.tokenizer, kwargs["max_seq_len"], kwargs["doc_stride"], kwargs["max_question_len"], False
)
features_list = [ squad_convert_examples_to_features(
[example],
self.tokenizer,
kwargs["max_seq_len"],
kwargs["doc_stride"],
kwargs["max_question_len"],
False
) for example in examples ]
all_answers = []
for features, example in zip(features_list, examples):
fw_args = self.inputs_for_model([f.__dict__ for f in features])
# Manage tensor allocation on correct device
@ -724,7 +731,7 @@ class QuestionAnsweringPipeline(Pipeline):
start, end = start.cpu().numpy(), end.cpu().numpy()
answers = []
for (example, feature, start_, end_) in zip(examples, features, start, end):
for (feature, start_, end_) in zip(features, start, end):
# Normalize logits and spans to retrieve the answer
start_ = np.exp(start_) / np.sum(np.exp(start_))
end_ = np.exp(end_) / np.sum(np.exp(end_))
@ -751,9 +758,12 @@ class QuestionAnsweringPipeline(Pipeline):
}
for s, e, score in zip(starts, ends, scores)
]
if len(answers) == 1:
return answers[0]
return answers
answers = sorted(answers, key = lambda x:x['score'], reverse=True)[:kwargs["topk"]]
all_answers+=answers
if len(all_answers) == 1:
return all_answers[0]
return all_answers
def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
"""