transformers/tests/test_pipelines_conversational.py
Nicolas Patry aeb18b9224
Adding new encoder_no_repeat_ngram_size to generate. (#9984)
Adding new `encoder_no_repeat_ngram_size` to `generate`.

Blenderbot results seemed off compared to original ParlAI script:
`https://parl.ai/projects/recipes/`. Notably the model seems
to repeat a lot what was said during the conversation.

The actual problem was that `no_repeat_ngram_size` actually applies
to the `encoder_input_ids` but HF's `no_repeat_ngram_size` applies
to the previously generated ids (within the decoder). The history
conversation of blenderbot is within the `encoder` part so that
explains why HF's implementation had the repetitions.

This fix was focused on blenderbot *not* small and added tests
for those because they are quite different in configuration.

This change includes:

- Adding a new EncoderNoRepeatLogitProcessor.
- Adding 1 new arg to `generate` (`encoder_no_repeat_ngram_size`)
- Adding 1 new config parameter `encoder_no_repeat_ngram_size`.
- Adding 2 tests, one for the pipeline (high level, inputs exhibited
repeat behavior, one low level for EncoderNoRepeatLogitProcessor)
- Factored NoRepeatLogitProcessor so that logic could be reused.

Further work:

- Blenderbot conversational pipeline still does not behave correctly
 as they way input is prepared within the pipeline is still incorrect
(follow up PR)
- Blenderbot allows the bot to have personas, which is done by
prepending "your personna: XXXX" to the input, this could be explored
too in a follow up PR.

@patrickvonplaten
@LysandreJik

* Update src/transformers/generation_logits_process.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Update src/transformers/generation_utils.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Update src/transformers/generation_utils.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Update src/transformers/configuration_utils.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Doc quality.

* Fixing test.

* Last fixes.

* Fixing to account for batch_size.

* Update src/transformers/configuration_utils.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/generation_utils.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2021-02-04 15:00:18 +01:00

366 lines
15 KiB
Python

# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import (
AutoModelForSeq2SeqLM,
AutoTokenizer,
Conversation,
ConversationalPipeline,
is_torch_available,
pipeline,
)
from transformers.testing_utils import is_pipeline_test, require_torch, slow, torch_device
from .test_pipelines_common import MonoInputPipelineCommonMixin
if is_torch_available():
import torch
from transformers.models.gpt2 import GPT2Config, GPT2LMHeadModel
DEFAULT_DEVICE_NUM = -1 if torch_device == "cpu" else 0
@is_pipeline_test
class SimpleConversationPipelineTests(unittest.TestCase):
def get_pipeline(self):
# When
config = GPT2Config(
vocab_size=263,
n_ctx=128,
max_length=128,
n_embd=64,
n_layer=1,
n_head=8,
bos_token_id=256,
eos_token_id=257,
)
model = GPT2LMHeadModel(config)
# Force model output to be L
V, D = model.lm_head.weight.shape
bias = torch.zeros(V, requires_grad=True)
weight = torch.zeros((V, D), requires_grad=True)
bias[76] = 1
model.lm_head.bias = torch.nn.Parameter(bias)
model.lm_head.weight = torch.nn.Parameter(weight)
# # Created with:
# import tempfile
# from tokenizers import Tokenizer, models
# from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
# vocab = [(chr(i), i) for i in range(256)]
# tokenizer = Tokenizer(models.Unigram(vocab))
# with tempfile.NamedTemporaryFile() as f:
# tokenizer.save(f.name)
# real_tokenizer = PreTrainedTokenizerFast(tokenizer_file=f.name, eos_token="<eos>", bos_token="<bos>")
# real_tokenizer._tokenizer.save("dummy.json")
# Special tokens are automatically added at load time.
tokenizer = AutoTokenizer.from_pretrained("Narsil/small_conversational_test")
conversation_agent = pipeline(
task="conversational", device=DEFAULT_DEVICE_NUM, model=model, tokenizer=tokenizer
)
return conversation_agent
@require_torch
def test_integration_torch_conversation(self):
conversation_agent = self.get_pipeline()
conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
conversation_2 = Conversation("What's the last book you have read?")
self.assertEqual(len(conversation_1.past_user_inputs), 0)
self.assertEqual(len(conversation_2.past_user_inputs), 0)
with self.assertLogs("transformers", level="WARNING") as log:
result = conversation_agent([conversation_1, conversation_2], max_length=48)
self.assertEqual(len(log.output), 2)
self.assertIn("You might consider trimming the early phase of the conversation", log.output[0])
self.assertIn("Setting `pad_token_id`", log.output[1])
# Two conversations in one pass
self.assertEqual(result, [conversation_1, conversation_2])
self.assertEqual(
result,
[
Conversation(
None,
past_user_inputs=["Going to the movies tonight - any suggestions?"],
generated_responses=["L"],
),
Conversation(
None, past_user_inputs=["What's the last book you have read?"], generated_responses=["L"]
),
],
)
# One conversation with history
conversation_2.add_user_input("Why do you recommend it?")
with self.assertLogs("transformers", level="WARNING") as log:
result = conversation_agent(conversation_2, max_length=64)
self.assertEqual(len(log.output), 3)
self.assertIn("Cutting history off because it's too long", log.output[0])
self.assertIn("You might consider trimming the early phase of the conversation", log.output[1])
self.assertIn("Setting `pad_token_id`", log.output[2])
self.assertEqual(result, conversation_2)
self.assertEqual(
result,
Conversation(
None,
past_user_inputs=["What's the last book you have read?", "Why do you recommend it?"],
generated_responses=["L", "L"],
),
)
@require_torch
def test_history_cache(self):
conversation_agent = self.get_pipeline()
conversation = Conversation(
"Why do you recommend it?",
past_user_inputs=["What's the last book you have read?"],
generated_responses=["b"],
)
with self.assertLogs("transformers", level="WARNING") as log:
_ = conversation_agent(conversation, max_length=64)
self.assertEqual(len(log.output), 3)
self.assertIn("Cutting history off because it's too long (63 > 32) for underlying model", log.output[0])
self.assertIn("63 is bigger than 0.9 * max_length: 64", log.output[1])
self.assertIn("Setting `pad_token_id`", log.output[2])
self.assertEqual(conversation._index, 1)
self.assertEqual(
conversation._history,
[
87,
104,
97,
116,
39,
115,
32,
116,
104,
101,
32,
108,
97,
115,
116,
32,
98,
111,
111,
107,
32,
121,
111,
117,
32,
104,
97,
118,
101,
32,
114,
101,
97,
100,
63,
259, # EOS
98, # b
259, # EOS
],
)
class ConversationalPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
pipeline_task = "conversational"
small_models = [] # Models tested without the @slow decorator
large_models = ["microsoft/DialoGPT-medium"] # Models tested with the @slow decorator
invalid_inputs = ["Hi there!", Conversation()]
def _test_pipeline(
self, nlp
): # override the default test method to check that the output is a `Conversation` object
self.assertIsNotNone(nlp)
# We need to recreate conversation for successive tests to pass as
# Conversation objects get *consumed* by the pipeline
conversation = Conversation("Hi there!")
mono_result = nlp(conversation)
self.assertIsInstance(mono_result, Conversation)
conversations = [Conversation("Hi there!"), Conversation("How are you?")]
multi_result = nlp(conversations)
self.assertIsInstance(multi_result, list)
self.assertIsInstance(multi_result[0], Conversation)
# Conversation have been consumed and are not valid anymore
# Inactive conversations passed to the pipeline raise a ValueError
self.assertRaises(ValueError, nlp, conversation)
self.assertRaises(ValueError, nlp, conversations)
for bad_input in self.invalid_inputs:
self.assertRaises(Exception, nlp, bad_input)
self.assertRaises(Exception, nlp, self.invalid_inputs)
@require_torch
@slow
def test_integration_torch_conversation(self):
# When
nlp = pipeline(task="conversational", device=DEFAULT_DEVICE_NUM)
conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
conversation_2 = Conversation("What's the last book you have read?")
# Then
self.assertEqual(len(conversation_1.past_user_inputs), 0)
self.assertEqual(len(conversation_2.past_user_inputs), 0)
# When
result = nlp([conversation_1, conversation_2], do_sample=False, max_length=1000)
# Then
self.assertEqual(result, [conversation_1, conversation_2])
self.assertEqual(len(result[0].past_user_inputs), 1)
self.assertEqual(len(result[1].past_user_inputs), 1)
self.assertEqual(len(result[0].generated_responses), 1)
self.assertEqual(len(result[1].generated_responses), 1)
self.assertEqual(result[0].past_user_inputs[0], "Going to the movies tonight - any suggestions?")
self.assertEqual(result[0].generated_responses[0], "The Big Lebowski")
self.assertEqual(result[1].past_user_inputs[0], "What's the last book you have read?")
self.assertEqual(result[1].generated_responses[0], "The Last Question")
# When
conversation_2.add_user_input("Why do you recommend it?")
result = nlp(conversation_2, do_sample=False, max_length=1000)
# Then
self.assertEqual(result, conversation_2)
self.assertEqual(len(result.past_user_inputs), 2)
self.assertEqual(len(result.generated_responses), 2)
self.assertEqual(result.past_user_inputs[1], "Why do you recommend it?")
self.assertEqual(result.generated_responses[1], "It's a good book.")
@require_torch
@slow
def test_integration_torch_conversation_truncated_history(self):
# When
nlp = pipeline(task="conversational", min_length_for_response=24, device=DEFAULT_DEVICE_NUM)
conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
# Then
self.assertEqual(len(conversation_1.past_user_inputs), 0)
# When
result = nlp(conversation_1, do_sample=False, max_length=36)
# Then
self.assertEqual(result, conversation_1)
self.assertEqual(len(result.past_user_inputs), 1)
self.assertEqual(len(result.generated_responses), 1)
self.assertEqual(result.past_user_inputs[0], "Going to the movies tonight - any suggestions?")
self.assertEqual(result.generated_responses[0], "The Big Lebowski")
# When
conversation_1.add_user_input("Is it an action movie?")
result = nlp(conversation_1, do_sample=False, max_length=36)
# Then
self.assertEqual(result, conversation_1)
self.assertEqual(len(result.past_user_inputs), 2)
self.assertEqual(len(result.generated_responses), 2)
self.assertEqual(result.past_user_inputs[1], "Is it an action movie?")
self.assertEqual(result.generated_responses[1], "It's a comedy.")
@require_torch
@slow
def test_integration_torch_conversation_blenderbot_400M(self):
tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot-400M-distill")
nlp = ConversationalPipeline(model=model, tokenizer=tokenizer)
conversation_1 = Conversation("hello")
result = nlp(
conversation_1,
)
self.assertEqual(
result.generated_responses[0],
# ParlAI implementation output, we have a different one, but it's our
# second best, you can check by using num_return_sequences=10
# " Hello! How are you? I'm just getting ready to go to work, how about you?",
" Hello! How are you doing today? I just got back from a walk with my dog.",
)
conversation_1 = Conversation(" Lasagne hello")
result = nlp(conversation_1, encoder_no_repeat_ngram_size=3)
self.assertEqual(
result.generated_responses[0],
" Lasagne is my favorite Italian dish. Do you like lasagne?",
)
conversation_1 = Conversation(
"Lasagne hello Lasagne is my favorite Italian dish. Do you like lasagne? I like lasagne."
)
result = nlp(
conversation_1,
encoder_no_repeat_ngram_size=3,
)
self.assertEqual(
result.generated_responses[0],
# ParlAI implementation output, we have a different one, but it's our
# second best, you can check by using num_return_sequences=10
# " Hello! How are you? I'm just getting ready to go to work, how about you?",
" Lasagne is a traditional Italian dish consisting of a yeasted flatbread typically topped with tomato sauce and cheese.",
)
@require_torch
@slow
def test_integration_torch_conversation_encoder_decoder(self):
# When
tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot_small-90M")
nlp = ConversationalPipeline(model=model, tokenizer=tokenizer, device=DEFAULT_DEVICE_NUM)
conversation_1 = Conversation("My name is Sarah and I live in London")
conversation_2 = Conversation("Going to the movies tonight, What movie would you recommend? ")
# Then
self.assertEqual(len(conversation_1.past_user_inputs), 0)
self.assertEqual(len(conversation_2.past_user_inputs), 0)
# When
result = nlp([conversation_1, conversation_2], do_sample=False, max_length=1000)
# Then
self.assertEqual(result, [conversation_1, conversation_2])
self.assertEqual(len(result[0].past_user_inputs), 1)
self.assertEqual(len(result[1].past_user_inputs), 1)
self.assertEqual(len(result[0].generated_responses), 1)
self.assertEqual(len(result[1].generated_responses), 1)
self.assertEqual(result[0].past_user_inputs[0], "My name is Sarah and I live in London")
self.assertEqual(
result[0].generated_responses[0],
"hi sarah, i live in london as well. do you have any plans for the weekend?",
)
self.assertEqual(
result[1].past_user_inputs[0], "Going to the movies tonight, What movie would you recommend? "
)
self.assertEqual(
result[1].generated_responses[0], "i don't know... i'm not really sure. what movie are you going to see?"
)
# When
conversation_1.add_user_input("Not yet, what about you?")
conversation_2.add_user_input("What's your name?")
result = nlp([conversation_1, conversation_2], do_sample=False, max_length=1000)
# Then
self.assertEqual(result, [conversation_1, conversation_2])
self.assertEqual(len(result[0].past_user_inputs), 2)
self.assertEqual(len(result[1].past_user_inputs), 2)
self.assertEqual(len(result[0].generated_responses), 2)
self.assertEqual(len(result[1].generated_responses), 2)
self.assertEqual(result[0].past_user_inputs[1], "Not yet, what about you?")
self.assertEqual(result[0].generated_responses[1], "i don't have any plans yet. i'm not sure what to do yet.")
self.assertEqual(result[1].past_user_inputs[1], "What's your name?")
self.assertEqual(result[1].generated_responses[1], "i don't have a name, but i'm going to see a horror movie.")