mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-14 18:18:24 +06:00

* WIP refactoring pipeline tests - switching to fast tokenizers * fix dialog pipeline and fill-mask * refactoring pipeline tests backbone * make large tests slow * fix tests (tf Bart inactive for now) * fix doc... * clean up for merge * fixing tests - remove bart from summarization until there is TF * fix quality and RAG * Add new translation pipeline tests - fix JAX tests * only slow for dialog * Fixing the missing TF-BART imports in modeling_tf_auto * spin out pipeline tests in separate CI job * adding pipeline test to CI YAML * add slow pipeline tests * speed up tf and pt join test to avoid redoing all the standalone pt and tf tests * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sam Shleifer <sshleifer@gmail.com> * Update src/transformers/pipelines.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * add require_torch and require_tf in is_pt_tf_cross_test Co-authored-by: Sam Shleifer <sshleifer@gmail.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
141 lines
5.2 KiB
Python
141 lines
5.2 KiB
Python
import unittest
|
|
|
|
from transformers import pipeline
|
|
from transformers.testing_utils import require_tf, require_torch, slow
|
|
|
|
from .test_pipelines_common import MonoInputPipelineCommonMixin
|
|
|
|
|
|
EXPECTED_FILL_MASK_RESULT = [
|
|
[
|
|
{"sequence": "<s>My name is John</s>", "score": 0.00782308354973793, "token": 610, "token_str": "ĠJohn"},
|
|
{"sequence": "<s>My name is Chris</s>", "score": 0.007475061342120171, "token": 1573, "token_str": "ĠChris"},
|
|
],
|
|
[
|
|
{"sequence": "<s>The largest city in France is Paris</s>", "score": 0.3185044229030609, "token": 2201},
|
|
{"sequence": "<s>The largest city in France is Lyon</s>", "score": 0.21112334728240967, "token": 12790},
|
|
],
|
|
]
|
|
|
|
EXPECTED_FILL_MASK_TARGET_RESULT = [
|
|
[
|
|
{
|
|
"sequence": "<s>My name is Patrick</s>",
|
|
"score": 0.004992353264242411,
|
|
"token": 3499,
|
|
"token_str": "ĠPatrick",
|
|
},
|
|
{
|
|
"sequence": "<s>My name is Clara</s>",
|
|
"score": 0.00019297805556561798,
|
|
"token": 13606,
|
|
"token_str": "ĠClara",
|
|
},
|
|
]
|
|
]
|
|
|
|
|
|
class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
|
|
pipeline_task = "fill-mask"
|
|
pipeline_loading_kwargs = {"topk": 2}
|
|
small_models = ["sshleifer/tiny-distilroberta-base"] # Models tested without the @slow decorator
|
|
large_models = ["distilroberta-base"] # Models tested with the @slow decorator
|
|
mandatory_keys = {"sequence", "score", "token"}
|
|
valid_inputs = [
|
|
"My name is <mask>",
|
|
"The largest city in France is <mask>",
|
|
]
|
|
invalid_inputs = [
|
|
"This is <mask> <mask>" # More than 1 mask_token in the input is not supported
|
|
"This is" # No mask_token is not supported
|
|
]
|
|
expected_check_keys = ["sequence"]
|
|
|
|
@require_torch
|
|
def test_torch_fill_mask_with_targets(self):
|
|
valid_inputs = ["My name is <mask>"]
|
|
valid_targets = [[" Teven", " Patrick", " Clara"], [" Sam"]]
|
|
invalid_targets = [[], [""], ""]
|
|
for model_name in self.small_models:
|
|
nlp = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
|
|
for targets in valid_targets:
|
|
outputs = nlp(valid_inputs, targets=targets)
|
|
self.assertIsInstance(outputs, list)
|
|
self.assertEqual(len(outputs), len(targets))
|
|
for targets in invalid_targets:
|
|
self.assertRaises(ValueError, nlp, valid_inputs, targets=targets)
|
|
|
|
@require_tf
|
|
def test_tf_fill_mask_with_targets(self):
|
|
valid_inputs = ["My name is <mask>"]
|
|
valid_targets = [[" Teven", " Patrick", " Clara"], [" Sam"]]
|
|
invalid_targets = [[], [""], ""]
|
|
for model_name in self.small_models:
|
|
nlp = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf")
|
|
for targets in valid_targets:
|
|
outputs = nlp(valid_inputs, targets=targets)
|
|
self.assertIsInstance(outputs, list)
|
|
self.assertEqual(len(outputs), len(targets))
|
|
for targets in invalid_targets:
|
|
self.assertRaises(ValueError, nlp, valid_inputs, targets=targets)
|
|
|
|
@require_torch
|
|
@slow
|
|
def test_torch_fill_mask_results(self):
|
|
mandatory_keys = {"sequence", "score", "token"}
|
|
valid_inputs = [
|
|
"My name is <mask>",
|
|
"The largest city in France is <mask>",
|
|
]
|
|
valid_targets = [" Patrick", " Clara"]
|
|
for model_name in self.large_models:
|
|
nlp = pipeline(
|
|
task="fill-mask",
|
|
model=model_name,
|
|
tokenizer=model_name,
|
|
framework="pt",
|
|
topk=2,
|
|
)
|
|
self._test_mono_column_pipeline(
|
|
nlp,
|
|
valid_inputs,
|
|
mandatory_keys,
|
|
expected_multi_result=EXPECTED_FILL_MASK_RESULT,
|
|
expected_check_keys=["sequence"],
|
|
)
|
|
self._test_mono_column_pipeline(
|
|
nlp,
|
|
valid_inputs[:1],
|
|
mandatory_keys,
|
|
expected_multi_result=EXPECTED_FILL_MASK_TARGET_RESULT,
|
|
expected_check_keys=["sequence"],
|
|
targets=valid_targets,
|
|
)
|
|
|
|
@require_tf
|
|
@slow
|
|
def test_tf_fill_mask_results(self):
|
|
mandatory_keys = {"sequence", "score", "token"}
|
|
valid_inputs = [
|
|
"My name is <mask>",
|
|
"The largest city in France is <mask>",
|
|
]
|
|
valid_targets = [" Patrick", " Clara"]
|
|
for model_name in self.large_models:
|
|
nlp = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf", topk=2)
|
|
self._test_mono_column_pipeline(
|
|
nlp,
|
|
valid_inputs,
|
|
mandatory_keys,
|
|
expected_multi_result=EXPECTED_FILL_MASK_RESULT,
|
|
expected_check_keys=["sequence"],
|
|
)
|
|
self._test_mono_column_pipeline(
|
|
nlp,
|
|
valid_inputs[:1],
|
|
mandatory_keys,
|
|
expected_multi_result=EXPECTED_FILL_MASK_TARGET_RESULT,
|
|
expected_check_keys=["sequence"],
|
|
targets=valid_targets,
|
|
)
|