diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 1bd5b651db9..817c5208d07 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -138,16 +138,16 @@ TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION = [ ( # test name for the test runner f"{dtype}_pad_{padding_side}{'' if use_attention_mask else '_no_attn_mask'}" - f"{'_output_attn' if output_attentions else ''}{'_sdpa_kernels' if enable_kernels else ''}", + f"{'_sdpa_kernels' if enable_kernels else ''}", # parameterization - *(dtype, padding_side, use_attention_mask, output_attentions, enable_kernels), + *(dtype, padding_side, use_attention_mask, False, enable_kernels), ) for dtype in ("fp16", "fp32", "bf16") for padding_side in ("left", "right") for use_attention_mask in (True, False) - for output_attentions in (True, False) for enable_kernels in (True, False) -] + # Extra test case: `output_attentions=True` has special attention mask handling and sdpa reverts to eager +] + [("fp32_pad_left_output_attentions", "fp32", "left", True, True, False)] def _config_zero_init(config): @@ -3618,7 +3618,7 @@ class ModelTesterMixin: ("cuda", False, torch.bfloat16): 1e-2, ("cuda", False, torch.float16): 5e-3, ("cuda", True, torch.float32): 1e-4, - ("cuda", True, torch.bfloat16): 3e-2, + ("cuda", True, torch.bfloat16): 3e-2, # (different from others) ("cuda", True, torch.float16): 5e-3, }