Enabling imageGPT auto feature extractor. (#16871)

* Enablign `imageGPT` auto feature extractor. Co-authored-by: ydshieh <ydshieh@users.noreply.github.com> * Small updates. * Update after rebase to use `input_ids` instead of `pixel_values`. Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-08-02 19:21:31 +06:00 · 2022-05-24 12:30:46 +02:00 · 2022-05-24 12:30:46 +02:00 · d980929803
commit d980929803
parent 31ee80d556
3 changed files with 18 additions and 2 deletions
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@ -50,6 +50,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
        ("flava", "FlavaFeatureExtractor"),
        ("glpn", "GLPNFeatureExtractor"),
        ("hubert", "Wav2Vec2FeatureExtractor"),
+        ("imagegpt", "ImageGPTFeatureExtractor"),
        ("layoutlmv2", "LayoutLMv2FeatureExtractor"),
        ("layoutlmv3", "LayoutLMv3FeatureExtractor"),
        ("maskformer", "MaskFormerFeatureExtractor"),
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@ -75,14 +75,19 @@ def _pad(items, key, padding_value, padding_side):
        # Others include `attention_mask` etc...
        shape = items[0][key].shape
        dim = len(shape)
-        if dim == 4:
+        if key == "pixel_values":
            # This is probable image so padding shouldn't be necessary
            # B, C, H, W
            return torch.cat([item[key] for item in items], dim=0)
        max_length = max(item[key].shape[1] for item in items)
+        min_length = min(item[key].shape[1] for item in items)
        dtype = items[0][key].dtype

        if dim == 2:
+            if max_length == min_length:
+                # Bypass for `ImageGPT` which doesn't provide a padding value, yet
+                # we can consistently pad since the size should be matching
+                return torch.cat([item[key] for item in items], dim=0)
            tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
        elif dim == 3:
            tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
@ -146,7 +151,11 @@ def pad_collate_fn(tokenizer, feature_extractor):
        padded = {}
        for key in keys:
            if key in {"input_ids"}:
-                _padding_value = t_padding_value
+                # ImageGPT uses a feature extractor
+                if feature_extractor is not None:
+                    _padding_value = f_padding_value
+                else:
+                    _padding_value = t_padding_value
            elif key in {"input_values", "pixel_values", "input_features"}:
                _padding_value = f_padding_value
            elif key in {"p_mask", "special_tokens_mask"}:
--- a/tests/models/imagegpt/test_modeling_imagegpt.py
+++ b/tests/models/imagegpt/test_modeling_imagegpt.py
@ -171,6 +171,12 @@ class ImageGPTModelTester:
            reorder_and_upcast_attn=reorder_and_upcast_attn,
        )

+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.vocab_size = 513
+        config.max_position_embeddings = 1024
+        return config
+
    def prepare_config_and_inputs_for_decoder(self):
        (
            config,