[Audio Processor] Only pass sr to feat extractor (#20022)

* [Audio Processor] Only pass sr to feat extractor * move out of if/else * copy to other processors
2025-07-31 02:02:21 +06:00 · 2022-11-08 08:59:03 +00:00 · 2022-11-08 08:59:03 +00:00 · 3e39fd09a9
commit 3e39fd09a9
parent fb1c8db78a
6 changed files with 12 additions and 6 deletions
--- a/src/transformers/models/mctct/processing_mctct.py
+++ b/src/transformers/models/mctct/processing_mctct.py
@ -58,6 +58,7 @@ class MCTCTProcessor(ProcessorMixin):
            audio = kwargs.pop("raw_speech")
        else:
            audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
        text = kwargs.pop("text", None)
        if len(args) > 0:
            audio = args[0]
@ -67,7 +68,7 @@ class MCTCTProcessor(ProcessorMixin):
            raise ValueError("You need to specify either an `audio` or `text` input to process.")

        if audio is not None:
-            inputs = self.feature_extractor(audio, *args, **kwargs)
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
        if text is not None:
            encodings = self.tokenizer(text, **kwargs)

--- a/src/transformers/models/speech_to_text/processing_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/processing_speech_to_text.py
@ -61,6 +61,7 @@ class Speech2TextProcessor(ProcessorMixin):
            audio = kwargs.pop("raw_speech")
        else:
            audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
        text = kwargs.pop("text", None)
        if len(args) > 0:
            audio = args[0]
@ -70,7 +71,7 @@ class Speech2TextProcessor(ProcessorMixin):
            raise ValueError("You need to specify either an `audio` or `text` input to process.")

        if audio is not None:
-            inputs = self.feature_extractor(audio, *args, **kwargs)
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
        if text is not None:
            encodings = self.tokenizer(text, **kwargs)

--- a/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
@ -60,6 +60,7 @@ class Speech2Text2Processor(ProcessorMixin):
            audio = kwargs.pop("raw_speech")
        else:
            audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
        text = kwargs.pop("text", None)
        if len(args) > 0:
            audio = args[0]
@ -69,7 +70,7 @@ class Speech2Text2Processor(ProcessorMixin):
            raise ValueError("You need to specify either an `audio` or `text` input to process.")

        if audio is not None:
-            inputs = self.feature_extractor(audio, *args, **kwargs)
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
        if text is not None:
            encodings = self.tokenizer(text, **kwargs)

--- a/src/transformers/models/wav2vec2/processing_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py
@ -80,6 +80,7 @@ class Wav2Vec2Processor(ProcessorMixin):
            audio = kwargs.pop("raw_speech")
        else:
            audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
        text = kwargs.pop("text", None)
        if len(args) > 0:
            audio = args[0]
@ -89,7 +90,7 @@ class Wav2Vec2Processor(ProcessorMixin):
            raise ValueError("You need to specify either an `audio` or `text` input to process.")

        if audio is not None:
-            inputs = self.feature_extractor(audio, *args, **kwargs)
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
        if text is not None:
            encodings = self.tokenizer(text, **kwargs)

--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@ -228,6 +228,7 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
            audio = kwargs.pop("raw_speech")
        else:
            audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
        text = kwargs.pop("text", None)
        if len(args) > 0:
            audio = args[0]
@ -237,7 +238,7 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
            raise ValueError("You need to specify either an `audio` or `text` input to process.")

        if audio is not None:
-            inputs = self.feature_extractor(audio, *args, **kwargs)
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
        if text is not None:
            encodings = self.tokenizer(text, **kwargs)

--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@ -85,6 +85,7 @@ class WhisperProcessor(ProcessorMixin):
            return self.current_processor(*args, **kwargs)

        audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
        text = kwargs.pop("text", None)
        if len(args) > 0:
            audio = args[0]
@ -94,7 +95,7 @@ class WhisperProcessor(ProcessorMixin):
            raise ValueError("You need to specify either an `audio` or `text` input to process.")

        if audio is not None:
-            inputs = self.feature_extractor(audio, *args, **kwargs)
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
        if text is not None:
            encodings = self.tokenizer(text, **kwargs)