From 497346d07ec39da3a7f38a7e0a67a4906c141ea3 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 18 Jan 2022 15:36:22 +0100 Subject: [PATCH] [ASR pipeline] correct with lm pipeline (#15200) * [ASR pipeline] correct with lm pipeline * improve error --- setup.py | 2 +- src/transformers/dependency_versions_table.py | 2 +- src/transformers/feature_extraction_utils.py | 5 +++-- src/transformers/pipelines/__init__.py | 8 ++++---- tests/test_feature_extraction_auto.py | 8 +++++++- ..._pipelines_automatic_speech_recognition.py | 20 +++++++++++++++++++ 6 files changed, 36 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index a1ae7d732c9..0959b244b5e 100644 --- a/setup.py +++ b/setup.py @@ -152,7 +152,7 @@ _deps = [ "tokenizers>=0.10.1,!=0.11.3", "torch>=1.0", "torchaudio", - "pyctcdecode>=0.2.0", + "pyctcdecode>=0.3.0", "tqdm>=4.27", "unidic>=1.0.2", "unidic_lite>=1.0.7", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index c2397ed5198..20f8e966ecf 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -62,7 +62,7 @@ deps = { "tokenizers": "tokenizers>=0.10.1,!=0.11.3", "torch": "torch>=1.0", "torchaudio": "torchaudio", - "pyctcdecode": "pyctcdecode>=0.2.0", + "pyctcdecode": "pyctcdecode>=0.3.0", "tqdm": "tqdm>=4.27", "unidic": "unidic>=1.0.2", "unidic_lite": "unidic_lite>=1.0.7", diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py index b6a2ea965d1..ce22fb3d407 100644 --- a/src/transformers/feature_extraction_utils.py +++ b/src/transformers/feature_extraction_utils.py @@ -489,8 +489,9 @@ class FeatureExtractionMixin: # make sure private name "_processor_class" is correctly # saved as "processor_class" - if dictionary.get("_processor_class", None) is not None: - dictionary["processor_class"] = dictionary.pop("_processor_class") + _processor_class = dictionary.pop("_processor_class", None) + if _processor_class is not None: + dictionary["processor_class"] = _processor_class return json.dumps(dictionary, indent=2, sort_keys=True) + "\n" diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index 77259462f9c..fab5ccb0085 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -4,6 +4,7 @@ import io import json +import os # coding=utf-8 # Copyright 2018 The HuggingFace Inc. team. @@ -617,17 +618,16 @@ def pipeline( and isinstance(model_name, str) ): try: + import kenlm # to trigger `ImportError` if not installed from pyctcdecode import BeamSearchDecoderCTC language_model_glob = os.path.join(BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*") alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME allow_regex = [language_model_glob, alphabet_filename] - decoder = BeamSearchDecoderCTC.load_from_hf_hub( - pretrained_model_name_or_path, allow_regex=allow_regex - ) + decoder = BeamSearchDecoderCTC.load_from_hf_hub(model_name, allow_regex=allow_regex) kwargs["decoder"] = decoder - except Exception as e: + except ImportError as e: logger.warning( "Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Try to install `pyctcdecode` and `kenlm`: (`pip install pyctcdecode`, `pip install https://github.com/kpu/kenlm/archive/master.zip`): Error: {e}" ) diff --git a/tests/test_feature_extraction_auto.py b/tests/test_feature_extraction_auto.py index 5b219e0d513..d49244118e3 100644 --- a/tests/test_feature_extraction_auto.py +++ b/tests/test_feature_extraction_auto.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import os import tempfile import unittest @@ -42,8 +43,9 @@ class AutoFeatureExtractorTest(unittest.TestCase): # remove feature_extractor_type to make sure config.json alone is enough to load feature processor locally config_dict = AutoFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR).to_dict() + config_dict.pop("feature_extractor_type") - config = Wav2Vec2FeatureExtractor(config_dict) + config = Wav2Vec2FeatureExtractor(**config_dict) # save in new folder model_config.save_pretrained(tmpdirname) @@ -51,6 +53,10 @@ class AutoFeatureExtractorTest(unittest.TestCase): config = AutoFeatureExtractor.from_pretrained(tmpdirname) + # make sure private variable is not incorrectly saved + dict_as_saved = json.loads(config.to_json_string()) + self.assertTrue("_processor_class" not in dict_as_saved) + self.assertIsInstance(config, Wav2Vec2FeatureExtractor) def test_feature_extractor_from_local_file(self): diff --git a/tests/test_pipelines_automatic_speech_recognition.py b/tests/test_pipelines_automatic_speech_recognition.py index 262aea5beba..c64f6b69dc9 100644 --- a/tests/test_pipelines_automatic_speech_recognition.py +++ b/tests/test_pipelines_automatic_speech_recognition.py @@ -295,6 +295,26 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=Pipel self.assertEqual(output, [{"text": ANY(str)}]) self.assertEqual(output[0]["text"][:6], "ZBT ZC") + @require_torch + @require_pyctcdecode + def test_with_lm_fast(self): + speech_recognizer = pipeline( + task="automatic-speech-recognition", + model="hf-internal-testing/processor_with_lm", + framework="pt", + ) + self.assertEqual(speech_recognizer.type, "ctc_with_lm") + + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + audio = ds[40]["audio"]["array"] + + n_repeats = 2 + audio_tiled = np.tile(audio, n_repeats) + output = speech_recognizer([audio_tiled], batch_size=2) + + self.assertEqual(output, [{"text": ANY(str)}]) + self.assertEqual(output[0]["text"][:6], "