mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-29 17:22:25 +06:00

* first raw version of the bark integration * working code on small models with single run * add converting script from suno weights 2 hf * many changes * correct past_kv output * working implementation for inference * update the converting script according to the architecture changes * add a working end-to-end inference code * remove some comments and make small changes * remove unecessary comment * add docstrings and ensure no unecessary intermediary output during audio generation * remove done TODOs * make style + add config docstrings * modification for batch inference support on the whole model * add details to .generation_audio method * add copyright * convert EncodecModel from original library to transformers implementation * add two class in order to facilitate model and sub-models loading from the hub * add support of loading the whole model * add BarkProcessor * correct modeling according to processor output * Add proper __init__ and auto support * Add up-to-date copyright/license message * add relative import instead of absolute * cleaner head_dim computation * small comment removal or changes * more verbose LayerNorm init method * specify eps for clearer comprehension * more verbose variable naming in the MLP module * remove unecessary BarkBlock parameter * clearer code in the forward pass of the BarkBlock * remove _initialize_modules method for cleaner code * Remove unnecessary methods from sub-models * move code to remove unnecessary function * rename a variable for clarity and change an assert * move code and change variable name for clarity * remove unnecessary asserts * correct small bug * correct a comment * change variable names for clarity * remove asserts * change import from absolute to relative * correct small error due to comma missing + correct import * Add attribute Bark config * add first version of tests * update attention_map * add tie_weights and resize_token_embeddings for fineModel * correct getting attention_mask in generate_text_semantic * remove Bark inference trick * leave more choices in barkProcessor * remove _no_split_modules * fixe error in forward of block and introduce clearer notations * correct converting script with last changes * make style + add draft bark.mdx * correct BarkModelTest::test_generate_text_semantic * add Bark in main README * add dummy_pt_objects for Bark * add missing models in the main init * correct test_decoder_model_past_with_large_inputs * disable torchscript test * change docstring of BarkProcessor * Add test_processor_bark * make style * correct copyrights * add bark.mdx + make style, quality and consistency * Apply suggestions from code review Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> * Remove unnecessary test method * simply logic of a test * Only check first ids for slow audio generation * split full end-to-end generation tests * remove unneccessary comment * change submodel names for clearer naming * remove ModuleDict from modeling_bark * combine two if statements * ensure that an edge misued won't happen * modify variable name * move code snippet to the right place (coarse instead of semantic) * change BarkSemanticModule -> BarkSemanticModel * align BarkProcessor with transformers paradigm * correct BarkProcessor tests with last commit changes * change _validate_voice_preset to an instance method instead of a class method * tie_weights already called with post_init * add codec_model config to configuration * update bark modeling tests with recent BarkProcessor changes * remove SubModelPretrainedModel + change speakers embeddings prompt type in BarkModel * change absolute imports to relative * remove TODO * change docstrings * add examples to docs and docstrings * make style * uses BatchFeature in BarkProcessor insteads of dict * continue improving docstrings and docs + make style * correct docstrings examples * more comprehensible speaker_embeddings load/Save * rename speaker_embeddings_dict -> speaker_embeddings * correct bark.mdx + add bark to documentation_tests * correct docstrings configuration_bark * integrate last nit suggestions * integrate BarkGeneration configs * make style * remove bark tests from documentation_tests.txt because timeout - tested manually * add proper generation config initialization * small bark.mdx documentation changes * rename bark.mdx -> bark.md * add torch.no_grad behind BarkModel.generate_audio() * replace assert by ValueError in convert_suno_to_hf.py * integrate a series of short comments from reviewer * move SemanticLogitsProcessors and remove .detach() from Bark docs and docstrings * actually remove SemanticLogitsProcessor from modeling_bark.oy * BarkProcessor returns a single output instead of tuple + correct docstrings * make style + correct bug * add initializer_range to BarkConfig + correct slow modeling tests * add .clone() to history_prompt.coarse_prompt to avoid modifying input array * Making sure no extra "`" are present * remove extra characters in modeling_bark.py * Correct output if history_prompt is None * remove TODOs * remove ravel comment * completing generation_configuration_bark.py docstrings * change docstrings - number of audio codebooks instead of Encodec codebooks * change 'bias' docstrings in configuration_bark.py * format code * rename BarkModel.generate_audio -> BarkModel.generate_speech * modify AutoConfig instead of EncodecConfig in BarkConfig * correct AutoConfig wrong init * refactor BarkModel and sub-models generate_coarse, generate_fine, generate_text_semantic * remove SemanticLogitsProcessor and replace it with SuppressTokensLogitsProcessor * move nb_codebook related config arguments to BarkFineConfig * rename bark.mdx -> bark.md * correcting BarkModelConfig from_pretrained + remove keys_to_ignore * correct bark.md with correct hub path * correct code bug in bark.md * correct list tokens_to_suppress * modify Processor to load nested speaker embeddings in a safer way * correct batch sampling in BarkFineModel.generate_fine * Apply suggestions from code review Small docstrings correction and code improvements Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * give more details about num_layers in docstrings * correct indentation mistake * correct submodelconfig order of docstring variables * put audio models in alphabetical order in utils/check_repo.my * remove useless line from test_modeling_bark.py * makes BarkCoarseModelTest inherits from (ModelTesterMixin, GenerationTesterMixin, unittest.TestCase) instead of BarkSemanticModelTest * make a Tester class for each sub-model instead of inheriting * add test_resize_embeddings=True for Bark sub-models * add Copied from transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoSelfAttention._split_heads * remove 'Copied fom Bark' comment * remove unneccessary comment * change np.min -> min in modeling_bark.py * refactored all custom layers to have Bark prefix * add attention_mask as an argument of generate_text_semantic * refactor sub-models start docstrings to have more precise config class definition * move _tied_weights_keys overriding * add docstrings to generate_xxx in modeling_bark.py * add loading whole BarkModel to convert_suno_to_hf * refactor attribute and variable names * make style convert_suno * update bark checkpoints * remove never entered if statement * move bark_modeling docstrings after BarkPretrainedModel class definition * refactor modeling_bark.py: kv -> key_values * small nits - code refactoring and removing unecessary lines from _init_weights * nits - replace inplace method by variable assigning * remove *optional* when necessary * remove some lines in generate_speech * add default value for optional parameter * Refactor preprocess_histories_before_coarse -> preprocess_histories Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * correct usage after refactoring * refactor Bark's generate_xxx -> generate and modify docstrings and tests accordingly * update docstrings python in configuration_bark.py * add bark files in utils/documentation_test.txt * correct docstrings python snippet * add the ability to use parameters in the form of e.g coarse_temperature * add semantic_max_new_tokens in python snippet in docstrings for quicker generation * Reformate sub-models kwargs in BakModel.generate Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * correct kwargs in BarkModel.generate * correct attention_mask kwarg in BarkModel.generate * add tests for sub-models args in BarkModel.generate and correct BarkFineModel.test_generate_fp16 * enrich BarkModel.generate docstrings with a description of how to use the kwargs --------- Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
128 lines
4.6 KiB
Python
128 lines
4.6 KiB
Python
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import os
|
|
import shutil
|
|
import tempfile
|
|
import unittest
|
|
|
|
import numpy as np
|
|
|
|
from transformers import AutoTokenizer, BarkProcessor
|
|
from transformers.testing_utils import require_torch, slow
|
|
|
|
|
|
@require_torch
|
|
class BarkProcessorTest(unittest.TestCase):
|
|
def setUp(self):
|
|
self.checkpoint = "ylacombe/bark-small"
|
|
self.tmpdirname = tempfile.mkdtemp()
|
|
self.voice_preset = "en_speaker_1"
|
|
self.input_string = "This is a test string"
|
|
self.speaker_embeddings_dict_path = "speaker_embeddings_path.json"
|
|
self.speaker_embeddings_directory = "speaker_embeddings"
|
|
|
|
def get_tokenizer(self, **kwargs):
|
|
return AutoTokenizer.from_pretrained(self.checkpoint, **kwargs)
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.tmpdirname)
|
|
|
|
def test_save_load_pretrained_default(self):
|
|
tokenizer = self.get_tokenizer()
|
|
|
|
processor = BarkProcessor(tokenizer=tokenizer)
|
|
|
|
processor.save_pretrained(self.tmpdirname)
|
|
processor = BarkProcessor.from_pretrained(self.tmpdirname)
|
|
|
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
|
|
|
@slow
|
|
def test_save_load_pretrained_additional_features(self):
|
|
processor = BarkProcessor.from_pretrained(
|
|
pretrained_processor_name_or_path=self.checkpoint,
|
|
speaker_embeddings_dict_path=self.speaker_embeddings_dict_path,
|
|
)
|
|
processor.save_pretrained(
|
|
self.tmpdirname,
|
|
speaker_embeddings_dict_path=self.speaker_embeddings_dict_path,
|
|
speaker_embeddings_directory=self.speaker_embeddings_directory,
|
|
)
|
|
|
|
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
|
|
|
processor = BarkProcessor.from_pretrained(
|
|
self.tmpdirname,
|
|
self.speaker_embeddings_dict_path,
|
|
bos_token="(BOS)",
|
|
eos_token="(EOS)",
|
|
)
|
|
|
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
|
|
|
def test_speaker_embeddings(self):
|
|
processor = BarkProcessor.from_pretrained(
|
|
pretrained_processor_name_or_path=self.checkpoint,
|
|
speaker_embeddings_dict_path=self.speaker_embeddings_dict_path,
|
|
)
|
|
|
|
seq_len = 35
|
|
nb_codebooks_coarse = 2
|
|
nb_codebooks_total = 8
|
|
|
|
voice_preset = {
|
|
"semantic_prompt": np.ones(seq_len),
|
|
"coarse_prompt": np.ones((nb_codebooks_coarse, seq_len)),
|
|
"fine_prompt": np.ones((nb_codebooks_total, seq_len)),
|
|
}
|
|
|
|
# test providing already loaded voice_preset
|
|
inputs = processor(text=self.input_string, voice_preset=voice_preset)
|
|
|
|
processed_voice_preset = inputs["history_prompt"]
|
|
for key in voice_preset:
|
|
self.assertListEqual(voice_preset[key].tolist(), processed_voice_preset.get(key, np.array([])).tolist())
|
|
|
|
# test loading voice preset from npz file
|
|
tmpfilename = os.path.join(self.tmpdirname, "file.npz")
|
|
np.savez(tmpfilename, **voice_preset)
|
|
inputs = processor(text=self.input_string, voice_preset=tmpfilename)
|
|
processed_voice_preset = inputs["history_prompt"]
|
|
|
|
for key in voice_preset:
|
|
self.assertListEqual(voice_preset[key].tolist(), processed_voice_preset.get(key, np.array([])).tolist())
|
|
|
|
# test loading voice preset from the hub
|
|
inputs = processor(text=self.input_string, voice_preset=self.voice_preset)
|
|
|
|
def test_tokenizer(self):
|
|
tokenizer = self.get_tokenizer()
|
|
|
|
processor = BarkProcessor(tokenizer=tokenizer)
|
|
|
|
encoded_processor = processor(text=self.input_string)
|
|
|
|
encoded_tok = tokenizer(
|
|
self.input_string,
|
|
padding="max_length",
|
|
max_length=256,
|
|
add_special_tokens=False,
|
|
return_attention_mask=True,
|
|
return_token_type_ids=False,
|
|
)
|
|
|
|
for key in encoded_tok.keys():
|
|
self.assertListEqual(encoded_tok[key], encoded_processor[key].squeeze().tolist())
|