mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-02 03:01:07 +06:00
Update Bark generation configs and tests (#25409)
* update bark generation configs for more coherent parameter * make style * update bark hub repo
This commit is contained in:
parent
cf84738d2e
commit
704bf595eb
@ -36,8 +36,8 @@ class BarkSemanticGenerationConfig(GenerationConfig):
|
|||||||
return_dict_in_generate=False,
|
return_dict_in_generate=False,
|
||||||
output_hidden_states=False,
|
output_hidden_states=False,
|
||||||
output_attentions=False,
|
output_attentions=False,
|
||||||
temperature=0.7,
|
temperature=1.0,
|
||||||
do_sample=True,
|
do_sample=False,
|
||||||
text_encoding_offset=10_048,
|
text_encoding_offset=10_048,
|
||||||
text_pad_token=129_595,
|
text_pad_token=129_595,
|
||||||
semantic_infer_token=129_599,
|
semantic_infer_token=129_599,
|
||||||
@ -70,9 +70,9 @@ class BarkSemanticGenerationConfig(GenerationConfig):
|
|||||||
output_attentions (`bool`, *optional*, defaults to `False`):
|
output_attentions (`bool`, *optional*, defaults to `False`):
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
||||||
returned tensors for more details.
|
returned tensors for more details.
|
||||||
temperature (`float`, *optional*, defaults to 0.7):
|
temperature (`float`, *optional*, defaults to 1.0):
|
||||||
The value used to modulate the next token probabilities.
|
The value used to modulate the next token probabilities.
|
||||||
do_sample (`bool`, *optional*, defaults to `True`):
|
do_sample (`bool`, *optional*, defaults to `False`):
|
||||||
Whether or not to use sampling ; use greedy decoding otherwise.
|
Whether or not to use sampling ; use greedy decoding otherwise.
|
||||||
text_encoding_offset (`int`, *optional*, defaults to 10_048):
|
text_encoding_offset (`int`, *optional*, defaults to 10_048):
|
||||||
Text encoding offset.
|
Text encoding offset.
|
||||||
@ -119,8 +119,8 @@ class BarkCoarseGenerationConfig(GenerationConfig):
|
|||||||
return_dict_in_generate=False,
|
return_dict_in_generate=False,
|
||||||
output_hidden_states=False,
|
output_hidden_states=False,
|
||||||
output_attentions=False,
|
output_attentions=False,
|
||||||
temperature=0.7,
|
temperature=1.0,
|
||||||
do_sample=True,
|
do_sample=False,
|
||||||
coarse_semantic_pad_token=12_048,
|
coarse_semantic_pad_token=12_048,
|
||||||
coarse_rate_hz=75,
|
coarse_rate_hz=75,
|
||||||
n_coarse_codebooks=2,
|
n_coarse_codebooks=2,
|
||||||
@ -150,9 +150,9 @@ class BarkCoarseGenerationConfig(GenerationConfig):
|
|||||||
output_attentions (`bool`, *optional*, defaults to `False`):
|
output_attentions (`bool`, *optional*, defaults to `False`):
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
||||||
returned tensors for more details.
|
returned tensors for more details.
|
||||||
temperature (`float`, *optional*, defaults to 0.7):
|
temperature (`float`, *optional*, defaults to 1.0):
|
||||||
The value used to modulate the next token probabilities.
|
The value used to modulate the next token probabilities.
|
||||||
do_sample (`bool`, *optional*, defaults to `True`):
|
do_sample (`bool`, *optional*, defaults to `False`):
|
||||||
Whether or not to use sampling ; use greedy decoding otherwise.
|
Whether or not to use sampling ; use greedy decoding otherwise.
|
||||||
coarse_semantic_pad_token (`int`, *optional*, defaults to 12_048):
|
coarse_semantic_pad_token (`int`, *optional*, defaults to 12_048):
|
||||||
Coarse semantic pad token.
|
Coarse semantic pad token.
|
||||||
@ -194,7 +194,7 @@ class BarkFineGenerationConfig(GenerationConfig):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
temperature=0.5,
|
temperature=1.0,
|
||||||
max_fine_history_length=512,
|
max_fine_history_length=512,
|
||||||
max_fine_input_length=1024,
|
max_fine_input_length=1024,
|
||||||
n_fine_codebooks=8,
|
n_fine_codebooks=8,
|
||||||
@ -209,7 +209,7 @@ class BarkFineGenerationConfig(GenerationConfig):
|
|||||||
documentation from [`GenerationConfig`] for more information.
|
documentation from [`GenerationConfig`] for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
temperature (`float`, *optional*, defaults to 0.5):
|
temperature (`float`, *optional*):
|
||||||
The value used to modulate the next token probabilities.
|
The value used to modulate the next token probabilities.
|
||||||
max_fine_history_length (`int`, *optional*, defaults to 512):
|
max_fine_history_length (`int`, *optional*, defaults to 512):
|
||||||
Max length of the fine history vector.
|
Max length of the fine history vector.
|
||||||
@ -224,6 +224,13 @@ class BarkFineGenerationConfig(GenerationConfig):
|
|||||||
self.max_fine_input_length = max_fine_input_length
|
self.max_fine_input_length = max_fine_input_length
|
||||||
self.n_fine_codebooks = n_fine_codebooks
|
self.n_fine_codebooks = n_fine_codebooks
|
||||||
|
|
||||||
|
def validate(self):
|
||||||
|
"""
|
||||||
|
Overrides GenerationConfig.validate because BarkFineGenerationConfig don't use any parameters outside
|
||||||
|
temperature.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class BarkGenerationConfig(GenerationConfig):
|
class BarkGenerationConfig(GenerationConfig):
|
||||||
model_type = "bark"
|
model_type = "bark"
|
||||||
|
@ -1336,7 +1336,7 @@ class BarkFineModel(BarkPreTrainedModel):
|
|||||||
input_buffer = fine_input[:, start_idx : start_idx + max_fine_input_length, :]
|
input_buffer = fine_input[:, start_idx : start_idx + max_fine_input_length, :]
|
||||||
for n_inner in range(n_coarse, fine_generation_config.n_fine_codebooks):
|
for n_inner in range(n_coarse, fine_generation_config.n_fine_codebooks):
|
||||||
logits = self.forward(n_inner, input_buffer).logits
|
logits = self.forward(n_inner, input_buffer).logits
|
||||||
if temperature is None:
|
if temperature is None or temperature == 1.0:
|
||||||
relevant_logits = logits[:, rel_start_fill_idx:, :codebook_size]
|
relevant_logits = logits[:, rel_start_fill_idx:, :codebook_size]
|
||||||
codebook_preds = torch.argmax(relevant_logits, -1)
|
codebook_preds = torch.argmax(relevant_logits, -1)
|
||||||
else:
|
else:
|
||||||
@ -1499,8 +1499,8 @@ class BarkModel(BarkPreTrainedModel):
|
|||||||
```python
|
```python
|
||||||
>>> from transformers import AutoProcessor, BarkModel
|
>>> from transformers import AutoProcessor, BarkModel
|
||||||
|
|
||||||
>>> processor = AutoProcessor.from_pretrained("ylacombe/bark-small")
|
>>> processor = AutoProcessor.from_pretrained("suno/bark-small")
|
||||||
>>> model = BarkModel.from_pretrained("ylacombe/bark-small")
|
>>> model = BarkModel.from_pretrained("suno/bark-small")
|
||||||
|
|
||||||
>>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
|
>>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
|
||||||
>>> voice_preset = "v2/en_speaker_6"
|
>>> voice_preset = "v2/en_speaker_6"
|
||||||
|
@ -894,11 +894,11 @@ class BarkFineModelTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
class BarkModelIntegrationTests(unittest.TestCase):
|
class BarkModelIntegrationTests(unittest.TestCase):
|
||||||
@cached_property
|
@cached_property
|
||||||
def model(self):
|
def model(self):
|
||||||
return BarkModel.from_pretrained("ylacombe/bark-large").to(torch_device)
|
return BarkModel.from_pretrained("suno/bark").to(torch_device)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def processor(self):
|
def processor(self):
|
||||||
return BarkProcessor.from_pretrained("ylacombe/bark-large")
|
return BarkProcessor.from_pretrained("suno/bark")
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def inputs(self):
|
def inputs(self):
|
||||||
@ -937,6 +937,7 @@ class BarkModelIntegrationTests(unittest.TestCase):
|
|||||||
output_ids = self.model.semantic.generate(
|
output_ids = self.model.semantic.generate(
|
||||||
**input_ids,
|
**input_ids,
|
||||||
do_sample=False,
|
do_sample=False,
|
||||||
|
temperature=1.0,
|
||||||
semantic_generation_config=self.semantic_generation_config,
|
semantic_generation_config=self.semantic_generation_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -957,6 +958,7 @@ class BarkModelIntegrationTests(unittest.TestCase):
|
|||||||
output_ids = self.model.semantic.generate(
|
output_ids = self.model.semantic.generate(
|
||||||
**input_ids,
|
**input_ids,
|
||||||
do_sample=False,
|
do_sample=False,
|
||||||
|
temperature=1.0,
|
||||||
semantic_generation_config=self.semantic_generation_config,
|
semantic_generation_config=self.semantic_generation_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -964,6 +966,7 @@ class BarkModelIntegrationTests(unittest.TestCase):
|
|||||||
output_ids,
|
output_ids,
|
||||||
history_prompt=history_prompt,
|
history_prompt=history_prompt,
|
||||||
do_sample=False,
|
do_sample=False,
|
||||||
|
temperature=1.0,
|
||||||
semantic_generation_config=self.semantic_generation_config,
|
semantic_generation_config=self.semantic_generation_config,
|
||||||
coarse_generation_config=self.coarse_generation_config,
|
coarse_generation_config=self.coarse_generation_config,
|
||||||
codebook_size=self.model.generation_config.codebook_size,
|
codebook_size=self.model.generation_config.codebook_size,
|
||||||
@ -994,6 +997,7 @@ class BarkModelIntegrationTests(unittest.TestCase):
|
|||||||
output_ids = self.model.semantic.generate(
|
output_ids = self.model.semantic.generate(
|
||||||
**input_ids,
|
**input_ids,
|
||||||
do_sample=False,
|
do_sample=False,
|
||||||
|
temperature=1.0,
|
||||||
semantic_generation_config=self.semantic_generation_config,
|
semantic_generation_config=self.semantic_generation_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1001,6 +1005,7 @@ class BarkModelIntegrationTests(unittest.TestCase):
|
|||||||
output_ids,
|
output_ids,
|
||||||
history_prompt=history_prompt,
|
history_prompt=history_prompt,
|
||||||
do_sample=False,
|
do_sample=False,
|
||||||
|
temperature=1.0,
|
||||||
semantic_generation_config=self.semantic_generation_config,
|
semantic_generation_config=self.semantic_generation_config,
|
||||||
coarse_generation_config=self.coarse_generation_config,
|
coarse_generation_config=self.coarse_generation_config,
|
||||||
codebook_size=self.model.generation_config.codebook_size,
|
codebook_size=self.model.generation_config.codebook_size,
|
||||||
@ -1040,9 +1045,16 @@ class BarkModelIntegrationTests(unittest.TestCase):
|
|||||||
input_ids = self.inputs
|
input_ids = self.inputs
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
self.model.generate(**input_ids, do_sample=False, coarse_do_sample=True, coarse_temperature=0.7)
|
|
||||||
self.model.generate(
|
self.model.generate(
|
||||||
**input_ids, do_sample=False, coarse_do_sample=True, coarse_temperature=0.7, fine_temperature=0.3
|
**input_ids, do_sample=False, temperature=1.0, coarse_do_sample=True, coarse_temperature=0.7
|
||||||
|
)
|
||||||
|
self.model.generate(
|
||||||
|
**input_ids,
|
||||||
|
do_sample=False,
|
||||||
|
temperature=1.0,
|
||||||
|
coarse_do_sample=True,
|
||||||
|
coarse_temperature=0.7,
|
||||||
|
fine_temperature=0.3,
|
||||||
)
|
)
|
||||||
self.model.generate(
|
self.model.generate(
|
||||||
**input_ids,
|
**input_ids,
|
||||||
@ -1061,7 +1073,7 @@ class BarkModelIntegrationTests(unittest.TestCase):
|
|||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# standard generation
|
# standard generation
|
||||||
output_with_no_offload = self.model.generate(**input_ids, do_sample=False, fine_temperature=None)
|
output_with_no_offload = self.model.generate(**input_ids, do_sample=False, temperature=1.0)
|
||||||
|
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
@ -1088,7 +1100,7 @@ class BarkModelIntegrationTests(unittest.TestCase):
|
|||||||
self.assertTrue(hasattr(self.model.semantic, "_hf_hook"))
|
self.assertTrue(hasattr(self.model.semantic, "_hf_hook"))
|
||||||
|
|
||||||
# output with cpu offload
|
# output with cpu offload
|
||||||
output_with_offload = self.model.generate(**input_ids, do_sample=False, fine_temperature=None)
|
output_with_offload = self.model.generate(**input_ids, do_sample=False, temperature=1.0)
|
||||||
|
|
||||||
# checks if same output
|
# checks if same output
|
||||||
self.assertListEqual(output_with_no_offload.tolist(), output_with_offload.tolist())
|
self.assertListEqual(output_with_no_offload.tolist(), output_with_offload.tolist())
|
||||||
|
@ -26,7 +26,7 @@ from transformers.testing_utils import require_torch, slow
|
|||||||
@require_torch
|
@require_torch
|
||||||
class BarkProcessorTest(unittest.TestCase):
|
class BarkProcessorTest(unittest.TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.checkpoint = "ylacombe/bark-small"
|
self.checkpoint = "suno/bark-small"
|
||||||
self.tmpdirname = tempfile.mkdtemp()
|
self.tmpdirname = tempfile.mkdtemp()
|
||||||
self.voice_preset = "en_speaker_1"
|
self.voice_preset = "en_speaker_1"
|
||||||
self.input_string = "This is a test string"
|
self.input_string = "This is a test string"
|
||||||
|
Loading…
Reference in New Issue
Block a user