From 81642d2b51de9d5e5aee1768abdc744d90f7f52d Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Wed, 3 Apr 2024 17:11:01 +0800 Subject: [PATCH] Make EncodecModel.decode ONNX exportable (#29913) * fix encodec onnx export for musicgen * simplification * fix quality * better style --- .../models/encodec/modeling_encodec.py | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py index bd56661b198..5a299b601b4 100644 --- a/src/transformers/models/encodec/modeling_encodec.py +++ b/src/transformers/models/encodec/modeling_encodec.py @@ -111,14 +111,27 @@ class EncodecConv1d(nn.Module): elif self.norm_type == "time_group_norm": self.norm = nn.GroupNorm(1, out_channels) - @staticmethod + kernel_size = self.conv.kernel_size[0] + stride = torch.tensor(self.conv.stride[0], dtype=torch.int64) + dilation = self.conv.dilation[0] + + # Effective kernel size with dilations. + kernel_size = torch.tensor((kernel_size - 1) * dilation + 1, dtype=torch.int64) + + self.register_buffer("stride", stride, persistent=False) + self.register_buffer("kernel_size", kernel_size, persistent=False) + self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False) + def _get_extra_padding_for_conv1d( - hidden_states: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0 - ) -> int: + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor: """See `pad_for_conv1d`.""" length = hidden_states.shape[-1] - n_frames = (length - kernel_size + padding_total) / stride + 1 - ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total) + n_frames = (length - self.kernel_size + self.padding_total) / self.stride + 1 + n_frames = torch.ceil(n_frames).to(torch.int64) - 1 + ideal_length = n_frames * self.stride + self.kernel_size - self.padding_total + return ideal_length - length @staticmethod @@ -141,20 +154,15 @@ class EncodecConv1d(nn.Module): return padded[..., :end] def forward(self, hidden_states): - kernel_size = self.conv.kernel_size[0] - stride = self.conv.stride[0] - dilation = self.conv.dilation[0] - kernel_size = (kernel_size - 1) * dilation + 1 # effective kernel size with dilations - padding_total = kernel_size - stride - extra_padding = self._get_extra_padding_for_conv1d(hidden_states, kernel_size, stride, padding_total) + extra_padding = self._get_extra_padding_for_conv1d(hidden_states) if self.causal: # Left padding for causal - hidden_states = self._pad1d(hidden_states, (padding_total, extra_padding), mode=self.pad_mode) + hidden_states = self._pad1d(hidden_states, (self.padding_total, extra_padding), mode=self.pad_mode) else: # Asymmetric padding required for odd strides - padding_right = padding_total // 2 - padding_left = padding_total - padding_right + padding_right = self.padding_total // 2 + padding_left = self.padding_total - padding_right hidden_states = self._pad1d( hidden_states, (padding_left, padding_right + extra_padding), mode=self.pad_mode )