mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-15 02:28:24 +06:00

* Gemma 3n * initial commit of Gemma 3n scaffold * Fixing param pass through on Gemm3p5RMSNorm * Adds Einsum layer to Gemma 3n * Updating EinsumLayer API * Undoing erroneous force push * Reverting RMSNorm to with_scale by default * Adds LAuReL to Gemma 3n * Adds AltUp to Gemma 3n * Adding Gemma3p5 overall and text config with vision and audio config placeholders (#3) * Adding gemma3p5 text configs * Adding audio config placeholders * Adding a placeholder for vision configs * Updating MobileNetVisionConfig, inheriting TimmWrapperConfig * Updating text configs * Update src/transformers/models/gemma3p5/modular_gemma3p5.py Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Removing altup configs to accept the suggested configs * Update src/transformers/models/gemma3p5/modular_gemma3p5.py Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Updating altup config * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Addressing review comments and updating text configs * Adding a config for activation sparsity * Updating configs to pass through options to super class init and adjust some name prefixes * Updating laurel and altup with corrected config values * Normalizing sub_config initializers --------- Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Updating MLP with activation sparsity (#2) * Updating DecoderBlock for Gemma 3n (#3) * Initial Gemm3nTextModel (#4) NOTE: This implementation WILL CHANGE in the coming weeks, however, changes will be strictly additive and this will remain a suitable baseline for downstream implementations to reference. * Adding KV Cache Sharing * Adds Einsum layer to Gemma 3n * Updating EinsumLayer API * Refactored kv cache sharing in attention * Adding KVStore for cache sharing * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update src/transformers/cache_utils.py Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Undoing erroneous force push * Reverting RMSNorm to with_scale by default * Adds LAuReL to Gemma 3n * Updating KV Cache Sharing implementation * Updating the q and k norm definitions in the attention module * Fixing name error for q,k,v RMS norm to use the right 3n module * Updating MLP with activation sparsity * Updating DecoderBlock for Gemma 3.5 * Updating kv cache sharing implementation with the use of a cache buffer and refactoring some lines of code * Isolating KV Cache logic to relevant components * Fixing logic error in Gemma3nAttention.forward * Refactoring caching contributions and fixing kv_store initialization * Simplifying Configs * Remove errant self from super init call * Bug fix in the Attention module - changing self.head_dim to config.head_dim * Bug fixes in the LaurelBlock and RMS Norm super init call * removing redundant code from a merge * Adding per_layer_inputs to TextModel * Adding preprocess embeddings with altup * Adds per-layer-to-single output and a host of TODOs * Integrating altup predict with the model workflow and other minor bug fixes * Using nn.Embedding temporarily for text model * It goes forward * Minor refactor of attention sparsity and RoPE initialization * Fixing duplicate rope_scaling param bug when loading from pretrained --------- Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> * Normalizing on altup_num_inputs config option * regenerating modeling file after syncing to HEAD * Use torch.std(..., unbiased=False) for activation sparsity (#8) * Refactoring to a single QVK Norm (#13) * AltUp: support scale_corrected_output (#14) * Converts einsums to nn.Linear (#7) * Converts einsums to nn.Linear * Removing unused variables * Aligning SharedKVCache with HybridCache (#11) * Alinging SharedKVStore with HybridCache * Remove KVStore. Refactor apply_rotary_pos_emb for sharing * Addressing review comments * Supporting split modality embeddings in Gemma3n (#10) * Adding the Embedder class * Update modular Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Update modular Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Update modular Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Update modular Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Update modular Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Update modular Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Addressing review comments, adding audio embedding layers, integrating embedder with the remaining architecture, adding a forward method for conditional generation * Apply suggestions from code review Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Update modular Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Addressing review comments, prop drilling audio and vision configs to the text config * Removing TODO's that have been addressed * Simplify Embedder init and add audio embeddings * Embeddings refactor. Adds Gemma3nAudioEmbedder and Gemma3nVisionEmbedder * Refactoring vision and audio embeddings into ConditionalGeneration model --------- Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Updating attention mask for Gemma 3.5 (#15) * xxx_token_index to xxx_token_id * remvoing deprecated last_cache_position * Removing references to SigLIP * Always init per-layer inputs * Using torch.finfo().min for epsilon_tensor * Gemma3nDecoderLayer inherits from Gemma3DecoderLayer. Remove gating lambdas * fix modular GEMMA3N_INPUTS_DOCSTRING * Gemma3nAttention inherits from Gemma3Attention * Modular inheritance fixes * CausalLM conversion script for 4B model (#16) * Add Gemma3n Audio Encoder (#6) * initial commit of Gemma 3.5 scaffold * Fixing param pass through on Gemm3nRMSNorm * Adds Einsum layer to Gemma 3.5 * Updating EinsumLayer API * Undoing erroneous force push * Reverting RMSNorm to with_scale by default * Adds LAuReL to Gemma 3n * Adds AltUp to Gemma 3n * Adding Gemma3n overall and text config with vision and audio config placeholders (#3) * Adding gemma3n text configs * Adding audio config placeholders * Adding a placeholder for vision configs * Updating MobileNetVisionConfig, inheriting TimmWrapperConfig * Updating text configs * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Removing altup configs to accept the suggested configs * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Updating altup config * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Addressing review comments and updating text configs * Adding a config for activation sparsity * Updating configs to pass through options to super class init and adjust some name prefixes * Updating laurel and altup with corrected config values * Normalizing sub_config initializers --------- Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Updating MLP with activation sparsity (#2) * Updating DecoderBlock for Gemma 3.5 (#3) * Initial Gemm3nTextModel (#4) NOTE: This implementation WILL CHANGE in the coming weeks, however, changes will be strictly additive and this will remain a suitable baseline for downstream implementations to reference. * Adding KV Cache Sharing * Adds Einsum layer to Gemma 3.5 * Updating EinsumLayer API * Refactored kv cache sharing in attention * Adding KVStore for cache sharing * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update src/transformers/cache_utils.py Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Undoing erroneous force push * Reverting RMSNorm to with_scale by default * Adds LAuReL to Gemma 3n * Updating KV Cache Sharing implementation * Updating the q and k norm definitions in the attention module * Fixing name error for q,k,v RMS norm to use the right Gemma 3n module * Updating MLP with activation sparsity * Updating DecoderBlock for Gemma 3.5 * Updating kv cache sharing implementation with the use of a cache buffer and refactoring some lines of code * Isolating KV Cache logic to relevant components * Fixing logic error in Gemma3nAttention.forward * Refactoring caching contributions and fixing kv_store initialization * Simplifying Configs * Remove errant self from super init call * Bug fix in the Attention module - changing self.head_dim to config.head_dim * Bug fixes in the LaurelBlock and RMS Norm super init call * removing redundant code from a merge * Adding per_layer_inputs to TextModel * Adding preprocess embeddings with altup * Adds per-layer-to-single output and a host of TODOs * Integrating altup predict with the model workflow and other minor bug fixes * Using nn.Embedding temporarily for text model * It goes forward * Minor refactor of attention sparsity and RoPE initialization * Fixing duplicate rope_scaling param bug when loading from pretrained --------- Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> * Normalizing on altup_num_inputs config option * Adding audio encoder config * Adds high-level components for Audio Encoder * Implement uniform reducer for Audio Encoder * Adding placeholders for Conformer components in Audio Encoder * Adding placeholders for SubSampleConvProjection components in Audio Encoder * Adding SequenceLayer component placeholders * Implementing Gemma3nAudioEncoder with nn.Sequential * Implementing Gemma3nAudioSubSampleConvProjection with nn.Sequential * Implementing Conformer model with SequenceLayers * Use OrderedDict in nn.Sequential initializers * Implements sl.Residual in Torch with nn.Sequential and OrderedDict * Adopting a base SequenceLayer class with default forward() method * Implementing sl.GatedLinearUnit in Torch * Implementing sl.Swish in Torch * Implementing sl.ReLU in Torch * Implementing sl.Scale in Torch * Removing sl.Dropout after tree-shaking * Implementing sl.RMSNorm in Torch with fake shape * Implementing sl.GroupNorm in Torch * Implementing sl.Conv2d in Torch * Implementing sl.Dense in Torch * Removing sl.Delay layers, which act as pass-throughs * Connecting shapes to configs in initializers * Removing sl.Emit * Implementing sl.ExpandDims in Torch * Adding sl.GradientClipping to Torch * Implementing sl.DenseShaped in Torch * Implementing sl.LDPA in Torch * Removing unused sl.CombinedQKVProj class * Fixing erroneous type hint * Implemnenting sl.DepthwiseConv1D in Torch * Implementing sl.MaskInvalid in Torch * Fixes for initialization * Fixes for saving weights * Removing einsums per feedback from HF staff * Removing Sequence Layers idioms from audio encoder * Fixes for reviewer comments * CausalLM conversion script for 4B model * inv_timescales to non-persistent buffer * Addressing audio encoder Attention feedback * Addressing Gemma3nAudioSSCPConvBlock feedback * Addressing Gemma3nAudioConformerAttention feedback * Addressing padding feedback * Weights conversion loads audio state dict * Always use vision_config so saving works * Token id updates for configs * Stubs for interleaving audio embs * Addressing reviewer feedback --------- Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> * Fixing cache access error * Removing duplicate code from a bad merge * Gemma 3n Text + Vision Part 1 (#17) * testing utilities for numerics comparisons * Corrected einsum to nn.Linear weights conversion * Inherit scaled word embs from Gemma3 not Bart * Fixing transposes for collapsed linears * More transpose fixes * numpy api fix * RMSNorm: Explicit kwargs, scale_shift=0.0 when with_scale=True * Force AltUp to float32 * Updating debugging script for AudioEncoder debugging * Support divide_weight_by_sqrt_fan_in from JAX for per-layer inputs * Correcting attention einsum conversions * RMSNorm in type of x * Fixing douplicate laurel norm/gating * KV sharing using the right previous indices * Refactor kv shared index computation. Correct frac_shared_layers * Use num_shared_layers instead of inferring from a fraction * fixing a bug for logging * Fix shared data_ptrs in altup inits * rope: adjust proj -> norm -> rope to preserve computation (#20) * rope: adjust proj -> norm -> rope to preserve computation * Removing some breaking language model fluff in ConditionalGeneration * Consolidate query_states transforms --------- Co-authored-by: Douglas Reid <21148125+douglas-reid@users.noreply.github.com> Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Vectorize the loops in AltUp (#19) * Vectorize the loops in AltUp * fix typo * Expanding to support batched inputs * remove extra debug script * Fix AltUp.forward --------- Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Add 'scale_shift=0.0, with_scale=True' to the final norm in TextModel * Convert norm to 1/sqrt (#21) * Convert norm to 1/sqrt * Scale shift change per Phil's rec * Adding default activation sparsity * Fixing 2B config in weights conversion script * Fixing RMSNorm parameters - adding scale_shift and with_scale * Correcting query pre-attention scaling * Adding query_rescale_scalar to text config * Adding layer_idx to MLP * Permafix for input_layernorm * Use 1/sqrt instead of rsqrt in DecoderLayer * Fix o_proj conversion * Conversion script update for vision encoder * Removing logging for debugging timm model * Fixing bugs in Gemma3nForConditionalGeneration for text generation * Generating the modeling_gemma3n.py file * Removing the addition of an erroneous line in the modeling file * Adding gemma3n text model to modeling_auto * Bugfix: Updating the interleaving of inputs_embeds and vision_embeds * Updating the modeling file with the latest bugfix changes * Updating models/auto for Gemma 3n * using AutoTokenizer in forward test * Adding processing_gemma3n.py * Gemma 3n configured for AutoModel. Conversion script updated. * Removing errant merge artifacts --------- Co-authored-by: Mayank Chaturvedi <imayank@google.com> Co-authored-by: Douglas Reid <douglas-reid@users.noreply.github.com> Co-authored-by: Douglas Reid <21148125+douglas-reid@users.noreply.github.com> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> * Removing errant debugging statements from Gemma 3 * Gemma3n audio model (#18) * testing utilities for numerics comparisons * Implement CumulativeGroupNorm and add to SubSampleConvProjection and SSCPConvBlock * Add audio version of forward script based on RyanMullins' implementation * Updating to match encoder tests. WIP: config question needs resolving * Updates to audio classes to enable end-to-end running * Removing vestigial classes, cleaning up print statements * Adding SiLU / Swish to audio conformer feed forward block * Shifted Gemma3p5Audio naming prefix to Gemma3NanoAudio * Adding outputs to audio test * Fixes to padding in SSCP and 1D convolution, align RMS Norm with wider model * Update forward test to load from local weights * Update conversion to process / output audio layers * Update __all__ to export audio encoder * AutoModel registration for Gemma 3n Audio * Use AutoModel for ConditionalGeneration.audio_tower * Fixing input_proj_linear transpose * Fixing Gemma3NanoAudioConformerAttention.post conversion * Fixing Gemma3NanoAudioSSCPConvBlock.conv weights conversion * Correcting indentation issue on Gemma3p5RMSNorm --------- Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Text + Vision Part 2 (#23) * Updates for ConditionalGeneration.get_image_features * Adding a WIP draft of image_processing_gemma3p5.py * Update src/transformers/models/gemma3p5/modular_gemma3p5.py Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> * Modular conversion after github suggested change * Text + image gives good results * Fixing image size preset * Updating configs for the 2B variant in the conversion script * Using final generation config in conversion script --------- Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> * Audio Integration (#12) * initial commit of Gemma 3n scaffold * Fixing param pass through on Gemm3nRMSNorm * Adds Einsum layer to Gemma 3n * Updating EinsumLayer API * Undoing erroneous force push * Reverting RMSNorm to with_scale by default * Adds LAuReL to Gemma 3n * Adds AltUp to Gemma 3n * Adding Gemma 3n overall and text config with vision and audio config placeholders (#3) * Adding Gemma 3n text configs * Adding audio config placeholders * Adding a placeholder for vision configs * Updating MobileNetVisionConfig, inheriting TimmWrapperConfig * Updating text configs * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Removing altup configs to accept the suggested configs * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Updating altup config * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Addressing review comments and updating text configs * Adding a config for activation sparsity * Updating configs to pass through options to super class init and adjust some name prefixes * Updating laurel and altup with corrected config values * Normalizing sub_config initializers --------- Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Updating MLP with activation sparsity (#2) * Updating DecoderBlock for Gemma 3n (#3) * Initial Gemma3nTextModel (#4) NOTE: This implementation WILL CHANGE in the coming weeks, however, changes will be strictly additive and this will remain a suitable baseline for downstream implementations to reference. * Adding KV Cache Sharing * Adds Einsum layer to Gemma 3n * Updating EinsumLayer API * Refactored kv cache sharing in attention * Adding KVStore for cache sharing * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update src/transformers/cache_utils.py Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Undoing erroneous force push * Reverting RMSNorm to with_scale by default * Adds LAuReL to Gemma 3n * Updating KV Cache Sharing implementation * Updating the q and k norm definitions in the attention module * Fixing name error for q,k,v RMS norm to use the right 3n module * Updating MLP with activation sparsity * Updating DecoderBlock for Gemma 3n * Updating kv cache sharing implementation with the use of a cache buffer and refactoring some lines of code * Isolating KV Cache logic to relevant components * Fixing logic error in Gemma3nAttention.forward * Refactoring caching contributions and fixing kv_store initialization * Simplifying Configs * Remove errant self from super init call * Bug fix in the Attention module - changing self.head_dim to config.head_dim * Bug fixes in the LaurelBlock and RMS Norm super init call * removing redundant code from a merge * Adding per_layer_inputs to TextModel * Adding preprocess embeddings with altup * Adds per-layer-to-single output and a host of TODOs * Integrating altup predict with the model workflow and other minor bug fixes * Using nn.Embedding temporarily for text model * It goes forward * Minor refactor of attention sparsity and RoPE initialization * Fixing duplicate rope_scaling param bug when loading from pretrained --------- Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> * Normalizing on altup_num_inputs config option * Adding audio encoder config * Adds high-level components for Audio Encoder * Implement uniform reducer for Audio Encoder * Adding placeholders for Conformer components in Audio Encoder * Adding placeholders for SubSampleConvProjection components in Audio Encoder * Adding SequenceLayer component placeholders * Implementing Gemma3nAudioEncoder with nn.Sequential * Implementing Gemma3nAudioSubSampleConvProjection with nn.Sequential * Implementing Conformer model with SequenceLayers * Use OrderedDict in nn.Sequential initializers * Implements sl.Residual in Torch with nn.Sequential and OrderedDict * Adopting a base SequenceLayer class with default forward() method * Implementing sl.GatedLinearUnit in Torch * Implementing sl.Swish in Torch * Implementing sl.ReLU in Torch * Implementing sl.Scale in Torch * Removing sl.Dropout after tree-shaking * Implementing sl.RMSNorm in Torch with fake shape * Implementing sl.GroupNorm in Torch * Implementing sl.Conv2d in Torch * Implementing sl.Dense in Torch * Removing sl.Delay layers, which act as pass-throughs * Connecting shapes to configs in initializers * Removing sl.Emit * Implementing sl.ExpandDims in Torch * Adding sl.GradientClipping to Torch * Implementing sl.DenseShaped in Torch * Implementing sl.LDPA in Torch * Removing unused sl.CombinedQKVProj class * Fixing erroneous type hint * Implemnenting sl.DepthwiseConv1D in Torch * Implementing sl.MaskInvalid in Torch * Fixes for initialization * Fixes for saving weights * Removing einsums per feedback from HF staff * Removing Sequence Layers idioms from audio encoder * Fixes for reviewer comments * Converting sl.Frontend to FeatureExtractor * Updates for ConditionalGeneration.get_image_features * Adding a WIP draft of image_processing_gemma3n.py * Update modular Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> * Modular conversion after github suggested change * Text + image gives good results * Fixing image size preset * Draft of audio data in chat template * Removing image processing. Using SigLIP instead. * Audio input going end-to-end * Fixing dtype issues in audio encoder * x-lib formatting consistency * Adding example data * Save preprocessor_config.json from conversion script * Instrumentaiton for debugging * Additional instrumentation for preprocessing debugging * Updates to preprocessor, padding; produces correct end-to-end results on sample * Tackling configuraiton TODOs * Start of feature extractor refatcor * Adds Numpy version of USM extractor, removes Torch version and dependencies * Fixing AltUp.correct coef permute * Supporting batches of single audio segment inputs * Docstrings updates for config * In-lining audio feature extraction * Adjustments to conversion script and smoke test script --------- Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> Co-authored-by: pculliton <phillipculliton@gmail.com> * Gemma 3n renaming * Removing test data and utilities * Renaming test files * Gemma 3n refactor * Fix tokenizer config in conversion script * Address reviewer feedback * FeatureExtractor returns float32 by default * Adding basic tests for audio, and input name for audio encoder * Audio integration test, updates to model_id for other integration tests * Use scales for q and k norms (#26) * Update audio integration test to use HF dataset * Reviewer feedback * Expand embedding table to full vocab size in weights conversion * Mix-n-match MatFormers for Gemma 3n (#25) * Remove in-place operations (#30) * chore: removing inplace ops * remove [tensor] * n pattern * chore: reviewer feedback in AudioEncoder and AltUp * More grad clipping * Dynamo compatibility * fix: cache slicing error * chore: simplify shared kv cache slicing * chore: vision encoder rename in timm * fix: image processor do_normalize=False * fixup: style * chore: model_doc * fix: docs for code quality * chore: repo consistency * fix: RMSNorm in float as in prior Gemmas * fix: per_layer_inputs = None * chore: Gemma3nForCausalLM from Gemma3nForConditionalGeneration checkpoint * chore: repo consistency * Add initial unit tests for Gemma3nAudioFeatureExtractor (#27) * Add initial unit tests for Gemma3nAudioFeatureExtractor * Add basic unit tests for Gemma3nProcessor (#28) Co-authored-by: Douglas Reid <21148125+douglas-reid@users.noreply.github.com> * parameterize tests --------- Co-authored-by: Douglas Reid <21148125+douglas-reid@users.noreply.github.com> * chore: code style * fix: test cases * style and consistency * fix config in the test to be coherent with layer cache sharing * fix hidden states in tests and code * inits and mappings * fix modality prefixes * test order and prefixes * fix test exception * fix class order and reduce model size for faster tests * restore _checkpoint_conversion_mapping to load Caual from Conditional * fix config mapping! * fix: reviewer feedback --------- Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> Co-authored-by: raushan <raushan@huggingface.co> Co-authored-by: Mayank Chaturvedi <imayank@google.com> Co-authored-by: Douglas Reid <douglas-reid@users.noreply.github.com> Co-authored-by: Douglas Reid <21148125+douglas-reid@users.noreply.github.com> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> Co-authored-by: pculliton <phillipculliton@gmail.com> Co-authored-by: Aritra Roy Gosthipaty <aritra.born2fly@gmail.com> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com> * fix import test * add model args * auto_docstring * replace test path * consistency * skip tests for now * fix docstring for doc builder * skip unused attr --------- Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> Co-authored-by: raushan <raushan@huggingface.co> Co-authored-by: Mayank Chaturvedi <imayank@google.com> Co-authored-by: Douglas Reid <douglas-reid@users.noreply.github.com> Co-authored-by: Douglas Reid <21148125+douglas-reid@users.noreply.github.com> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> Co-authored-by: pculliton <phillipculliton@gmail.com> Co-authored-by: Aritra Roy Gosthipaty <aritra.born2fly@gmail.com> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com> Co-authored-by: Arthur <arthur.zucker@gmail.com>
186 lines
7.6 KiB
Python
186 lines
7.6 KiB
Python
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import shutil
|
|
import tempfile
|
|
import unittest
|
|
|
|
import numpy as np
|
|
from parameterized import parameterized
|
|
|
|
from transformers import GemmaTokenizerFast, SiglipImageProcessorFast, is_speech_available
|
|
from transformers.testing_utils import require_sentencepiece, require_torch, require_torchaudio, require_vision
|
|
|
|
from .test_feature_extraction_gemma3n import floats_list
|
|
|
|
|
|
if is_speech_available():
|
|
from transformers.models.gemma3n import Gemma3nAudioFeatureExtractor, Gemma3nProcessor
|
|
|
|
|
|
@require_torch
|
|
@require_torchaudio
|
|
@require_vision
|
|
@require_sentencepiece
|
|
class Gemma3nProcessorTest(unittest.TestCase):
|
|
def setUp(self):
|
|
# TODO: update to google?
|
|
self.model_id = "Google/gemma-3n-E4B-it"
|
|
self.tmpdirname = tempfile.mkdtemp(suffix="gemma3n")
|
|
self.maxDiff = None
|
|
|
|
def get_tokenizer(self, **kwargs):
|
|
return GemmaTokenizerFast.from_pretrained(self.model_id, **kwargs)
|
|
|
|
def get_feature_extractor(self, **kwargs):
|
|
return Gemma3nAudioFeatureExtractor.from_pretrained(self.model_id, **kwargs)
|
|
|
|
def get_image_processor(self, **kwargs):
|
|
return SiglipImageProcessorFast.from_pretrained(self.model_id, **kwargs)
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.tmpdirname)
|
|
|
|
def test_save_load_pretrained_default(self):
|
|
# NOTE: feature_extractor and image_processor both use the same filename, preprocessor_config.json, when saved to
|
|
# disk, but the files are overwritten by processor.save_pretrained(). This test does not attempt to address
|
|
# this potential issue, and as such, does not guarantee content accuracy.
|
|
|
|
tokenizer = self.get_tokenizer()
|
|
feature_extractor = self.get_feature_extractor()
|
|
image_processor = self.get_image_processor()
|
|
|
|
processor = Gemma3nProcessor(
|
|
tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
|
|
)
|
|
|
|
processor.save_pretrained(self.tmpdirname)
|
|
processor = Gemma3nProcessor.from_pretrained(self.tmpdirname)
|
|
|
|
self.assertIsInstance(processor.tokenizer, GemmaTokenizerFast)
|
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
|
|
|
self.assertIsInstance(processor.feature_extractor, Gemma3nAudioFeatureExtractor)
|
|
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
|
|
|
def test_save_load_pretrained_additional_features(self):
|
|
tokenizer = self.get_tokenizer()
|
|
feature_extractor = self.get_feature_extractor()
|
|
image_processor = self.get_image_processor()
|
|
|
|
processor = Gemma3nProcessor(
|
|
tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
|
|
)
|
|
processor.save_pretrained(self.tmpdirname)
|
|
|
|
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS-BOS)", eos_token="(EOS-EOS)")
|
|
feature_extractor_add_kwargs = self.get_feature_extractor(dither=5.0, padding_value=1.0)
|
|
|
|
processor = Gemma3nProcessor.from_pretrained(
|
|
self.tmpdirname, bos_token="(BOS-BOS)", eos_token="(EOS-EOS)", dither=5.0, padding_value=1.0
|
|
)
|
|
|
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
|
self.assertIsInstance(processor.tokenizer, GemmaTokenizerFast)
|
|
|
|
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
|
self.assertIsInstance(processor.feature_extractor, Gemma3nAudioFeatureExtractor)
|
|
|
|
@parameterized.expand([256, 512, 768, 1024])
|
|
def test_image_processor(self, image_size: int):
|
|
feature_extractor = self.get_feature_extractor()
|
|
tokenizer = self.get_tokenizer()
|
|
image_processor = self.get_image_processor()
|
|
processor = Gemma3nProcessor(
|
|
tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
|
|
)
|
|
|
|
raw_image = np.random.randint(0, 256, size=(image_size, image_size, 3), dtype=np.uint8)
|
|
input_image_processor = image_processor(raw_image, return_tensors="pt")
|
|
input_processor = processor(text="Describe:", images=raw_image, return_tensors="pt")
|
|
|
|
for key in input_image_processor.keys():
|
|
self.assertAlmostEqual(input_image_processor[key].sum(), input_processor[key].sum(), delta=1e-2)
|
|
if "pixel_values" in key:
|
|
# NOTE: all images should be re-scaled to 768x768
|
|
self.assertEqual(input_image_processor[key].shape, (1, 3, 768, 768))
|
|
self.assertEqual(input_processor[key].shape, (1, 3, 768, 768))
|
|
|
|
def test_audio_feature_extractor(self):
|
|
feature_extractor = self.get_feature_extractor()
|
|
tokenizer = self.get_tokenizer()
|
|
image_processor = self.get_image_processor()
|
|
processor = Gemma3nProcessor(
|
|
tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
|
|
)
|
|
|
|
raw_speech = floats_list((3, 1000))
|
|
input_feat_extract = feature_extractor(raw_speech, return_tensors="pt")
|
|
input_processor = processor(text="Transcribe:", audio=raw_speech, return_tensors="pt")
|
|
|
|
for key in input_feat_extract.keys():
|
|
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
|
|
|
def test_tokenizer(self):
|
|
feature_extractor = self.get_feature_extractor()
|
|
tokenizer = self.get_tokenizer()
|
|
image_processor = self.get_image_processor()
|
|
processor = Gemma3nProcessor(
|
|
tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
|
|
)
|
|
|
|
input_str = "This is a test string"
|
|
|
|
encoded_processor = processor(text=input_str)
|
|
|
|
encoded_tok = tokenizer(input_str)
|
|
|
|
for key in encoded_tok.keys():
|
|
self.assertListEqual(encoded_tok[key], encoded_processor[key][0])
|
|
|
|
def test_tokenizer_decode(self):
|
|
feature_extractor = self.get_feature_extractor()
|
|
tokenizer = self.get_tokenizer()
|
|
image_processor = self.get_image_processor()
|
|
processor = Gemma3nProcessor(
|
|
tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
|
|
)
|
|
|
|
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
|
|
|
decoded_processor = processor.batch_decode(predicted_ids)
|
|
decoded_tok = tokenizer.batch_decode(predicted_ids)
|
|
|
|
self.assertListEqual(decoded_tok, decoded_processor)
|
|
|
|
def test_model_input_names(self):
|
|
feature_extractor = self.get_feature_extractor()
|
|
tokenizer = self.get_tokenizer()
|
|
image_processor = self.get_image_processor()
|
|
processor = Gemma3nProcessor(
|
|
tokenizer=tokenizer, feature_extractor=feature_extractor, image_processor=image_processor
|
|
)
|
|
|
|
for key in feature_extractor.model_input_names:
|
|
self.assertIn(
|
|
key,
|
|
processor.model_input_names,
|
|
)
|
|
|
|
for key in image_processor.model_input_names:
|
|
self.assertIn(
|
|
key,
|
|
processor.model_input_names,
|
|
)
|