mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 21:00:08 +06:00

* Gemma 3n * initial commit of Gemma 3n scaffold * Fixing param pass through on Gemm3p5RMSNorm * Adds Einsum layer to Gemma 3n * Updating EinsumLayer API * Undoing erroneous force push * Reverting RMSNorm to with_scale by default * Adds LAuReL to Gemma 3n * Adds AltUp to Gemma 3n * Adding Gemma3p5 overall and text config with vision and audio config placeholders (#3) * Adding gemma3p5 text configs * Adding audio config placeholders * Adding a placeholder for vision configs * Updating MobileNetVisionConfig, inheriting TimmWrapperConfig * Updating text configs * Update src/transformers/models/gemma3p5/modular_gemma3p5.py Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Removing altup configs to accept the suggested configs * Update src/transformers/models/gemma3p5/modular_gemma3p5.py Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Updating altup config * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Addressing review comments and updating text configs * Adding a config for activation sparsity * Updating configs to pass through options to super class init and adjust some name prefixes * Updating laurel and altup with corrected config values * Normalizing sub_config initializers --------- Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Updating MLP with activation sparsity (#2) * Updating DecoderBlock for Gemma 3n (#3) * Initial Gemm3nTextModel (#4) NOTE: This implementation WILL CHANGE in the coming weeks, however, changes will be strictly additive and this will remain a suitable baseline for downstream implementations to reference. * Adding KV Cache Sharing * Adds Einsum layer to Gemma 3n * Updating EinsumLayer API * Refactored kv cache sharing in attention * Adding KVStore for cache sharing * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update src/transformers/cache_utils.py Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Undoing erroneous force push * Reverting RMSNorm to with_scale by default * Adds LAuReL to Gemma 3n * Updating KV Cache Sharing implementation * Updating the q and k norm definitions in the attention module * Fixing name error for q,k,v RMS norm to use the right 3n module * Updating MLP with activation sparsity * Updating DecoderBlock for Gemma 3.5 * Updating kv cache sharing implementation with the use of a cache buffer and refactoring some lines of code * Isolating KV Cache logic to relevant components * Fixing logic error in Gemma3nAttention.forward * Refactoring caching contributions and fixing kv_store initialization * Simplifying Configs * Remove errant self from super init call * Bug fix in the Attention module - changing self.head_dim to config.head_dim * Bug fixes in the LaurelBlock and RMS Norm super init call * removing redundant code from a merge * Adding per_layer_inputs to TextModel * Adding preprocess embeddings with altup * Adds per-layer-to-single output and a host of TODOs * Integrating altup predict with the model workflow and other minor bug fixes * Using nn.Embedding temporarily for text model * It goes forward * Minor refactor of attention sparsity and RoPE initialization * Fixing duplicate rope_scaling param bug when loading from pretrained --------- Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> * Normalizing on altup_num_inputs config option * regenerating modeling file after syncing to HEAD * Use torch.std(..., unbiased=False) for activation sparsity (#8) * Refactoring to a single QVK Norm (#13) * AltUp: support scale_corrected_output (#14) * Converts einsums to nn.Linear (#7) * Converts einsums to nn.Linear * Removing unused variables * Aligning SharedKVCache with HybridCache (#11) * Alinging SharedKVStore with HybridCache * Remove KVStore. Refactor apply_rotary_pos_emb for sharing * Addressing review comments * Supporting split modality embeddings in Gemma3n (#10) * Adding the Embedder class * Update modular Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Update modular Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Update modular Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Update modular Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Update modular Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Update modular Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Addressing review comments, adding audio embedding layers, integrating embedder with the remaining architecture, adding a forward method for conditional generation * Apply suggestions from code review Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Update modular Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> * Addressing review comments, prop drilling audio and vision configs to the text config * Removing TODO's that have been addressed * Simplify Embedder init and add audio embeddings * Embeddings refactor. Adds Gemma3nAudioEmbedder and Gemma3nVisionEmbedder * Refactoring vision and audio embeddings into ConditionalGeneration model --------- Co-authored-by: Ryan Mullins <ryan@ryanmullins.org> Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Updating attention mask for Gemma 3.5 (#15) * xxx_token_index to xxx_token_id * remvoing deprecated last_cache_position * Removing references to SigLIP * Always init per-layer inputs * Using torch.finfo().min for epsilon_tensor * Gemma3nDecoderLayer inherits from Gemma3DecoderLayer. Remove gating lambdas * fix modular GEMMA3N_INPUTS_DOCSTRING * Gemma3nAttention inherits from Gemma3Attention * Modular inheritance fixes * CausalLM conversion script for 4B model (#16) * Add Gemma3n Audio Encoder (#6) * initial commit of Gemma 3.5 scaffold * Fixing param pass through on Gemm3nRMSNorm * Adds Einsum layer to Gemma 3.5 * Updating EinsumLayer API * Undoing erroneous force push * Reverting RMSNorm to with_scale by default * Adds LAuReL to Gemma 3n * Adds AltUp to Gemma 3n * Adding Gemma3n overall and text config with vision and audio config placeholders (#3) * Adding gemma3n text configs * Adding audio config placeholders * Adding a placeholder for vision configs * Updating MobileNetVisionConfig, inheriting TimmWrapperConfig * Updating text configs * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Removing altup configs to accept the suggested configs * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Updating altup config * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Addressing review comments and updating text configs * Adding a config for activation sparsity * Updating configs to pass through options to super class init and adjust some name prefixes * Updating laurel and altup with corrected config values * Normalizing sub_config initializers --------- Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Updating MLP with activation sparsity (#2) * Updating DecoderBlock for Gemma 3.5 (#3) * Initial Gemm3nTextModel (#4) NOTE: This implementation WILL CHANGE in the coming weeks, however, changes will be strictly additive and this will remain a suitable baseline for downstream implementations to reference. * Adding KV Cache Sharing * Adds Einsum layer to Gemma 3.5 * Updating EinsumLayer API * Refactored kv cache sharing in attention * Adding KVStore for cache sharing * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update src/transformers/cache_utils.py Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Undoing erroneous force push * Reverting RMSNorm to with_scale by default * Adds LAuReL to Gemma 3n * Updating KV Cache Sharing implementation * Updating the q and k norm definitions in the attention module * Fixing name error for q,k,v RMS norm to use the right Gemma 3n module * Updating MLP with activation sparsity * Updating DecoderBlock for Gemma 3.5 * Updating kv cache sharing implementation with the use of a cache buffer and refactoring some lines of code * Isolating KV Cache logic to relevant components * Fixing logic error in Gemma3nAttention.forward * Refactoring caching contributions and fixing kv_store initialization * Simplifying Configs * Remove errant self from super init call * Bug fix in the Attention module - changing self.head_dim to config.head_dim * Bug fixes in the LaurelBlock and RMS Norm super init call * removing redundant code from a merge * Adding per_layer_inputs to TextModel * Adding preprocess embeddings with altup * Adds per-layer-to-single output and a host of TODOs * Integrating altup predict with the model workflow and other minor bug fixes * Using nn.Embedding temporarily for text model * It goes forward * Minor refactor of attention sparsity and RoPE initialization * Fixing duplicate rope_scaling param bug when loading from pretrained --------- Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> * Normalizing on altup_num_inputs config option * Adding audio encoder config * Adds high-level components for Audio Encoder * Implement uniform reducer for Audio Encoder * Adding placeholders for Conformer components in Audio Encoder * Adding placeholders for SubSampleConvProjection components in Audio Encoder * Adding SequenceLayer component placeholders * Implementing Gemma3nAudioEncoder with nn.Sequential * Implementing Gemma3nAudioSubSampleConvProjection with nn.Sequential * Implementing Conformer model with SequenceLayers * Use OrderedDict in nn.Sequential initializers * Implements sl.Residual in Torch with nn.Sequential and OrderedDict * Adopting a base SequenceLayer class with default forward() method * Implementing sl.GatedLinearUnit in Torch * Implementing sl.Swish in Torch * Implementing sl.ReLU in Torch * Implementing sl.Scale in Torch * Removing sl.Dropout after tree-shaking * Implementing sl.RMSNorm in Torch with fake shape * Implementing sl.GroupNorm in Torch * Implementing sl.Conv2d in Torch * Implementing sl.Dense in Torch * Removing sl.Delay layers, which act as pass-throughs * Connecting shapes to configs in initializers * Removing sl.Emit * Implementing sl.ExpandDims in Torch * Adding sl.GradientClipping to Torch * Implementing sl.DenseShaped in Torch * Implementing sl.LDPA in Torch * Removing unused sl.CombinedQKVProj class * Fixing erroneous type hint * Implemnenting sl.DepthwiseConv1D in Torch * Implementing sl.MaskInvalid in Torch * Fixes for initialization * Fixes for saving weights * Removing einsums per feedback from HF staff * Removing Sequence Layers idioms from audio encoder * Fixes for reviewer comments * CausalLM conversion script for 4B model * inv_timescales to non-persistent buffer * Addressing audio encoder Attention feedback * Addressing Gemma3nAudioSSCPConvBlock feedback * Addressing Gemma3nAudioConformerAttention feedback * Addressing padding feedback * Weights conversion loads audio state dict * Always use vision_config so saving works * Token id updates for configs * Stubs for interleaving audio embs * Addressing reviewer feedback --------- Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> * Fixing cache access error * Removing duplicate code from a bad merge * Gemma 3n Text + Vision Part 1 (#17) * testing utilities for numerics comparisons * Corrected einsum to nn.Linear weights conversion * Inherit scaled word embs from Gemma3 not Bart * Fixing transposes for collapsed linears * More transpose fixes * numpy api fix * RMSNorm: Explicit kwargs, scale_shift=0.0 when with_scale=True * Force AltUp to float32 * Updating debugging script for AudioEncoder debugging * Support divide_weight_by_sqrt_fan_in from JAX for per-layer inputs * Correcting attention einsum conversions * RMSNorm in type of x * Fixing douplicate laurel norm/gating * KV sharing using the right previous indices * Refactor kv shared index computation. Correct frac_shared_layers * Use num_shared_layers instead of inferring from a fraction * fixing a bug for logging * Fix shared data_ptrs in altup inits * rope: adjust proj -> norm -> rope to preserve computation (#20) * rope: adjust proj -> norm -> rope to preserve computation * Removing some breaking language model fluff in ConditionalGeneration * Consolidate query_states transforms --------- Co-authored-by: Douglas Reid <21148125+douglas-reid@users.noreply.github.com> Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Vectorize the loops in AltUp (#19) * Vectorize the loops in AltUp * fix typo * Expanding to support batched inputs * remove extra debug script * Fix AltUp.forward --------- Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Add 'scale_shift=0.0, with_scale=True' to the final norm in TextModel * Convert norm to 1/sqrt (#21) * Convert norm to 1/sqrt * Scale shift change per Phil's rec * Adding default activation sparsity * Fixing 2B config in weights conversion script * Fixing RMSNorm parameters - adding scale_shift and with_scale * Correcting query pre-attention scaling * Adding query_rescale_scalar to text config * Adding layer_idx to MLP * Permafix for input_layernorm * Use 1/sqrt instead of rsqrt in DecoderLayer * Fix o_proj conversion * Conversion script update for vision encoder * Removing logging for debugging timm model * Fixing bugs in Gemma3nForConditionalGeneration for text generation * Generating the modeling_gemma3n.py file * Removing the addition of an erroneous line in the modeling file * Adding gemma3n text model to modeling_auto * Bugfix: Updating the interleaving of inputs_embeds and vision_embeds * Updating the modeling file with the latest bugfix changes * Updating models/auto for Gemma 3n * using AutoTokenizer in forward test * Adding processing_gemma3n.py * Gemma 3n configured for AutoModel. Conversion script updated. * Removing errant merge artifacts --------- Co-authored-by: Mayank Chaturvedi <imayank@google.com> Co-authored-by: Douglas Reid <douglas-reid@users.noreply.github.com> Co-authored-by: Douglas Reid <21148125+douglas-reid@users.noreply.github.com> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> * Removing errant debugging statements from Gemma 3 * Gemma3n audio model (#18) * testing utilities for numerics comparisons * Implement CumulativeGroupNorm and add to SubSampleConvProjection and SSCPConvBlock * Add audio version of forward script based on RyanMullins' implementation * Updating to match encoder tests. WIP: config question needs resolving * Updates to audio classes to enable end-to-end running * Removing vestigial classes, cleaning up print statements * Adding SiLU / Swish to audio conformer feed forward block * Shifted Gemma3p5Audio naming prefix to Gemma3NanoAudio * Adding outputs to audio test * Fixes to padding in SSCP and 1D convolution, align RMS Norm with wider model * Update forward test to load from local weights * Update conversion to process / output audio layers * Update __all__ to export audio encoder * AutoModel registration for Gemma 3n Audio * Use AutoModel for ConditionalGeneration.audio_tower * Fixing input_proj_linear transpose * Fixing Gemma3NanoAudioConformerAttention.post conversion * Fixing Gemma3NanoAudioSSCPConvBlock.conv weights conversion * Correcting indentation issue on Gemma3p5RMSNorm --------- Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Text + Vision Part 2 (#23) * Updates for ConditionalGeneration.get_image_features * Adding a WIP draft of image_processing_gemma3p5.py * Update src/transformers/models/gemma3p5/modular_gemma3p5.py Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> * Modular conversion after github suggested change * Text + image gives good results * Fixing image size preset * Updating configs for the 2B variant in the conversion script * Using final generation config in conversion script --------- Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> * Audio Integration (#12) * initial commit of Gemma 3n scaffold * Fixing param pass through on Gemm3nRMSNorm * Adds Einsum layer to Gemma 3n * Updating EinsumLayer API * Undoing erroneous force push * Reverting RMSNorm to with_scale by default * Adds LAuReL to Gemma 3n * Adds AltUp to Gemma 3n * Adding Gemma 3n overall and text config with vision and audio config placeholders (#3) * Adding Gemma 3n text configs * Adding audio config placeholders * Adding a placeholder for vision configs * Updating MobileNetVisionConfig, inheriting TimmWrapperConfig * Updating text configs * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Removing altup configs to accept the suggested configs * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Updating altup config * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Addressing review comments and updating text configs * Adding a config for activation sparsity * Updating configs to pass through options to super class init and adjust some name prefixes * Updating laurel and altup with corrected config values * Normalizing sub_config initializers --------- Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Updating MLP with activation sparsity (#2) * Updating DecoderBlock for Gemma 3n (#3) * Initial Gemma3nTextModel (#4) NOTE: This implementation WILL CHANGE in the coming weeks, however, changes will be strictly additive and this will remain a suitable baseline for downstream implementations to reference. * Adding KV Cache Sharing * Adds Einsum layer to Gemma 3n * Updating EinsumLayer API * Refactored kv cache sharing in attention * Adding KVStore for cache sharing * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update modular Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Update src/transformers/cache_utils.py Co-authored-by: Ryan Mullins <ryanmullins@google.com> * Undoing erroneous force push * Reverting RMSNorm to with_scale by default * Adds LAuReL to Gemma 3n * Updating KV Cache Sharing implementation * Updating the q and k norm definitions in the attention module * Fixing name error for q,k,v RMS norm to use the right 3n module * Updating MLP with activation sparsity * Updating DecoderBlock for Gemma 3n * Updating kv cache sharing implementation with the use of a cache buffer and refactoring some lines of code * Isolating KV Cache logic to relevant components * Fixing logic error in Gemma3nAttention.forward * Refactoring caching contributions and fixing kv_store initialization * Simplifying Configs * Remove errant self from super init call * Bug fix in the Attention module - changing self.head_dim to config.head_dim * Bug fixes in the LaurelBlock and RMS Norm super init call * removing redundant code from a merge * Adding per_layer_inputs to TextModel * Adding preprocess embeddings with altup * Adds per-layer-to-single output and a host of TODOs * Integrating altup predict with the model workflow and other minor bug fixes * Using nn.Embedding temporarily for text model * It goes forward * Minor refactor of attention sparsity and RoPE initialization * Fixing duplicate rope_scaling param bug when loading from pretrained --------- Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> * Normalizing on altup_num_inputs config option * Adding audio encoder config * Adds high-level components for Audio Encoder * Implement uniform reducer for Audio Encoder * Adding placeholders for Conformer components in Audio Encoder * Adding placeholders for SubSampleConvProjection components in Audio Encoder * Adding SequenceLayer component placeholders * Implementing Gemma3nAudioEncoder with nn.Sequential * Implementing Gemma3nAudioSubSampleConvProjection with nn.Sequential * Implementing Conformer model with SequenceLayers * Use OrderedDict in nn.Sequential initializers * Implements sl.Residual in Torch with nn.Sequential and OrderedDict * Adopting a base SequenceLayer class with default forward() method * Implementing sl.GatedLinearUnit in Torch * Implementing sl.Swish in Torch * Implementing sl.ReLU in Torch * Implementing sl.Scale in Torch * Removing sl.Dropout after tree-shaking * Implementing sl.RMSNorm in Torch with fake shape * Implementing sl.GroupNorm in Torch * Implementing sl.Conv2d in Torch * Implementing sl.Dense in Torch * Removing sl.Delay layers, which act as pass-throughs * Connecting shapes to configs in initializers * Removing sl.Emit * Implementing sl.ExpandDims in Torch * Adding sl.GradientClipping to Torch * Implementing sl.DenseShaped in Torch * Implementing sl.LDPA in Torch * Removing unused sl.CombinedQKVProj class * Fixing erroneous type hint * Implemnenting sl.DepthwiseConv1D in Torch * Implementing sl.MaskInvalid in Torch * Fixes for initialization * Fixes for saving weights * Removing einsums per feedback from HF staff * Removing Sequence Layers idioms from audio encoder * Fixes for reviewer comments * Converting sl.Frontend to FeatureExtractor * Updates for ConditionalGeneration.get_image_features * Adding a WIP draft of image_processing_gemma3n.py * Update modular Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> * Modular conversion after github suggested change * Text + image gives good results * Fixing image size preset * Draft of audio data in chat template * Removing image processing. Using SigLIP instead. * Audio input going end-to-end * Fixing dtype issues in audio encoder * x-lib formatting consistency * Adding example data * Save preprocessor_config.json from conversion script * Instrumentaiton for debugging * Additional instrumentation for preprocessing debugging * Updates to preprocessor, padding; produces correct end-to-end results on sample * Tackling configuraiton TODOs * Start of feature extractor refatcor * Adds Numpy version of USM extractor, removes Torch version and dependencies * Fixing AltUp.correct coef permute * Supporting batches of single audio segment inputs * Docstrings updates for config * In-lining audio feature extraction * Adjustments to conversion script and smoke test script --------- Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> Co-authored-by: pculliton <phillipculliton@gmail.com> * Gemma 3n renaming * Removing test data and utilities * Renaming test files * Gemma 3n refactor * Fix tokenizer config in conversion script * Address reviewer feedback * FeatureExtractor returns float32 by default * Adding basic tests for audio, and input name for audio encoder * Audio integration test, updates to model_id for other integration tests * Use scales for q and k norms (#26) * Update audio integration test to use HF dataset * Reviewer feedback * Expand embedding table to full vocab size in weights conversion * Mix-n-match MatFormers for Gemma 3n (#25) * Remove in-place operations (#30) * chore: removing inplace ops * remove [tensor] * n pattern * chore: reviewer feedback in AudioEncoder and AltUp * More grad clipping * Dynamo compatibility * fix: cache slicing error * chore: simplify shared kv cache slicing * chore: vision encoder rename in timm * fix: image processor do_normalize=False * fixup: style * chore: model_doc * fix: docs for code quality * chore: repo consistency * fix: RMSNorm in float as in prior Gemmas * fix: per_layer_inputs = None * chore: Gemma3nForCausalLM from Gemma3nForConditionalGeneration checkpoint * chore: repo consistency * Add initial unit tests for Gemma3nAudioFeatureExtractor (#27) * Add initial unit tests for Gemma3nAudioFeatureExtractor * Add basic unit tests for Gemma3nProcessor (#28) Co-authored-by: Douglas Reid <21148125+douglas-reid@users.noreply.github.com> * parameterize tests --------- Co-authored-by: Douglas Reid <21148125+douglas-reid@users.noreply.github.com> * chore: code style * fix: test cases * style and consistency * fix config in the test to be coherent with layer cache sharing * fix hidden states in tests and code * inits and mappings * fix modality prefixes * test order and prefixes * fix test exception * fix class order and reduce model size for faster tests * restore _checkpoint_conversion_mapping to load Caual from Conditional * fix config mapping! * fix: reviewer feedback --------- Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> Co-authored-by: raushan <raushan@huggingface.co> Co-authored-by: Mayank Chaturvedi <imayank@google.com> Co-authored-by: Douglas Reid <douglas-reid@users.noreply.github.com> Co-authored-by: Douglas Reid <21148125+douglas-reid@users.noreply.github.com> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> Co-authored-by: pculliton <phillipculliton@gmail.com> Co-authored-by: Aritra Roy Gosthipaty <aritra.born2fly@gmail.com> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com> * fix import test * add model args * auto_docstring * replace test path * consistency * skip tests for now * fix docstring for doc builder * skip unused attr --------- Co-authored-by: SindhuRaghuram97 <114270661+SindhuRaghuram97@users.noreply.github.com> Co-authored-by: Sindhu Raghuram <sindhuraghuram@google.com> Co-authored-by: raushan <raushan@huggingface.co> Co-authored-by: Mayank Chaturvedi <imayank@google.com> Co-authored-by: Douglas Reid <douglas-reid@users.noreply.github.com> Co-authored-by: Douglas Reid <21148125+douglas-reid@users.noreply.github.com> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> Co-authored-by: pculliton <phillipculliton@gmail.com> Co-authored-by: Aritra Roy Gosthipaty <aritra.born2fly@gmail.com> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com> Co-authored-by: Arthur <arthur.zucker@gmail.com>
502 lines
19 KiB
Python
502 lines
19 KiB
Python
# coding=utf-8
|
|
# Copyright 2023 The HuggingFace Inc. team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import inspect
|
|
import os
|
|
import re
|
|
|
|
from transformers.configuration_utils import PretrainedConfig
|
|
from transformers.utils import direct_transformers_import
|
|
|
|
|
|
# All paths are set with the intent you should run this script from the root of the repo with the command
|
|
# python utils/check_config_docstrings.py
|
|
PATH_TO_TRANSFORMERS = "src/transformers"
|
|
|
|
|
|
# This is to make sure the transformers module imported is the one in the repo.
|
|
transformers = direct_transformers_import(PATH_TO_TRANSFORMERS)
|
|
|
|
CONFIG_MAPPING = transformers.models.auto.configuration_auto.CONFIG_MAPPING
|
|
|
|
SPECIAL_CASES_TO_ALLOW = {
|
|
# used internally during generation to provide the custom logit processors with their necessary information
|
|
"DiaConfig": [
|
|
"delay_pattern",
|
|
],
|
|
# 'max_position_embeddings' is not used in modeling file, but needed for eval frameworks like Huggingface's lighteval (https://github.com/huggingface/lighteval/blob/af24080ea4f16eaf1683e353042a2dfc9099f038/src/lighteval/models/base_model.py#L264).
|
|
# periods and offsets are not used in modeling file, but used in the configuration file to define `layers_block_type` and `layers_num_experts`.
|
|
"BambaConfig": [
|
|
"attn_layer_indices",
|
|
],
|
|
"Dots1Config": ["max_window_layers"],
|
|
"JambaConfig": [
|
|
"max_position_embeddings",
|
|
"attn_layer_offset",
|
|
"attn_layer_period",
|
|
"expert_layer_offset",
|
|
"expert_layer_period",
|
|
],
|
|
"Qwen2Config": ["use_sliding_window", "max_window_layers"],
|
|
"Qwen2MoeConfig": ["use_sliding_window"],
|
|
"Qwen2VLTextConfig": ["use_sliding_window", "max_window_layers"],
|
|
"Qwen2_5_VLTextConfig": ["use_sliding_window", "max_window_layers"],
|
|
"Qwen2_5OmniTextConfig": ["use_sliding_window", "max_window_layers"],
|
|
"Qwen2_5OmniTalkerConfig": ["use_sliding_window", "max_window_layers"],
|
|
"Qwen3Config": ["max_window_layers", "use_sliding_window"], # now use `layer_types` instead
|
|
"Qwen3MoeConfig": ["max_window_layers", "use_sliding_window"],
|
|
# `cache_implementation` should be in the default generation config, but we don't yet support per-model
|
|
# generation configs (TODO joao)
|
|
"Gemma2Config": ["tie_word_embeddings", "cache_implementation"],
|
|
"Cohere2Config": ["cache_implementation"],
|
|
# Dropout with this value was declared but never used
|
|
"Phi3Config": ["embd_pdrop"],
|
|
# used to compute the property `self.chunk_length`
|
|
"EncodecConfig": ["overlap"],
|
|
# used to compute the property `self.layers_block_type`
|
|
"RecurrentGemmaConfig": ["block_types"],
|
|
# used as in the config to define `intermediate_size`
|
|
"MambaConfig": ["expand"],
|
|
# used as in the config to define `intermediate_size`
|
|
"FalconMambaConfig": ["expand"],
|
|
# used as `self.bert_model = BertModel(config, ...)`
|
|
"DPRConfig": True,
|
|
"FuyuConfig": True,
|
|
# not used in modeling files, but it's an important information
|
|
"FSMTConfig": ["langs"],
|
|
# used internally in the configuration class file
|
|
"GPTNeoConfig": ["attention_types"],
|
|
# used internally in the configuration class file
|
|
"EsmConfig": ["is_folding_model"],
|
|
# used during training (despite we don't have training script for these models yet)
|
|
"Mask2FormerConfig": ["ignore_value"],
|
|
# `ignore_value` used during training (despite we don't have training script for these models yet)
|
|
# `norm` used in conversion script (despite not using in the modeling file)
|
|
"OneFormerConfig": ["ignore_value", "norm"],
|
|
# used internally in the configuration class file
|
|
"T5Config": ["feed_forward_proj"],
|
|
# used internally in the configuration class file
|
|
# `tokenizer_class` get default value `T5Tokenizer` intentionally
|
|
"MT5Config": ["feed_forward_proj", "tokenizer_class"],
|
|
"UMT5Config": ["feed_forward_proj", "tokenizer_class"],
|
|
# used internally in the configuration class file
|
|
"LongT5Config": ["feed_forward_proj"],
|
|
# used internally in the configuration class file
|
|
"Pop2PianoConfig": ["feed_forward_proj"],
|
|
# used internally in the configuration class file
|
|
"SwitchTransformersConfig": ["feed_forward_proj"],
|
|
# having default values other than `1e-5` - we can't fix them without breaking
|
|
"BioGptConfig": ["layer_norm_eps"],
|
|
# having default values other than `1e-5` - we can't fix them without breaking
|
|
"GLPNConfig": ["layer_norm_eps"],
|
|
# having default values other than `1e-5` - we can't fix them without breaking
|
|
"SegformerConfig": ["layer_norm_eps"],
|
|
# having default values other than `1e-5` - we can't fix them without breaking
|
|
"CvtConfig": ["layer_norm_eps"],
|
|
# having default values other than `1e-5` - we can't fix them without breaking
|
|
"PerceiverConfig": ["layer_norm_eps"],
|
|
# used internally to calculate the feature size
|
|
"InformerConfig": ["num_static_real_features", "num_time_features"],
|
|
# used internally to calculate the feature size
|
|
"TimeSeriesTransformerConfig": ["num_static_real_features", "num_time_features"],
|
|
# used internally to calculate the feature size
|
|
"AutoformerConfig": ["num_static_real_features", "num_time_features"],
|
|
# used internally to calculate `mlp_dim`
|
|
"SamVisionConfig": ["mlp_ratio"],
|
|
# used internally to calculate `mlp_dim`
|
|
"SamHQVisionConfig": ["mlp_ratio"],
|
|
# For (head) training, but so far not implemented
|
|
"ClapAudioConfig": ["num_classes"],
|
|
# Not used, but providing useful information to users
|
|
"SpeechT5HifiGanConfig": ["sampling_rate"],
|
|
# used internally in the configuration class file
|
|
"UdopConfig": ["feed_forward_proj"],
|
|
# Actually used in the config or generation config, in that case necessary for the sub-components generation
|
|
"SeamlessM4TConfig": [
|
|
"max_new_tokens",
|
|
"t2u_max_new_tokens",
|
|
"t2u_decoder_attention_heads",
|
|
"t2u_decoder_ffn_dim",
|
|
"t2u_decoder_layers",
|
|
"t2u_encoder_attention_heads",
|
|
"t2u_encoder_ffn_dim",
|
|
"t2u_encoder_layers",
|
|
"t2u_max_position_embeddings",
|
|
],
|
|
# Actually used in the config or generation config, in that case necessary for the sub-components generation
|
|
"SeamlessM4Tv2Config": [
|
|
"max_new_tokens",
|
|
"t2u_decoder_attention_heads",
|
|
"t2u_decoder_ffn_dim",
|
|
"t2u_decoder_layers",
|
|
"t2u_encoder_attention_heads",
|
|
"t2u_encoder_ffn_dim",
|
|
"t2u_encoder_layers",
|
|
"t2u_max_position_embeddings",
|
|
"t2u_variance_pred_dropout",
|
|
"t2u_variance_predictor_embed_dim",
|
|
"t2u_variance_predictor_hidden_dim",
|
|
"t2u_variance_predictor_kernel_size",
|
|
],
|
|
"ZambaConfig": [
|
|
"tie_word_embeddings",
|
|
"attn_layer_offset",
|
|
"attn_layer_period",
|
|
],
|
|
"MllamaTextConfig": [
|
|
"initializer_range",
|
|
],
|
|
"MllamaVisionConfig": [
|
|
"initializer_range",
|
|
"supported_aspect_ratios",
|
|
],
|
|
"ConditionalDetrConfig": [
|
|
"bbox_cost",
|
|
"bbox_loss_coefficient",
|
|
"class_cost",
|
|
"cls_loss_coefficient",
|
|
"dice_loss_coefficient",
|
|
"focal_alpha",
|
|
"giou_cost",
|
|
"giou_loss_coefficient",
|
|
"mask_loss_coefficient",
|
|
],
|
|
"DabDetrConfig": [
|
|
"dilation",
|
|
"bbox_cost",
|
|
"bbox_loss_coefficient",
|
|
"class_cost",
|
|
"cls_loss_coefficient",
|
|
"focal_alpha",
|
|
"giou_cost",
|
|
"giou_loss_coefficient",
|
|
],
|
|
"DetrConfig": [
|
|
"bbox_cost",
|
|
"bbox_loss_coefficient",
|
|
"class_cost",
|
|
"dice_loss_coefficient",
|
|
"eos_coefficient",
|
|
"giou_cost",
|
|
"giou_loss_coefficient",
|
|
"mask_loss_coefficient",
|
|
],
|
|
"DFineConfig": [
|
|
"eos_coefficient",
|
|
"focal_loss_alpha",
|
|
"focal_loss_gamma",
|
|
"matcher_alpha",
|
|
"matcher_bbox_cost",
|
|
"matcher_class_cost",
|
|
"matcher_gamma",
|
|
"matcher_giou_cost",
|
|
"use_focal_loss",
|
|
"weight_loss_bbox",
|
|
"weight_loss_giou",
|
|
"weight_loss_vfl",
|
|
"weight_loss_fgl",
|
|
"weight_loss_ddf",
|
|
],
|
|
"GroundingDinoConfig": [
|
|
"bbox_cost",
|
|
"bbox_loss_coefficient",
|
|
"class_cost",
|
|
"focal_alpha",
|
|
"giou_cost",
|
|
"giou_loss_coefficient",
|
|
],
|
|
"RTDetrConfig": [
|
|
"eos_coefficient",
|
|
"focal_loss_alpha",
|
|
"focal_loss_gamma",
|
|
"matcher_alpha",
|
|
"matcher_bbox_cost",
|
|
"matcher_class_cost",
|
|
"matcher_gamma",
|
|
"matcher_giou_cost",
|
|
"use_focal_loss",
|
|
"weight_loss_bbox",
|
|
"weight_loss_giou",
|
|
"weight_loss_vfl",
|
|
],
|
|
"RTDetrV2Config": [
|
|
"eos_coefficient",
|
|
"focal_loss_alpha",
|
|
"focal_loss_gamma",
|
|
"matcher_alpha",
|
|
"matcher_bbox_cost",
|
|
"matcher_class_cost",
|
|
"matcher_gamma",
|
|
"matcher_giou_cost",
|
|
"use_focal_loss",
|
|
"weight_loss_bbox",
|
|
"weight_loss_giou",
|
|
"weight_loss_vfl",
|
|
],
|
|
"YolosConfig": [
|
|
"bbox_cost",
|
|
"bbox_loss_coefficient",
|
|
"class_cost",
|
|
"eos_coefficient",
|
|
"giou_cost",
|
|
"giou_loss_coefficient",
|
|
],
|
|
"GPTNeoXConfig": ["rotary_emb_base"],
|
|
"Gemma3Config": ["boi_token_index", "eoi_token_index"],
|
|
"Gemma3TextConfig": ["cache_implementation", "tie_word_embeddings"],
|
|
"ShieldGemma2Config": [
|
|
"boi_token_index",
|
|
"eoi_token_index",
|
|
"initializer_range",
|
|
"mm_tokens_per_image",
|
|
"text_config",
|
|
"vision_config",
|
|
],
|
|
"Llama4Config": ["boi_token_index", "eoi_token_index"],
|
|
"Llama4TextConfig": [
|
|
"interleave_moe_layer_step",
|
|
"no_rope_layer_interval",
|
|
"no_rope_layers",
|
|
"output_router_logits",
|
|
"router_aux_loss_coef",
|
|
"router_jitter_noise",
|
|
"cache_implementation",
|
|
"attention_chunk_size",
|
|
],
|
|
"Llama4VisionConfig": ["multi_modal_projector_bias", "norm_eps"],
|
|
"SmolLM3Config": ["no_rope_layer_interval"],
|
|
"Gemma3nVisionConfig": ["architecture", "do_pooling", "model_args"], # this is for use in `timm`
|
|
}
|
|
|
|
|
|
# TODO (ydshieh): Check the failing cases, try to fix them or move some cases to the above block once we are sure
|
|
SPECIAL_CASES_TO_ALLOW.update(
|
|
{
|
|
"CLIPSegConfig": True,
|
|
"DeformableDetrConfig": True,
|
|
"DinatConfig": True,
|
|
"DonutSwinConfig": True,
|
|
"FastSpeech2ConformerConfig": True,
|
|
"FSMTConfig": True,
|
|
"LayoutLMv2Config": True,
|
|
"MaskFormerSwinConfig": True,
|
|
"MT5Config": True,
|
|
# For backward compatibility with trust remote code models
|
|
"MptConfig": True,
|
|
"MptAttentionConfig": True,
|
|
"OneFormerConfig": True,
|
|
"PerceiverConfig": True,
|
|
"RagConfig": True,
|
|
"SpeechT5Config": True,
|
|
"SwinConfig": True,
|
|
"Swin2SRConfig": True,
|
|
"Swinv2Config": True,
|
|
"SwitchTransformersConfig": True,
|
|
"TableTransformerConfig": True,
|
|
"TapasConfig": True,
|
|
"UniSpeechConfig": True,
|
|
"UniSpeechSatConfig": True,
|
|
"WavLMConfig": True,
|
|
"WhisperConfig": True,
|
|
# TODO: @Arthur (for `alignment_head` and `alignment_layer`)
|
|
"JukeboxPriorConfig": True,
|
|
# TODO: @Younes (for `is_decoder`)
|
|
"Pix2StructTextConfig": True,
|
|
"IdeficsConfig": True,
|
|
"IdeficsVisionConfig": True,
|
|
"IdeficsPerceiverConfig": True,
|
|
}
|
|
)
|
|
|
|
|
|
def check_attribute_being_used(config_class, attributes, default_value, source_strings):
|
|
"""Check if any name in `attributes` is used in one of the strings in `source_strings`
|
|
|
|
Args:
|
|
config_class (`type`):
|
|
The configuration class for which the arguments in its `__init__` will be checked.
|
|
attributes (`List[str]`):
|
|
The name of an argument (or attribute) and its variant names if any.
|
|
default_value (`Any`):
|
|
A default value for the attribute in `attributes` assigned in the `__init__` of `config_class`.
|
|
source_strings (`List[str]`):
|
|
The python source code strings in the same modeling directory where `config_class` is defined. The file
|
|
containing the definition of `config_class` should be excluded.
|
|
"""
|
|
attribute_used = False
|
|
for attribute in attributes:
|
|
for modeling_source in source_strings:
|
|
# check if we can find `config.xxx`, `getattr(config, "xxx", ...)` or `getattr(self.config, "xxx", ...)`
|
|
if (
|
|
f"config.{attribute}" in modeling_source
|
|
or f'getattr(config, "{attribute}"' in modeling_source
|
|
or f'getattr(self.config, "{attribute}"' in modeling_source
|
|
or (
|
|
"TextConfig" in config_class.__name__
|
|
and f"config.get_text_config().{attribute}" in modeling_source
|
|
)
|
|
):
|
|
attribute_used = True
|
|
# Deal with multi-line cases
|
|
elif (
|
|
re.search(
|
|
rf'getattr[ \t\v\n\r\f]*\([ \t\v\n\r\f]*(self\.)?config,[ \t\v\n\r\f]*"{attribute}"',
|
|
modeling_source,
|
|
)
|
|
is not None
|
|
):
|
|
attribute_used = True
|
|
if attribute_used:
|
|
break
|
|
if attribute_used:
|
|
break
|
|
|
|
# common and important attributes, even if they do not always appear in the modeling files
|
|
attributes_to_allow = [
|
|
"initializer_range",
|
|
"bos_index",
|
|
"eos_index",
|
|
"pad_index",
|
|
"unk_index",
|
|
"mask_index",
|
|
"image_token_id", # for VLMs
|
|
"video_token_id",
|
|
"image_seq_length",
|
|
"video_seq_length",
|
|
"image_size",
|
|
"text_config", # may appear as `get_text_config()`
|
|
"use_cache",
|
|
"out_features",
|
|
"out_indices",
|
|
"sampling_rate",
|
|
# backbone related arguments passed to load_backbone
|
|
"use_pretrained_backbone",
|
|
"backbone",
|
|
"backbone_config",
|
|
"use_timm_backbone",
|
|
"backbone_kwargs",
|
|
# rope attributes may not appear directly in the modeling but are used
|
|
"rope_theta",
|
|
"partial_rotary_factor",
|
|
"pretraining_tp",
|
|
"boi_token_id",
|
|
"eoi_token_id",
|
|
]
|
|
attributes_used_in_generation = ["encoder_no_repeat_ngram_size"]
|
|
|
|
# Special cases to be allowed
|
|
case_allowed = True
|
|
if not attribute_used:
|
|
case_allowed = False
|
|
for attribute in attributes:
|
|
# Allow if the default value in the configuration class is different from the one in `PretrainedConfig`
|
|
if attribute in ["is_encoder_decoder"] and default_value is True:
|
|
case_allowed = True
|
|
elif attribute in ["tie_word_embeddings"] and default_value is False:
|
|
case_allowed = True
|
|
|
|
# Allow cases without checking the default value in the configuration class
|
|
elif attribute in attributes_to_allow + attributes_used_in_generation:
|
|
case_allowed = True
|
|
elif attribute.endswith("_token_id"):
|
|
case_allowed = True
|
|
|
|
# configuration class specific cases
|
|
if not case_allowed:
|
|
allowed_cases = SPECIAL_CASES_TO_ALLOW.get(config_class.__name__, [])
|
|
case_allowed = allowed_cases is True or attribute in allowed_cases
|
|
|
|
return attribute_used or case_allowed
|
|
|
|
|
|
def check_config_attributes_being_used(config_class):
|
|
"""Check the arguments in `__init__` of `config_class` are used in the modeling files in the same directory
|
|
|
|
Args:
|
|
config_class (`type`):
|
|
The configuration class for which the arguments in its `__init__` will be checked.
|
|
"""
|
|
# Get the parameters in `__init__` of the configuration class, and the default values if any
|
|
signature = dict(inspect.signature(config_class.__init__).parameters)
|
|
parameter_names = [x for x in list(signature.keys()) if x not in ["self", "kwargs"]]
|
|
parameter_defaults = [signature[param].default for param in parameter_names]
|
|
|
|
# If `attribute_map` exists, an attribute can have different names to be used in the modeling files, and as long
|
|
# as one variant is used, the test should pass
|
|
reversed_attribute_map = {}
|
|
if len(config_class.attribute_map) > 0:
|
|
reversed_attribute_map = {v: k for k, v in config_class.attribute_map.items()}
|
|
|
|
# Get the path to modeling source files
|
|
config_source_file = inspect.getsourcefile(config_class)
|
|
model_dir = os.path.dirname(config_source_file)
|
|
# Let's check against all frameworks: as long as one framework uses an attribute, we are good.
|
|
modeling_paths = [os.path.join(model_dir, fn) for fn in os.listdir(model_dir) if fn.startswith("modeling_")]
|
|
|
|
# Get the source code strings
|
|
modeling_sources = []
|
|
for path in modeling_paths:
|
|
if os.path.isfile(path):
|
|
with open(path, encoding="utf8") as fp:
|
|
modeling_sources.append(fp.read())
|
|
|
|
unused_attributes = []
|
|
for config_param, default_value in zip(parameter_names, parameter_defaults):
|
|
# `attributes` here is all the variant names for `config_param`
|
|
attributes = [config_param]
|
|
# some configuration classes have non-empty `attribute_map`, and both names could be used in the
|
|
# corresponding modeling files. As long as one of them appears, it is fine.
|
|
if config_param in reversed_attribute_map:
|
|
attributes.append(reversed_attribute_map[config_param])
|
|
|
|
if not check_attribute_being_used(config_class, attributes, default_value, modeling_sources):
|
|
unused_attributes.append(attributes[0])
|
|
|
|
return sorted(unused_attributes)
|
|
|
|
|
|
def check_config_attributes():
|
|
"""Check the arguments in `__init__` of all configuration classes are used in python files"""
|
|
configs_with_unused_attributes = {}
|
|
for _config_class in list(CONFIG_MAPPING.values()):
|
|
# Skip deprecated models
|
|
if "models.deprecated" in _config_class.__module__:
|
|
continue
|
|
# Some config classes are not in `CONFIG_MAPPING` (e.g. `CLIPVisionConfig`, `Blip2VisionConfig`, etc.)
|
|
config_classes_in_module = [
|
|
cls
|
|
for name, cls in inspect.getmembers(
|
|
inspect.getmodule(_config_class),
|
|
lambda x: inspect.isclass(x)
|
|
and issubclass(x, PretrainedConfig)
|
|
and inspect.getmodule(x) == inspect.getmodule(_config_class),
|
|
)
|
|
]
|
|
for config_class in config_classes_in_module:
|
|
unused_attributes = check_config_attributes_being_used(config_class)
|
|
if len(unused_attributes) > 0:
|
|
configs_with_unused_attributes[config_class.__name__] = unused_attributes
|
|
|
|
if len(configs_with_unused_attributes) > 0:
|
|
error = "The following configuration classes contain unused attributes in the corresponding modeling files:\n"
|
|
for name, attributes in configs_with_unused_attributes.items():
|
|
error += f"{name}: {attributes}\n"
|
|
|
|
raise ValueError(error)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
check_config_attributes()
|