diff --git a/docs/source/en/internal/import_utils.md b/docs/source/en/internal/import_utils.md index 749ece15da6..0d76c2bbe33 100644 --- a/docs/source/en/internal/import_utils.md +++ b/docs/source/en/internal/import_utils.md @@ -38,7 +38,7 @@ However, no method can be called on that object: ```python >>> DetrImageProcessorFast.from_pretrained() ImportError: -DetrImageProcessorFast requires the Torchvision library but it was not found in your environment. Checkout the instructions on the +DetrImageProcessorFast requires the Torchvision library but it was not found in your environment. Check out the instructions on the installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment. Please note that you may need to restart your runtime after installation. ``` diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index 22d93c54f89..7f307eaf707 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -546,7 +546,7 @@ def main(): # region Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( - "This example script only works for models that have a fast tokenizer. Checkout the big table of models at" + "This example script only works for models that have a fast tokenizer. Check out the big table of models at" " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet" " this requirement" ) diff --git a/examples/modular-transformers/configuration_my_new_model.py b/examples/modular-transformers/configuration_my_new_model.py index febd1b88675..863217d304f 100644 --- a/examples/modular-transformers/configuration_my_new_model.py +++ b/examples/modular-transformers/configuration_my_new_model.py @@ -36,7 +36,7 @@ class MyNewModelConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/examples/modular-transformers/configuration_new_model.py b/examples/modular-transformers/configuration_new_model.py index ba05b4ea51b..f9954a3c7a0 100644 --- a/examples/modular-transformers/configuration_new_model.py +++ b/examples/modular-transformers/configuration_new_model.py @@ -34,7 +34,7 @@ class NewModelConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. head_dim (`int`, *optional*, defaults to 256): diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 6651c8690be..b673a699f96 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -357,7 +357,7 @@ def main(): # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( - "This example script only works for models that have a fast tokenizer. Checkout the big table of models at" + "This example script only works for models that have a fast tokenizer. Check out the big table of models at" " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet" " this requirement" ) diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index af1a14b2199..6b32267c924 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -399,7 +399,7 @@ def main(): # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( - "This example script only works for models that have a fast tokenizer. Checkout the big table of models at" + "This example script only works for models that have a fast tokenizer. Check out the big table of models at" " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet" " this requirement" ) diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index b0a78f4355b..5979e351d0a 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -378,7 +378,7 @@ def main(): # region Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( - "This example script only works for models that have a fast tokenizer. Checkout the big table of models at" + "This example script only works for models that have a fast tokenizer. Check out the big table of models at" " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet" " this requirement" ) diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py index 5843e726d64..761dd2f722a 100644 --- a/src/transformers/models/aria/configuration_aria.py +++ b/src/transformers/models/aria/configuration_aria.py @@ -49,7 +49,7 @@ class AriaTextConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 24922deea30..c80351cd9a8 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -120,7 +120,7 @@ class AriaTextConfig(LlamaConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/bamba/configuration_bamba.py b/src/transformers/models/bamba/configuration_bamba.py index 36ac30ccca4..488f414444b 100644 --- a/src/transformers/models/bamba/configuration_bamba.py +++ b/src/transformers/models/bamba/configuration_bamba.py @@ -53,7 +53,7 @@ class BambaConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. diff --git a/src/transformers/models/bitnet/configuration_bitnet.py b/src/transformers/models/bitnet/configuration_bitnet.py index 645af2699e2..87177bab982 100644 --- a/src/transformers/models/bitnet/configuration_bitnet.py +++ b/src/transformers/models/bitnet/configuration_bitnet.py @@ -48,7 +48,7 @@ class BitNetConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`): diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index 70d9a7cd7a0..7760f55b7c1 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -125,7 +125,7 @@ class ChameleonConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py b/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py index 59b253b5ece..dbfa8232313 100644 --- a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py +++ b/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py @@ -446,7 +446,7 @@ def main(): "--model_size", choices=["7B", "30B"], help="" - " models correspond to the finetuned versions, and are specific to the Chameleon official release. For more details on Chameleon, checkout the original repo: https://github.com/facebookresearch/chameleon", + " models correspond to the finetuned versions, and are specific to the Chameleon official release. For more details on Chameleon, check out the original repo: https://github.com/facebookresearch/chameleon", ) parser.add_argument( "--output_dir", diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py index eeeb2364280..3e257448bc2 100644 --- a/src/transformers/models/cohere/configuration_cohere.py +++ b/src/transformers/models/cohere/configuration_cohere.py @@ -56,7 +56,7 @@ class CohereConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py index e407fb83dfd..8da5a81c09c 100644 --- a/src/transformers/models/cohere2/configuration_cohere2.py +++ b/src/transformers/models/cohere2/configuration_cohere2.py @@ -52,7 +52,7 @@ class Cohere2Config(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py index 7a5cab506e2..7153b1e5473 100644 --- a/src/transformers/models/cohere2/modular_cohere2.py +++ b/src/transformers/models/cohere2/modular_cohere2.py @@ -74,7 +74,7 @@ class Cohere2Config(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py index b13b9d2a873..56a60cda24e 100644 --- a/src/transformers/models/csm/configuration_csm.py +++ b/src/transformers/models/csm/configuration_csm.py @@ -54,7 +54,7 @@ class CsmDepthDecoderConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): @@ -235,7 +235,7 @@ class CsmConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the backbone model Transformer decoder. diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py index 82b8701cb57..4372cad67f1 100644 --- a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py @@ -52,7 +52,7 @@ class DeepseekV3Config(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. n_shared_experts (`int`, *optional*, defaults to 1): diff --git a/src/transformers/models/diffllama/configuration_diffllama.py b/src/transformers/models/diffllama/configuration_diffllama.py index 1b38f55d390..3e0b918e909 100644 --- a/src/transformers/models/diffllama/configuration_diffllama.py +++ b/src/transformers/models/diffllama/configuration_diffllama.py @@ -48,7 +48,7 @@ class DiffLlamaConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/emu3/configuration_emu3.py b/src/transformers/models/emu3/configuration_emu3.py index 19315003dfb..509150df8d9 100644 --- a/src/transformers/models/emu3/configuration_emu3.py +++ b/src/transformers/models/emu3/configuration_emu3.py @@ -138,7 +138,7 @@ class Emu3TextConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/falcon_h1/configuration_falcon_h1.py b/src/transformers/models/falcon_h1/configuration_falcon_h1.py index 2a686ecc612..94ca7f848d7 100644 --- a/src/transformers/models/falcon_h1/configuration_falcon_h1.py +++ b/src/transformers/models/falcon_h1/configuration_falcon_h1.py @@ -50,7 +50,7 @@ class FalconH1Config(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index e372817bf71..d8f26d38450 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -47,7 +47,7 @@ class GemmaConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. head_dim (`int`, *optional*, defaults to 256): diff --git a/src/transformers/models/gemma/convert_gemma_weights_to_hf.py b/src/transformers/models/gemma/convert_gemma_weights_to_hf.py index fd275c157f3..494e2c7187e 100644 --- a/src/transformers/models/gemma/convert_gemma_weights_to_hf.py +++ b/src/transformers/models/gemma/convert_gemma_weights_to_hf.py @@ -151,7 +151,7 @@ def main(): "--model_size", default="7B", choices=["2B", "7B", "tokenizer_only"], - help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b", + help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, check out the original repo: https://huggingface.co/google/gemma-7b", ) parser.add_argument( "--output_dir", diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index 648caea73d7..b10bd51f0cf 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -74,7 +74,7 @@ class GemmaConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. head_dim (`int`, *optional*, defaults to 256): diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py index 810c10cc928..c1390bf205c 100644 --- a/src/transformers/models/gemma2/configuration_gemma2.py +++ b/src/transformers/models/gemma2/configuration_gemma2.py @@ -47,7 +47,7 @@ class Gemma2Config(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. head_dim (`int`, *optional*, defaults to 256): diff --git a/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py b/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py index c41f9a2fdbb..1a3a4a92f3a 100644 --- a/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py +++ b/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py @@ -184,7 +184,7 @@ def main(): "--model_size", default="9B", choices=["9B", "27B", "tokenizer_only"], - help="'f' models correspond to the finetuned versions, and are specific to the Gemma22 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b", + help="'f' models correspond to the finetuned versions, and are specific to the Gemma22 official release. For more details on Gemma2, check out the original repo: https://huggingface.co/google/gemma-7b", ) parser.add_argument( "--output_dir", diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index 31b251f4ca7..f35fdefac6d 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -71,7 +71,7 @@ class Gemma2Config(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. head_dim (`int`, *optional*, defaults to 256): diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index db2749644cc..d6935cfef7c 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -55,7 +55,7 @@ class Gemma3TextConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. head_dim (`int`, *optional*, defaults to 256): diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index d679d30c8b9..3c1f5b0d5fd 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -82,7 +82,7 @@ class Gemma3TextConfig(Gemma2Config, PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. head_dim (`int`, *optional*, defaults to 256): diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index f9a3ab53a93..94b7eb528fd 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -42,7 +42,7 @@ class GlmConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position. diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py index 05d129a23e5..7ed2fbf0ee2 100644 --- a/src/transformers/models/glm4/configuration_glm4.py +++ b/src/transformers/models/glm4/configuration_glm4.py @@ -42,7 +42,7 @@ class Glm4Config(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position. diff --git a/src/transformers/models/granite/configuration_granite.py b/src/transformers/models/granite/configuration_granite.py index bfe98c3a529..c7f32402604 100644 --- a/src/transformers/models/granite/configuration_granite.py +++ b/src/transformers/models/granite/configuration_granite.py @@ -54,7 +54,7 @@ class GraniteConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/granitemoe/configuration_granitemoe.py b/src/transformers/models/granitemoe/configuration_granitemoe.py index e17860a41d6..cb1c30da5ab 100644 --- a/src/transformers/models/granitemoe/configuration_granitemoe.py +++ b/src/transformers/models/granitemoe/configuration_granitemoe.py @@ -54,7 +54,7 @@ class GraniteMoeConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py index 4a3b1da88a8..cd38fc8f9e8 100644 --- a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py @@ -49,7 +49,7 @@ class GraniteMoeHybridConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py index 2f81cad8932..dc71da6a5f5 100644 --- a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py @@ -54,7 +54,7 @@ class GraniteMoeSharedConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/helium/configuration_helium.py b/src/transformers/models/helium/configuration_helium.py index 7b27c6e54b6..fe3a5d95d12 100644 --- a/src/transformers/models/helium/configuration_helium.py +++ b/src/transformers/models/helium/configuration_helium.py @@ -42,7 +42,7 @@ class HeliumConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. head_dim (`int`, *optional*, defaults to 128): diff --git a/src/transformers/models/jamba/configuration_jamba.py b/src/transformers/models/jamba/configuration_jamba.py index 185380383e2..5980a28e4a0 100644 --- a/src/transformers/models/jamba/configuration_jamba.py +++ b/src/transformers/models/jamba/configuration_jamba.py @@ -55,7 +55,7 @@ class JambaConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 1ea1f124279..1bca8282701 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -51,7 +51,7 @@ class LlamaConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py index 84b5c53a916..e8282ef7438 100644 --- a/src/transformers/models/llama/convert_llama_weights_to_hf.py +++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py @@ -528,7 +528,7 @@ def main(): parser.add_argument( "--model_size", default=None, - help="'f' Deprecated in favor of `num_shards`: models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama", + help="'f' Deprecated in favor of `num_shards`: models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, check out the original repo: https://huggingface.co/meta-llama", ) parser.add_argument( "--output_dir", diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py index 52031411fe6..4d5a1b00784 100644 --- a/src/transformers/models/mimi/configuration_mimi.py +++ b/src/transformers/models/mimi/configuration_mimi.py @@ -95,7 +95,7 @@ class MimiConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`): The attention head dimension. diff --git a/src/transformers/models/minimax/configuration_minimax.py b/src/transformers/models/minimax/configuration_minimax.py index c0d8611af5c..b8ec562de09 100644 --- a/src/transformers/models/minimax/configuration_minimax.py +++ b/src/transformers/models/minimax/configuration_minimax.py @@ -51,7 +51,7 @@ class MiniMaxConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`): The attention head dimension. diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index 9a44d666563..0028dcbfb6c 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -76,7 +76,7 @@ class MiniMaxConfig(MixtralConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`): The attention head dimension. diff --git a/src/transformers/models/mistral/configuration_mistral.py b/src/transformers/models/mistral/configuration_mistral.py index c362dbef286..5a3cac5225f 100644 --- a/src/transformers/models/mistral/configuration_mistral.py +++ b/src/transformers/models/mistral/configuration_mistral.py @@ -51,7 +51,7 @@ class MistralConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`): The attention head dimension. diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index 066b045ee1e..ef2d870c1b8 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -51,7 +51,7 @@ class MixtralConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`): The attention head dimension. diff --git a/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py b/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py index 7e9f25d37f4..f6df5901fef 100644 --- a/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py +++ b/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py @@ -227,7 +227,7 @@ def main(): parser.add_argument( "--model_size", choices=["7B"], - help="'f' models correspond to the finetuned versions, and are specific to the Mixtral official release. For more details on Mixtral, checkout the original repo: https://huggingface.co/mistral-ai", + help="'f' models correspond to the finetuned versions, and are specific to the Mixtral official release. For more details on Mixtral, check out the original repo: https://huggingface.co/mistral-ai", default="7B", ) parser.add_argument("--output_dir", help="Location to write HF model", required=True) diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py index 0ea6f149e43..dba0d973af8 100644 --- a/src/transformers/models/moonshine/configuration_moonshine.py +++ b/src/transformers/models/moonshine/configuration_moonshine.py @@ -53,7 +53,7 @@ class MoonshineConfig(PretrainedConfig): `encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if `encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. decoder_num_key_value_heads (`int`, *optional*): @@ -61,7 +61,7 @@ class MoonshineConfig(PretrainedConfig): `decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if `decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `decoder_num_attention_heads`. pad_head_dim_to_multiple_of (`int`, *optional*): diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 6abc22ae997..f99de20eb02 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -75,7 +75,7 @@ class MoonshineConfig(PretrainedConfig): `encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if `encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. decoder_num_key_value_heads (`int`, *optional*): @@ -83,7 +83,7 @@ class MoonshineConfig(PretrainedConfig): `decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if `decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `decoder_num_attention_heads`. pad_head_dim_to_multiple_of (`int`, *optional*): diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index 1da69740a7e..02b82ee5ed5 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -47,7 +47,7 @@ class MoshiDepthConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. audio_vocab_size (`int`, *optional*, defaults to 2048): Vocabulary size of the audio part of model. Defines the number of different tokens that can be @@ -171,7 +171,7 @@ class MoshiConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. audio_vocab_size (`int`, *optional*): Vocabulary size of the audio part of model. Defines the number of different tokens that can be diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py index a6f1ad0c5d9..e52d6d14484 100644 --- a/src/transformers/models/nemotron/configuration_nemotron.py +++ b/src/transformers/models/nemotron/configuration_nemotron.py @@ -52,7 +52,7 @@ class NemotronConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`): diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py index 4ad5de61520..2c2da014635 100644 --- a/src/transformers/models/olmo/configuration_olmo.py +++ b/src/transformers/models/olmo/configuration_olmo.py @@ -53,7 +53,7 @@ class OlmoConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/olmo2/configuration_olmo2.py b/src/transformers/models/olmo2/configuration_olmo2.py index 3c1f396e0f8..96ed3993ed4 100644 --- a/src/transformers/models/olmo2/configuration_olmo2.py +++ b/src/transformers/models/olmo2/configuration_olmo2.py @@ -35,7 +35,7 @@ class Olmo2Config(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/olmo2/modular_olmo2.py b/src/transformers/models/olmo2/modular_olmo2.py index 103d6616c5f..22d716f8559 100644 --- a/src/transformers/models/olmo2/modular_olmo2.py +++ b/src/transformers/models/olmo2/modular_olmo2.py @@ -49,7 +49,7 @@ class Olmo2Config(OlmoConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/olmoe/configuration_olmoe.py b/src/transformers/models/olmoe/configuration_olmoe.py index 2b85ea55e3d..79976597375 100644 --- a/src/transformers/models/olmoe/configuration_olmoe.py +++ b/src/transformers/models/olmoe/configuration_olmoe.py @@ -42,7 +42,7 @@ class OlmoeConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index 4ffd89db3ad..bd6eb48003c 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -50,7 +50,7 @@ class PhiConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. resid_pdrop (`float`, *optional*, defaults to 0.0): diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index ec133fbd7d5..4b91cbcd147 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -49,7 +49,7 @@ class Phi3Config(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. resid_pdrop (`float`, *optional*, defaults to 0.0): diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index 3f776b0b71e..fee669feb83 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -268,7 +268,7 @@ class Phi4MultimodalConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. resid_pdrop (`float`, *optional*, defaults to 0.0): diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py index e6ab2c1cb0d..76925919eb2 100644 --- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py @@ -304,7 +304,7 @@ class Phi4MultimodalConfig(Phi3Config): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. resid_pdrop (`float`, *optional*, defaults to 0.0): diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py index 33123ff8ef2..f30ed7435ca 100644 --- a/src/transformers/models/phimoe/configuration_phimoe.py +++ b/src/transformers/models/phimoe/configuration_phimoe.py @@ -48,7 +48,7 @@ class PhimoeConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. diff --git a/src/transformers/models/qwen2/configuration_qwen2.py b/src/transformers/models/qwen2/configuration_qwen2.py index f89be07bf13..d8171da3cf7 100644 --- a/src/transformers/models/qwen2/configuration_qwen2.py +++ b/src/transformers/models/qwen2/configuration_qwen2.py @@ -50,7 +50,7 @@ class Qwen2Config(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index 3f76da5e3eb..6367355994c 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -238,7 +238,7 @@ class Qwen2_5OmniTextConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. @@ -584,7 +584,7 @@ class Qwen2_5OmniTalkerConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 46bd0f23209..bed63636cd7 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -277,7 +277,7 @@ class Qwen2_5OmniTextConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. @@ -623,7 +623,7 @@ class Qwen2_5OmniTalkerConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index c053de54b39..01f0599b083 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -94,7 +94,7 @@ class Qwen2_5_VLTextConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. diff --git a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py index 113f6c09eae..bb7839fa442 100644 --- a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py @@ -49,7 +49,7 @@ class Qwen2MoeConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 03bfa66c41f..a63e32be849 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -83,7 +83,7 @@ class Qwen2VLTextConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py index 8335c798d16..0af5d1f70f3 100644 --- a/src/transformers/models/qwen3/configuration_qwen3.py +++ b/src/transformers/models/qwen3/configuration_qwen3.py @@ -50,7 +50,7 @@ class Qwen3Config(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. head_dim (`int`, *optional*, defaults to 128): The attention head dimension. diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py index 082b8ffb8cb..e0029710924 100644 --- a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py @@ -49,7 +49,7 @@ class Qwen3MoeConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. diff --git a/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py b/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py index ea1cdd58ec9..a31c9c3b7ad 100644 --- a/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py +++ b/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py @@ -167,7 +167,7 @@ def main(): "--model_size", default="2B", choices=["2B", "7B", "tokenizer_only"], - help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b", + help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, check out the original repo: https://huggingface.co/google/gemma-7b", ) parser.add_argument( "--output_dir", diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index 06501792c42..a30f9510c43 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -51,7 +51,7 @@ class StableLmConfig(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): diff --git a/src/transformers/models/starcoder2/configuration_starcoder2.py b/src/transformers/models/starcoder2/configuration_starcoder2.py index b617a1cad84..b58e8ddf1a2 100644 --- a/src/transformers/models/starcoder2/configuration_starcoder2.py +++ b/src/transformers/models/starcoder2/configuration_starcoder2.py @@ -50,7 +50,7 @@ class Starcoder2Config(PretrainedConfig): `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): The non-linear activation function (function or string) in the decoder. diff --git a/src/transformers/models/zamba/configuration_zamba.py b/src/transformers/models/zamba/configuration_zamba.py index 46d99a32272..e51d0e4ef42 100644 --- a/src/transformers/models/zamba/configuration_zamba.py +++ b/src/transformers/models/zamba/configuration_zamba.py @@ -59,7 +59,7 @@ class ZambaConfig(PretrainedConfig): `num_key_value_heads=None`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). n_mamba_heads (`int`, *optional*, defaults to 2): Number of mamba heads for each mamba layer. diff --git a/src/transformers/models/zamba2/configuration_zamba2.py b/src/transformers/models/zamba2/configuration_zamba2.py index cda81abd430..1392eab1b3c 100644 --- a/src/transformers/models/zamba2/configuration_zamba2.py +++ b/src/transformers/models/zamba2/configuration_zamba2.py @@ -79,7 +79,7 @@ class Zamba2Config(PretrainedConfig): `num_key_value_heads=None`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this + by meanpooling all the original heads within that group. For more details, check out [this paper](https://arxiv.org/pdf/2305.13245.pdf). attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. diff --git a/src/transformers/quantizers/quantizer_bitnet.py b/src/transformers/quantizers/quantizer_bitnet.py index bac1f01a9ac..4df34d22a7d 100644 --- a/src/transformers/quantizers/quantizer_bitnet.py +++ b/src/transformers/quantizers/quantizer_bitnet.py @@ -34,7 +34,7 @@ class BitNetHfQuantizer(HfQuantizer): 1.58-bit quantization from BitNet quantization method: Before loading: it converts the linear layers into BitLinear layers during loading. - Checkout the paper introducing this method : https://arxiv.org/pdf/2402.17764 + Check out the paper introducing this method : https://arxiv.org/pdf/2402.17764 """ requires_parameters_quantization = False diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py index ba880f7bb55..f1104591295 100644 --- a/src/transformers/utils/hub.py +++ b/src/transformers/utils/hub.py @@ -90,7 +90,7 @@ torch_cache_home = os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOM default_cache_path = constants.default_cache_path # Determine default cache directory. Lots of legacy environment variables to ensure backward compatibility. -# The best way to set the cache path is with the environment variable HF_HOME. For more details, checkout this +# The best way to set the cache path is with the environment variable HF_HOME. For more details, check out this # documentation page: https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables. # # In code, use `HF_HUB_CACHE` as the default cache path. This variable is set by the library and is guaranteed @@ -542,7 +542,7 @@ def cached_files( elif _raise_exceptions_for_missing_entries: raise OSError( f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load the files, and couldn't find them in the" - f" cached files.\nCheckout your internet connection or see how to run the library in offline mode at" + f" cached files.\nCheck your internet connection or see how to run the library in offline mode at" " 'https://huggingface.co/docs/transformers/installation#offline-mode'." ) from e # snapshot_download will not raise EntryNotFoundError, but hf_hub_download can. If this is the case, it will be treated diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 6db912589ed..0d412406d9f 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -1492,7 +1492,7 @@ Please note that you may need to restart your runtime after installation. # docstyle-ignore SENTENCEPIECE_IMPORT_ERROR = """ -{0} requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the +{0} requires the SentencePiece library but it was not found in your environment. Check out the instructions on the installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones that match your environment. Please note that you may need to restart your runtime after installation. """ @@ -1500,7 +1500,7 @@ that match your environment. Please note that you may need to restart your runti # docstyle-ignore PROTOBUF_IMPORT_ERROR = """ -{0} requires the protobuf library but it was not found in your environment. Checkout the instructions on the +{0} requires the protobuf library but it was not found in your environment. Check out the instructions on the installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones that match your environment. Please note that you may need to restart your runtime after installation. """ @@ -1508,7 +1508,7 @@ that match your environment. Please note that you may need to restart your runti # docstyle-ignore FAISS_IMPORT_ERROR = """ -{0} requires the faiss library but it was not found in your environment. Checkout the instructions on the +{0} requires the faiss library but it was not found in your environment. Check out the instructions on the installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones that match your environment. Please note that you may need to restart your runtime after installation. """ @@ -1516,7 +1516,7 @@ that match your environment. Please note that you may need to restart your runti # docstyle-ignore PYTORCH_IMPORT_ERROR = """ -{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the +{0} requires the PyTorch library but it was not found in your environment. Check out the instructions on the installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment. Please note that you may need to restart your runtime after installation. """ @@ -1524,7 +1524,7 @@ Please note that you may need to restart your runtime after installation. # docstyle-ignore TORCHVISION_IMPORT_ERROR = """ -{0} requires the Torchvision library but it was not found in your environment. Checkout the instructions on the +{0} requires the Torchvision library but it was not found in your environment. Check out the instructions on the installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment. Please note that you may need to restart your runtime after installation. """ @@ -1576,7 +1576,7 @@ Please note that you may need to restart your runtime after installation. # docstyle-ignore TENSORFLOW_IMPORT_ERROR = """ -{0} requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the +{0} requires the TensorFlow library but it was not found in your environment. Check out the instructions on the installation page: https://www.tensorflow.org/install and follow the ones that match your environment. Please note that you may need to restart your runtime after installation. """ @@ -1584,7 +1584,7 @@ Please note that you may need to restart your runtime after installation. # docstyle-ignore DETECTRON2_IMPORT_ERROR = """ -{0} requires the detectron2 library but it was not found in your environment. Checkout the instructions on the +{0} requires the detectron2 library but it was not found in your environment. Check out the instructions on the installation page: https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md and follow the ones that match your environment. Please note that you may need to restart your runtime after installation. """ @@ -1592,14 +1592,14 @@ that match your environment. Please note that you may need to restart your runti # docstyle-ignore FLAX_IMPORT_ERROR = """ -{0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the +{0} requires the FLAX library but it was not found in your environment. Check out the instructions on the installation page: https://github.com/google/flax and follow the ones that match your environment. Please note that you may need to restart your runtime after installation. """ # docstyle-ignore FTFY_IMPORT_ERROR = """ -{0} requires the ftfy library but it was not found in your environment. Checkout the instructions on the +{0} requires the ftfy library but it was not found in your environment. Check out the instructions on the installation section: https://github.com/rspeer/python-ftfy/tree/master#installing and follow the ones that match your environment. Please note that you may need to restart your runtime after installation. """ diff --git a/templates/adding_a_new_model/README.md b/templates/adding_a_new_model/README.md index 8c8c7af0b33..7f60be589b8 100644 --- a/templates/adding_a_new_model/README.md +++ b/templates/adding_a_new_model/README.md @@ -19,5 +19,5 @@ limitations under the License. This page has been updated in light of the removal of the `add_new_model` script in favor of the more complete `add_new_model_like` script. -We recommend you checkout the documentation of [How to add a model](https://huggingface.co/docs/transformers/main/en/add_new_model) -in the Hugging Face Transformers documentation for complete and up-to-date instructions. +We recommend you check out the documentation on [how to add a model](https://huggingface.co/docs/transformers/main/en/add_new_model) +for complete and up-to-date instructions.