From 1d3a1cc44b6af824f708fab114b174a27f9db2c5 Mon Sep 17 00:00:00 2001 From: Matt Date: Wed, 1 Mar 2023 16:57:06 +0000 Subject: [PATCH] Add check for different embedding types in examples (#21881) * Add check for different embedding types in examples * Correctly update summarization example --- examples/tensorflow/language-modeling/run_clm.py | 10 +++++++++- examples/tensorflow/language-modeling/run_mlm.py | 10 +++++++++- .../tensorflow/summarization/run_summarization.py | 10 +++++++++- examples/tensorflow/token-classification/run_ner.py | 10 +++++++++- examples/tensorflow/translation/run_translation.py | 11 ++++++++++- 5 files changed, 46 insertions(+), 5 deletions(-) diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py index 861929afb58..7829a887a28 100755 --- a/examples/tensorflow/language-modeling/run_clm.py +++ b/examples/tensorflow/language-modeling/run_clm.py @@ -475,7 +475,15 @@ def main(): # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. - embedding_size = model.get_input_embeddings().weight.shape[0] + embeddings = model.get_input_embeddings() + + # Matt: This is a temporary workaround as we transition our models to exclusively using Keras embeddings. + # As soon as the transition is complete, all embeddings should be keras.Embeddings layers, and + # the weights will always be in embeddings.embeddings. + if hasattr(embeddings, "embeddings"): + embedding_size = embeddings.embeddings.shape[0] + else: + embedding_size = embeddings.weight.shape[0] if len(tokenizer) > embedding_size: model.resize_token_embeddings(len(tokenizer)) # endregion diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py index 5db7130df55..14fe7cefd20 100755 --- a/examples/tensorflow/language-modeling/run_mlm.py +++ b/examples/tensorflow/language-modeling/run_mlm.py @@ -491,7 +491,15 @@ def main(): # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. - embedding_size = model.get_input_embeddings().weight.shape[0] + embeddings = model.get_input_embeddings() + + # Matt: This is a temporary workaround as we transition our models to exclusively using Keras embeddings. + # As soon as the transition is complete, all embeddings should be keras.Embeddings layers, and + # the weights will always be in embeddings.embeddings. + if hasattr(embeddings, "embeddings"): + embedding_size = embeddings.embeddings.shape[0] + else: + embedding_size = embeddings.weight.shape[0] if len(tokenizer) > embedding_size: model.resize_token_embeddings(len(tokenizer)) # endregion diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index 61ee9c2ba6d..d8705ad6334 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -518,7 +518,15 @@ def main(): # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. - embedding_size = model.get_input_embeddings().weight.shape[0] + embeddings = model.get_input_embeddings() + + # Matt: This is a temporary workaround as we transition our models to exclusively using Keras embeddings. + # As soon as the transition is complete, all embeddings should be keras.Embeddings layers, and + # the weights will always be in embeddings.embeddings. + if hasattr(embeddings, "embeddings"): + embedding_size = embeddings.embeddings.shape[0] + else: + embedding_size = embeddings.weight.shape[0] if len(tokenizer) > embedding_size: model.resize_token_embeddings(len(tokenizer)) # endregion diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py index 7b90938f02d..b8897358805 100644 --- a/examples/tensorflow/token-classification/run_ner.py +++ b/examples/tensorflow/token-classification/run_ner.py @@ -387,7 +387,15 @@ def main(): # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. - embedding_size = model.get_input_embeddings().weight.shape[0] + embeddings = model.get_input_embeddings() + + # Matt: This is a temporary workaround as we transition our models to exclusively using Keras embeddings. + # As soon as the transition is complete, all embeddings should be keras.Embeddings layers, and + # the weights will always be in embeddings.embeddings. + if hasattr(embeddings, "embeddings"): + embedding_size = embeddings.embeddings.shape[0] + else: + embedding_size = embeddings.weight.shape[0] if len(tokenizer) > embedding_size: model.resize_token_embeddings(len(tokenizer)) # endregion diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index 09c0b8a9ea7..c9a753139e6 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -471,9 +471,18 @@ def main(): # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. - embedding_size = model.get_input_embeddings().weight.shape[0] + embeddings = model.get_input_embeddings() + + # Matt: This is a temporary workaround as we transition our models to exclusively using Keras embeddings. + # As soon as the transition is complete, all embeddings should be keras.Embeddings layers, and + # the weights will always be in embeddings.embeddings. + if hasattr(embeddings, "embeddings"): + embedding_size = embeddings.embeddings.shape[0] + else: + embedding_size = embeddings.weight.shape[0] if len(tokenizer) > embedding_size: model.resize_token_embeddings(len(tokenizer)) + if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)): model.config.forced_bos_token_id = forced_bos_token_id # endregion