diff --git a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py index ff716a5b93f..60849c2efb7 100644 --- a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py +++ b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py @@ -82,7 +82,7 @@ def copy_encoder(hf_encoder, pt_model): def copy_text_model_and_projection(hf_model, pt_model): # copy projection - hf_model.text_projection.weight.data = pt_model.text_projection.data.T + hf_model.text_projection.weight.data = pt_model.text_projection.data.T.contiguous() # copy text encoder copy_encoder(hf_model.text_model, pt_model) @@ -90,7 +90,7 @@ def copy_text_model_and_projection(hf_model, pt_model): def copy_vison_model_and_projection(hf_model, pt_model): # copy projection - hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T + hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T.contiguous() # copy layer norms copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)