Add TVLT (#20725)

* Update image_processing_tvlt.py * Update modeling_tvlt.py * Update * Update modeling_tvlt.py * Create tvlt.mdx * Update configuration_tvlt.py * Update modeling_tvlt.py * Update test_modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update image_processing_tvlt.py * Update feature_extraction_tvlt.py * Update tvlt models * Update tests * Update * Update * Update tests * Update README_ko.md * Update README_ja.md * Update README_ko.md * Update README_zh-hans.md * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update tvlt.mdx * Update modeling_tvlt.py * Update configuration_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Add files via upload * Update model * Update modeling_tvlt.py * Update tvlt models * Update src/transformers/models/tvlt/__init__.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/__init__.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Add files via upload * Add files via upload * Delete modeling_tvlt.py * Delete feature_extraction_tvlt.py * Delete configuration_tvlt.py * Delete image_processing_tvlt.py * Delete processing_tvlt.py * Update tvlt * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update README.md Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update README_es.md * Update README_hd.md * Update README_ja.md * Update README_ko.md * Update README_zh-hans.md * Update README_zh-hant.md * Update index.mdx * Update tvlt.mdx * Update tvlt.mdx * Update configuration_tvlt.py * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update modeling_tvlt.py * Add files via upload * Update tvlt.mdx * Update modeling_auto.py * Add files via upload * Add files via upload * Update dummy_pt_objects.py * Update __init__.py * Update feature_extraction_tvlt.py * Update feature_extraction_tvlt.py * Update image_processing_tvlt.py * Update modeling_auto.py * Update test_feature_extraction_tvlt.py * Update test_processor_tvlt.py * Update test_feature_extraction_tvlt.py * Add files via upload * Update test_image_processor_tvlt.py * Update tests/models/tvlt/test_processor_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_image_processor_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update tests/models/tvlt/test_image_processor_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_image_processor_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_image_processor_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_feature_extraction_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/feature_extraction_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/feature_extraction_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/feature_extraction_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/feature_extraction_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update feature_extraction_tvlt.py * Update feature_extraction_tvlt.py * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update image_processing_tvlt.py * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update test_image_processor_tvlt.py * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Add files via upload * Add files via upload * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Add files via upload * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update image_processing_tvlt.py * Add files via upload * Add files via upload * Update tvlt.mdx * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Update modeling_auto.py * Update tvlt.mdx * Update dummy_pt_objects.py * Update feature_extraction_tvlt.py * Update modeling_tvlt.py * Update test_feature_extraction_tvlt.py * Update test_image_processor_tvlt.py * Update test_feature_extraction_tvlt.py * Update modeling_tvlt.py * Update dummy_pt_objects.py * Update dummy_speech_objects.py * Add files via upload * Update README_hd.md * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update test_modeling_tvlt.py * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/feature_extraction_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update MAE processing * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling * Update style * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update check_repo.py * Update tvlt.mdx * Update __init__.py * Update tests * Update tvlt models * Update configuration_tvlt.py * Update configuration_tvlt.py * Update image_processing_tvlt.py * Update dummy_pt_objects.py * Add files via upload * Update test_modeling_tvlt.py * Update test_feature_extraction_tvlt.py * Update test_feature_extraction_tvlt.py * Update test_feature_extraction_tvlt.py * Update test_feature_extraction_tvlt.py * Update test_feature_extraction_tvlt.py * Update test_feature_extraction_tvlt.py --------- Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
2025-07-31 02:02:21 +06:00 · 2023-02-15 13:10:30 -05:00 · 2023-02-15 13:10:30 -05:00 · a0e69a9375
commit a0e69a9375
parent 7bac51837b
29 changed files with 3848 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -421,6 +421,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
--- a/README_es.md
+++ b/README_es.md
@ -414,6 +414,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
--- a/README_hd.md
+++ b/README_hd.md
@ -386,6 +386,7 @@ conda install -c huggingface transformers
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU की ओर से) कागज के साथ [संस्करण-एक्स: एक ब्लॉग मॉडल चौकस चौक मॉडल मॉडल] (https://arxivorg/abs/1901.02860) क्वोकोक वी. ले, रुस्लैन सलाखुतदी
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (माइक्रोसॉफ्ट रिसर्च से) साथ में दिया गया पेपर [UniSpeech: यूनिफाइड स्पीच रिप्रेजेंटेशन लर्निंग विद लेबलेड एंड अनलेबल्ड डेटा](https:/ /arxiv.org/abs/2101.07597) चेंगई वांग, यू वू, याओ कियान, केनिची कुमातानी, शुजी लियू, फुरु वेई, माइकल ज़ेंग, ज़ुएदोंग हुआंग द्वारा।
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [UNISPEECH-SAT: यूनिवर्सल स्पीच रिप्रेजेंटेशन लर्निंग विद स्पीकर अवेयर प्री-ट्रेनिंग ](https://arxiv.org/abs/2110.05752) सानयुआन चेन, यू वू, चेंग्यी वांग, झेंगयांग चेन, झूओ चेन, शुजी लियू, जियान वू, याओ कियान, फुरु वेई, जिन्यु ली, जियांगज़ान यू द्वारा पोस्ट किया गया।
--- a/README_ja.md
+++ b/README_ja.md
@ -448,6 +448,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley から) Michael Janner, Qiyang Li, Sergey Levine から公開された研究論文: [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU から) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov から公開された研究論文: [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860)
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft から), Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei から公開された研究論文: [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282)
+1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill から), Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal から公開された研究論文: [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156)
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research から) Yi Tay, Mostafa Dehghani, Vinh Q から公開された研究論文: [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research から) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang から公開された研究論文: [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597)
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research から) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu から公開された研究論文: [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752)
--- a/README_ko.md
+++ b/README_ko.md
@ -363,6 +363,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley 에서) Michael Janner, Qiyang Li, Sergey Levin 의 [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) 논문과 함께 발표했습니다.
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU 에서) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 의 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 논문과 함께 발표했습니다.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft 에서) Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 의 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 논문과 함께 발표했습니다.
+1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill 에서) Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 의 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 논문과 함께 발표했습니다.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research 에서) Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzle 의 [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) 논문과 함께 발표했습니다.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research 에서) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 의 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 논문과 함께 발표했습니다.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research 에서) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 의 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 논문과 함께 발표했습니다.
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@ -387,6 +387,7 @@ conda install -c huggingface transformers
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
+1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (来自 UNC Chapel Hill) 伴随论文 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 由 Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 发布。
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@ -399,6 +399,7 @@ conda install -c huggingface transformers
 1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -576,6 +576,8 @@
        title: TAPAS
      - local: model_doc/trocr
        title: TrOCR
+      - local: model_doc/tvlt
+        title: TVLT
      - local: model_doc/vilt
        title: ViLT
      - local: model_doc/vision-encoder-decoder
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@ -200,6 +200,7 @@ The documentation is organized into five sections:
 1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
 1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[TVLT](model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
@ -382,6 +383,7 @@ Flax), PyTorch, and/or TensorFlow.
 |    Trajectory Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |        Transformer-XL         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |             TrOCR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             TVLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           UniSpeech           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         UniSpeechSat          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            UPerNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
@ -410,4 +412,4 @@ Flax), PyTorch, and/or TensorFlow.
 |             YOLOS             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             YOSO              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |

-<!-- End table-->
+<!-- End table-->
--- a/docs/source/en/model_doc/tvlt.mdx
+++ b/docs/source/en/model_doc/tvlt.mdx
@ -0,0 +1,73 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# TVLT
+
+## Overview
+
+The TVLT model was proposed in [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156)
+by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal (the first three authors contributed equally). The Textless Vision-Language Transformer (TVLT) is a model that uses raw visual and audio inputs for vision-and-language representation learning, without using text-specific modules such as tokenization or automatic speech recognition (ASR). It can perform various audiovisual and vision-language tasks like retrieval, question answering, etc.
+
+The abstract from the paper is the following:
+
+*In this work, we present the Textless Vision-Language Transformer (TVLT), where homogeneous transformer blocks take raw visual and audio inputs for vision-and-language representation learning with minimal modality-specific design, and do not use text-specific modules such as tokenization or automatic speech recognition (ASR). TVLT is trained by reconstructing masked patches of continuous video frames and audio spectrograms (masked autoencoding) and contrastive modeling to align video and audio. TVLT attains performance comparable to its text-based counterpart on various multimodal tasks, such as visual question answering, image retrieval, video retrieval, and multimodal sentiment analysis, with 28x faster inference speed and only 1/3 of the parameters. Our findings suggest the possibility of learning compact and efficient visual-linguistic representations from low-level visual and audio signals without assuming the prior existence of text.*
+
+Tips:
+
+- TVLT is a model that takes both `pixel_values` and `audio_values` as input. One can use [`TvltProcessor`] to prepare data for the model.
+  This processor wraps an image processor (for the image/video modality) and an audio feature extractor (for the audio modality) into one.
+- TVLT is trained with images/videos and audios of various sizes: the authors resize and crop the input images/videos to 224 and limit the length of audio spectrogram to 2048. To make batching of videos and audios possible, the authors use a `pixel_mask` that indicates which pixels are real/padding and `audio_mask` that indicates which audio values are real/padding.
+- The design of TVLT is very similar to that of a standard Vision Transformer (ViT) and masked autoencoder (MAE) as in [ViTMAE](vitmae). The difference is that the model includes embedding layers for the audio modality.
+- The PyTorch version of this model is only available in torch 1.10 and higher.
+
+<p align="center">
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/tvlt_architecture.png"
+alt="drawing" width="600"/>
+</p>
+
+<small> TVLT architecture. Taken from the <a href="[https://arxiv.org/abs/2102.03334](https://arxiv.org/abs/2209.14156)">original paper</a>. </small>
+
+The original code can be found [here](https://github.com/zinengtang/TVLT). This model was contributed by [Zineng Tang](https://huggingface.co/ZinengTang).
+
+## TvltConfig
+
+[[autodoc]] TvltConfig
+
+## TvltProcessor
+
+[[autodoc]] TvltProcessor
+    - __call__
+
+## TvltImageProcessor
+
+[[autodoc]] TvltImageProcessor
+    - preprocess
+
+## TvltFeatureExtractor
+
+[[autodoc]] TvltFeatureExtractor
+    - __call__
+    
+## TvltModel
+
+[[autodoc]] TvltModel
+    - forward
+
+## TvltForPreTraining
+
+[[autodoc]] TvltForPreTraining
+    - forward
+
+## TvltForAudioVisualClassification
+
+[[autodoc]] TvltForAudioVisualClassification
+    - forward
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -444,6 +444,11 @@ _import_structure = {
        "TrOCRConfig",
        "TrOCRProcessor",
    ],
+    "models.tvlt": [
+        "TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "TvltConfig",
+        "TvltProcessor",
+    ],
    "models.unispeech": [
        "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "UniSpeechConfig",
@ -750,6 +755,7 @@ else:
    _import_structure["models.mctct"].append("MCTCTFeatureExtractor")
    _import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor")
    _import_structure["models.speecht5"].append("SpeechT5FeatureExtractor")
+    _import_structure["models.tvlt"].append("TvltFeatureExtractor")

 # Tensorflow-text-specific objects
 try:
@ -826,6 +832,7 @@ else:
    _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
    _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
    _import_structure["models.swin2sr"].append("Swin2SRImageProcessor")
+    _import_structure["models.tvlt"].append("TvltImageProcessor")
    _import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
    _import_structure["models.vilt"].extend(["ViltFeatureExtractor", "ViltImageProcessor", "ViltProcessor"])
    _import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"])
@ -2348,6 +2355,15 @@ else:
    _import_structure["models.trocr"].extend(
        ["TROCR_PRETRAINED_MODEL_ARCHIVE_LIST", "TrOCRForCausalLM", "TrOCRPreTrainedModel"]
    )
+    _import_structure["models.tvlt"].extend(
+        [
+            "TVLT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TvltForAudioVisualClassification",
+            "TvltForPreTraining",
+            "TvltModel",
+            "TvltPreTrainedModel",
+        ]
+    )
    _import_structure["models.unispeech"].extend(
        [
            "UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST",
@ -3937,6 +3953,7 @@ if TYPE_CHECKING:
        TransfoXLTokenizer,
    )
    from .models.trocr import TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP, TrOCRConfig, TrOCRProcessor
+    from .models.tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig, TvltProcessor
    from .models.unispeech import UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechConfig
    from .models.unispeech_sat import UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechSatConfig
    from .models.upernet import UperNetConfig
@ -4212,6 +4229,7 @@ if TYPE_CHECKING:
        from .models.mctct import MCTCTFeatureExtractor
        from .models.speech_to_text import Speech2TextFeatureExtractor
        from .models.speecht5 import SpeechT5FeatureExtractor
+        from .models.tvlt import TvltFeatureExtractor

    try:
        if not is_tensorflow_text_available():
@ -4269,6 +4287,7 @@ if TYPE_CHECKING:
        from .models.poolformer import PoolFormerFeatureExtractor, PoolFormerImageProcessor
        from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
        from .models.swin2sr import Swin2SRImageProcessor
+        from .models.tvlt import TvltImageProcessor
        from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
        from .models.vilt import ViltFeatureExtractor, ViltImageProcessor, ViltProcessor
        from .models.vit import ViTFeatureExtractor, ViTImageProcessor
@ -5510,6 +5529,13 @@ if TYPE_CHECKING:
            load_tf_weights_in_transfo_xl,
        )
        from .models.trocr import TROCR_PRETRAINED_MODEL_ARCHIVE_LIST, TrOCRForCausalLM, TrOCRPreTrainedModel
+        from .models.tvlt import (
+            TVLT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TvltForAudioVisualClassification,
+            TvltForPreTraining,
+            TvltModel,
+            TvltPreTrainedModel,
+        )
        from .models.unispeech import (
            UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST,
            UniSpeechForCTC,
--- a/src/transformers/models/init.py
+++ b/src/transformers/models/init.py
@ -169,6 +169,7 @@ from . import (
    trajectory_transformer,
    transfo_xl,
    trocr,
+    tvlt,
    unispeech,
    unispeech_sat,
    upernet,
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@ -169,6 +169,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
        ("trajectory_transformer", "TrajectoryTransformerConfig"),
        ("transfo-xl", "TransfoXLConfig"),
        ("trocr", "TrOCRConfig"),
+        ("tvlt", "TvltConfig"),
        ("unispeech", "UniSpeechConfig"),
        ("unispeech-sat", "UniSpeechSatConfig"),
        ("upernet", "UperNetConfig"),
@ -329,6 +330,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
        ("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("timesformer", "TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("tvlt", "TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("unispeech", "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("unispeech-sat", "UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("van", "VAN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@ -516,6 +518,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
        ("trajectory_transformer", "Trajectory Transformer"),
        ("transfo-xl", "Transformer-XL"),
        ("trocr", "TrOCR"),
+        ("tvlt", "TVLT"),
        ("ul2", "UL2"),
        ("unispeech", "UniSpeech"),
        ("unispeech-sat", "UniSpeechSat"),
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@ -163,6 +163,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
        ("timesformer", "TimesformerModel"),
        ("trajectory_transformer", "TrajectoryTransformerModel"),
        ("transfo-xl", "TransfoXLModel"),
+        ("tvlt", "TvltModel"),
        ("unispeech", "UniSpeechModel"),
        ("unispeech-sat", "UniSpeechSatModel"),
        ("van", "VanModel"),
@ -235,6 +236,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
        ("t5", "T5ForConditionalGeneration"),
        ("tapas", "TapasForMaskedLM"),
        ("transfo-xl", "TransfoXLLMHeadModel"),
+        ("tvlt", "TvltForPreTraining"),
        ("unispeech", "UniSpeechForPreTraining"),
        ("unispeech-sat", "UniSpeechSatForPreTraining"),
        ("videomae", "VideoMAEForPreTraining"),
--- a/src/transformers/models/tvlt/init.py
+++ b/src/transformers/models/tvlt/init.py
@ -0,0 +1,101 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_speech_available,
+    is_torch_available,
+    is_vision_available,
+)
+
+
+_import_structure = {
+    "configuration_tvlt": ["TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP", "TvltConfig"],
+    "processing_tvlt": ["TvltProcessor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tvlt"] = [
+        "TVLT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TvltModel",
+        "TvltForPreTraining",
+        "TvltForAudioVisualClassification",
+        "TvltPreTrainedModel",
+    ]
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_tvlt"] = ["TvltImageProcessor"]
+
+try:
+    if not is_speech_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["feature_extraction_tvlt"] = ["TvltFeatureExtractor"]
+
+if TYPE_CHECKING:
+    from .configuration_tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig
+    from .processing_tvlt import TvltProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tvlt import (
+            TVLT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TvltForAudioVisualClassification,
+            TvltForPreTraining,
+            TvltModel,
+            TvltPreTrainedModel,
+        )
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_tvlt import TvltImageProcessor
+
+    try:
+        if not is_speech_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_tvlt import TvltFeatureExtractor
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/src/transformers/models/tvlt/configuration_tvlt.py
+++ b/src/transformers/models/tvlt/configuration_tvlt.py
@ -0,0 +1,187 @@
+# coding=utf-8
+# Copyright 2023 MURGe-Lab and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TVLT model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "ZinengTang/tvlt-base": "https://huggingface.co/ZinengTang/tvlt-base/blob/main/config.json",
+}
+
+
+class TvltConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`TvltModel`]. It is used to instantiate a TVLT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the TVLT
+    [TVLT/tvlt-base](https://huggingface.co/ZinengTang/tvlt-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        spectrogram_length (`int`, *optional*, defaults to 2048):
+            The time length of each audio spectrogram.
+        frequency_length (`int`, *optional*, defaults to 128):
+            The frequency length of audio spectrogram.
+        image_patch_size (`List[int]`, *optional*, defaults to `[16, 16]`):
+            The size (resolution) of each image patch.
+        audio_patch_size (`List[int]`, *optional*, defaults to `[16, 16]`):
+            The size (resolution) of each audio patch.
+        num_image_channels (`int`, *optional*, defaults to 3):
+            The number of input image channels.
+        num_audio_channels (`int`, *optional*, defaults to 1):
+            The number of input audio channels.
+        num_frames (`int`, *optional*, defaults to 8):
+            The maximum number of frames for an input video.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        use_mean_pooling (`bool`, *optional*, defaults to `False`):
+            Whether to mean pool the final hidden states instead of using the final hidden state of the [CLS] token.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the decoder.
+        decoder_hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the decoder.
+        decoder_num_hidden_layers (`int`, *optional*, defaults to 8):
+            Number of hidden layers in the decoder.
+        decoder_intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
+        pixel_mask_ratio (`float`, *optional*, defaults to 0.75):
+            Image patch masking ratio.
+        audio_mask_ratio (`float`, *optional*, defaults to 0.15):
+            Audio patch masking ratio.
+        audio_mask_type (`str`, *optional*, defaults to `"frame-level"`):
+            Audio patch masking type, choose between "frame-level" and "patch-level".
+        task_matching (`bool`, *optional*, defaults to `True`):
+            Whether to use vision audio matching task in pretraining.
+        task_mae (`bool`, *optional*, defaults to `True`):
+            Whether to use the masked auto-encoder (MAE) in pretraining.
+        loss_type (`str`, *optional*, defaults to `"classification"`):
+            Loss types including regression and classification.
+
+    Example:
+
+    ```python
+    >>> from transformers import TvltConfig, TvltModel
+
+    >>> # # Initializing a TVLT ZinengTang/tvlt-base style configuration
+    >>> configuration = TvltConfig()
+
+    >>> # # Initializing a model (with random weights) from the ZinengTang/tvlt-base style configuration
+    >>> model = TvltModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "tvlt"
+
+    def __init__(
+        self,
+        image_size=224,
+        spectrogram_length=2048,
+        frequency_length=128,
+        image_patch_size=[16, 16],
+        audio_patch_size=[16, 16],
+        num_image_channels=3,
+        num_audio_channels=1,
+        num_frames=8,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        qkv_bias=True,
+        use_mean_pooling=False,
+        decoder_num_attention_heads=16,
+        decoder_hidden_size=512,
+        decoder_num_hidden_layers=8,
+        decoder_intermediate_size=2048,
+        pixel_mask_ratio=0.75,
+        audio_mask_ratio=0.15,
+        audio_mask_type="frame-level",
+        task_matching=True,
+        task_mae=True,
+        loss_type="classification",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if audio_mask_type not in ("frame-level", "patch_level"):
+            raise ValueError(
+                "audio_mask_type must be one of two acceptable strategies - {'frame_level', 'patch-level') "
+                f"got {audio_mask_type}"
+            )
+
+        self.image_size = image_size
+        self.spectrogram_length = spectrogram_length
+        self.frequency_length = frequency_length
+        self.image_patch_size = image_patch_size
+        self.audio_patch_size = audio_patch_size
+        self.num_image_channels = num_image_channels
+        self.num_audio_channels = num_audio_channels
+        self.num_frames = num_frames
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.use_mean_pooling = use_mean_pooling
+
+        self.decoder_num_attention_heads = decoder_num_attention_heads
+        self.decoder_hidden_size = decoder_hidden_size
+        self.decoder_num_hidden_layers = decoder_num_hidden_layers
+        self.decoder_intermediate_size = decoder_intermediate_size
+        self.pixel_mask_ratio = pixel_mask_ratio
+        self.audio_mask_ratio = audio_mask_ratio
+        self.audio_mask_type = audio_mask_type
+
+        self.task_matching = task_matching
+        self.task_mae = task_mae
+        self.loss_type = loss_type
--- a/src/transformers/models/tvlt/feature_extraction_tvlt.py
+++ b/src/transformers/models/tvlt/feature_extraction_tvlt.py
@ -0,0 +1,336 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for TVLT."""
+
+from math import ceil
+from typing import List, Optional, Union
+
+import numpy as np
+from numpy.fft import fft
+
+from transformers.feature_extraction_sequence_utils import BatchFeature, SequenceFeatureExtractor
+from transformers.utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class TvltFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a TVLT audio feature extractor. This feature extractor can be used to prepare audios for the model.
+
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+    Args:
+        spectrogram_length (`Dict[str, int]` *optional*, defaults to 2048):
+            The time length of each audio spectrogram.
+        num_channels (`int` *optional*, defaults to 1):
+            Number of audio channels.
+        patch_size (`List[int]` *optional*, defaults to `[16, 16]`):
+            The patch size of audio patch embedding.
+        feature_size (`int`, defaults to 128):
+            The frequency length of audio spectrogram.
+        sampling_rate (`int`, defaults to 44100):
+            The sampling rate at which the audio files should be digitalized expressed in Hertz (Hz).
+        hop_length_to_sampling_rate (`int`, defaults to 86):
+            Hop length is length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
+            For example, with sampling rate 44100, the hop length is 512, with 44100 / 512 = 86
+        n_fft (`int`, defaults to 2048):
+            Size of the Fourier transform.
+        padding_value (`float`, *optional*, defaults to 0.0):
+            Padding value used to pad the audio. Should correspond to silences.
+    """
+
+    model_input_names = ["audio_values", "audio_mask"]
+
+    def __init__(
+        self,
+        spectrogram_length=2048,
+        num_channels=1,
+        patch_size=[16, 16],
+        feature_size=128,
+        sampling_rate=44100,
+        hop_length_to_sampling_rate=86,
+        n_fft=2048,
+        padding_value=0.0,
+        **kwargs,
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            **kwargs,
+        )
+
+        self.spectrogram_length = spectrogram_length
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.freq_len = feature_size // self.patch_size[1]
+        self.n_fft = n_fft
+        self.hop_length = sampling_rate // hop_length_to_sampling_rate
+        self.sampling_rate = sampling_rate
+        self.padding_value = padding_value
+        self.mel_filters = self.get_mel_filters(sampling_rate, n_fft, n_mels=feature_size)
+
+    # Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.get_mel_filters with 45.245640471924965->59.99247463746737
+    def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
+        # Initialize the weights
+        n_mels = int(n_mels)
+        weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+
+        # Center freqs of each FFT bin
+        fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
+
+        # 'Center freqs' of mel bands - uniformly spaced between limits
+        min_mel = 0.0
+        max_mel = 59.99247463746737
+
+        mels = np.linspace(min_mel, max_mel, n_mels + 2)
+
+        mels = np.asanyarray(mels)
+
+        # Fill in the linear scale
+        f_min = 0.0
+        f_sp = 200.0 / 3
+        freqs = f_min + f_sp * mels
+
+        # And now the nonlinear scale
+        min_log_hz = 1000.0  # beginning of log region (Hz)
+        min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+        logstep = np.log(6.4) / 27.0  # step size for log region
+
+        # If we have vector data, vectorize
+        log_t = mels >= min_log_mel
+        freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
+
+        mel_f = freqs
+
+        fdiff = np.diff(mel_f)
+        ramps = np.subtract.outer(mel_f, fftfreqs)
+
+        for i in range(n_mels):
+            # lower and upper slopes for all bins
+            lower = -ramps[i] / fdiff[i]
+            upper = ramps[i + 2] / fdiff[i + 1]
+
+            # .. then intersect them with each other and zero
+            weights[i] = np.maximum(0, np.minimum(lower, upper))
+
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
+        weights *= enorm[:, np.newaxis]
+
+        return weights
+
+    # Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.fram_wave
+    def fram_wave(self, waveform, center=True):
+        """
+        Transform a raw waveform into a list of smaller waveforms. The window length defines how much of the signal is
+        contain in each frame (smalle waveform), while the hope length defines the step between the beginning of each
+        new frame.
+
+        Centering is done by reflecting the waveform which is first centered around `frame_idx * hop_length`.
+        """
+        frames = []
+        for i in range(0, waveform.shape[0] + 1, self.hop_length):
+            half_window = (self.n_fft - 1) // 2 + 1
+            if center:
+                start = i - half_window if i > half_window else 0
+                end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
+
+                frame = waveform[start:end]
+
+                if start == 0:
+                    padd_width = (-i + half_window, 0)
+                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
+
+                elif end == waveform.shape[0]:
+                    padd_width = (0, (i - waveform.shape[0] + half_window))
+                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
+
+            else:
+                frame = waveform[i : i + self.n_fft]
+                frame_width = frame.shape[0]
+                if frame_width < waveform.shape[0]:
+                    frame = np.lib.pad(
+                        frame, pad_width=(0, self.n_fft - frame_width), mode="constant", constant_values=0
+                    )
+
+            frames.append(frame)
+        return np.stack(frames, 0)
+
+    # Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.stft
+    def stft(self, frames, window):
+        """
+        Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
+        results as `torch.stft`.
+        """
+        frame_size = frames.shape[1]
+        fft_size = self.n_fft
+
+        if fft_size is None:
+            fft_size = frame_size
+
+        if fft_size < frame_size:
+            raise ValueError("FFT size must greater or equal the frame size")
+        # number of FFT bins to store
+        num_fft_bins = (fft_size >> 1) + 1
+
+        data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
+        fft_signal = np.zeros(fft_size)
+
+        for f, frame in enumerate(frames):
+            if window is not None:
+                np.multiply(frame, window, out=fft_signal[:frame_size])
+            else:
+                fft_signal[:frame_size] = frame
+            data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
+        return data.T
+
+    def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
+        """
+        Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
+        implementation with 1e-5 tolerance.
+        """
+        window = np.hanning(self.n_fft + 1)[:-1]
+
+        frames = self.fram_wave(waveform)
+        stft = self.stft(frames, window=window)
+        magnitudes = np.abs(stft[:, :-1]) ** 2
+
+        filters = self.mel_filters
+        mel_spec = filters @ magnitudes
+
+        log_spec = 10.0 * np.log10(np.maximum(1e-10, mel_spec))
+        log_spec -= 10.0 * np.log10(np.maximum(1e-10, 1.0))
+        log_spec = np.maximum(log_spec, log_spec.max() - 80.0)
+        log_spec = log_spec - 20.0
+        log_spec = np.clip(log_spec / 40.0, -2.0, 0.0) + 1.0
+
+        return log_spec
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_attention_mask: Optional[bool] = True,
+        sampling_rate: Optional[int] = None,
+        resample: bool = False,
+        mask_audio: bool = False,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to prepare one or several audio(s) for the model.
+
+        Args:
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            return_attention_mask (`bool`, *optional*, default to `True`):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default. [What are attention masks?](../glossary#attention-mask)
+
+                <Tip>
+
+                For TvltTransformer models, `attention_mask` should alwys be passed for batched inference, to avoid
+                subtle bugs.
+
+                </Tip>
+
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
+                pipeline. Current model supports sampling rate 16000 and 44100.
+            resample (`bool`, *optional*, defaults to `False`):
+                If the sampling rate is not matched, resample the input audio to match.
+            mask_audio (`bool`, *optional*, defaults to `False`):
+                Whether or not to mask input audio for MAE task.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **audio_values** -- Audio values to be fed to a model, of shape (batch_size, num_channels, height,
+              width).
+
+            - **audio_mask** -- Audio masks to be fed to a model, of shape (batch_size, num_audio_patches).
+        """
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    "This feature extractor is set to support sampling rate"
+                    f" of {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled"
+                    f" with {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched = bool(
+            isinstance(raw_speech, (list, tuple))
+            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        )
+        if is_batched:
+            raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float32)
+        # always return batch
+        if not is_batched:
+            raw_speech = [np.asarray([raw_speech]).T]
+
+        # Convert audio signals to log mel spectrograms, truncate by time axis
+        audio_features = [
+            self._np_extract_fbank_features(waveform.squeeze()).T[: self.spectrogram_length] for waveform in raw_speech
+        ]
+        if isinstance(audio_features[0], List):
+            audio_features = [np.asarray(feature, dtype=np.float32) for feature in audio_features]
+
+        # Create audio attention mask
+        max_patch_len = max(
+            [ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len for feature in audio_features]
+        )  # The maximum number of audio patches in a batch
+        if return_attention_mask:
+            audio_mask = [
+                (ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len) * [1]
+                + (max_patch_len - ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len) * [0]
+                for feature in audio_features
+            ]
+            audio_mask = np.array(audio_mask).astype(np.float32)
+
+        # convert into correct format for padding
+        max_time_len = max_patch_len // self.freq_len * self.patch_size[0]  # The maximum audio size in a batch
+        padded_audio_features = np.ones([len(audio_features), 1, max_time_len, self.feature_size]).astype(np.float32)
+        padded_audio_features = padded_audio_features * self.padding_value
+        for i in range(len(audio_features)):
+            feature = audio_features[i]
+            padded_audio_features[i, :, : feature.shape[0], :] = feature
+
+        # return as BatchFeature
+        if return_attention_mask:
+            data = {"audio_values": padded_audio_features, "audio_mask": audio_mask}
+        else:
+            data = {"audio_values": padded_audio_features}
+
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+        return encoded_inputs
--- a/src/transformers/models/tvlt/image_processing_tvlt.py
+++ b/src/transformers/models/tvlt/image_processing_tvlt.py
@ -0,0 +1,455 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for TVLT."""
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from transformers.utils.generic import TensorType
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    center_crop,
+    get_resize_output_image_size,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    is_valid_image,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def make_batched(videos) -> List[List[ImageInput]]:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)):
+        return videos
+
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        videos_dim = np.array(videos[0]).ndim
+        if videos_dim == 3:
+            return [videos]
+        elif videos_dim == 4:
+            return videos
+
+    elif is_valid_image(videos):
+        videos_dim = np.array(videos).ndim
+        if videos_dim == 3:
+            return [[videos]]
+        elif videos_dim == 4:
+            return [videos]
+        elif videos_dim == 5:
+            return videos
+
+    raise ValueError(f"Could not make batched video from {videos}")
+
+
+class TvltImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a TVLT image processor.
+
+    This processor can be used to prepare either videos or images for the model by converting images to 1-frame videos.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the output image after resizing. The shortest edge of the image will be resized to
+            `size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
+            `size` in the `preprocess` method.
+        patch_size (`List[int]` *optional*, defaults to [16,16]):
+            The patch size of image patch embedding.
+        num_frames (`int` *optional*, defaults to 8):
+            The maximum number of video frames.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by the `do_center_crop`
+            parameter in the `preprocess` method.
+        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after applying the center crop. Can be overridden by the `crop_size` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to 1/255):
+            Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
+            in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = [
+        "pixel_values",
+        "pixel_mask",
+        "pixel_values_mixed",
+        "pixel_mask_mixed",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        patch_size: List[int] = [16, 16],
+        num_frames: int = 8,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = IMAGENET_STANDARD_MEAN,
+        image_std: Optional[Union[float, List[float]]] = IMAGENET_STANDARD_STD,
+        init_mask_generator=False,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.patch_size = patch_size
+        self.num_frames = num_frames
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
+                have the size `(h, w)`. If `size` is of the form `{"shortest_edge": s}`, the output image will have its
+                shortest edge of length `s` while keeping the aspect ratio of the original image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" in size:
+            output_size = get_resize_output_image_size(image, size["shortest_edge"], default_to_square=False)
+        elif "height" in size and "width" in size:
+            output_size = (size["height"], size["width"])
+        else:
+            raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+
+    def center_crop(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `size` along any
+        edge, the image is padded with 0's and then center cropped.
+
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"Size must have 'height' and 'width' as keys. Got {size.keys()}")
+        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.normalize
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            mean (`float` or `List[float]`):
+                Image mean to use for normalization.
+            std (`float` or `List[float]`):
+                Image standard deviation to use for normalization.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The normalized image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+
+        if do_resize:
+            image = self.resize(image=image, size=size, resample=resample)
+
+        if do_center_crop:
+            image = self.center_crop(image, size=crop_size)
+
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor)
+
+        if do_normalize:
+            image = self.normalize(image=image, mean=image_mean, std=image_std)
+        image = to_channel_dimension_format(image, data_format)
+        return image
+
+    def preprocess(
+        self,
+        videos: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        patch_size: List[int] = None,
+        num_frames: int = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        is_mixed: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an videos or image or batch of videos or images.
+
+        Args:
+            videos (`ImageInput`):
+                Images or videos to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after applying resize.
+            patch_size (`List[int]` *optional*, defaults to self.patch_size):
+                The patch size of image patch embedding.
+            num_frames (`int` *optional*, defaults to self.num_frames):
+                The maximum number of video frames.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_centre_crop`):
+                Whether to centre crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the image after applying the centre crop.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            is_mixed (`bool`, *optional*):
+                If the input video has negative samples.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                    - Unset: Use the inferred channel dimension format of the input image.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
+              width).
+
+            - **pixel_mask** -- Pixel masks to be fed to a model, of shape (batch_size, num_pixel_patches).
+
+            - **pixel_values_mixed** -- Pixel values with both postive or negative to be fed to a model, of shape
+              (batch_size, num_channels, height, width).
+
+            - **pixel_mask_mixed** -- Pixel masks with both postive or negative to be fed to a model, of shape
+              (batch_size, num_pixel_patches).
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        num_frames = num_frames if patch_size is not None else self.num_frames
+
+        if not valid_images(videos):
+            raise ValueError(
+                "Invalid image or video type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        videos = make_batched(videos)
+
+        # Check number of frames is fewer than maximum frames
+        for video in videos:
+            if len(video) > self.num_frames:
+                raise ValueError(
+                    f"number of frames must not be greater than the maximum frames of the model {self.num_frames}."
+                )
+
+        max_num_frames = max([len(video) for video in videos])
+        num_patches_per_image = (size["shortest_edge"] // patch_size[0]) ** 2
+        video_masks = np.array(
+            [
+                len(video) * num_patches_per_image * [1] + (max_num_frames - len(video)) * num_patches_per_image * [0]
+                for video in videos
+            ]
+        )
+
+        videos = [
+            [
+                self._preprocess_image(
+                    image=img,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_center_crop=do_center_crop,
+                    crop_size=crop_size,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                )
+                for img in video
+            ]
+            for video in videos
+        ]
+
+        # If videos contain both positive/negative, use mixed key for video-audio matching task
+        if is_mixed:
+            data = {"pixel_values_mixed": videos, "pixel_mask_mixed": video_masks}
+        else:
+            data = {"pixel_values": videos, "pixel_mask": video_masks}
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
--- a/src/transformers/models/tvlt/modeling_tvlt.py
+++ b/src/transformers/models/tvlt/modeling_tvlt.py
--- a/src/transformers/models/tvlt/processing_tvlt.py
+++ b/src/transformers/models/tvlt/processing_tvlt.py
@ -0,0 +1,88 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for TVLT.
+"""
+
+from ...processing_utils import ProcessorMixin
+
+
+class TvltProcessor(ProcessorMixin):
+    r"""
+    Constructs a TVLT processor which wraps a TVLT image processor and TVLT feature extractor into a single processor.
+
+    [`TvltProcessor`] offers all the functionalities of [`TvltImageProcessor`] and [`TvltFeatureExtractor`]. See the
+    docstring of [`~TvltProcessor.__call__`] for more information.
+
+    Args:
+        image_processor (`TvltImageProcessor`):
+            An instance of [`TvltImageProcessor`]. The image processor is a required input.
+        feature_extractor (`TvltFeatureExtractor`):
+            An instance of [`TvltFeatureExtractor`]. The feature extractor is a required input.
+    """
+    attributes = ["image_processor", "feature_extractor"]
+    image_processor_class = "TvltImageProcessor"
+    feature_extractor_class = "TvltFeatureExtractor"
+
+    def __init__(self, image_processor, feature_extractor):
+        super().__init__(image_processor=image_processor, feature_extractor=feature_extractor)
+
+        self.image_processor = image_processor
+        self.feature_extractor = feature_extractor
+
+    def __call__(
+        self,
+        images=None,
+        audio=None,
+        images_mixed=None,
+        sampling_rate=None,
+        mask_audio=False,
+        mask_pixel=False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Forwards the `images` argument to TvltImageProcessor's [`~TvltImageProcessor.preprocess`] and the `audio`
+        argument to TvltFeatureExtractor's [`~TvltFeatureExtractor.__call__`]. Please refer to the docstring of the
+        above two methods for more information.
+        """
+
+        if images is None and audio is None:
+            raise ValueError("You need to specify either an `images` or `audio` input to process.")
+
+        images_mixed_dict = None
+        if images is not None:
+            images_dict = self.image_processor(images, mask_pixel=mask_pixel, *args, **kwargs)
+        if images_mixed is not None:
+            images_mixed_dict = self.image_processor(images_mixed, is_mixed=True, *args, **kwargs)
+        if audio is not None:
+            audio_dict = self.feature_extractor(
+                audio, *args, sampling_rate=sampling_rate, mask_audio=mask_audio, **kwargs
+            )
+
+        output_dict = {}
+        if audio is not None:
+            output_dict.update(audio_dict)
+        if images is not None:
+            output_dict.update(images_dict)
+        if images_mixed_dict is not None:
+            output_dict.update(images_mixed_dict)
+        return output_dict
+
+    @property
+    def model_input_names(self):
+        image_processor_input_names = self.image_processor.model_input_names
+        feature_extractor_input_names = self.feature_extractor.model_input_names
+        return list(dict.fromkeys(image_processor_input_names + feature_extractor_input_names))
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@ -6068,6 +6068,37 @@ class TrOCRPreTrainedModel(metaclass=DummyObject):
        requires_backends(self, ["torch"])


+TVLT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TvltForAudioVisualClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TvltForPreTraining(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TvltModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TvltPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST = None


--- a/src/transformers/utils/dummy_speech_objects.py
+++ b/src/transformers/utils/dummy_speech_objects.py
@ -28,3 +28,10 @@ class SpeechT5FeatureExtractor(metaclass=DummyObject):

    def __init__(self, *args, **kwargs):
        requires_backends(self, ["speech"])
+
+
+class TvltFeatureExtractor(metaclass=DummyObject):
+    _backends = ["speech"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["speech"])
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@ -415,6 +415,13 @@ class Swin2SRImageProcessor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class TvltImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class VideoMAEFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

--- a/tests/models/tvlt/init.py
+++ b/tests/models/tvlt/init.py
--- a/tests/models/tvlt/test_feature_extraction_tvlt.py
+++ b/tests/models/tvlt/test_feature_extraction_tvlt.py
@ -0,0 +1,207 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TVLT feature extraction. """
+
+import itertools
+import os
+import random
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import is_datasets_available, is_speech_available
+from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torchaudio
+from transformers.utils.import_utils import is_torch_available
+
+from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+
+
+if is_torch_available():
+    import torch
+
+if is_datasets_available():
+    from datasets import load_dataset
+
+if is_speech_available():
+    from transformers import TvltFeatureExtractor
+
+global_rng = random.Random()
+
+
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
+class TvltFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_seq_length=400,
+        max_seq_length=2000,
+        spectrogram_length=2048,
+        feature_size=128,
+        num_audio_channels=1,
+        hop_length=512,
+        chunk_length=30,
+        sampling_rate=44100,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+        self.spectrogram_length = spectrogram_length
+        self.feature_size = feature_size
+        self.num_audio_channels = num_audio_channels
+        self.hop_length = hop_length
+        self.chunk_length = chunk_length
+        self.sampling_rate = sampling_rate
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "spectrogram_length": self.spectrogram_length,
+            "feature_size": self.feature_size,
+            "num_audio_channels": self.num_audio_channels,
+            "hop_length": self.hop_length,
+            "chunk_length": self.chunk_length,
+            "sampling_rate": self.sampling_rate,
+        }
+
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+
+        if equal_length:
+            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
+        else:
+            # make sure that inputs increase in size
+            speech_inputs = [
+                floats_list((x, self.feature_size))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+        if numpify:
+            speech_inputs = [np.asarray(x) for x in speech_inputs]
+        return speech_inputs
+
+
+@require_torch
+@require_torchaudio
+class TvltFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+    feature_extraction_class = TvltFeatureExtractor if is_speech_available() else None
+
+    def setUp(self):
+        self.feat_extract_tester = TvltFeatureExtractionTester(self)
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "spectrogram_length"))
+        self.assertTrue(hasattr(feature_extractor, "feature_size"))
+        self.assertTrue(hasattr(feature_extractor, "num_audio_channels"))
+        self.assertTrue(hasattr(feature_extractor, "hop_length"))
+        self.assertTrue(hasattr(feature_extractor, "chunk_length"))
+        self.assertTrue(hasattr(feature_extractor, "sampling_rate"))
+
+    def test_feat_extract_from_and_save_pretrained(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
+            check_json_file_has_correct_format(saved_file)
+            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
+
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = dict_first.pop("mel_filters")
+        mel_2 = dict_second.pop("mel_filters")
+        self.assertTrue(np.allclose(mel_1, mel_2))
+        self.assertEqual(dict_first, dict_second)
+
+    def test_feat_extract_to_json_file(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
+            feat_extract_first.to_json_file(json_file_path)
+            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
+
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = dict_first.pop("mel_filters")
+        mel_2 = dict_second.pop("mel_filters")
+        self.assertTrue(np.allclose(mel_1, mel_2))
+        self.assertEqual(dict_first, dict_second)
+
+    def test_call(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 20000)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        # Test not batched input
+        encoded_audios = feature_extractor(np_speech_inputs[0], return_tensors="np", sampling_rate=44100).audio_values
+
+        self.assertTrue(encoded_audios.ndim == 4)
+        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
+        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
+        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
+
+        # Test batched
+        encoded_audios = feature_extractor(np_speech_inputs, return_tensors="np", sampling_rate=44100).audio_values
+
+        self.assertTrue(encoded_audios.ndim == 4)
+        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
+        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
+        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
+
+        # Test audio masking
+        encoded_audios = feature_extractor(
+            np_speech_inputs, return_tensors="np", sampling_rate=44100, mask_audio=True
+        ).audio_values
+
+        self.assertTrue(encoded_audios.ndim == 4)
+        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
+        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
+        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
+
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def test_integration(self):
+        input_speech = self._load_datasamples(1)
+        feaure_extractor = TvltFeatureExtractor()
+        audio_values = feaure_extractor(input_speech, return_tensors="pt").audio_values
+
+        self.assertTrue(audio_values.shape, [1, 1, 192, 128])
+
+        expected_slice = torch.tensor([[-0.3032, -0.2708], [-0.4434, -0.4007]])
+        self.assertTrue(torch.allclose(audio_values[0, 0, :2, :2], expected_slice, atol=1e-4))
--- a/tests/models/tvlt/test_image_processor_tvlt.py
+++ b/tests/models/tvlt/test_image_processor_tvlt.py
@ -0,0 +1,253 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TVLT image processor. """
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingSavingTestMixin
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import TvltImageProcessor
+
+
+def prepare_video(image_processor_tester, width=10, height=10, numpify=False, torchify=False):
+    """This function prepares a video as a list of PIL images/NumPy arrays/PyTorch tensors."""
+
+    video = []
+    for i in range(image_processor_tester.num_frames):
+        video.append(np.random.randint(255, size=(image_processor_tester.num_channels, width, height), dtype=np.uint8))
+
+    if not numpify and not torchify:
+        # PIL expects the channel dimension as last dimension
+        video = [Image.fromarray(np.moveaxis(frame, 0, -1)) for frame in video]
+
+    if torchify:
+        video = [torch.from_numpy(frame) for frame in video]
+
+    return video
+
+
+def prepare_video_inputs(image_processor_tester, equal_resolution=False, numpify=False, torchify=False):
+    """This function prepares a batch of videos: a list of list of PIL images, or a list of list of numpy arrays if
+    one specifies numpify=True, or a list of list of PyTorch tensors if one specifies torchify=True.
+    One can specify whether the videos are of the same resolution or not.
+    """
+
+    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
+
+    video_inputs = []
+    for i in range(image_processor_tester.batch_size):
+        if equal_resolution:
+            width = height = image_processor_tester.max_resolution
+        else:
+            width, height = np.random.choice(
+                np.arange(image_processor_tester.min_resolution, image_processor_tester.max_resolution), 2
+            )
+            video = prepare_video(
+                image_processor_tester=image_processor_tester,
+                width=width,
+                height=height,
+                numpify=numpify,
+                torchify=torchify,
+            )
+        video_inputs.append(video)
+
+    return video_inputs
+
+
+class TvltImageProcessorTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        num_frames=4,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_center_crop=True,
+        crop_size=None,
+    ):
+        size = size if size is not None else {"shortest_edge": 18}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.num_frames = num_frames
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+        }
+
+
+@require_torch
+@require_vision
+class TvltImageProcessorTest(ImageProcessingSavingTestMixin, unittest.TestCase):
+    image_processing_class = TvltImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = TvltImageProcessorTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processor, "image_mean"))
+        self.assertTrue(hasattr(image_processor, "image_std"))
+        self.assertTrue(hasattr(image_processor, "do_normalize"))
+        self.assertTrue(hasattr(image_processor, "do_resize"))
+        self.assertTrue(hasattr(image_processor, "do_center_crop"))
+        self.assertTrue(hasattr(image_processor, "size"))
+
+    def test_call_pil(self):
+        # Initialize image_processor
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL videos
+        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], Image.Image)
+
+        # Test not batched input
+        encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize image_processor
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], np.ndarray)
+
+        # Test not batched input
+        encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize image_processor
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], torch.Tensor)
+
+        # Test not batched input
+        encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
--- a/tests/models/tvlt/test_modeling_tvlt.py
+++ b/tests/models/tvlt/test_modeling_tvlt.py
@ -0,0 +1,628 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch TVLT model. """
+
+import copy
+import inspect
+import unittest
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+from transformers import (
+    TvltConfig,
+    is_datasets_available,
+    is_speech_available,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+
+    from transformers import TvltForAudioVisualClassification, TvltForPreTraining, TvltModel
+    from transformers.models.tvlt.modeling_tvlt import TVLT_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_10
+else:
+    is_torch_greater_or_equal_than_1_10 = False
+
+
+if is_datasets_available():
+    from datasets import load_dataset
+
+if is_vision_available():
+    from transformers import TvltImageProcessor
+
+if is_speech_available():
+    from transformers import TvltFeatureExtractor
+
+
+class TvltModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        image_size=32,
+        spectrogram_length=32,
+        frequency_length=16,
+        image_patch_size=[2, 2],
+        audio_patch_size=[2, 2],
+        num_image_channels=3,
+        num_audio_channels=1,
+        num_frames=2,
+        hidden_size=128,
+        num_hidden_layers=12,
+        num_attention_heads=4,
+        intermediate_size=128,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        qkv_bias=True,
+        use_mean_pooling=True,
+        decoder_num_attention_heads=4,
+        decoder_hidden_size=64,
+        decoder_num_hidden_layers=2,
+        decoder_intermediate_size=128,
+        image_mask_ratio=0.75,
+        audio_mask_ratio=0.15,
+        audio_mask_type="frame-level",
+        task_matching=True,
+        task_mae=True,
+        num_labels=1,
+        is_training=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.spectrogram_length = spectrogram_length
+        self.frequency_length = frequency_length
+        self.image_patch_size = image_patch_size
+        self.audio_patch_size = audio_patch_size
+        self.num_image_channels = num_image_channels
+        self.num_audio_channels = num_audio_channels
+        self.num_frames = num_frames
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.use_mean_pooling = use_mean_pooling
+
+        self.decoder_num_attention_heads = decoder_num_attention_heads
+        self.decoder_hidden_size = decoder_hidden_size
+        self.decoder_num_hidden_layers = decoder_num_hidden_layers
+        self.decoder_intermediate_size = decoder_intermediate_size
+        self.image_mask_ratio = image_mask_ratio
+        self.audio_mask_ratio = audio_mask_ratio
+
+        self.task_matching = task_matching
+        self.task_mae = task_mae
+        self.num_labels = num_labels
+
+        self.expected_pixel_seq_len = (self.image_size // self.image_patch_size[0]) ** 2 * self.num_frames
+        self.expected_audio_seq_len = (self.spectrogram_length // self.audio_patch_size[0]) * (
+            self.frequency_length // self.audio_patch_size[1]
+        )
+        # we set the expected sequence length (which is used in several tests)
+        # this is equal to the seq length of number of image/video patches + number of audio patches
+        self.expected_seq_len = self.expected_pixel_seq_len + self.expected_audio_seq_len + 1
+
+        self.image_mae_output_dim = image_patch_size[0] ** 2 * num_image_channels
+        self.audio_mae_output_dim = audio_patch_size[0] * audio_patch_size[1] * num_audio_channels
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor(
+            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
+        )
+        audio_values = floats_tensor(
+            [self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
+        )
+
+        pixel_mask = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
+        audio_mask = floats_tensor([self.batch_size, self.expected_audio_seq_len])
+
+        config = self.get_config()
+
+        return (config, pixel_values, audio_values, pixel_mask, audio_mask)
+
+    def prepare_config_and_inputs_for_pretraining(self):
+        pixel_values = floats_tensor(
+            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
+        )
+        audio_values = floats_tensor(
+            [self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
+        )
+
+        pixel_mask = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
+        audio_mask = floats_tensor([self.batch_size, self.expected_audio_seq_len])
+
+        pixel_values_mixed = floats_tensor(
+            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
+        )
+        pixel_mask_mixed = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
+        labels = floats_tensor([self.batch_size])
+        config = self.get_config()
+
+        return (
+            config,
+            pixel_values,
+            audio_values,
+            pixel_mask,
+            audio_mask,
+            pixel_values_mixed,
+            pixel_mask_mixed,
+            labels,
+        )
+
+    def get_config(self):
+        return TvltConfig(
+            image_size=self.image_size,
+            spectrogram_length=self.spectrogram_length,
+            frequency_length=self.frequency_length,
+            image_patch_size=self.image_patch_size,
+            audio_patch_size=self.audio_patch_size,
+            num_image_channels=self.num_image_channels,
+            num_audio_channels=self.num_audio_channels,
+            num_frames=self.num_frames,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            initializer_range=self.initializer_range,
+            layer_norm_eps=self.layer_norm_eps,
+            qkv_bias=self.qkv_bias,
+            use_mean_pooling=self.use_mean_pooling,
+            decoder_num_attention_heads=self.decoder_num_attention_heads,
+            decoder_hidden_size=self.decoder_hidden_size,
+            decoder_num_hidden_layers=self.decoder_num_hidden_layers,
+            decoder_intermediate_size=self.decoder_intermediate_size,
+            image_mask_ratio=self.image_mask_ratio,
+            audio_mask_ratio=self.audio_mask_ratio,
+            task_matching=self.task_matching,
+            task_mae=self.task_mae,
+            num_labels=self.num_labels,
+        )
+
+    def create_and_check_model(self, config, pixel_values, audio_values, pixel_mask, audio_mask):
+        model = TvltModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, audio_values, pixel_mask=pixel_mask, audio_mask=audio_mask)
+        result = model(pixel_values, audio_values)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.expected_seq_len, self.hidden_size)
+        )
+
+    def create_and_check_for_audiovisual_classification(
+        self, config, pixel_values, audio_values, pixel_mask, audio_mask
+    ):
+        model = TvltForAudioVisualClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, audio_values, pixel_mask=pixel_mask, audio_mask=audio_mask)
+        result = model(pixel_values, audio_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_pretraining(
+        self,
+        config,
+        pixel_values,
+        audio_values,
+        pixel_mask,
+        audio_mask,
+        pixel_values_mixed,
+        pixel_mask_mixed,
+        labels,
+    ):
+        model = TvltForPreTraining(config=config)
+        model.to(torch_device)
+        model.train()
+        result = model(
+            pixel_values,
+            audio_values,
+            pixel_mask,
+            audio_mask,
+            pixel_values_mixed=pixel_values_mixed,
+            pixel_mask_mixed=pixel_mask_mixed,
+            labels=labels,
+        )
+        self.parent.assertEqual(
+            result.pixel_logits.shape, (self.batch_size, self.expected_pixel_seq_len, self.image_mae_output_dim)
+        )
+        self.parent.assertEqual(
+            result.audio_logits.shape, (self.batch_size, self.expected_audio_seq_len, self.audio_mae_output_dim)
+        )
+        self.parent.assertEqual(result.matching_logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_pretraining_inference(
+        self,
+        config,
+        pixel_values,
+        audio_values,
+        pixel_mask,
+        audio_mask,
+        pixel_values_mixed,
+        pixel_mask_mixed,
+        labels,
+    ):
+        model = TvltForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            pixel_values,
+            audio_values,
+            pixel_mask,
+            audio_mask,
+            pixel_values_mixed=pixel_values_mixed,
+            pixel_mask_mixed=pixel_mask_mixed,
+            labels=labels,
+        )
+        if result.pixel_logits is not None:
+            self.parent.assertEqual(
+                result.pixel_logits.shape, (self.batch_size, self.expected_pixel_seq_len, self.image_mae_output_dim)
+            )
+        if result.audio_logits is not None:
+            self.parent.assertEqual(
+                result.audio_logits.shape, (self.batch_size, self.expected_audio_seq_len, self.audio_mae_output_dim)
+            )
+        self.parent.assertEqual(result.matching_logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, pixel_values, audio_values, pixel_mask, audio_mask) = config_and_inputs
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "audio_values": audio_values,
+            "pixel_mask": pixel_mask,
+            "audio_mask": audio_mask,
+        }
+        return config, inputs_dict
+
+    def prepare_pixel_values(self):
+        return floats_tensor(
+            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
+        )
+
+    def prepare_audio_values(self):
+        return floats_tensor(
+            [self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
+        )
+
+
+@require_torch
+@unittest.skipIf(not is_torch_greater_or_equal_than_1_10, "TVLT is only available in torch v1.10+")
+class TvltModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (TvltModel, TvltForPreTraining, TvltForAudioVisualClassification) if is_torch_available() else ()
+    )
+
+    fx_compatible = False
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    main_input_name = "pixel_values"
+
+    # TvltForAudioVisualClassification and TvltForPreTraining require special treatment
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=True):
+        inputs_dict = copy.deepcopy(inputs_dict)
+
+        if return_labels:
+            if model_class.__name__ == "TvltForAudioVisualClassification":
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size,), dtype=torch.long, device=torch_device
+                )
+            elif model_class.__name__ == "TvltForPreTraining":
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size,), dtype=torch.float, device=torch_device
+                )
+                inputs_dict["pixel_values_mixed"] = torch.zeros(
+                    (
+                        self.model_tester.batch_size,
+                        self.model_tester.num_frames,
+                        self.model_tester.num_image_channels,
+                        self.model_tester.image_size,
+                        self.model_tester.image_size,
+                    ),
+                    dtype=torch.float,
+                    device=torch_device,
+                )
+                inputs_dict["pixel_mask_mixed"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.expected_pixel_seq_len),
+                    dtype=torch.float,
+                    device=torch_device,
+                )
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = TvltModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=TvltConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="TVLT does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            input_embeddings = model.get_input_embeddings()
+            self.assertIsInstance(input_embeddings, (tuple))
+            for embedding in input_embeddings:
+                self.assertIsInstance(embedding, (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values", "audio_values"]
+            self.assertListEqual(arg_names[:2], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_audiovisual_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_audiovisual_classification(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pretraining()
+        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
+        self.model_tester.create_and_check_for_pretraining_inference(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TVLT_PRETRAINED_MODEL_ARCHIVE_LIST:
+            model = TvltModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes[1:]:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            for k, v in inputs.items():
+                print(k, v.shape)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_training_gradient_checkpointing(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes[1:]:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.use_cache = False
+            config.return_dict = True
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.gradient_checkpointing_enable()
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_attention_outputs(self):
+        if not self.has_attentions:
+            pass
+
+        else:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+
+            for model_class in self.all_model_classes[2:]:
+                seq_len = self.model_tester.expected_seq_len
+
+                inputs_dict["output_attentions"] = True
+                inputs_dict["output_hidden_states"] = False
+                config.return_dict = True
+                model = model_class(config)
+                model.to(torch_device)
+                model.eval()
+                with torch.no_grad():
+                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+                attentions = outputs.attentions
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+                # check that output_attentions also work using config
+                del inputs_dict["output_attentions"]
+                config.output_attentions = True
+                model = model_class(config)
+                model.to(torch_device)
+                model.eval()
+                with torch.no_grad():
+                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+                attentions = outputs.attentions
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, seq_len, seq_len],
+                )
+                out_len = len(outputs)
+
+                # Check attention is always last and order is fine
+                inputs_dict["output_attentions"] = True
+                inputs_dict["output_hidden_states"] = True
+                model = model_class(config)
+                model.to(torch_device)
+                model.eval()
+                with torch.no_grad():
+                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+                self.assertEqual(out_len + 1, len(outputs))
+
+                self_attentions = outputs.attentions
+
+                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, seq_len, seq_len],
+                )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+            expected_num_layers = self.model_tester.num_hidden_layers + 1
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            seq_length = self.model_tester.expected_seq_len
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes[2:]:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+
+# We will verify our results on a video of eating spaghetti
+# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
+def prepare_video(num_frames=8):
+    file = hf_hub_download(
+        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
+    )
+    video = np.load(file)[:num_frames]
+    return list(video)
+
+
+def prepare_audio(num_samples=1):
+    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    # automatic decoding with librispeech
+    speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+    return [x["array"] for x in speech_samples]
+
+
+@require_torch
+@require_vision
+class TvltModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        # logits were tested with a different mean and std, so we use the same here
+        return (
+            TvltImageProcessor() if is_vision_available() else None,
+            TvltFeatureExtractor(),
+        )
+
+    def test_inference_for_base_model(self):
+        model = TvltModel.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
+
+        image_processor, audio_feature_extractor = self.default_feature_extractor
+        video = prepare_video()
+        audio = prepare_audio()
+        video_inputs = image_processor(video, return_tensors="pt").to(torch_device)
+        audio_inputs = audio_feature_extractor(audio, return_tensors="pt").to(torch_device)
+        inputs = dict()
+        inputs.update(video_inputs)
+        inputs.update(audio_inputs)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_last_hidden_state_slice = torch.tensor([[-0.0186, -0.0691], [0.0242, -0.0398]])
+        self.assertTrue(
+            torch.allclose(outputs.last_hidden_state[:, :2, :2], expected_last_hidden_state_slice, atol=1e-4)
+        )
+
+    def test_inference_for_pretraining(self):
+        model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
+
+        image_processor, audio_feature_extractor = self.default_feature_extractor
+        video = prepare_video()
+        video_mixed = prepare_video()
+        audio = prepare_audio()
+        video_inputs = image_processor(video, return_tensors="pt", mask_pixel=True).to(torch_device)
+        video_mixed_inputs = image_processor(video_mixed, is_mixed=True, return_tensors="pt").to(torch_device)
+        audio_inputs = audio_feature_extractor(audio, return_tensors="pt", mask_audio=True).to(torch_device)
+        labels = torch.tensor([[0.0]], device=torch_device)
+        inputs = dict()
+        inputs.update(video_inputs)
+        inputs.update(video_mixed_inputs)
+        inputs.update(audio_inputs)
+        inputs.update({"labels": labels})
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_pixel_logits_shape = torch.Size([1, 1568, 768])
+        expected_audio_logits_shape = torch.Size([1, 96, 256])
+        expected_matching_logits_shape = torch.Size([1, 1])
+
+        if outputs.pixel_logits is not None:
+            self.assertEqual(outputs.pixel_logits.shape, expected_pixel_logits_shape)
+        if outputs.audio_logits is not None:
+            self.assertEqual(outputs.audio_logits.shape, expected_audio_logits_shape)
+        self.assertTrue(outputs.matching_logits.shape, expected_matching_logits_shape)
--- a/tests/models/tvlt/test_processor_tvlt.py
+++ b/tests/models/tvlt/test_processor_tvlt.py
@ -0,0 +1,116 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers import is_speech_available, is_vision_available
+from transformers.testing_utils import require_torch
+
+
+if is_vision_available():
+    from transformers import TvltImageProcessor
+
+if is_speech_available():
+    from transformers import TvltFeatureExtractor
+
+from transformers import TvltProcessor
+
+
+@require_torch
+class TvltProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.checkpoint = "ZinengTang/tvlt-base"
+        self.tmpdirname = tempfile.mkdtemp()
+
+    def get_image_processor(self, **kwargs):
+        return TvltImageProcessor.from_pretrained(self.checkpoint, **kwargs)
+
+    def get_feature_extractor(self, **kwargs):
+        return TvltFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_save_load_pretrained_default(self):
+        image_processor = self.get_image_processor()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
+        processor.save_pretrained(self.tmpdirname)
+        processor = TvltProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertIsInstance(processor.feature_extractor, TvltFeatureExtractor)
+        self.assertIsInstance(processor.image_processor, TvltImageProcessor)
+
+    def test_feature_extractor(self):
+        image_processor = self.get_image_processor()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
+
+        audio = np.ones([12000])
+
+        audio_dict = feature_extractor(audio, return_tensors="np")
+        input_processor = processor(audio=audio, return_tensors="np")
+
+        for key in audio_dict.keys():
+            self.assertAlmostEqual(audio_dict[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
+
+        images = np.ones([3, 224, 224])
+
+        image_dict = image_processor(images, return_tensors="np")
+        input_processor = processor(images=images, return_tensors="np")
+
+        for key in image_dict.keys():
+            self.assertAlmostEqual(image_dict[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_processor(self):
+        image_processor = self.get_image_processor()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
+
+        audio = np.ones([12000])
+        images = np.ones([3, 224, 224])
+
+        inputs = processor(audio=audio, images=images)
+
+        self.assertListEqual(list(inputs.keys()), ["audio_values", "audio_mask", "pixel_values", "pixel_mask"])
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
+
+        self.assertListEqual(
+            processor.model_input_names,
+            image_processor.model_input_names + feature_extractor.model_input_names,
+            msg="`processor` and `image_processor`+`feature_extractor` model input names do not match",
+        )
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@ -287,6 +287,7 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "AltCLIPTextModel",
    "AltCLIPVisionModel",
    "AltRobertaModel",
+    "TvltForAudioVisualClassification",
    "SpeechT5ForSpeechToSpeech",
    "SpeechT5ForTextToSpeech",
    "SpeechT5HifiGan",