* Update image_processing_tvlt.py

* Update modeling_tvlt.py

* Update

* Update modeling_tvlt.py

* Create tvlt.mdx

* Update configuration_tvlt.py

* Update modeling_tvlt.py

* Update test_modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update image_processing_tvlt.py

* Update feature_extraction_tvlt.py

* Update tvlt models

* Update tests

* Update

* Update

* Update tests

* Update README_ko.md

* Update README_ja.md

* Update README_ko.md

* Update README_zh-hans.md

* Update docs/source/en/model_doc/tvlt.mdx

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update docs/source/en/model_doc/tvlt.mdx

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update tvlt.mdx

* Update modeling_tvlt.py

* Update configuration_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Add files via upload

* Update model

* Update modeling_tvlt.py

* Update tvlt models

* Update src/transformers/models/tvlt/__init__.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/__init__.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Add files via upload

* Add files via upload

* Delete modeling_tvlt.py

* Delete feature_extraction_tvlt.py

* Delete configuration_tvlt.py

* Delete image_processing_tvlt.py

* Delete processing_tvlt.py

* Update tvlt

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update README.md

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update README_es.md

* Update README_hd.md

* Update README_ja.md

* Update README_ko.md

* Update README_zh-hans.md

* Update README_zh-hant.md

* Update index.mdx

* Update tvlt.mdx

* Update tvlt.mdx

* Update configuration_tvlt.py

* Update src/transformers/models/tvlt/image_processing_tvlt.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update modeling_tvlt.py

* Add files via upload

* Update tvlt.mdx

* Update modeling_auto.py

* Add files via upload

* Add files via upload

* Update dummy_pt_objects.py

* Update __init__.py

* Update feature_extraction_tvlt.py

* Update feature_extraction_tvlt.py

* Update image_processing_tvlt.py

* Update modeling_auto.py

* Update test_feature_extraction_tvlt.py

* Update test_processor_tvlt.py

* Update test_feature_extraction_tvlt.py

* Add files via upload

* Update test_image_processor_tvlt.py

* Update tests/models/tvlt/test_processor_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/processing_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_image_processor_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update tests/models/tvlt/test_image_processor_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_image_processor_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_image_processor_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_feature_extraction_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/processing_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update docs/source/en/model_doc/tvlt.mdx

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/feature_extraction_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/feature_extraction_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/feature_extraction_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/feature_extraction_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update feature_extraction_tvlt.py

* Update feature_extraction_tvlt.py

* Update src/transformers/models/tvlt/image_processing_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update image_processing_tvlt.py

* Update src/transformers/models/tvlt/modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update test_image_processor_tvlt.py

* Update tests/models/tvlt/test_modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Add files via upload

* Add files via upload

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Add files via upload

* Update docs/source/en/model_doc/tvlt.mdx

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update image_processing_tvlt.py

* Add files via upload

* Add files via upload

* Update tvlt.mdx

* Update docs/source/en/model_doc/tvlt.mdx

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update docs/source/en/model_doc/tvlt.mdx

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update docs/source/en/model_doc/tvlt.mdx

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Update docs/source/en/model_doc/tvlt.mdx

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Add files via upload

* Add files via upload

* Add files via upload

* Add files via upload

* Update modeling_auto.py

* Update tvlt.mdx

* Update dummy_pt_objects.py

* Update feature_extraction_tvlt.py

* Update modeling_tvlt.py

* Update test_feature_extraction_tvlt.py

* Update test_image_processor_tvlt.py

* Update test_feature_extraction_tvlt.py

* Update modeling_tvlt.py

* Update dummy_pt_objects.py

* Update dummy_speech_objects.py

* Add files via upload

* Update README_hd.md

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update test_modeling_tvlt.py

* Update src/transformers/models/tvlt/configuration_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/feature_extraction_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update MAE processing

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling

* Update style

* Update src/transformers/models/tvlt/modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update check_repo.py

* Update tvlt.mdx

* Update __init__.py

* Update tests

* Update tvlt models

* Update configuration_tvlt.py

* Update configuration_tvlt.py

* Update image_processing_tvlt.py

* Update dummy_pt_objects.py

* Add files via upload

* Update test_modeling_tvlt.py

* Update test_feature_extraction_tvlt.py

* Update test_feature_extraction_tvlt.py

* Update test_feature_extraction_tvlt.py

* Update test_feature_extraction_tvlt.py

* Update test_feature_extraction_tvlt.py

* Update test_feature_extraction_tvlt.py

---------

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
This commit is contained in:
Zineng Tang 2023-02-15 13:10:30 -05:00 committed by GitHub
parent 7bac51837b
commit a0e69a9375
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
29 changed files with 3848 additions and 1 deletions

View File

@ -421,6 +421,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.

View File

@ -414,6 +414,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.

View File

@ -386,6 +386,7 @@ conda install -c huggingface transformers
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU की ओर से) कागज के साथ [संस्करण-एक्स: एक ब्लॉग मॉडल चौकस चौक मॉडल मॉडल] (https://arxivorg/abs/1901.02860) क्वोकोक वी. ले, रुस्लैन सलाखुतदी
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (माइक्रोसॉफ्ट रिसर्च से) साथ में दिया गया पेपर [UniSpeech: यूनिफाइड स्पीच रिप्रेजेंटेशन लर्निंग विद लेबलेड एंड अनलेबल्ड डेटा](https:/ /arxiv.org/abs/2101.07597) चेंगई वांग, यू वू, याओ कियान, केनिची कुमातानी, शुजी लियू, फुरु वेई, माइकल ज़ेंग, ज़ुएदोंग हुआंग द्वारा।
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [UNISPEECH-SAT: यूनिवर्सल स्पीच रिप्रेजेंटेशन लर्निंग विद स्पीकर अवेयर प्री-ट्रेनिंग ](https://arxiv.org/abs/2110.05752) सानयुआन चेन, यू वू, चेंग्यी वांग, झेंगयांग चेन, झूओ चेन, शुजी लियू, जियान वू, याओ कियान, फुरु वेई, जिन्यु ली, जियांगज़ान यू द्वारा पोस्ट किया गया।

View File

@ -448,6 +448,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley から) Michael Janner, Qiyang Li, Sergey Levine から公開された研究論文: [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU から) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov から公開された研究論文: [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860)
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft から), Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei から公開された研究論文: [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282)
1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill から), Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal から公開された研究論文: [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156)
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research から) Yi Tay, Mostafa Dehghani, Vinh Q から公開された研究論文: [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research から) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang から公開された研究論文: [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597)
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research から) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu から公開された研究論文: [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752)

View File

@ -363,6 +363,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley 에서) Michael Janner, Qiyang Li, Sergey Levin 의 [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) 논문과 함께 발표했습니다.
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU 에서) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 의 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 논문과 함께 발표했습니다.
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft 에서) Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 의 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 논문과 함께 발표했습니다.
1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill 에서) Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 의 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 논문과 함께 발표했습니다.
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research 에서) Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzle 의 [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) 논문과 함께 발표했습니다.
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research 에서) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 의 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 논문과 함께 발표했습니다.
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research 에서) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 의 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 논문과 함께 발표했습니다.

View File

@ -387,6 +387,7 @@ conda install -c huggingface transformers
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (来自 UNC Chapel Hill) 伴随论文 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 由 Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 发布。
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。

View File

@ -399,6 +399,7 @@ conda install -c huggingface transformers
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.

View File

@ -576,6 +576,8 @@
title: TAPAS
- local: model_doc/trocr
title: TrOCR
- local: model_doc/tvlt
title: TVLT
- local: model_doc/vilt
title: ViLT
- local: model_doc/vision-encoder-decoder

View File

@ -200,6 +200,7 @@ The documentation is organized into five sections:
1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
1. **[TVLT](model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
@ -382,6 +383,7 @@ Flax), PyTorch, and/or TensorFlow.
| Trajectory Transformer | ❌ | ❌ | ✅ | ❌ | ❌ |
| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ |
| TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ |
| TVLT | ❌ | ❌ | ✅ | ❌ | ❌ |
| UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ |
| UniSpeechSat | ❌ | ❌ | ✅ | ❌ | ❌ |
| UPerNet | ❌ | ❌ | ✅ | ❌ | ❌ |
@ -410,4 +412,4 @@ Flax), PyTorch, and/or TensorFlow.
| YOLOS | ❌ | ❌ | ✅ | ❌ | ❌ |
| YOSO | ❌ | ❌ | ✅ | ❌ | ❌ |
<!-- End table-->
<!-- End table-->

View File

@ -0,0 +1,73 @@
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
# TVLT
## Overview
The TVLT model was proposed in [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156)
by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal (the first three authors contributed equally). The Textless Vision-Language Transformer (TVLT) is a model that uses raw visual and audio inputs for vision-and-language representation learning, without using text-specific modules such as tokenization or automatic speech recognition (ASR). It can perform various audiovisual and vision-language tasks like retrieval, question answering, etc.
The abstract from the paper is the following:
*In this work, we present the Textless Vision-Language Transformer (TVLT), where homogeneous transformer blocks take raw visual and audio inputs for vision-and-language representation learning with minimal modality-specific design, and do not use text-specific modules such as tokenization or automatic speech recognition (ASR). TVLT is trained by reconstructing masked patches of continuous video frames and audio spectrograms (masked autoencoding) and contrastive modeling to align video and audio. TVLT attains performance comparable to its text-based counterpart on various multimodal tasks, such as visual question answering, image retrieval, video retrieval, and multimodal sentiment analysis, with 28x faster inference speed and only 1/3 of the parameters. Our findings suggest the possibility of learning compact and efficient visual-linguistic representations from low-level visual and audio signals without assuming the prior existence of text.*
Tips:
- TVLT is a model that takes both `pixel_values` and `audio_values` as input. One can use [`TvltProcessor`] to prepare data for the model.
This processor wraps an image processor (for the image/video modality) and an audio feature extractor (for the audio modality) into one.
- TVLT is trained with images/videos and audios of various sizes: the authors resize and crop the input images/videos to 224 and limit the length of audio spectrogram to 2048. To make batching of videos and audios possible, the authors use a `pixel_mask` that indicates which pixels are real/padding and `audio_mask` that indicates which audio values are real/padding.
- The design of TVLT is very similar to that of a standard Vision Transformer (ViT) and masked autoencoder (MAE) as in [ViTMAE](vitmae). The difference is that the model includes embedding layers for the audio modality.
- The PyTorch version of this model is only available in torch 1.10 and higher.
<p align="center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/tvlt_architecture.png"
alt="drawing" width="600"/>
</p>
<small> TVLT architecture. Taken from the <a href="[https://arxiv.org/abs/2102.03334](https://arxiv.org/abs/2209.14156)">original paper</a>. </small>
The original code can be found [here](https://github.com/zinengtang/TVLT). This model was contributed by [Zineng Tang](https://huggingface.co/ZinengTang).
## TvltConfig
[[autodoc]] TvltConfig
## TvltProcessor
[[autodoc]] TvltProcessor
- __call__
## TvltImageProcessor
[[autodoc]] TvltImageProcessor
- preprocess
## TvltFeatureExtractor
[[autodoc]] TvltFeatureExtractor
- __call__
## TvltModel
[[autodoc]] TvltModel
- forward
## TvltForPreTraining
[[autodoc]] TvltForPreTraining
- forward
## TvltForAudioVisualClassification
[[autodoc]] TvltForAudioVisualClassification
- forward

26
src/transformers/__init__.py Executable file → Normal file
View File

@ -444,6 +444,11 @@ _import_structure = {
"TrOCRConfig",
"TrOCRProcessor",
],
"models.tvlt": [
"TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP",
"TvltConfig",
"TvltProcessor",
],
"models.unispeech": [
"UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP",
"UniSpeechConfig",
@ -750,6 +755,7 @@ else:
_import_structure["models.mctct"].append("MCTCTFeatureExtractor")
_import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor")
_import_structure["models.speecht5"].append("SpeechT5FeatureExtractor")
_import_structure["models.tvlt"].append("TvltFeatureExtractor")
# Tensorflow-text-specific objects
try:
@ -826,6 +832,7 @@ else:
_import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
_import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
_import_structure["models.swin2sr"].append("Swin2SRImageProcessor")
_import_structure["models.tvlt"].append("TvltImageProcessor")
_import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
_import_structure["models.vilt"].extend(["ViltFeatureExtractor", "ViltImageProcessor", "ViltProcessor"])
_import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"])
@ -2348,6 +2355,15 @@ else:
_import_structure["models.trocr"].extend(
["TROCR_PRETRAINED_MODEL_ARCHIVE_LIST", "TrOCRForCausalLM", "TrOCRPreTrainedModel"]
)
_import_structure["models.tvlt"].extend(
[
"TVLT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TvltForAudioVisualClassification",
"TvltForPreTraining",
"TvltModel",
"TvltPreTrainedModel",
]
)
_import_structure["models.unispeech"].extend(
[
"UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST",
@ -3937,6 +3953,7 @@ if TYPE_CHECKING:
TransfoXLTokenizer,
)
from .models.trocr import TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP, TrOCRConfig, TrOCRProcessor
from .models.tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig, TvltProcessor
from .models.unispeech import UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechConfig
from .models.unispeech_sat import UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechSatConfig
from .models.upernet import UperNetConfig
@ -4212,6 +4229,7 @@ if TYPE_CHECKING:
from .models.mctct import MCTCTFeatureExtractor
from .models.speech_to_text import Speech2TextFeatureExtractor
from .models.speecht5 import SpeechT5FeatureExtractor
from .models.tvlt import TvltFeatureExtractor
try:
if not is_tensorflow_text_available():
@ -4269,6 +4287,7 @@ if TYPE_CHECKING:
from .models.poolformer import PoolFormerFeatureExtractor, PoolFormerImageProcessor
from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
from .models.swin2sr import Swin2SRImageProcessor
from .models.tvlt import TvltImageProcessor
from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
from .models.vilt import ViltFeatureExtractor, ViltImageProcessor, ViltProcessor
from .models.vit import ViTFeatureExtractor, ViTImageProcessor
@ -5510,6 +5529,13 @@ if TYPE_CHECKING:
load_tf_weights_in_transfo_xl,
)
from .models.trocr import TROCR_PRETRAINED_MODEL_ARCHIVE_LIST, TrOCRForCausalLM, TrOCRPreTrainedModel
from .models.tvlt import (
TVLT_PRETRAINED_MODEL_ARCHIVE_LIST,
TvltForAudioVisualClassification,
TvltForPreTraining,
TvltModel,
TvltPreTrainedModel,
)
from .models.unispeech import (
UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST,
UniSpeechForCTC,

View File

@ -169,6 +169,7 @@ from . import (
trajectory_transformer,
transfo_xl,
trocr,
tvlt,
unispeech,
unispeech_sat,
upernet,

View File

@ -169,6 +169,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("trajectory_transformer", "TrajectoryTransformerConfig"),
("transfo-xl", "TransfoXLConfig"),
("trocr", "TrOCRConfig"),
("tvlt", "TvltConfig"),
("unispeech", "UniSpeechConfig"),
("unispeech-sat", "UniSpeechSatConfig"),
("upernet", "UperNetConfig"),
@ -329,6 +330,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("timesformer", "TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("tvlt", "TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("unispeech", "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("unispeech-sat", "UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("van", "VAN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@ -516,6 +518,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("trajectory_transformer", "Trajectory Transformer"),
("transfo-xl", "Transformer-XL"),
("trocr", "TrOCR"),
("tvlt", "TVLT"),
("ul2", "UL2"),
("unispeech", "UniSpeech"),
("unispeech-sat", "UniSpeechSat"),

View File

@ -163,6 +163,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("timesformer", "TimesformerModel"),
("trajectory_transformer", "TrajectoryTransformerModel"),
("transfo-xl", "TransfoXLModel"),
("tvlt", "TvltModel"),
("unispeech", "UniSpeechModel"),
("unispeech-sat", "UniSpeechSatModel"),
("van", "VanModel"),
@ -235,6 +236,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
("t5", "T5ForConditionalGeneration"),
("tapas", "TapasForMaskedLM"),
("transfo-xl", "TransfoXLLMHeadModel"),
("tvlt", "TvltForPreTraining"),
("unispeech", "UniSpeechForPreTraining"),
("unispeech-sat", "UniSpeechSatForPreTraining"),
("videomae", "VideoMAEForPreTraining"),

View File

@ -0,0 +1,101 @@
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_speech_available,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_tvlt": ["TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP", "TvltConfig"],
"processing_tvlt": ["TvltProcessor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tvlt"] = [
"TVLT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TvltModel",
"TvltForPreTraining",
"TvltForAudioVisualClassification",
"TvltPreTrainedModel",
]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_tvlt"] = ["TvltImageProcessor"]
try:
if not is_speech_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_tvlt"] = ["TvltFeatureExtractor"]
if TYPE_CHECKING:
from .configuration_tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig
from .processing_tvlt import TvltProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tvlt import (
TVLT_PRETRAINED_MODEL_ARCHIVE_LIST,
TvltForAudioVisualClassification,
TvltForPreTraining,
TvltModel,
TvltPreTrainedModel,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_tvlt import TvltImageProcessor
try:
if not is_speech_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_tvlt import TvltFeatureExtractor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

View File

@ -0,0 +1,187 @@
# coding=utf-8
# Copyright 2023 MURGe-Lab and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TVLT model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"ZinengTang/tvlt-base": "https://huggingface.co/ZinengTang/tvlt-base/blob/main/config.json",
}
class TvltConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`TvltModel`]. It is used to instantiate a TVLT
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the TVLT
[TVLT/tvlt-base](https://huggingface.co/ZinengTang/tvlt-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
spectrogram_length (`int`, *optional*, defaults to 2048):
The time length of each audio spectrogram.
frequency_length (`int`, *optional*, defaults to 128):
The frequency length of audio spectrogram.
image_patch_size (`List[int]`, *optional*, defaults to `[16, 16]`):
The size (resolution) of each image patch.
audio_patch_size (`List[int]`, *optional*, defaults to `[16, 16]`):
The size (resolution) of each audio patch.
num_image_channels (`int`, *optional*, defaults to 3):
The number of input image channels.
num_audio_channels (`int`, *optional*, defaults to 1):
The number of input audio channels.
num_frames (`int`, *optional*, defaults to 8):
The maximum number of frames for an input video.
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-6):
The epsilon used by the layer normalization layers.
qkv_bias (`bool`, *optional*, defaults to `True`):
Whether to add a bias to the queries, keys and values.
use_mean_pooling (`bool`, *optional*, defaults to `False`):
Whether to mean pool the final hidden states instead of using the final hidden state of the [CLS] token.
decoder_num_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the decoder.
decoder_hidden_size (`int`, *optional*, defaults to 512):
Dimensionality of the decoder.
decoder_num_hidden_layers (`int`, *optional*, defaults to 8):
Number of hidden layers in the decoder.
decoder_intermediate_size (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
pixel_mask_ratio (`float`, *optional*, defaults to 0.75):
Image patch masking ratio.
audio_mask_ratio (`float`, *optional*, defaults to 0.15):
Audio patch masking ratio.
audio_mask_type (`str`, *optional*, defaults to `"frame-level"`):
Audio patch masking type, choose between "frame-level" and "patch-level".
task_matching (`bool`, *optional*, defaults to `True`):
Whether to use vision audio matching task in pretraining.
task_mae (`bool`, *optional*, defaults to `True`):
Whether to use the masked auto-encoder (MAE) in pretraining.
loss_type (`str`, *optional*, defaults to `"classification"`):
Loss types including regression and classification.
Example:
```python
>>> from transformers import TvltConfig, TvltModel
>>> # # Initializing a TVLT ZinengTang/tvlt-base style configuration
>>> configuration = TvltConfig()
>>> # # Initializing a model (with random weights) from the ZinengTang/tvlt-base style configuration
>>> model = TvltModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "tvlt"
def __init__(
self,
image_size=224,
spectrogram_length=2048,
frequency_length=128,
image_patch_size=[16, 16],
audio_patch_size=[16, 16],
num_image_channels=3,
num_audio_channels=1,
num_frames=8,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
layer_norm_eps=1e-6,
qkv_bias=True,
use_mean_pooling=False,
decoder_num_attention_heads=16,
decoder_hidden_size=512,
decoder_num_hidden_layers=8,
decoder_intermediate_size=2048,
pixel_mask_ratio=0.75,
audio_mask_ratio=0.15,
audio_mask_type="frame-level",
task_matching=True,
task_mae=True,
loss_type="classification",
**kwargs,
):
super().__init__(**kwargs)
if audio_mask_type not in ("frame-level", "patch_level"):
raise ValueError(
"audio_mask_type must be one of two acceptable strategies - {'frame_level', 'patch-level') "
f"got {audio_mask_type}"
)
self.image_size = image_size
self.spectrogram_length = spectrogram_length
self.frequency_length = frequency_length
self.image_patch_size = image_patch_size
self.audio_patch_size = audio_patch_size
self.num_image_channels = num_image_channels
self.num_audio_channels = num_audio_channels
self.num_frames = num_frames
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.qkv_bias = qkv_bias
self.use_mean_pooling = use_mean_pooling
self.decoder_num_attention_heads = decoder_num_attention_heads
self.decoder_hidden_size = decoder_hidden_size
self.decoder_num_hidden_layers = decoder_num_hidden_layers
self.decoder_intermediate_size = decoder_intermediate_size
self.pixel_mask_ratio = pixel_mask_ratio
self.audio_mask_ratio = audio_mask_ratio
self.audio_mask_type = audio_mask_type
self.task_matching = task_matching
self.task_mae = task_mae
self.loss_type = loss_type

View File

@ -0,0 +1,336 @@
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Feature extractor class for TVLT."""
from math import ceil
from typing import List, Optional, Union
import numpy as np
from numpy.fft import fft
from transformers.feature_extraction_sequence_utils import BatchFeature, SequenceFeatureExtractor
from transformers.utils import TensorType, logging
logger = logging.get_logger(__name__)
class TvltFeatureExtractor(SequenceFeatureExtractor):
r"""
Constructs a TVLT audio feature extractor. This feature extractor can be used to prepare audios for the model.
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
should refer to this superclass for more information regarding those methods.
Args:
spectrogram_length (`Dict[str, int]` *optional*, defaults to 2048):
The time length of each audio spectrogram.
num_channels (`int` *optional*, defaults to 1):
Number of audio channels.
patch_size (`List[int]` *optional*, defaults to `[16, 16]`):
The patch size of audio patch embedding.
feature_size (`int`, defaults to 128):
The frequency length of audio spectrogram.
sampling_rate (`int`, defaults to 44100):
The sampling rate at which the audio files should be digitalized expressed in Hertz (Hz).
hop_length_to_sampling_rate (`int`, defaults to 86):
Hop length is length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
For example, with sampling rate 44100, the hop length is 512, with 44100 / 512 = 86
n_fft (`int`, defaults to 2048):
Size of the Fourier transform.
padding_value (`float`, *optional*, defaults to 0.0):
Padding value used to pad the audio. Should correspond to silences.
"""
model_input_names = ["audio_values", "audio_mask"]
def __init__(
self,
spectrogram_length=2048,
num_channels=1,
patch_size=[16, 16],
feature_size=128,
sampling_rate=44100,
hop_length_to_sampling_rate=86,
n_fft=2048,
padding_value=0.0,
**kwargs,
):
super().__init__(
feature_size=feature_size,
sampling_rate=sampling_rate,
padding_value=padding_value,
**kwargs,
)
self.spectrogram_length = spectrogram_length
self.num_channels = num_channels
self.patch_size = patch_size
self.freq_len = feature_size // self.patch_size[1]
self.n_fft = n_fft
self.hop_length = sampling_rate // hop_length_to_sampling_rate
self.sampling_rate = sampling_rate
self.padding_value = padding_value
self.mel_filters = self.get_mel_filters(sampling_rate, n_fft, n_mels=feature_size)
# Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.get_mel_filters with 45.245640471924965->59.99247463746737
def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
# Initialize the weights
n_mels = int(n_mels)
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
# Center freqs of each FFT bin
fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = 0.0
max_mel = 59.99247463746737
mels = np.linspace(min_mel, max_mel, n_mels + 2)
mels = np.asanyarray(mels)
# Fill in the linear scale
f_min = 0.0
f_sp = 200.0 / 3
freqs = f_min + f_sp * mels
# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region
# If we have vector data, vectorize
log_t = mels >= min_log_mel
freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
mel_f = freqs
fdiff = np.diff(mel_f)
ramps = np.subtract.outer(mel_f, fftfreqs)
for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]
# .. then intersect them with each other and zero
weights[i] = np.maximum(0, np.minimum(lower, upper))
# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
weights *= enorm[:, np.newaxis]
return weights
# Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.fram_wave
def fram_wave(self, waveform, center=True):
"""
Transform a raw waveform into a list of smaller waveforms. The window length defines how much of the signal is
contain in each frame (smalle waveform), while the hope length defines the step between the beginning of each
new frame.
Centering is done by reflecting the waveform which is first centered around `frame_idx * hop_length`.
"""
frames = []
for i in range(0, waveform.shape[0] + 1, self.hop_length):
half_window = (self.n_fft - 1) // 2 + 1
if center:
start = i - half_window if i > half_window else 0
end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
frame = waveform[start:end]
if start == 0:
padd_width = (-i + half_window, 0)
frame = np.pad(frame, pad_width=padd_width, mode="reflect")
elif end == waveform.shape[0]:
padd_width = (0, (i - waveform.shape[0] + half_window))
frame = np.pad(frame, pad_width=padd_width, mode="reflect")
else:
frame = waveform[i : i + self.n_fft]
frame_width = frame.shape[0]
if frame_width < waveform.shape[0]:
frame = np.lib.pad(
frame, pad_width=(0, self.n_fft - frame_width), mode="constant", constant_values=0
)
frames.append(frame)
return np.stack(frames, 0)
# Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.stft
def stft(self, frames, window):
"""
Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
results as `torch.stft`.
"""
frame_size = frames.shape[1]
fft_size = self.n_fft
if fft_size is None:
fft_size = frame_size
if fft_size < frame_size:
raise ValueError("FFT size must greater or equal the frame size")
# number of FFT bins to store
num_fft_bins = (fft_size >> 1) + 1
data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
fft_signal = np.zeros(fft_size)
for f, frame in enumerate(frames):
if window is not None:
np.multiply(frame, window, out=fft_signal[:frame_size])
else:
fft_signal[:frame_size] = frame
data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
return data.T
def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
"""
Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
implementation with 1e-5 tolerance.
"""
window = np.hanning(self.n_fft + 1)[:-1]
frames = self.fram_wave(waveform)
stft = self.stft(frames, window=window)
magnitudes = np.abs(stft[:, :-1]) ** 2
filters = self.mel_filters
mel_spec = filters @ magnitudes
log_spec = 10.0 * np.log10(np.maximum(1e-10, mel_spec))
log_spec -= 10.0 * np.log10(np.maximum(1e-10, 1.0))
log_spec = np.maximum(log_spec, log_spec.max() - 80.0)
log_spec = log_spec - 20.0
log_spec = np.clip(log_spec / 40.0, -2.0, 0.0) + 1.0
return log_spec
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
return_tensors: Optional[Union[str, TensorType]] = None,
return_attention_mask: Optional[bool] = True,
sampling_rate: Optional[int] = None,
resample: bool = False,
mask_audio: bool = False,
**kwargs,
) -> BatchFeature:
"""
Main method to prepare one or several audio(s) for the model.
Args:
raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
values, a list of numpy arrays or a list of list of float values.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return Numpy `np.ndarray` objects.
return_attention_mask (`bool`, *optional*, default to `True`):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific feature_extractor's default. [What are attention masks?](../glossary#attention-mask)
<Tip>
For TvltTransformer models, `attention_mask` should alwys be passed for batched inference, to avoid
subtle bugs.
</Tip>
sampling_rate (`int`, *optional*):
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
`sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
pipeline. Current model supports sampling rate 16000 and 44100.
resample (`bool`, *optional*, defaults to `False`):
If the sampling rate is not matched, resample the input audio to match.
mask_audio (`bool`, *optional*, defaults to `False`):
Whether or not to mask input audio for MAE task.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **audio_values** -- Audio values to be fed to a model, of shape (batch_size, num_channels, height,
width).
- **audio_mask** -- Audio masks to be fed to a model, of shape (batch_size, num_audio_patches).
"""
if sampling_rate is not None:
if sampling_rate != self.sampling_rate:
raise ValueError(
"This feature extractor is set to support sampling rate"
f" of {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled"
f" with {self.sampling_rate} and not {sampling_rate}."
)
else:
logger.warning(
"It is strongly recommended to pass the `sampling_rate` argument to this function. "
"Failing to do so can result in silent errors that might be hard to debug."
)
is_batched = bool(
isinstance(raw_speech, (list, tuple))
and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
)
if is_batched:
raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
elif not is_batched and not isinstance(raw_speech, np.ndarray):
raw_speech = np.asarray(raw_speech, dtype=np.float32)
elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
raw_speech = raw_speech.astype(np.float32)
# always return batch
if not is_batched:
raw_speech = [np.asarray([raw_speech]).T]
# Convert audio signals to log mel spectrograms, truncate by time axis
audio_features = [
self._np_extract_fbank_features(waveform.squeeze()).T[: self.spectrogram_length] for waveform in raw_speech
]
if isinstance(audio_features[0], List):
audio_features = [np.asarray(feature, dtype=np.float32) for feature in audio_features]
# Create audio attention mask
max_patch_len = max(
[ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len for feature in audio_features]
) # The maximum number of audio patches in a batch
if return_attention_mask:
audio_mask = [
(ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len) * [1]
+ (max_patch_len - ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len) * [0]
for feature in audio_features
]
audio_mask = np.array(audio_mask).astype(np.float32)
# convert into correct format for padding
max_time_len = max_patch_len // self.freq_len * self.patch_size[0] # The maximum audio size in a batch
padded_audio_features = np.ones([len(audio_features), 1, max_time_len, self.feature_size]).astype(np.float32)
padded_audio_features = padded_audio_features * self.padding_value
for i in range(len(audio_features)):
feature = audio_features[i]
padded_audio_features[i, :, : feature.shape[0], :] = feature
# return as BatchFeature
if return_attention_mask:
data = {"audio_values": padded_audio_features, "audio_mask": audio_mask}
else:
data = {"audio_values": padded_audio_features}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
return encoded_inputs

View File

@ -0,0 +1,455 @@
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for TVLT."""
from typing import Dict, List, Optional, Union
import numpy as np
from transformers.utils.generic import TensorType
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
center_crop,
get_resize_output_image_size,
normalize,
rescale,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
is_valid_image,
to_numpy_array,
valid_images,
)
from ...utils import logging
logger = logging.get_logger(__name__)
def make_batched(videos) -> List[List[ImageInput]]:
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)):
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
videos_dim = np.array(videos[0]).ndim
if videos_dim == 3:
return [videos]
elif videos_dim == 4:
return videos
elif is_valid_image(videos):
videos_dim = np.array(videos).ndim
if videos_dim == 3:
return [[videos]]
elif videos_dim == 4:
return [videos]
elif videos_dim == 5:
return videos
raise ValueError(f"Could not make batched video from {videos}")
class TvltImageProcessor(BaseImageProcessor):
r"""
Constructs a TVLT image processor.
This processor can be used to prepare either videos or images for the model by converting images to 1-frame videos.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
`do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
Size of the output image after resizing. The shortest edge of the image will be resized to
`size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
`size` in the `preprocess` method.
patch_size (`List[int]` *optional*, defaults to [16,16]):
The patch size of image patch embedding.
num_frames (`int` *optional*, defaults to 8):
The maximum number of video frames.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
`preprocess` method.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to center crop the image to the specified `crop_size`. Can be overridden by the `do_center_crop`
parameter in the `preprocess` method.
crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
Size of the image after applying the center crop. Can be overridden by the `crop_size` parameter in the
`preprocess` method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to 1/255):
Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
in the `preprocess` method.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
model_input_names = [
"pixel_values",
"pixel_mask",
"pixel_values_mixed",
"pixel_mask_mixed",
]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
patch_size: List[int] = [16, 16],
num_frames: int = 8,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_center_crop: bool = True,
crop_size: Dict[str, int] = None,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = IMAGENET_STANDARD_MEAN,
image_std: Optional[Union[float, List[float]]] = IMAGENET_STANDARD_STD,
init_mask_generator=False,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 224}
size = get_size_dict(size, default_to_square=False)
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
crop_size = get_size_dict(crop_size, param_name="crop_size")
self.do_resize = do_resize
self.size = size
self.patch_size = patch_size
self.num_frames = num_frames
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
have the size `(h, w)`. If `size` is of the form `{"shortest_edge": s}`, the output image will have its
shortest edge of length `s` while keeping the aspect ratio of the original image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use when resiizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size, default_to_square=False)
if "shortest_edge" in size:
output_size = get_resize_output_image_size(image, size["shortest_edge"], default_to_square=False)
elif "height" in size and "width" in size:
output_size = (size["height"], size["width"])
else:
raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
def center_crop(
self,
image: np.ndarray,
size: Dict[str, int],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `size` along any
edge, the image is padded with 0's and then center cropped.
Args:
image (`np.ndarray`):
Image to center crop.
size (`Dict[str, int]`):
Size of the output image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"Size must have 'height' and 'width' as keys. Got {size.keys()}")
return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
def rescale(
self,
image: np.ndarray,
scale: Union[int, float],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
"""
Rescale an image by a scale factor. image = image * scale.
Args:
image (`np.ndarray`):
Image to rescale.
scale (`int` or `float`):
Scale to apply to the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return rescale(image, scale=scale, data_format=data_format, **kwargs)
# Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.normalize
def normalize(
self,
image: np.ndarray,
mean: Union[float, List[float]],
std: Union[float, List[float]],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Normalize an image. image = (image - image_mean) / image_std.
Args:
image (`np.ndarray`):
Image to normalize.
mean (`float` or `List[float]`):
Image mean to use for normalization.
std (`float` or `List[float]`):
Image standard deviation to use for normalization.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
Returns:
`np.ndarray`: The normalized image.
"""
return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
def _preprocess_image(
self,
image: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
) -> np.ndarray:
"""Preprocesses a single image."""
if do_resize and size is None or resample is None:
raise ValueError("Size and resample must be specified if do_resize is True.")
if do_center_crop and crop_size is None:
raise ValueError("Crop size must be specified if do_center_crop is True.")
if do_rescale and rescale_factor is None:
raise ValueError("Rescale factor must be specified if do_rescale is True.")
if do_normalize and (image_mean is None or image_std is None):
raise ValueError("Image mean and std must be specified if do_normalize is True.")
# All transformations expect numpy arrays.
image = to_numpy_array(image)
if do_resize:
image = self.resize(image=image, size=size, resample=resample)
if do_center_crop:
image = self.center_crop(image, size=crop_size)
if do_rescale:
image = self.rescale(image=image, scale=rescale_factor)
if do_normalize:
image = self.normalize(image=image, mean=image_mean, std=image_std)
image = to_channel_dimension_format(image, data_format)
return image
def preprocess(
self,
videos: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
patch_size: List[int] = None,
num_frames: int = None,
resample: PILImageResampling = None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
is_mixed: bool = False,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
**kwargs,
) -> BatchFeature:
"""
Preprocess an videos or image or batch of videos or images.
Args:
videos (`ImageInput`):
Images or videos to preprocess.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after applying resize.
patch_size (`List[int]` *optional*, defaults to self.patch_size):
The patch size of image patch embedding.
num_frames (`int` *optional*, defaults to self.num_frames):
The maximum number of video frames.
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
has an effect if `do_resize` is set to `True`.
do_center_crop (`bool`, *optional*, defaults to `self.do_centre_crop`):
Whether to centre crop the image.
crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
Size of the image after applying the centre crop.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image values between [0 - 1].
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation.
is_mixed (`bool`, *optional*):
If the input video has negative samples.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the inferred channel dimension format of the input image.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
width).
- **pixel_mask** -- Pixel masks to be fed to a model, of shape (batch_size, num_pixel_patches).
- **pixel_values_mixed** -- Pixel values with both postive or negative to be fed to a model, of shape
(batch_size, num_channels, height, width).
- **pixel_mask_mixed** -- Pixel masks with both postive or negative to be fed to a model, of shape
(batch_size, num_pixel_patches).
"""
do_resize = do_resize if do_resize is not None else self.do_resize
resample = resample if resample is not None else self.resample
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
size = size if size is not None else self.size
size = get_size_dict(size, default_to_square=False)
crop_size = crop_size if crop_size is not None else self.crop_size
crop_size = get_size_dict(crop_size, param_name="crop_size")
patch_size = patch_size if patch_size is not None else self.patch_size
num_frames = num_frames if patch_size is not None else self.num_frames
if not valid_images(videos):
raise ValueError(
"Invalid image or video type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
videos = make_batched(videos)
# Check number of frames is fewer than maximum frames
for video in videos:
if len(video) > self.num_frames:
raise ValueError(
f"number of frames must not be greater than the maximum frames of the model {self.num_frames}."
)
max_num_frames = max([len(video) for video in videos])
num_patches_per_image = (size["shortest_edge"] // patch_size[0]) ** 2
video_masks = np.array(
[
len(video) * num_patches_per_image * [1] + (max_num_frames - len(video)) * num_patches_per_image * [0]
for video in videos
]
)
videos = [
[
self._preprocess_image(
image=img,
do_resize=do_resize,
size=size,
resample=resample,
do_center_crop=do_center_crop,
crop_size=crop_size,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
data_format=data_format,
)
for img in video
]
for video in videos
]
# If videos contain both positive/negative, use mixed key for video-audio matching task
if is_mixed:
data = {"pixel_values_mixed": videos, "pixel_mask_mixed": video_masks}
else:
data = {"pixel_values": videos, "pixel_mask": video_masks}
return BatchFeature(data=data, tensor_type=return_tensors)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,88 @@
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for TVLT.
"""
from ...processing_utils import ProcessorMixin
class TvltProcessor(ProcessorMixin):
r"""
Constructs a TVLT processor which wraps a TVLT image processor and TVLT feature extractor into a single processor.
[`TvltProcessor`] offers all the functionalities of [`TvltImageProcessor`] and [`TvltFeatureExtractor`]. See the
docstring of [`~TvltProcessor.__call__`] for more information.
Args:
image_processor (`TvltImageProcessor`):
An instance of [`TvltImageProcessor`]. The image processor is a required input.
feature_extractor (`TvltFeatureExtractor`):
An instance of [`TvltFeatureExtractor`]. The feature extractor is a required input.
"""
attributes = ["image_processor", "feature_extractor"]
image_processor_class = "TvltImageProcessor"
feature_extractor_class = "TvltFeatureExtractor"
def __init__(self, image_processor, feature_extractor):
super().__init__(image_processor=image_processor, feature_extractor=feature_extractor)
self.image_processor = image_processor
self.feature_extractor = feature_extractor
def __call__(
self,
images=None,
audio=None,
images_mixed=None,
sampling_rate=None,
mask_audio=False,
mask_pixel=False,
*args,
**kwargs,
):
"""
Forwards the `images` argument to TvltImageProcessor's [`~TvltImageProcessor.preprocess`] and the `audio`
argument to TvltFeatureExtractor's [`~TvltFeatureExtractor.__call__`]. Please refer to the docstring of the
above two methods for more information.
"""
if images is None and audio is None:
raise ValueError("You need to specify either an `images` or `audio` input to process.")
images_mixed_dict = None
if images is not None:
images_dict = self.image_processor(images, mask_pixel=mask_pixel, *args, **kwargs)
if images_mixed is not None:
images_mixed_dict = self.image_processor(images_mixed, is_mixed=True, *args, **kwargs)
if audio is not None:
audio_dict = self.feature_extractor(
audio, *args, sampling_rate=sampling_rate, mask_audio=mask_audio, **kwargs
)
output_dict = {}
if audio is not None:
output_dict.update(audio_dict)
if images is not None:
output_dict.update(images_dict)
if images_mixed_dict is not None:
output_dict.update(images_mixed_dict)
return output_dict
@property
def model_input_names(self):
image_processor_input_names = self.image_processor.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names
return list(dict.fromkeys(image_processor_input_names + feature_extractor_input_names))

View File

@ -6068,6 +6068,37 @@ class TrOCRPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
TVLT_PRETRAINED_MODEL_ARCHIVE_LIST = None
class TvltForAudioVisualClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class TvltForPreTraining(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class TvltModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class TvltPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST = None

View File

@ -28,3 +28,10 @@ class SpeechT5FeatureExtractor(metaclass=DummyObject):
def __init__(self, *args, **kwargs):
requires_backends(self, ["speech"])
class TvltFeatureExtractor(metaclass=DummyObject):
_backends = ["speech"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["speech"])

View File

@ -415,6 +415,13 @@ class Swin2SRImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"])
class TvltImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class VideoMAEFeatureExtractor(metaclass=DummyObject):
_backends = ["vision"]

View File

View File

@ -0,0 +1,207 @@
# coding=utf-8
# Copyright 2023 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the TVLT feature extraction. """
import itertools
import os
import random
import tempfile
import unittest
import numpy as np
from transformers import is_datasets_available, is_speech_available
from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torchaudio
from transformers.utils.import_utils import is_torch_available
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
if is_torch_available():
import torch
if is_datasets_available():
from datasets import load_dataset
if is_speech_available():
from transformers import TvltFeatureExtractor
global_rng = random.Random()
def floats_list(shape, scale=1.0, rng=None, name=None):
"""Creates a random float32 tensor"""
if rng is None:
rng = global_rng
values = []
for batch_idx in range(shape[0]):
values.append([])
for _ in range(shape[1]):
values[-1].append(rng.random() * scale)
return values
class TvltFeatureExtractionTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=7,
min_seq_length=400,
max_seq_length=2000,
spectrogram_length=2048,
feature_size=128,
num_audio_channels=1,
hop_length=512,
chunk_length=30,
sampling_rate=44100,
):
self.parent = parent
self.batch_size = batch_size
self.min_seq_length = min_seq_length
self.max_seq_length = max_seq_length
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
self.spectrogram_length = spectrogram_length
self.feature_size = feature_size
self.num_audio_channels = num_audio_channels
self.hop_length = hop_length
self.chunk_length = chunk_length
self.sampling_rate = sampling_rate
def prepare_feat_extract_dict(self):
return {
"spectrogram_length": self.spectrogram_length,
"feature_size": self.feature_size,
"num_audio_channels": self.num_audio_channels,
"hop_length": self.hop_length,
"chunk_length": self.chunk_length,
"sampling_rate": self.sampling_rate,
}
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
def _flatten(list_of_lists):
return list(itertools.chain(*list_of_lists))
if equal_length:
speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
else:
# make sure that inputs increase in size
speech_inputs = [
floats_list((x, self.feature_size))
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
]
if numpify:
speech_inputs = [np.asarray(x) for x in speech_inputs]
return speech_inputs
@require_torch
@require_torchaudio
class TvltFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
feature_extraction_class = TvltFeatureExtractor if is_speech_available() else None
def setUp(self):
self.feat_extract_tester = TvltFeatureExtractionTester(self)
def test_feat_extract_properties(self):
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
self.assertTrue(hasattr(feature_extractor, "spectrogram_length"))
self.assertTrue(hasattr(feature_extractor, "feature_size"))
self.assertTrue(hasattr(feature_extractor, "num_audio_channels"))
self.assertTrue(hasattr(feature_extractor, "hop_length"))
self.assertTrue(hasattr(feature_extractor, "chunk_length"))
self.assertTrue(hasattr(feature_extractor, "sampling_rate"))
def test_feat_extract_from_and_save_pretrained(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
mel_1 = dict_first.pop("mel_filters")
mel_2 = dict_second.pop("mel_filters")
self.assertTrue(np.allclose(mel_1, mel_2))
self.assertEqual(dict_first, dict_second)
def test_feat_extract_to_json_file(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
json_file_path = os.path.join(tmpdirname, "feat_extract.json")
feat_extract_first.to_json_file(json_file_path)
feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
mel_1 = dict_first.pop("mel_filters")
mel_2 = dict_second.pop("mel_filters")
self.assertTrue(np.allclose(mel_1, mel_2))
self.assertEqual(dict_first, dict_second)
def test_call(self):
# Initialize feature_extractor
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
# create three inputs of length 800, 1000, and 1200
speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 20000)]
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
# Test not batched input
encoded_audios = feature_extractor(np_speech_inputs[0], return_tensors="np", sampling_rate=44100).audio_values
self.assertTrue(encoded_audios.ndim == 4)
self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
# Test batched
encoded_audios = feature_extractor(np_speech_inputs, return_tensors="np", sampling_rate=44100).audio_values
self.assertTrue(encoded_audios.ndim == 4)
self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
# Test audio masking
encoded_audios = feature_extractor(
np_speech_inputs, return_tensors="np", sampling_rate=44100, mask_audio=True
).audio_values
self.assertTrue(encoded_audios.ndim == 4)
self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
return [x["array"] for x in speech_samples]
def test_integration(self):
input_speech = self._load_datasamples(1)
feaure_extractor = TvltFeatureExtractor()
audio_values = feaure_extractor(input_speech, return_tensors="pt").audio_values
self.assertTrue(audio_values.shape, [1, 1, 192, 128])
expected_slice = torch.tensor([[-0.3032, -0.2708], [-0.4434, -0.4007]])
self.assertTrue(torch.allclose(audio_values[0, 0, :2, :2], expected_slice, atol=1e-4))

View File

@ -0,0 +1,253 @@
# coding=utf-8
# Copyright 2023 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the TVLT image processor. """
import unittest
import numpy as np
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingSavingTestMixin
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers import TvltImageProcessor
def prepare_video(image_processor_tester, width=10, height=10, numpify=False, torchify=False):
"""This function prepares a video as a list of PIL images/NumPy arrays/PyTorch tensors."""
video = []
for i in range(image_processor_tester.num_frames):
video.append(np.random.randint(255, size=(image_processor_tester.num_channels, width, height), dtype=np.uint8))
if not numpify and not torchify:
# PIL expects the channel dimension as last dimension
video = [Image.fromarray(np.moveaxis(frame, 0, -1)) for frame in video]
if torchify:
video = [torch.from_numpy(frame) for frame in video]
return video
def prepare_video_inputs(image_processor_tester, equal_resolution=False, numpify=False, torchify=False):
"""This function prepares a batch of videos: a list of list of PIL images, or a list of list of numpy arrays if
one specifies numpify=True, or a list of list of PyTorch tensors if one specifies torchify=True.
One can specify whether the videos are of the same resolution or not.
"""
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
video_inputs = []
for i in range(image_processor_tester.batch_size):
if equal_resolution:
width = height = image_processor_tester.max_resolution
else:
width, height = np.random.choice(
np.arange(image_processor_tester.min_resolution, image_processor_tester.max_resolution), 2
)
video = prepare_video(
image_processor_tester=image_processor_tester,
width=width,
height=height,
numpify=numpify,
torchify=torchify,
)
video_inputs.append(video)
return video_inputs
class TvltImageProcessorTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
num_frames=4,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
do_center_crop=True,
crop_size=None,
):
size = size if size is not None else {"shortest_edge": 18}
crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.num_frames = num_frames
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_center_crop = do_center_crop
self.crop_size = crop_size
def prepare_image_processor_dict(self):
return {
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_normalize": self.do_normalize,
"do_resize": self.do_resize,
"size": self.size,
"do_center_crop": self.do_center_crop,
"crop_size": self.crop_size,
}
@require_torch
@require_vision
class TvltImageProcessorTest(ImageProcessingSavingTestMixin, unittest.TestCase):
image_processing_class = TvltImageProcessor if is_vision_available() else None
def setUp(self):
self.image_processor_tester = TvltImageProcessorTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processor = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processor, "image_mean"))
self.assertTrue(hasattr(image_processor, "image_std"))
self.assertTrue(hasattr(image_processor, "do_normalize"))
self.assertTrue(hasattr(image_processor, "do_resize"))
self.assertTrue(hasattr(image_processor, "do_center_crop"))
self.assertTrue(hasattr(image_processor, "size"))
def test_call_pil(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict)
# create random PIL videos
video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], Image.Image)
# Test not batched input
encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
1,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
# Test batched
encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
def test_call_numpy(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], np.ndarray)
# Test not batched input
encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
1,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
# Test batched
encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
def test_call_pytorch(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], torch.Tensor)
# Test not batched input
encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
1,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
# Test batched
encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_videos.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)

View File

@ -0,0 +1,628 @@
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch TVLT model. """
import copy
import inspect
import unittest
import numpy as np
from huggingface_hub import hf_hub_download
from transformers import (
TvltConfig,
is_datasets_available,
is_speech_available,
is_torch_available,
is_vision_available,
)
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
from transformers.utils import cached_property
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor
if is_torch_available():
import torch
import torch.nn as nn
from transformers import TvltForAudioVisualClassification, TvltForPreTraining, TvltModel
from transformers.models.tvlt.modeling_tvlt import TVLT_PRETRAINED_MODEL_ARCHIVE_LIST
from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_10
else:
is_torch_greater_or_equal_than_1_10 = False
if is_datasets_available():
from datasets import load_dataset
if is_vision_available():
from transformers import TvltImageProcessor
if is_speech_available():
from transformers import TvltFeatureExtractor
class TvltModelTester:
def __init__(
self,
parent,
batch_size=2,
image_size=32,
spectrogram_length=32,
frequency_length=16,
image_patch_size=[2, 2],
audio_patch_size=[2, 2],
num_image_channels=3,
num_audio_channels=1,
num_frames=2,
hidden_size=128,
num_hidden_layers=12,
num_attention_heads=4,
intermediate_size=128,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
layer_norm_eps=1e-12,
qkv_bias=True,
use_mean_pooling=True,
decoder_num_attention_heads=4,
decoder_hidden_size=64,
decoder_num_hidden_layers=2,
decoder_intermediate_size=128,
image_mask_ratio=0.75,
audio_mask_ratio=0.15,
audio_mask_type="frame-level",
task_matching=True,
task_mae=True,
num_labels=1,
is_training=True,
):
self.parent = parent
self.batch_size = batch_size
self.image_size = image_size
self.spectrogram_length = spectrogram_length
self.frequency_length = frequency_length
self.image_patch_size = image_patch_size
self.audio_patch_size = audio_patch_size
self.num_image_channels = num_image_channels
self.num_audio_channels = num_audio_channels
self.num_frames = num_frames
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.qkv_bias = qkv_bias
self.use_mean_pooling = use_mean_pooling
self.decoder_num_attention_heads = decoder_num_attention_heads
self.decoder_hidden_size = decoder_hidden_size
self.decoder_num_hidden_layers = decoder_num_hidden_layers
self.decoder_intermediate_size = decoder_intermediate_size
self.image_mask_ratio = image_mask_ratio
self.audio_mask_ratio = audio_mask_ratio
self.task_matching = task_matching
self.task_mae = task_mae
self.num_labels = num_labels
self.expected_pixel_seq_len = (self.image_size // self.image_patch_size[0]) ** 2 * self.num_frames
self.expected_audio_seq_len = (self.spectrogram_length // self.audio_patch_size[0]) * (
self.frequency_length // self.audio_patch_size[1]
)
# we set the expected sequence length (which is used in several tests)
# this is equal to the seq length of number of image/video patches + number of audio patches
self.expected_seq_len = self.expected_pixel_seq_len + self.expected_audio_seq_len + 1
self.image_mae_output_dim = image_patch_size[0] ** 2 * num_image_channels
self.audio_mae_output_dim = audio_patch_size[0] * audio_patch_size[1] * num_audio_channels
self.is_training = is_training
def prepare_config_and_inputs(self):
pixel_values = floats_tensor(
[self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
)
audio_values = floats_tensor(
[self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
)
pixel_mask = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
audio_mask = floats_tensor([self.batch_size, self.expected_audio_seq_len])
config = self.get_config()
return (config, pixel_values, audio_values, pixel_mask, audio_mask)
def prepare_config_and_inputs_for_pretraining(self):
pixel_values = floats_tensor(
[self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
)
audio_values = floats_tensor(
[self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
)
pixel_mask = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
audio_mask = floats_tensor([self.batch_size, self.expected_audio_seq_len])
pixel_values_mixed = floats_tensor(
[self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
)
pixel_mask_mixed = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
labels = floats_tensor([self.batch_size])
config = self.get_config()
return (
config,
pixel_values,
audio_values,
pixel_mask,
audio_mask,
pixel_values_mixed,
pixel_mask_mixed,
labels,
)
def get_config(self):
return TvltConfig(
image_size=self.image_size,
spectrogram_length=self.spectrogram_length,
frequency_length=self.frequency_length,
image_patch_size=self.image_patch_size,
audio_patch_size=self.audio_patch_size,
num_image_channels=self.num_image_channels,
num_audio_channels=self.num_audio_channels,
num_frames=self.num_frames,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
initializer_range=self.initializer_range,
layer_norm_eps=self.layer_norm_eps,
qkv_bias=self.qkv_bias,
use_mean_pooling=self.use_mean_pooling,
decoder_num_attention_heads=self.decoder_num_attention_heads,
decoder_hidden_size=self.decoder_hidden_size,
decoder_num_hidden_layers=self.decoder_num_hidden_layers,
decoder_intermediate_size=self.decoder_intermediate_size,
image_mask_ratio=self.image_mask_ratio,
audio_mask_ratio=self.audio_mask_ratio,
task_matching=self.task_matching,
task_mae=self.task_mae,
num_labels=self.num_labels,
)
def create_and_check_model(self, config, pixel_values, audio_values, pixel_mask, audio_mask):
model = TvltModel(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values, audio_values, pixel_mask=pixel_mask, audio_mask=audio_mask)
result = model(pixel_values, audio_values)
self.parent.assertEqual(
result.last_hidden_state.shape, (self.batch_size, self.expected_seq_len, self.hidden_size)
)
def create_and_check_for_audiovisual_classification(
self, config, pixel_values, audio_values, pixel_mask, audio_mask
):
model = TvltForAudioVisualClassification(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values, audio_values, pixel_mask=pixel_mask, audio_mask=audio_mask)
result = model(pixel_values, audio_values)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def create_and_check_for_pretraining(
self,
config,
pixel_values,
audio_values,
pixel_mask,
audio_mask,
pixel_values_mixed,
pixel_mask_mixed,
labels,
):
model = TvltForPreTraining(config=config)
model.to(torch_device)
model.train()
result = model(
pixel_values,
audio_values,
pixel_mask,
audio_mask,
pixel_values_mixed=pixel_values_mixed,
pixel_mask_mixed=pixel_mask_mixed,
labels=labels,
)
self.parent.assertEqual(
result.pixel_logits.shape, (self.batch_size, self.expected_pixel_seq_len, self.image_mae_output_dim)
)
self.parent.assertEqual(
result.audio_logits.shape, (self.batch_size, self.expected_audio_seq_len, self.audio_mae_output_dim)
)
self.parent.assertEqual(result.matching_logits.shape, (self.batch_size, self.num_labels))
def create_and_check_for_pretraining_inference(
self,
config,
pixel_values,
audio_values,
pixel_mask,
audio_mask,
pixel_values_mixed,
pixel_mask_mixed,
labels,
):
model = TvltForPreTraining(config=config)
model.to(torch_device)
model.eval()
result = model(
pixel_values,
audio_values,
pixel_mask,
audio_mask,
pixel_values_mixed=pixel_values_mixed,
pixel_mask_mixed=pixel_mask_mixed,
labels=labels,
)
if result.pixel_logits is not None:
self.parent.assertEqual(
result.pixel_logits.shape, (self.batch_size, self.expected_pixel_seq_len, self.image_mae_output_dim)
)
if result.audio_logits is not None:
self.parent.assertEqual(
result.audio_logits.shape, (self.batch_size, self.expected_audio_seq_len, self.audio_mae_output_dim)
)
self.parent.assertEqual(result.matching_logits.shape, (self.batch_size, self.num_labels))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, pixel_values, audio_values, pixel_mask, audio_mask) = config_and_inputs
inputs_dict = {
"pixel_values": pixel_values,
"audio_values": audio_values,
"pixel_mask": pixel_mask,
"audio_mask": audio_mask,
}
return config, inputs_dict
def prepare_pixel_values(self):
return floats_tensor(
[self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
)
def prepare_audio_values(self):
return floats_tensor(
[self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
)
@require_torch
@unittest.skipIf(not is_torch_greater_or_equal_than_1_10, "TVLT is only available in torch v1.10+")
class TvltModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (
(TvltModel, TvltForPreTraining, TvltForAudioVisualClassification) if is_torch_available() else ()
)
fx_compatible = False
test_pruning = False
test_headmasking = False
test_torchscript = False
test_resize_embeddings = False
main_input_name = "pixel_values"
# TvltForAudioVisualClassification and TvltForPreTraining require special treatment
def _prepare_for_class(self, inputs_dict, model_class, return_labels=True):
inputs_dict = copy.deepcopy(inputs_dict)
if return_labels:
if model_class.__name__ == "TvltForAudioVisualClassification":
inputs_dict["labels"] = torch.zeros(
(self.model_tester.batch_size,), dtype=torch.long, device=torch_device
)
elif model_class.__name__ == "TvltForPreTraining":
inputs_dict["labels"] = torch.zeros(
(self.model_tester.batch_size,), dtype=torch.float, device=torch_device
)
inputs_dict["pixel_values_mixed"] = torch.zeros(
(
self.model_tester.batch_size,
self.model_tester.num_frames,
self.model_tester.num_image_channels,
self.model_tester.image_size,
self.model_tester.image_size,
),
dtype=torch.float,
device=torch_device,
)
inputs_dict["pixel_mask_mixed"] = torch.zeros(
(self.model_tester.batch_size, self.model_tester.expected_pixel_seq_len),
dtype=torch.float,
device=torch_device,
)
return inputs_dict
def setUp(self):
self.model_tester = TvltModelTester(self)
self.config_tester = ConfigTester(self, config_class=TvltConfig, has_text_modality=False, hidden_size=37)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="TVLT does not use inputs_embeds")
def test_inputs_embeds(self):
pass
def test_model_common_attributes(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
input_embeddings = model.get_input_embeddings()
self.assertIsInstance(input_embeddings, (tuple))
for embedding in input_embeddings:
self.assertIsInstance(embedding, (nn.Module))
x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, nn.Linear))
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values", "audio_values"]
self.assertListEqual(arg_names[:2], expected_arg_names)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_for_audiovisual_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_audiovisual_classification(*config_and_inputs)
def test_for_pretraining(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pretraining()
self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
self.model_tester.create_and_check_for_pretraining_inference(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
for model_name in TVLT_PRETRAINED_MODEL_ARCHIVE_LIST:
model = TvltModel.from_pretrained(model_name)
self.assertIsNotNone(model)
def test_training(self):
if not self.model_tester.is_training:
return
for model_class in self.all_model_classes[1:]:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
model = model_class(config)
model.to(torch_device)
model.train()
inputs = self._prepare_for_class(inputs_dict, model_class)
for k, v in inputs.items():
print(k, v.shape)
loss = model(**inputs).loss
loss.backward()
def test_training_gradient_checkpointing(self):
if not self.model_tester.is_training:
return
for model_class in self.all_model_classes[1:]:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.use_cache = False
config.return_dict = True
model = model_class(config)
model.to(torch_device)
model.gradient_checkpointing_enable()
model.train()
inputs = self._prepare_for_class(inputs_dict, model_class)
loss = model(**inputs).loss
loss.backward()
def test_attention_outputs(self):
if not self.has_attentions:
pass
else:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
for model_class in self.all_model_classes[2:]:
seq_len = self.model_tester.expected_seq_len
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = False
config.return_dict = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
attentions = outputs.attentions
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
# check that output_attentions also work using config
del inputs_dict["output_attentions"]
config.output_attentions = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
attentions = outputs.attentions
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, seq_len, seq_len],
)
out_len = len(outputs)
# Check attention is always last and order is fine
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
self.assertEqual(out_len + 1, len(outputs))
self_attentions = outputs.attentions
self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(self_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, seq_len, seq_len],
)
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs.hidden_states
expected_num_layers = self.model_tester.num_hidden_layers + 1
self.assertEqual(len(hidden_states), expected_num_layers)
seq_length = self.model_tester.expected_seq_len
self.assertListEqual(
list(hidden_states[0].shape[-2:]),
[seq_length, self.model_tester.hidden_size],
)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes[2:]:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
# We will verify our results on a video of eating spaghetti
# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
def prepare_video(num_frames=8):
file = hf_hub_download(
repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
)
video = np.load(file)[:num_frames]
return list(video)
def prepare_audio(num_samples=1):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
return [x["array"] for x in speech_samples]
@require_torch
@require_vision
class TvltModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
# logits were tested with a different mean and std, so we use the same here
return (
TvltImageProcessor() if is_vision_available() else None,
TvltFeatureExtractor(),
)
def test_inference_for_base_model(self):
model = TvltModel.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
image_processor, audio_feature_extractor = self.default_feature_extractor
video = prepare_video()
audio = prepare_audio()
video_inputs = image_processor(video, return_tensors="pt").to(torch_device)
audio_inputs = audio_feature_extractor(audio, return_tensors="pt").to(torch_device)
inputs = dict()
inputs.update(video_inputs)
inputs.update(audio_inputs)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
expected_last_hidden_state_slice = torch.tensor([[-0.0186, -0.0691], [0.0242, -0.0398]])
self.assertTrue(
torch.allclose(outputs.last_hidden_state[:, :2, :2], expected_last_hidden_state_slice, atol=1e-4)
)
def test_inference_for_pretraining(self):
model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
image_processor, audio_feature_extractor = self.default_feature_extractor
video = prepare_video()
video_mixed = prepare_video()
audio = prepare_audio()
video_inputs = image_processor(video, return_tensors="pt", mask_pixel=True).to(torch_device)
video_mixed_inputs = image_processor(video_mixed, is_mixed=True, return_tensors="pt").to(torch_device)
audio_inputs = audio_feature_extractor(audio, return_tensors="pt", mask_audio=True).to(torch_device)
labels = torch.tensor([[0.0]], device=torch_device)
inputs = dict()
inputs.update(video_inputs)
inputs.update(video_mixed_inputs)
inputs.update(audio_inputs)
inputs.update({"labels": labels})
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
expected_pixel_logits_shape = torch.Size([1, 1568, 768])
expected_audio_logits_shape = torch.Size([1, 96, 256])
expected_matching_logits_shape = torch.Size([1, 1])
if outputs.pixel_logits is not None:
self.assertEqual(outputs.pixel_logits.shape, expected_pixel_logits_shape)
if outputs.audio_logits is not None:
self.assertEqual(outputs.audio_logits.shape, expected_audio_logits_shape)
self.assertTrue(outputs.matching_logits.shape, expected_matching_logits_shape)

View File

@ -0,0 +1,116 @@
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shutil
import tempfile
import unittest
import numpy as np
import pytest
from transformers import is_speech_available, is_vision_available
from transformers.testing_utils import require_torch
if is_vision_available():
from transformers import TvltImageProcessor
if is_speech_available():
from transformers import TvltFeatureExtractor
from transformers import TvltProcessor
@require_torch
class TvltProcessorTest(unittest.TestCase):
def setUp(self):
self.checkpoint = "ZinengTang/tvlt-base"
self.tmpdirname = tempfile.mkdtemp()
def get_image_processor(self, **kwargs):
return TvltImageProcessor.from_pretrained(self.checkpoint, **kwargs)
def get_feature_extractor(self, **kwargs):
return TvltFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_save_load_pretrained_default(self):
image_processor = self.get_image_processor()
feature_extractor = self.get_feature_extractor()
processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
processor.save_pretrained(self.tmpdirname)
processor = TvltProcessor.from_pretrained(self.tmpdirname)
self.assertIsInstance(processor.feature_extractor, TvltFeatureExtractor)
self.assertIsInstance(processor.image_processor, TvltImageProcessor)
def test_feature_extractor(self):
image_processor = self.get_image_processor()
feature_extractor = self.get_feature_extractor()
processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
audio = np.ones([12000])
audio_dict = feature_extractor(audio, return_tensors="np")
input_processor = processor(audio=audio, return_tensors="np")
for key in audio_dict.keys():
self.assertAlmostEqual(audio_dict[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_image_processor(self):
image_processor = self.get_image_processor()
feature_extractor = self.get_feature_extractor()
processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
images = np.ones([3, 224, 224])
image_dict = image_processor(images, return_tensors="np")
input_processor = processor(images=images, return_tensors="np")
for key in image_dict.keys():
self.assertAlmostEqual(image_dict[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_processor(self):
image_processor = self.get_image_processor()
feature_extractor = self.get_feature_extractor()
processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
audio = np.ones([12000])
images = np.ones([3, 224, 224])
inputs = processor(audio=audio, images=images)
self.assertListEqual(list(inputs.keys()), ["audio_values", "audio_mask", "pixel_values", "pixel_mask"])
# test if it raises when no input is passed
with pytest.raises(ValueError):
processor()
def test_model_input_names(self):
image_processor = self.get_image_processor()
feature_extractor = self.get_feature_extractor()
processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
self.assertListEqual(
processor.model_input_names,
image_processor.model_input_names + feature_extractor.model_input_names,
msg="`processor` and `image_processor`+`feature_extractor` model input names do not match",
)

1
utils/check_repo.py Executable file → Normal file
View File

@ -287,6 +287,7 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"AltCLIPTextModel",
"AltCLIPVisionModel",
"AltRobertaModel",
"TvltForAudioVisualClassification",
"SpeechT5ForSpeechToSpeech",
"SpeechT5ForTextToSpeech",
"SpeechT5HifiGan",