From c25f27fa6a2c04fb344a55f817b4976dd823c0c9 Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Wed, 7 Sep 2022 12:24:12 +0200 Subject: [PATCH] [VideoMAE] Improve code examples (#18919) * Simplify code example * Add seed --- .../models/videomae/modeling_videomae.py | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py index a807ed7208f..7efff490d8c 100644 --- a/src/transformers/models/videomae/modeling_videomae.py +++ b/src/transformers/models/videomae/modeling_videomae.py @@ -598,21 +598,18 @@ class VideoMAEModel(VideoMAEPreTrainedModel): >>> file_path = hf_hub_download( ... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset" ... ) - >>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0)) + >>> videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0)) >>> # sample 16 frames - >>> vr.seek(0) - >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(vr)) - >>> buffer = vr.get_batch(indices).asnumpy() - - >>> # create a list of NumPy arrays - >>> video = [buffer[i] for i in range(buffer.shape[0])] + >>> videoreader.seek(0) + >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(videoreader)) + >>> video = videoreader.get_batch(indices).asnumpy() >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base") >>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base") >>> # prepare video for the model - >>> inputs = feature_extractor(video, return_tensors="pt") + >>> inputs = feature_extractor(list(video), return_tensors="pt") >>> # forward pass >>> outputs = model(**inputs) @@ -943,10 +940,13 @@ class VideoMAEForVideoClassification(VideoMAEPreTrainedModel): ```python >>> from decord import VideoReader, cpu >>> import torch + >>> import numpy as np >>> from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification >>> from huggingface_hub import hf_hub_download + >>> np.random.seed(0) + >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len): ... converted_len = int(clip_len * frame_sample_rate) @@ -961,20 +961,17 @@ class VideoMAEForVideoClassification(VideoMAEPreTrainedModel): >>> file_path = hf_hub_download( ... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset" ... ) - >>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0)) + >>> videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0)) >>> # sample 16 frames - >>> vr.seek(0) - >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(vr)) - >>> buffer = vr.get_batch(indices).asnumpy() - - >>> # create a list of NumPy arrays - >>> video = [buffer[i] for i in range(buffer.shape[0])] + >>> videoreader.seek(0) + >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(videoreader)) + >>> video = videoreader.get_batch(indices).asnumpy() >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics") >>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics") - >>> inputs = feature_extractor(video, return_tensors="pt") + >>> inputs = feature_extractor(list(video), return_tensors="pt") >>> with torch.no_grad(): ... outputs = model(**inputs)