transformers/benchmark.py
Raushan Turganbay e0646f3dce
Chat template: return vectorized output in processors (#34275)
* update chat template

* style

* fix tests

* Update src/transformers/image_utils.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>

* typehints + docs

* fix tests

* remove unnecessary warnings

* forgot code style :(

* allow users to pass backend and num frames

* Update docs/source/en/chat_templating.md

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>

* Update src/transformers/image_utils.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>

* Update src/transformers/image_utils.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>

* Update src/transformers/image_utils.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>

* Update src/transformers/image_utils.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>

* Update src/transformers/image_utils.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>

* Update src/transformers/image_utils.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>

* Update src/transformers/processing_utils.py

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>

* typo fix

* style

* address comments

* align with "pipeline" template

* update docs

* update docs

* unpack for all kwargs?

* wrong conflict resolution while rebasing

* tmp

* update docs

* Update docs/source/en/chat_templating.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/en/chat_templating.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/en/chat_templating.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/en/chat_templating.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

---------

Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
2025-01-10 11:05:29 +01:00

133 lines
3.8 KiB
Python

import os
import time
import cv2
import av
import numpy as np
from numba import jit, cuda
from decord import VideoReader, cpu, gpu
import torch
from torchvision import io
video_dir = "/raid/raushan/temp_dir/"
NUM_FRAMES = 32
# @jit(nopython=True, target_backend='cuda') # <-- If you have a cuda GPU
def process_video_cv2(video: cv2.VideoCapture, indices: np.array, length: int):
index = 0
frames = []
while video.isOpened():
success, frame = video.read()
if index in indices:
# Channel 0:B 1:G 2:R
height, width, channel = frame.shape
frames.append(frame[0:height, 0:width, 0:channel])
if success:
index += 1
if index >= length:
break
video.release()
return frames
def read_video_opencv(video_path, num_frames=NUM_FRAMES):
'''
Decode the video with open-cv decoder.
Args:
video_path (str): Path to the video file.
num_frames (int): Number of frames to sample uniformly. Defaults to NUM_FRAMES
Returns:
np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
'''
video = cv2.VideoCapture(video_path)
fps = int(video.get(cv2.CAP_PROP_FPS))
total_num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(int)
frames = process_video_cv2(video, indices, total_num_frames)
return np.stack(frames)
def read_video_decord(video_path, num_frames=NUM_FRAMES):
'''
Decode the video with Decord decoder.
Args:
video_path (str): Path to the video file.
num_frames (int): Number of frames to sample uniformly. Defaults to NUM_FRAMES
Returns:
np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
'''
vr = VideoReader(uri=video_path, ctx=cpu(0)) # you need to install from source to use gpu ctx
indices = np.arange(0, len(vr), len(vr) / num_frames).astype(int)
frames = vr.get_batch(indices).asnumpy()
return frames
def read_video_pyav(video_path, num_frames=NUM_FRAMES):
'''
Decode the video with PyAV decoder.
Args:
video_path (str): Path to the video file.
num_frames (int): Number of frames to sample uniformly. Defaults to NUM_FRAMES
Returns:
np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
'''
container = av.open(video_path)
# sample uniformly "num_frames" frames from the video
total_frames = container.streams.video[0].frames
indices = np.arange(0, total_frames, total_frames / num_frames).astype(int)
frames = []
container.seek(0)
start_index = indices[0]
end_index = indices[-1]
for i, frame in enumerate(container.decode(video=0)):
if i > end_index:
break
if i >= start_index and i in indices:
frames.append(frame)
return np.stack([x.to_ndarray(format="rgb24") for x in frames])
def read_video_torchvision(video_path, num_frames=NUM_FRAMES):
video, _, info = io.read_video(
video_path,
start_pts=0.0,
end_pts=None,
pts_unit="sec",
output_format="TCHW",
)
idx = torch.linspace(0, video.size(0) - 1, num_frames, dtype=torch.int64)
return video[idx]
decoders = {"decord": read_video_decord, "opencv": read_video_opencv, "av": read_video_pyav, "torchvision": read_video_torchvision}
for name, fn in decoders.items():
start = time.perf_counter()
for video_file in os.listdir(video_dir):
path = f"{video_dir}/{video_file}"
output = fn(path)
end = time.perf_counter()
print(f"Time taken for {name}: {(end-start):.04f} sec")
# Time taken for decord: 475.2979 sec
# Time taken for opencv: 614.6062 sec
# Time taken for av: 1067.0860 sec
# Time taken for torchvision: 1924.0433 sec