From 135e86aa54577cc0a5cafcb595b2d73d79a05add Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Mon, 20 Jan 2025 13:40:57 +0100 Subject: [PATCH] Remove read_video and run --- read_video.py | 77 ------------------------------------ run.py | 107 -------------------------------------------------- 2 files changed, 184 deletions(-) delete mode 100644 read_video.py delete mode 100644 run.py diff --git a/read_video.py b/read_video.py deleted file mode 100644 index 25e201a6e48..00000000000 --- a/read_video.py +++ /dev/null @@ -1,77 +0,0 @@ -import numpy as np -import cv2 -import requests -from yt_dlp import YoutubeDL -from contextlib import redirect_stdout -from pathlib import Path -import io -import imageio.v3 as iio - - -url = "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4" -vid = cv2.VideoCapture(url) -# ret, frame = vid.read() - -while(True): - # Capture frame-by-frame - ret, frame = vid.read() - #print cap.isOpened(), ret - if frame is not None: - pass - # print(frame.shape) - else: - break - -print(vid.isOpened(), frame is not None) - -buffer = io.BytesIO(requests.get(url).content) -video = buffer.getvalue() -frames = iio.imread(video, index=None) -print(frames.shape) - - - - - -youtube_id = "https://www.youtube.com/watch?v=BaW_jenozKc" - -ctx = { - "outtmpl": "-", - 'logtostderr': True -} - -buffer = io.BytesIO() -with redirect_stdout(buffer), YoutubeDL(ctx) as foo: - foo.download([youtube_id]) -# Path(f"vi.mp4").write_bytes(buffer.getvalue()) - -video = buffer.getvalue() -print(type(video)) -frames = iio.imread(video, index=None) -print(frames.shape) - - -import decord -file_obj = io.BytesIO(video) -container = decord.VideoReader(file_obj) -print(container[2].shape) - -# print(np.frombuffer(video, dtype=np.uint8).shape) -# img_array = np.asarray(bytearray(video), dtype=np.uint8) -# im = cv2.imdecode(img_array, cv2.IMREAD_UNCHANGED) - - - -import av - -file_obj = io.BytesIO(video) -container = av.open(file_obj) -container.seek(0) -frames = [] -for i, frame in enumerate(container.decode(video=0)): - if i > 10: - break - if i >= 0: - frames.append(frame) -out = np.stack([x.to_ndarray(format="rgb24") for x in frames]) -print(out.shape) diff --git a/run.py b/run.py deleted file mode 100644 index b79ba1ecf3f..00000000000 --- a/run.py +++ /dev/null @@ -1,107 +0,0 @@ -import av -import torch -import decord -from decord import VideoReader, cpu - -import numpy as np -from PIL import Image -from huggingface_hub import hf_hub_download -from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration, SiglipImageProcessor - -model_id = "/raid/raushan/llava-next-video-qwen-7b" - -model = LlavaNextVideoForConditionalGeneration.from_pretrained( - model_id, - torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, -).to(0) - -processor = LlavaNextVideoProcessor.from_pretrained(model_id, torch_dtype=torch.bfloat16) -img_proc = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384") - -image = Image.open("/raid/raushan/image.png") - - -def load_video(video_path, max_frames_num,fps=1,force_sample=False): - - vr = VideoReader(video_path) - total_frame_num = len(vr) - video_time = total_frame_num / vr.get_avg_fps() - fps = round(vr.get_avg_fps()/fps) - frame_idx = [i for i in range(0, len(vr), fps)] - frame_time = [i/fps for i in frame_idx] - if len(frame_idx) > max_frames_num or force_sample: - sample_fps = max_frames_num - uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int) - frame_idx = uniform_sampled_frames.tolist() - frame_time = [i/vr.get_avg_fps() for i in frame_idx] - frame_time = ",".join([f"{i:.2f}s" for i in frame_time]) - spare_frames = vr.get_batch(frame_idx).asnumpy() - print(spare_frames.shape) - return spare_frames,frame_time,video_time - - -def read_video_pyav(container, indices): - ''' - Decode the video with PyAV decoder. - Args: - container (`av.container.input.InputContainer`): PyAV container. - indices (`List[int]`): List of frame indices to decode. - Returns: - result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). - ''' - frames = [] - container.seek(0) - start_index = indices[0] - end_index = indices[-1] - for i, frame in enumerate(container.decode(video=0)): - if i > end_index: - break - if i >= start_index and i in indices: - frames.append(frame) - return np.stack([x.to_ndarray(format="rgb24") for x in frames]) - - -# define a chat history and use `apply_chat_template` to get correctly formatted prompt -# Each value in "content" has to be a list of dicts with types ("text", "image", "video") -# <|im_start|>system -# You are a helpful assistant.<|im_end|> -# <|im_start|>user -# Time farmes are this moments and we ahev 64 frames -# Please describe this video in detail.<|im_end|> -# <|im_start|>assistant - -conversation = [ - { - - "role": "system", - "content": [ - {"type": "text", "text": "You are a helpful assistant."}, - ], - }, - { - - "role": "user", - "content": [ - {"type": "text", "text": "The video lasts for 19.97 seconds, and 64 frames are uniformly sampled from it. These frames are located at 0.00s,0.30s,0.60s,0.93s,1.23s,1.57s,1.87s,2.20s,2.50s,2.83s,3.13s,3.47s,3.77s,4.10s,4.40s,4.73s,5.03s,5.37s,5.67s,6.00s,6.30s,6.63s,6.93s,7.27s,7.57s,7.90s,8.20s,8.53s,8.83s,9.17s,9.47s,9.80s,10.10s,10.43s,10.73s,11.07s,11.37s,11.70s,12.00s,12.33s,12.63s,12.97s,13.27s,13.60s,13.90s,14.23s,14.53s,14.87s,15.17s,15.50s,15.80s,16.13s,16.43s,16.77s,17.07s,17.40s,17.70s,18.03s,18.33s,18.67s,18.97s,19.30s,19.60s,19.93s.Please answer the following questions related to this video.\nPlease describe this video in detail."}, - {"type": "video"}, - ], - }, -] - -prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) -prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n