AMI-Video-Recall

Paused

App Files Files Community

AMI-Video-Recall / app.py

IVGSZ

Update app.py

6993849 verified 5 months ago

raw

history blame contribute delete

5.9 kB

	import torch
	import gradio as gr
	from flash_vstream.serve.demo import Chat, title_markdown, block_css
	from flash_vstream.constants import *
	from flash_vstream.conversation import conv_templates, Conversation
	import os
	from PIL import Image
	import tempfile
	import imageio
	import shutil


	model_path = "IVGSZ/Flash-VStream-7b"
	load_8bit = False
	load_4bit = False

	def save_image_to_local(image):
	filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.jpg')
	image = Image.open(image)
	image.save(filename)
	return filename


	def save_video_to_local(video_path):
	filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.mp4')
	shutil.copyfile(video_path, filename)
	return filename


	def generate(video, textbox_in, first_run, state, state_, images_tensor):

	flag = 1
	if not textbox_in:
	if len(state_.messages) > 0:
	textbox_in = state_.messages[-1][1]
	state_.messages.pop(-1)
	flag = 0
	else:
	return "Please enter instruction"

	video = video if video else "none"

	if type(state) is not Conversation:
	state = conv_templates[conv_mode].copy()
	state_ = conv_templates[conv_mode].copy()
	images_tensor = []

	first_run = False if len(state.messages) > 0 else True

	text_en_in = textbox_in.replace("picture", "image")

	image_processor = handler.image_processor

	if os.path.exists(video):
	video_tensor = handler._get_rawvideo_dec(video, image_processor, max_frames=MAX_IMAGE_LENGTH)
	images_tensor = image_processor(video_tensor, return_tensors='pt')['pixel_values'].to(handler.model.device, dtype=torch.float16)
	print("video_tensor", video_tensor.shape)

	if os.path.exists(video):
	text_en_in = DEFAULT_IMAGE_TOKEN + '\n' + text_en_in

	text_en_out, state_ = handler.generate(images_tensor, text_en_in, first_run=first_run, state=state_)
	state_.messages[-1] = (state_.roles[1], text_en_out)

	text_en_out = text_en_out.split('#')[0]
	textbox_out = text_en_out

	show_images = ""
	if os.path.exists(video):
	filename = save_video_to_local(video)
	show_images += f'<video controls playsinline width="500" style="display: inline-block;" src="./file={filename}"></video>'

	if flag:
	state.append_message(state.roles[0], textbox_in + "\n" + show_images)
	state.append_message(state.roles[1], textbox_out)

	return (state, state_, state.to_gradio_chatbot(), False, gr.update(value=None, interactive=True), images_tensor, gr.update(value=None, interactive=True))


	def regenerate(state, state_):
	state.messages.pop(-1)
	state_.messages.pop(-1)
	if len(state.messages) > 0:
	return state, state_, state.to_gradio_chatbot(), False
	return (state, state_, state.to_gradio_chatbot(), True)


	def clear_history(state, state_):
	state = conv_templates[conv_mode].copy()
	state_ = conv_templates[conv_mode].copy()
	return (gr.update(value=None, interactive=True), \
	gr.update(value=None, interactive=True),\
	True, state, state_, state.to_gradio_chatbot(), [])


	conv_mode = "vicuna_v1"
	handler = Chat(model_path, conv_mode=conv_mode, load_4bit=load_4bit, load_8bit=load_8bit)
	if not os.path.exists("temp"):
	os.makedirs("temp")

	print(torch.cuda.memory_allocated())
	print(torch.cuda.max_memory_allocated())

	with gr.Blocks(title='Flash-VStream', theme=gr.themes.Soft(), css=block_css) as demo:
	gr.Markdown(title_markdown)
	state = gr.State()
	state_ = gr.State()
	first_run = gr.State()
	images_tensor = gr.State()

	with gr.Row():
	with gr.Column(scale=3):
	video = gr.Video(label="Input Video")

	with gr.Column(scale=7):
	chatbot = gr.Chatbot(label="Flash-VStream", bubble_full_width=True).style(height=700)
	with gr.Row():
	with gr.Column(scale=8):
	textbox = gr.Textbox(show_label=False,
	placeholder="Enter text and press Send",
	container=False)
	with gr.Column(scale=2, min_width=50):
	submit_btn = gr.Button(value="Send", variant="primary", interactive=True)

	with gr.Row(visible=True) as button_row:
	regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=True)
	clear_btn = gr.Button(value="🗑️ Clear history", interactive=True)

	cur_dir = os.path.dirname(os.path.abspath(__file__))

	with gr.Row():
	gr.Examples(
	examples=[
	[
	f"{cur_dir}/examples/video1.mp4",
	"Describe the video briefly.",
	]
	],
	inputs=[video, textbox],
	)

	gr.Examples(
	examples=[
	[
	f"{cur_dir}/examples/video4.mp4",
	"What is the boy doing?",
	]
	],
	inputs=[video, textbox],
	)

	gr.Examples(
	examples=[
	[
	f"{cur_dir}/examples/video5.mp4",
	"Why is this video funny?",
	]
	],
	inputs=[video, textbox],
	)

	submit_btn.click(generate, [video, textbox, first_run, state, state_, images_tensor], [state, state_, chatbot, first_run, textbox, images_tensor, video])

	regenerate_btn.click(regenerate, [state, state_], [state, state_, chatbot, first_run]).then(
	generate, [video, textbox, first_run, state, state_, images_tensor], [state, state_, chatbot, first_run, textbox, images_tensor, video])

	clear_btn.click(clear_history, [state, state_],
	[video, textbox, first_run, state, state_, chatbot, images_tensor])


	# app = gr.mount_gradio_app(app, demo, path="/")
	demo.launch()