Spaces:

aidealab
/

AIdeaLab-VideoJP

Running on Zero

App Files Files Community

AIdeaLab-VideoJP / app.py

alfredplpl

Update app.py

f62b8e3 verified 17 days ago

raw

history blame contribute delete

7.72 kB

	import gradio as gr

	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from diffusers import (
	AutoencoderKLCogVideoX,
	CogVideoXTransformer3DModel,
	)
	from diffusers.utils import export_to_video
	import tqdm
	from torchvision.transforms import ToPILImage
	import os
	import spaces

	#from torchao.quantization import autoquant

	device="cuda"
	shape=(1,48//4,16,256//8,256//8)
	sample_N=25
	torch_dtype=torch.bfloat16
	eps=1
	cfg=2.5

	tokenizer = AutoTokenizer.from_pretrained(
	"llm-jp/llm-jp-3-1.8b"
	)

	text_encoder = AutoModelForCausalLM.from_pretrained(
	"llm-jp/llm-jp-3-1.8b",
	torch_dtype=torch_dtype
	)
	text_encoder=text_encoder.to(device)

	transformer = CogVideoXTransformer3DModel.from_pretrained(
	"aidealab/AIdeaLab-VideoJP",
	torch_dtype=torch_dtype,
	token=os.environ['TOKEN']
	)
	#transformer = autoquant(transformer, error_on_unseen=False)
	#transformer.to(memory_format=torch.channels_last)
	#transformer = torch.compile(transformer, mode="max-autotune", fullgraph=True)
	transformer=transformer.to(device)

	vae = AutoencoderKLCogVideoX.from_pretrained(
	"THUDM/CogVideoX-2b",
	subfolder="vae"
	)
	vae=vae.to(dtype=torch_dtype, device=device)
	vae.enable_slicing()
	vae.enable_tiling()

	@spaces.GPU
	def text_to_video(prompt, cfg=cfg):
	text_inputs = tokenizer(
	prompt,
	padding="max_length",
	max_length=512,
	truncation=True,
	add_special_tokens=True,
	return_tensors="pt",
	)
	text_input_ids = text_inputs.input_ids
	prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True, attention_mask=text_inputs.attention_mask.to(device)).hidden_states[-1]
	prompt_embeds = prompt_embeds.to(dtype=torch_dtype, device=device)

	null_text_inputs = tokenizer(
	"",
	padding="max_length",
	max_length=512,
	truncation=True,
	add_special_tokens=True,
	return_tensors="pt",
	)
	null_text_input_ids = null_text_inputs.input_ids
	null_prompt_embeds = text_encoder(null_text_input_ids.to(device), output_hidden_states=True, attention_mask=null_text_inputs.attention_mask.to(device)).hidden_states[-1]
	null_prompt_embeds = null_prompt_embeds.to(dtype=torch_dtype, device=device)

	# euler discreate sampler with cfg
	z0 = torch.randn(shape, device=device)
	latents = z0.detach().clone().to(torch_dtype)

	dt = 1.0 / sample_N
	with torch.no_grad():
	for i in tqdm.tqdm(range(sample_N)):
	num_t = i / sample_N
	t = torch.ones(shape[0], device=device) * num_t
	psudo_t=(1000-eps)*(1-t)+eps
	positive_conditional = transformer(hidden_states=latents, timestep=psudo_t, encoder_hidden_states=prompt_embeds, image_rotary_emb=None)
	null_conditional = transformer(hidden_states=latents, timestep=psudo_t, encoder_hidden_states=null_prompt_embeds, image_rotary_emb=None)
	pred = null_conditional.sample+cfg*(positive_conditional.sample-null_conditional.sample)
	latents = latents.detach().clone() + dt * pred.detach().clone()

	# Free vram
	latents = latents / vae.config.scaling_factor
	latents = latents.permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
	x=vae.decode(latents).sample
	x = x / 2 + 0.5
	x = x.clamp(0,1)
	x=x.permute(0, 2, 1, 3, 4).to(torch.float32)# [B, F, C, H, W]
	print(x.shape)
	x=[ToPILImage()(frame) for frame in x[0]]

	export_to_video(x,"output.mp4",fps=24)
	return "output.mp4"

	css="""
	#col-container {
	margin: 0 auto;
	max-width: 520px;
	}
	"""

	# Gradio アプリケーションのレイアウトを定義
	with gr.Blocks(css=css) as demo:
	with gr.Column(elem_id="col-container"):
	gr.Markdown("""# AIdeaLab VideoJP Demo
	AIdeaLab VideoJPは、Rectified Flow Transformerで作られている軽量な動画生成モデルです ([詳細](https://note.com/aidealab/n/n677018ea1953)、[モデル](https://huggingface.co/aidealab/AIdeaLab-VideoJP))。十数秒で動画を作ることができます。なお、AIdeaLab VideoJPは経済産業省と国立研究開発法人新エネルギー・産業技術総合開発機構（ＮＥＤＯ）が実施する、国内の生成AIの開発力強化を目的としたプロジェクト「GENIAC（Generative AI Accelerator Challenge）」の成果をもとに作成されました。""")
	# Dropdown（コンボボックス）で入力
	sample_prompts = [
	"チューリップや菜の花、色とりどりの花が果てしなく続く畑を埋め尽くし、まるでパッチワークのようにカラフルに彩る。朝の柔らかな光が花びらを透かし、淡いグラデーションが映える。風に揺れる花々をスローモーションで捉え、花びらが優雅に舞う姿を映画のような演出で撮影。背景には遠くに連なる山並みや青い空、浮かぶ白い雲が立体感を引き立てる。",
	"The waves crash against the jagged rocks of the shoreline, sending spray high into the air.The rocks are a dark gray color, with sharp edges and deep crevices. The water is a clear blue-green, with white foam where the waves break against the rocks. The sky is a light gray, with a few white clouds dotting the horizon.",
	"遠くまで広がる青い海が、波打ち際で白く砕ける飛沫を上げる。視点はドローン撮影のように、上空から浜辺とサンゴ礁を一望する。柔らかな南国の陽光が海面に反射して宝石のように輝き、うっすらと海底のカラフルなサンゴが見える。",
	"A clear, turquoise river flows through a rocky canyon, cascading over a small waterfall and forming a pool of water at the bottom.The river is the main focus of the scene, with its clear water reflecting the surrounding trees and rocks. The canyon walls are steep and rocky, with some vegetation growing on them. ",
	"静かな森の中を、やわらかな朝陽が差し込む。木漏れ日に照らされた小川には小さな魚が泳ぎ、森の奥からは小鳥のさえずりが聞こえる。少し幻想的な雰囲気を持ちながらも穏やかな情景を映し出す動画。",
	"The camera pans over a snow-covered mountain range, revealing a vast expanse of snow-capped peaks and valleys.The mountains are covered in a thick layer of snow, with some areas appearing almost white while others have a slightly darker, almost grayish hue. The peaks are jagged and irregular, with some rising sharply into the sky while others are more rounded. The valleys are deep and narrow, with steep slopes that are also covered in snow. The trees in the foreground are mostly bare, with only a few leaves remaining on their branches. The sky is overcast, with thick clouds obscuring the sun. The overall impression is one of peace and tranquility, with the snow-covered mountains standing as a testament to the power and beauty of nature.",
	# "黒い短い髪を持った男性がカメラに向かって話している。背景には白い壁が見える。男性の顔ははっきりしていて、黒い目や口がくっきり見えている。",
	]
	text_input = gr.Dropdown(choices=sample_prompts, label="サンプルプロンプトを選んでください")
	generate_button = gr.Button("生成")

	output_video = gr.Video(label="生成された動画")

	# ボタンクリック時の挙動を設定
	generate_button.click(
	fn=text_to_video,
	inputs=text_input,
	outputs=output_video
	)

	demo.launch()