Why not small fp8 models

#10
by ryg81 - opened

Hey why don't you release fp8 model, that can be used by consumer level GPUs easily?

Hey why don't you release fp8 model, that can be used by consumer level GPUs easily?

I think we can do fp8 quantization of text encoder

here is consumer level and no quantize:

prompt = "A photorealistic close-up of a iridescent hummingbird hovering mid-air, its wings a blur of sapphire and emerald, drinking nectar from a bioluminescent flower"
negative_prompt = "cartoon, anime, poor quality, poor clarity, ugly"
g_scale,steps,width,height = 4, 15, 1536, 640

import diffusers, torch, gc
def flush():
gc.collect()
torch.cuda.empty_cache()
device, dtype, model_id = "cuda", torch.bfloat16, "THUDM/CogView4-6B"
emb_prompts = diffusers.DiffusionPipeline.from_pretrained(model_id, transformer=None, vae=None, torch_dtype=dtype).to(device)
with torch.no_grad():
(prompt_embeds, negative_prompt_embeds) = emb_prompts.encode_prompt(prompt=prompt, negative_prompt=negative_prompt)
taco = flush()
emb_prompts, text_encoder, tokenizer = taco,taco,taco
del taco

pipeline = diffusers.DiffusionPipeline.from_pretrained(model_id, text_encoder=None, tokenizer=None, torch_dtype=dtype).to(device)
with torch.inference_mode():
image = pipeline(prompt_embeds=prompt_embeds.to(dtype), negative_prompt_embeds=negative_prompt_embeds.to(dtype), guidance_scale=g_scale, num_inference_steps=steps, width=width, height=height).images[0]
display(image);del pipeline;del image;flush()

Your need to confirm your account before you can post a new comment.

Sign up or log in to comment