Why not small fp8 models
Hey why don't you release fp8 model, that can be used by consumer level GPUs easily?
Hey why don't you release fp8 model, that can be used by consumer level GPUs easily?
I think we can do fp8 quantization of text encoder
here is consumer level and no quantize:
prompt = "A photorealistic close-up of a iridescent hummingbird hovering mid-air, its wings a blur of sapphire and emerald, drinking nectar from a bioluminescent flower"
negative_prompt = "cartoon, anime, poor quality, poor clarity, ugly"
g_scale,steps,width,height = 4, 15, 1536, 640
import diffusers, torch, gc
def flush():
gc.collect()
torch.cuda.empty_cache()
device, dtype, model_id = "cuda", torch.bfloat16, "THUDM/CogView4-6B"
emb_prompts = diffusers.DiffusionPipeline.from_pretrained(model_id, transformer=None, vae=None, torch_dtype=dtype).to(device)
with torch.no_grad():
(prompt_embeds, negative_prompt_embeds) = emb_prompts.encode_prompt(prompt=prompt, negative_prompt=negative_prompt)
taco = flush()
emb_prompts, text_encoder, tokenizer = taco,taco,taco
del taco
pipeline = diffusers.DiffusionPipeline.from_pretrained(model_id, text_encoder=None, tokenizer=None, torch_dtype=dtype).to(device)
with torch.inference_mode():
image = pipeline(prompt_embeds=prompt_embeds.to(dtype), negative_prompt_embeds=negative_prompt_embeds.to(dtype), guidance_scale=g_scale, num_inference_steps=steps, width=width, height=height).images[0]
display(image);del pipeline;del image;flush()