THUDM/CogView4-6B · Why not small fp8 models

ryg81

Mar 14

Hey why don't you release fp8 model, that can be used by consumer level GPUs easily?

suifeng1001

Apr 5

Hey why don't you release fp8 model, that can be used by consumer level GPUs easily?

I think we can do fp8 quantization of text encoder

kwal559

Apr 17

here is consumer level and no quantize:

prompt = "A photorealistic close-up of a iridescent hummingbird hovering mid-air, its wings a blur of sapphire and emerald, drinking nectar from a bioluminescent flower"
negative_prompt = "cartoon, anime, poor quality, poor clarity, ugly"
g_scale,steps,width,height = 4, 15, 1536, 640

import diffusers, torch, gc
def flush():
gc.collect()
torch.cuda.empty_cache()
device, dtype, model_id = "cuda", torch.bfloat16, "THUDM/CogView4-6B"
emb_prompts = diffusers.DiffusionPipeline.from_pretrained(model_id, transformer=None, vae=None, torch_dtype=dtype).to(device)
with torch.no_grad():
(prompt_embeds, negative_prompt_embeds) = emb_prompts.encode_prompt(prompt=prompt, negative_prompt=negative_prompt)
taco = flush()
emb_prompts, text_encoder, tokenizer = taco,taco,taco
del taco

pipeline = diffusers.DiffusionPipeline.from_pretrained(model_id, text_encoder=None, tokenizer=None, torch_dtype=dtype).to(device)
with torch.inference_mode():
image = pipeline(prompt_embeds=prompt_embeds.to(dtype), negative_prompt_embeds=negative_prompt_embeds.to(dtype), guidance_scale=g_scale, num_inference_steps=steps, width=width, height=height).images[0]
display(image);del pipeline;del image;flush()