SmolVLM: Redefining small and efficient multimodal models
Paper
•
2504.05299
•
Published
•
158
None defined yet.
pip install -U huggingface_hub[hf_xet]
from huggingface_hub import InferenceClient
client = InferenceClient(provider="fal-ai", bill_to="my-cool-company")
image = client.text_to_image(
"A majestic lion in a fantasy forest",
model="black-forest-labs/FLUX.1-schnell",
)
image.save("lion.png")
transformers
in dedicated releases!v4.49.0-SmolVLM-2
and v4.49.0-SigLIP-2
.import { KokoroTTS } from "kokoro-js";
const tts = await KokoroTTS.from_pretrained(
"onnx-community/Kokoro-82M-ONNX",
{ dtype: "q8" }, // fp32, fp16, q8, q4, q4f16
);
const text = "Life is like a box of chocolates. You never know what you're gonna get.";
const audio = await tts.generate(text,
{ voice: "af_sky" }, // See `tts.list_voices()`
);
audio.save("audio.wav");