Spaces:
Running
Running
import gradio as gr | |
from transformers import AutoProcessor, AutoModelForVision2Seq | |
from PIL import Image | |
import torch | |
# Load Model & Processor | |
model_name = "ds4sd/SmolDocling-256M-preview" | |
processor = AutoProcessor.from_pretrained(model_name) | |
model = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.bfloat16).to("cuda" if torch.cuda.is_available() else "cpu") | |
def process_image(image): | |
if image is None: | |
return "Error: No image provided." | |
# Convert image to RGB format to ensure compatibility | |
image = image.convert("RGB") | |
# Process the image | |
inputs = processor(images=[image], return_tensors="pt").to(model.device) | |
# Generate output (remove unnecessary kwargs) | |
output = model.generate(**inputs) | |
# Decode output text | |
result = processor.batch_decode(output, skip_special_tokens=True)[0] | |
return result | |
# Create Gradio Interface | |
iface = gr.Interface( | |
fn=process_image, | |
inputs=gr.Image(type="pil"), # Fixed input format | |
outputs="text", | |
title="SmolDocling Document Processing", | |
description="Upload a document image to extract text." | |
) | |
iface.launch(server_name="0.0.0.0", server_port=7860) |