mansari722 commited on
Commit
a6f2409
·
verified ·
1 Parent(s): 551e50f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -6
app.py CHANGED
@@ -3,24 +3,35 @@ from transformers import AutoProcessor, AutoModelForVision2Seq
3
  from PIL import Image
4
  import torch
5
 
6
- # Load model & processor
7
  model_name = "ds4sd/SmolDocling-256M-preview"
8
  processor = AutoProcessor.from_pretrained(model_name)
9
  model = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.bfloat16).to("cuda" if torch.cuda.is_available() else "cpu")
10
 
11
  def process_image(image):
12
- inputs = processor(images=image, return_tensors="pt").to(model.device)
13
- output = model.generate(**inputs, max_new_tokens=1024)
 
 
 
 
 
 
 
 
 
 
 
14
  result = processor.batch_decode(output, skip_special_tokens=True)[0]
15
  return result
16
 
17
- # Create Gradio interface
18
  iface = gr.Interface(
19
  fn=process_image,
20
- inputs=gr.Image(type="pil"), # FIXED
21
  outputs="text",
22
  title="SmolDocling Document Processing",
23
  description="Upload a document image to extract text."
24
  )
25
 
26
- iface.launch(share=True)
 
3
  from PIL import Image
4
  import torch
5
 
6
+ # Load Model & Processor
7
  model_name = "ds4sd/SmolDocling-256M-preview"
8
  processor = AutoProcessor.from_pretrained(model_name)
9
  model = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.bfloat16).to("cuda" if torch.cuda.is_available() else "cpu")
10
 
11
  def process_image(image):
12
+ if image is None:
13
+ return "Error: No image provided."
14
+
15
+ # Convert image to RGB format to ensure compatibility
16
+ image = image.convert("RGB")
17
+
18
+ # Process the image
19
+ inputs = processor(images=[image], return_tensors="pt").to(model.device)
20
+
21
+ # Generate output (remove unnecessary kwargs)
22
+ output = model.generate(**inputs)
23
+
24
+ # Decode output text
25
  result = processor.batch_decode(output, skip_special_tokens=True)[0]
26
  return result
27
 
28
+ # Create Gradio Interface
29
  iface = gr.Interface(
30
  fn=process_image,
31
+ inputs=gr.Image(type="pil"), # Fixed input format
32
  outputs="text",
33
  title="SmolDocling Document Processing",
34
  description="Upload a document image to extract text."
35
  )
36
 
37
+ iface.launch(server_name="0.0.0.0", server_port=7860)