bulentsoykan commited on
Commit
feae4d7
Β·
verified Β·
1 Parent(s): 00821bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -28
app.py CHANGED
@@ -19,24 +19,38 @@ if not HF_API_KEY:
19
  HF_API_KEY = st.secrets.get("HF_API_KEY", "") # Try getting from Streamlit secrets
20
 
21
  # Hugging Face API function
22
- def process_image_with_hf(image_bytes):
23
- # Use an available multimodal model that can handle images and text
24
- API_URL = "https://api-inference.huggingface.co/models/llava-hf/llava-1.5-7b-hf"
25
  headers = {"Authorization": f"Bearer {HF_API_KEY}"}
26
 
27
  # Convert image to base64
28
  image_b64 = base64.b64encode(image_bytes).decode('utf-8')
29
 
30
- # Prepare payload
31
- payload = {
32
- "inputs": {
33
- "image": image_b64,
34
- "text": """Analyze the text in the provided image. Extract all readable content
35
- and present it in a structured Markdown format that is clear, concise,
36
- and well-organized. Ensure proper formatting (e.g., headings, lists, or
37
- code blocks) as necessary to represent the content effectively."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  }
39
- }
40
 
41
  # Make API request
42
  response = requests.post(API_URL, headers=headers, json=payload)
@@ -45,12 +59,17 @@ def process_image_with_hf(image_bytes):
45
  raise Exception(f"API request failed with status code {response.status_code}: {response.text}")
46
 
47
  # Handle different response formats
48
- if isinstance(response.json(), list):
49
- return response.json()[0]["generated_text"]
50
- elif isinstance(response.json(), dict) and "generated_text" in response.json():
51
- return response.json()["generated_text"]
52
- else:
53
- return str(response.json())
 
 
 
 
 
54
 
55
  # Title and description in main area
56
  try:
@@ -78,15 +97,15 @@ with st.sidebar:
78
  st.header("Settings")
79
  model_option = st.selectbox(
80
  "Select Vision Model",
81
- ["LLaVA 1.5 (7B)", "CLIP-ViT", "BLIP-2"],
82
  index=0
83
  )
84
 
85
- # Map selection to model ID
86
  model_mapping = {
87
- "LLaVA 1.5 (7B)": "llava-hf/llava-1.5-7b-hf",
88
- "CLIP-ViT": "openai/clip-vit-base-patch32",
89
- "BLIP-2": "Salesforce/blip2-opt-2.7b"
90
  }
91
 
92
  selected_model = model_mapping[model_option]
@@ -106,14 +125,11 @@ with st.sidebar:
106
  if st.button("Extract Text πŸ”", type="primary"):
107
  with st.spinner(f"Processing image with {model_option}..."):
108
  try:
109
- # Update the model URL
110
- API_URL = f"https://api-inference.huggingface.co/models/{selected_model}"
111
-
112
  # Get image bytes
113
  img_bytes = uploaded_file.getvalue()
114
 
115
- # Process with Hugging Face API
116
- result = process_image_with_hf(img_bytes)
117
  st.session_state['ocr_result'] = result
118
  except Exception as e:
119
  st.error(f"Error processing image: {str(e)}")
 
19
  HF_API_KEY = st.secrets.get("HF_API_KEY", "") # Try getting from Streamlit secrets
20
 
21
  # Hugging Face API function
22
+ def process_image_with_hf(image_bytes, model_id):
23
+ API_URL = f"https://api-inference.huggingface.co/models/{model_id}"
 
24
  headers = {"Authorization": f"Bearer {HF_API_KEY}"}
25
 
26
  # Convert image to base64
27
  image_b64 = base64.b64encode(image_bytes).decode('utf-8')
28
 
29
+ # Prepare payload based on model type
30
+ if "llava" in model_id.lower():
31
+ payload = {
32
+ "inputs": {
33
+ "image": image_b64,
34
+ "prompt": """Analyze the text in the provided image. Extract all readable content
35
+ and present it in a structured Markdown format that is clear, concise,
36
+ and well-organized. Ensure proper formatting (e.g., headings, lists, or
37
+ code blocks) as necessary to represent the content effectively."""
38
+ },
39
+ "parameters": {
40
+ "max_new_tokens": 1024
41
+ }
42
+ }
43
+ else:
44
+ # Generic payload format for other models
45
+ payload = {
46
+ "inputs": {
47
+ "image": image_b64,
48
+ "text": """Analyze the text in the provided image. Extract all readable content
49
+ and present it in a structured Markdown format that is clear, concise,
50
+ and well-organized. Ensure proper formatting (e.g., headings, lists, or
51
+ code blocks) as necessary to represent the content effectively."""
52
+ }
53
  }
 
54
 
55
  # Make API request
56
  response = requests.post(API_URL, headers=headers, json=payload)
 
59
  raise Exception(f"API request failed with status code {response.status_code}: {response.text}")
60
 
61
  # Handle different response formats
62
+ response_json = response.json()
63
+ if isinstance(response_json, list):
64
+ return response_json[0]["generated_text"]
65
+ elif isinstance(response_json, dict):
66
+ if "generated_text" in response_json:
67
+ return response_json["generated_text"]
68
+ elif "text" in response_json:
69
+ return response_json["text"]
70
+
71
+ # Fallback
72
+ return str(response_json)
73
 
74
  # Title and description in main area
75
  try:
 
97
  st.header("Settings")
98
  model_option = st.selectbox(
99
  "Select Vision Model",
100
+ ["LLaVA-1.5-7B", "MiniGPT-4", "Idefics"],
101
  index=0
102
  )
103
 
104
+ # Updated model mapping with confirmed working models
105
  model_mapping = {
106
+ "LLaVA-1.5-7B": "llava-hf/llava-1.5-7b-hf",
107
+ "MiniGPT-4": "Vision-CAIR/MiniGPT-4",
108
+ "Idefics": "HuggingFaceM4/idefics-9b-instruct"
109
  }
110
 
111
  selected_model = model_mapping[model_option]
 
125
  if st.button("Extract Text πŸ”", type="primary"):
126
  with st.spinner(f"Processing image with {model_option}..."):
127
  try:
 
 
 
128
  # Get image bytes
129
  img_bytes = uploaded_file.getvalue()
130
 
131
+ # Process with Hugging Face API using selected model
132
+ result = process_image_with_hf(img_bytes, selected_model)
133
  st.session_state['ocr_result'] = result
134
  except Exception as e:
135
  st.error(f"Error processing image: {str(e)}")