Spaces:

bulentsoykan
/

streamlit-OCR-app

Running

App Files Files Community

bulentsoykan commited on Mar 17

Commit

feae4d7

verified ·

1 Parent(s): 00821bd

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -28

app.py CHANGED Viewed

@@ -19,24 +19,38 @@ if not HF_API_KEY:
     HF_API_KEY = st.secrets.get("HF_API_KEY", "")  # Try getting from Streamlit secrets
 # Hugging Face API function
-def process_image_with_hf(image_bytes):
-    # Use an available multimodal model that can handle images and text
-    API_URL = "https://api-inference.huggingface.co/models/llava-hf/llava-1.5-7b-hf"
     headers = {"Authorization": f"Bearer {HF_API_KEY}"}
     # Convert image to base64
     image_b64 = base64.b64encode(image_bytes).decode('utf-8')
-    # Prepare payload
-    payload = {
-        "inputs": {
-            "image": image_b64,
-            "text": """Analyze the text in the provided image. Extract all readable content
-                    and present it in a structured Markdown format that is clear, concise,
-                    and well-organized. Ensure proper formatting (e.g., headings, lists, or
-                    code blocks) as necessary to represent the content effectively."""
         }
-    }
     # Make API request
     response = requests.post(API_URL, headers=headers, json=payload)
@@ -45,12 +59,17 @@ def process_image_with_hf(image_bytes):
         raise Exception(f"API request failed with status code {response.status_code}: {response.text}")
     # Handle different response formats
-    if isinstance(response.json(), list):
-        return response.json()[0]["generated_text"]
-    elif isinstance(response.json(), dict) and "generated_text" in response.json():
-        return response.json()["generated_text"]
-    else:
-        return str(response.json())
 # Title and description in main area
 try:
@@ -78,15 +97,15 @@ with st.sidebar:
     st.header("Settings")
     model_option = st.selectbox(
         "Select Vision Model",
-        ["LLaVA 1.5 (7B)", "CLIP-ViT", "BLIP-2"],
         index=0
     )
-    # Map selection to model ID
     model_mapping = {
-        "LLaVA 1.5 (7B)": "llava-hf/llava-1.5-7b-hf",
-        "CLIP-ViT": "openai/clip-vit-base-patch32",
-        "BLIP-2": "Salesforce/blip2-opt-2.7b"
     }
     selected_model = model_mapping[model_option]
@@ -106,14 +125,11 @@ with st.sidebar:
             if st.button("Extract Text 🔍", type="primary"):
                 with st.spinner(f"Processing image with {model_option}..."):
                     try:
-                        # Update the model URL
-                        API_URL = f"https://api-inference.huggingface.co/models/{selected_model}"
                         # Get image bytes
                         img_bytes = uploaded_file.getvalue()
-                        # Process with Hugging Face API
-                        result = process_image_with_hf(img_bytes)
                         st.session_state['ocr_result'] = result
                     except Exception as e:
                         st.error(f"Error processing image: {str(e)}")

     HF_API_KEY = st.secrets.get("HF_API_KEY", "")  # Try getting from Streamlit secrets
 # Hugging Face API function
+def process_image_with_hf(image_bytes, model_id):
+    API_URL = f"https://api-inference.huggingface.co/models/{model_id}"
     headers = {"Authorization": f"Bearer {HF_API_KEY}"}
     # Convert image to base64
     image_b64 = base64.b64encode(image_bytes).decode('utf-8')
+    # Prepare payload based on model type
+    if "llava" in model_id.lower():
+        payload = {
+            "inputs": {
+                "image": image_b64,
+                "prompt": """Analyze the text in the provided image. Extract all readable content
+                        and present it in a structured Markdown format that is clear, concise,
+                        and well-organized. Ensure proper formatting (e.g., headings, lists, or
+                        code blocks) as necessary to represent the content effectively."""
+            },
+            "parameters": {
+                "max_new_tokens": 1024
+            }
+        }
+    else:
+        # Generic payload format for other models
+        payload = {
+            "inputs": {
+                "image": image_b64,
+                "text": """Analyze the text in the provided image. Extract all readable content
+                        and present it in a structured Markdown format that is clear, concise,
+                        and well-organized. Ensure proper formatting (e.g., headings, lists, or
+                        code blocks) as necessary to represent the content effectively."""
+            }
         }
     # Make API request
     response = requests.post(API_URL, headers=headers, json=payload)
         raise Exception(f"API request failed with status code {response.status_code}: {response.text}")
     # Handle different response formats
+    response_json = response.json()
+    if isinstance(response_json, list):
+        return response_json[0]["generated_text"]
+    elif isinstance(response_json, dict):
+        if "generated_text" in response_json:
+            return response_json["generated_text"]
+        elif "text" in response_json:
+            return response_json["text"]
+    # Fallback
+    return str(response_json)
 # Title and description in main area
 try:
     st.header("Settings")
     model_option = st.selectbox(
         "Select Vision Model",
+        ["LLaVA-1.5-7B", "MiniGPT-4", "Idefics"],
         index=0
     )
+    # Updated model mapping with confirmed working models
     model_mapping = {
+        "LLaVA-1.5-7B": "llava-hf/llava-1.5-7b-hf",
+        "MiniGPT-4": "Vision-CAIR/MiniGPT-4",
+        "Idefics": "HuggingFaceM4/idefics-9b-instruct"
     }
     selected_model = model_mapping[model_option]
             if st.button("Extract Text 🔍", type="primary"):
                 with st.spinner(f"Processing image with {model_option}..."):
                     try:
                         # Get image bytes
                         img_bytes = uploaded_file.getvalue()
+                        # Process with Hugging Face API using selected model
+                        result = process_image_with_hf(img_bytes, selected_model)
                         st.session_state['ocr_result'] = result
                     except Exception as e:
                         st.error(f"Error processing image: {str(e)}")