Spaces:

Anuji
/

OCR-app

Sleeping

App Files Files Community

Anuji commited on Apr 3

Commit

3c6abc9

verified ·

1 Parent(s): 97a3816

init app.py

Browse files

Files changed (1) hide show

app.py +128 -0

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Step 2: Verify GPU
+import torch
+print("CUDA Available:", torch.cuda.is_available())
+print("Device:", torch.cuda.current_device() if torch.cuda.is_available() else "CPU")
+print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A")
+# Step 3: Modified app.py for Colab with debugging
+import os
+import gradio as gr
+import torch
+from PIL import Image
+from deepseek_vl2.serve.inference import load_model, deepseek_generate, convert_conversation_to_prompts
+from deepseek_vl2.models.conversation import SeparatorStyle
+from deepseek_vl2.serve.app_modules.utils import configure_logger, strip_stop_words, pil_to_base64
+from google.colab import files
+logger = configure_logger()
+MODELS = ["deepseek-ai/deepseek-vl2-tiny"]
+DEPLOY_MODELS = {}
+IMAGE_TOKEN = "<image>"
+def fetch_model(model_name: str, dtype=torch.bfloat16):
+    global DEPLOY_MODELS
+    if model_name not in DEPLOY_MODELS:
+        print(f"Loading {model_name}...")
+        model_info = load_model(model_name, dtype=dtype)
+        tokenizer, model, vl_chat_processor = model_info
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        model = model.to(device)
+        DEPLOY_MODELS[model_name] = (tokenizer, model, vl_chat_processor)
+        print(f"Loaded {model_name} on {device}")
+    return DEPLOY_MODELS[model_name]
+def generate_prompt_with_history(text, images, history, vl_chat_processor, tokenizer, max_length=2048):
+    conversation = vl_chat_processor.new_chat_template()
+    if history:
+        conversation.messages = history
+    if images:
+        text = f"{IMAGE_TOKEN}\n{text}"
+        text = (text, images)
+    conversation.append_message(conversation.roles[0], text)
+    conversation.append_message(conversation.roles[1], "")
+    return conversation
+def to_gradio_chatbot(conv):
+    ret = []
+    for i, (role, msg) in enumerate(conv.messages[conv.offset:]):
+        if i % 2 == 0:
+            if isinstance(msg, tuple):
+                msg, images = msg
+                for image in images:
+                    img_b64 = pil_to_base64(image, "user upload", max_size=800, min_size=400)
+                    msg = msg.replace(IMAGE_TOKEN, img_b64, 1)
+            ret.append([msg, None])
+        else:
+            ret[-1][-1] = msg
+    return ret
+def predict(text, images, chatbot, history, model_name="deepseek-ai/deepseek-vl2-tiny"):
+    print("Starting predict function...")
+    tokenizer, vl_gpt, vl_chat_processor = fetch_model(model_name)
+    if not text:
+        print("Empty text input detected.")
+        return chatbot, history, "Empty context."
+    print("Processing images...")
+    pil_images = [Image.open(img).convert("RGB") for img in images] if images else []
+    conversation = generate_prompt_with_history(
+        text, pil_images, history, vl_chat_processor, tokenizer
+    )
+    all_conv, _ = convert_conversation_to_prompts(conversation)
+    stop_words = conversation.stop_str
+    gradio_chatbot_output = to_gradio_chatbot(conversation)
+    full_response = ""
+    print("Generating response...")
+    try:
+        with torch.no_grad():
+            for x in deepseek_generate(
+                conversations=all_conv,
+                vl_gpt=vl_gpt,
+                vl_chat_processor=vl_chat_processor,
+                tokenizer=tokenizer,
+                stop_words=stop_words,
+                max_length=2048,
+                temperature=0.1,
+                top_p=0.9,
+                repetition_penalty=1.1
+            ):
+                full_response += x
+                response = strip_stop_words(full_response, stop_words)
+                conversation.update_last_message(response)
+                gradio_chatbot_output[-1][1] = response
+                print(f"Yielding partial response: {response[:50]}...")
+                yield gradio_chatbot_output, conversation.messages, "Generating..."
+        print("Generation complete.")
+        torch.cuda.empty_cache()
+        yield gradio_chatbot_output, conversation.messages, "Success"
+    except Exception as e:
+        print(f"Error in generation: {str(e)}")
+        yield gradio_chatbot_output, conversation.messages, f"Error: {str(e)}"
+# Gradio interface for OCR
+def upload_and_process(image):
+    if image is None:
+        return "Please upload an image.", []
+    prompt = "Extract all text from this image exactly as it appears, ensuring the output is in English only. Preserve spaces, bullets, numbers, and all formatting. Do not translate, generate, or include text in any other language. Stop at the last character of the image text."
+    chatbot = []
+    history = []
+    print("Starting upload_and_process...")
+    for chatbot_output, history_output, status in predict(prompt, [image], chatbot, history):
+        print(f"Status: {status}")
+        if status == "Success":
+            return chatbot_output[-1][1], history_output
+    return "Processing failed.", []
+# Launch Gradio app
+with gr.Blocks() as demo:
+    gr.Markdown("### DeepSeek-VL2 OCR in Colab")
+    image_input = gr.Image(type="filepath", label="Upload Image")
+    output_text = gr.Textbox(label="Extracted Text")
+    history_state = gr.State([])
+    submit_btn = gr.Button("Extract Text")
+    submit_btn.click(upload_and_process, inputs=image_input, outputs=[output_text, history_state])
+demo.launch(share=True, debug=True)  # Added debug=True for more Gradio logs