updated
Browse files
app.py
CHANGED
@@ -1,49 +1,33 @@
|
|
1 |
import os
|
2 |
-
import subprocess
|
3 |
-
import sys
|
4 |
import torch
|
5 |
-
import
|
6 |
from PIL import Image
|
7 |
from deepseek_vl2.serve.inference import load_model, deepseek_generate, convert_conversation_to_prompts
|
8 |
from deepseek_vl2.models.conversation import SeparatorStyle
|
9 |
from deepseek_vl2.serve.app_modules.utils import configure_logger, strip_stop_words, pil_to_base64
|
10 |
-
from google.colab import files
|
11 |
|
12 |
-
#
|
13 |
-
repo_dir = "/content/DeepSeek-VL2"
|
14 |
-
if not os.path.exists(repo_dir):
|
15 |
-
subprocess.run(["git", "clone", "https://github.com/deepseek-ai/DeepSeek-VL2"], check=True)
|
16 |
-
sys.path.append(repo_dir) # Add the DeepSeek-VL2 repository to the Python path
|
17 |
-
else:
|
18 |
-
print("Repository already cloned.")
|
19 |
-
|
20 |
-
# Step 2: Install dependencies if not already installed
|
21 |
-
subprocess.run([sys.executable, "-m", "pip", "install", "-r", "/content/DeepSeek-VL2/requirements.txt"])
|
22 |
-
|
23 |
-
# Step 3: Verify GPU (Optional)
|
24 |
-
print("CUDA Available:", torch.cuda.is_available())
|
25 |
-
print("Device:", torch.cuda.current_device() if torch.cuda.is_available() else "CPU")
|
26 |
-
print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A")
|
27 |
-
|
28 |
-
# Step 4: Define your model and prediction functions
|
29 |
logger = configure_logger()
|
30 |
|
|
|
31 |
MODELS = ["deepseek-ai/deepseek-vl2-tiny"]
|
32 |
DEPLOY_MODELS = {}
|
33 |
IMAGE_TOKEN = "<image>"
|
34 |
|
|
|
35 |
def fetch_model(model_name: str, dtype=torch.bfloat16):
|
36 |
global DEPLOY_MODELS
|
37 |
if model_name not in DEPLOY_MODELS:
|
38 |
-
|
39 |
model_info = load_model(model_name, dtype=dtype)
|
40 |
tokenizer, model, vl_chat_processor = model_info
|
41 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
42 |
model = model.to(device)
|
43 |
DEPLOY_MODELS[model_name] = (tokenizer, model, vl_chat_processor)
|
44 |
-
|
45 |
return DEPLOY_MODELS[model_name]
|
46 |
|
|
|
47 |
def generate_prompt_with_history(text, images, history, vl_chat_processor, tokenizer, max_length=2048):
|
48 |
conversation = vl_chat_processor.new_chat_template()
|
49 |
if history:
|
@@ -55,6 +39,7 @@ def generate_prompt_with_history(text, images, history, vl_chat_processor, token
|
|
55 |
conversation.append_message(conversation.roles[1], "")
|
56 |
return conversation
|
57 |
|
|
|
58 |
def to_gradio_chatbot(conv):
|
59 |
ret = []
|
60 |
for i, (role, msg) in enumerate(conv.messages[conv.offset:]):
|
@@ -69,14 +54,12 @@ def to_gradio_chatbot(conv):
|
|
69 |
ret[-1][-1] = msg
|
70 |
return ret
|
71 |
|
|
|
72 |
def predict(text, images, chatbot, history, model_name="deepseek-ai/deepseek-vl2-tiny"):
|
73 |
-
print("Starting predict function...")
|
74 |
tokenizer, vl_gpt, vl_chat_processor = fetch_model(model_name)
|
75 |
if not text:
|
76 |
-
print("Empty text input detected.")
|
77 |
return chatbot, history, "Empty context."
|
78 |
|
79 |
-
print("Processing images...")
|
80 |
pil_images = [Image.open(img).convert("RGB") for img in images] if images else []
|
81 |
conversation = generate_prompt_with_history(
|
82 |
text, pil_images, history, vl_chat_processor, tokenizer
|
@@ -86,7 +69,6 @@ def predict(text, images, chatbot, history, model_name="deepseek-ai/deepseek-vl2
|
|
86 |
gradio_chatbot_output = to_gradio_chatbot(conversation)
|
87 |
|
88 |
full_response = ""
|
89 |
-
print("Generating response...")
|
90 |
try:
|
91 |
with torch.no_grad():
|
92 |
for x in deepseek_generate(
|
@@ -104,38 +86,31 @@ def predict(text, images, chatbot, history, model_name="deepseek-ai/deepseek-vl2
|
|
104 |
response = strip_stop_words(full_response, stop_words)
|
105 |
conversation.update_last_message(response)
|
106 |
gradio_chatbot_output[-1][1] = response
|
107 |
-
print(f"Yielding partial response: {response[:50]}...")
|
108 |
yield gradio_chatbot_output, conversation.messages, "Generating..."
|
109 |
|
110 |
-
print("Generation complete.")
|
111 |
torch.cuda.empty_cache()
|
112 |
yield gradio_chatbot_output, conversation.messages, "Success"
|
113 |
except Exception as e:
|
114 |
-
print(f"Error in generation: {str(e)}")
|
115 |
yield gradio_chatbot_output, conversation.messages, f"Error: {str(e)}"
|
116 |
|
117 |
-
#
|
118 |
-
|
119 |
-
|
120 |
-
return "Please upload an image.", []
|
121 |
-
prompt = "Extract all text from this image exactly as it appears, ensuring the output is in English only. Preserve spaces, bullets, numbers, and all formatting. Do not translate, generate, or include text in any other language. Stop at the last character of the image text."
|
122 |
-
chatbot = []
|
123 |
-
history = []
|
124 |
-
print("Starting upload_and_process...")
|
125 |
-
for chatbot_output, history_output, status in predict(prompt, [image], chatbot, history):
|
126 |
-
print(f"Status: {status}")
|
127 |
-
if status == "Success":
|
128 |
-
return chatbot_output[-1][1], history_output
|
129 |
-
return "Processing failed.", []
|
130 |
|
131 |
-
#
|
132 |
-
|
133 |
-
gr.Markdown("### DeepSeek-VL2 OCR in Colab")
|
134 |
-
image_input = gr.Image(type="filepath", label="Upload Image")
|
135 |
-
output_text = gr.Textbox(label="Extracted Text")
|
136 |
-
history_state = gr.State([])
|
137 |
-
submit_btn = gr.Button("Extract Text")
|
138 |
-
submit_btn.click(upload_and_process, inputs=image_input, outputs=[output_text, history_state])
|
139 |
|
140 |
-
|
|
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
|
|
|
|
2 |
import torch
|
3 |
+
import streamlit as st
|
4 |
from PIL import Image
|
5 |
from deepseek_vl2.serve.inference import load_model, deepseek_generate, convert_conversation_to_prompts
|
6 |
from deepseek_vl2.models.conversation import SeparatorStyle
|
7 |
from deepseek_vl2.serve.app_modules.utils import configure_logger, strip_stop_words, pil_to_base64
|
|
|
8 |
|
9 |
+
# Initialize logger
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
logger = configure_logger()
|
11 |
|
12 |
+
# Global variables for model loading
|
13 |
MODELS = ["deepseek-ai/deepseek-vl2-tiny"]
|
14 |
DEPLOY_MODELS = {}
|
15 |
IMAGE_TOKEN = "<image>"
|
16 |
|
17 |
+
# Load model function
|
18 |
def fetch_model(model_name: str, dtype=torch.bfloat16):
|
19 |
global DEPLOY_MODELS
|
20 |
if model_name not in DEPLOY_MODELS:
|
21 |
+
st.write(f"Loading {model_name}...")
|
22 |
model_info = load_model(model_name, dtype=dtype)
|
23 |
tokenizer, model, vl_chat_processor = model_info
|
24 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
25 |
model = model.to(device)
|
26 |
DEPLOY_MODELS[model_name] = (tokenizer, model, vl_chat_processor)
|
27 |
+
st.write(f"Loaded {model_name} on {device}")
|
28 |
return DEPLOY_MODELS[model_name]
|
29 |
|
30 |
+
# Generate prompt with conversation history
|
31 |
def generate_prompt_with_history(text, images, history, vl_chat_processor, tokenizer, max_length=2048):
|
32 |
conversation = vl_chat_processor.new_chat_template()
|
33 |
if history:
|
|
|
39 |
conversation.append_message(conversation.roles[1], "")
|
40 |
return conversation
|
41 |
|
42 |
+
# Convert conversation to Gradio-compatible format
|
43 |
def to_gradio_chatbot(conv):
|
44 |
ret = []
|
45 |
for i, (role, msg) in enumerate(conv.messages[conv.offset:]):
|
|
|
54 |
ret[-1][-1] = msg
|
55 |
return ret
|
56 |
|
57 |
+
# Prediction function
|
58 |
def predict(text, images, chatbot, history, model_name="deepseek-ai/deepseek-vl2-tiny"):
|
|
|
59 |
tokenizer, vl_gpt, vl_chat_processor = fetch_model(model_name)
|
60 |
if not text:
|
|
|
61 |
return chatbot, history, "Empty context."
|
62 |
|
|
|
63 |
pil_images = [Image.open(img).convert("RGB") for img in images] if images else []
|
64 |
conversation = generate_prompt_with_history(
|
65 |
text, pil_images, history, vl_chat_processor, tokenizer
|
|
|
69 |
gradio_chatbot_output = to_gradio_chatbot(conversation)
|
70 |
|
71 |
full_response = ""
|
|
|
72 |
try:
|
73 |
with torch.no_grad():
|
74 |
for x in deepseek_generate(
|
|
|
86 |
response = strip_stop_words(full_response, stop_words)
|
87 |
conversation.update_last_message(response)
|
88 |
gradio_chatbot_output[-1][1] = response
|
|
|
89 |
yield gradio_chatbot_output, conversation.messages, "Generating..."
|
90 |
|
|
|
91 |
torch.cuda.empty_cache()
|
92 |
yield gradio_chatbot_output, conversation.messages, "Success"
|
93 |
except Exception as e:
|
|
|
94 |
yield gradio_chatbot_output, conversation.messages, f"Error: {str(e)}"
|
95 |
|
96 |
+
# Streamlit UI setup
|
97 |
+
st.title("DeepSeek-VL2 OCR in Colab")
|
98 |
+
st.write("Upload an image and get the extracted text.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
+
# Image upload
|
101 |
+
image_input = st.file_uploader("Upload Image", type=["png", "jpg", "jpeg"])
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
+
# Output text
|
104 |
+
output_text = st.text_area("Extracted Text", "")
|
105 |
|
106 |
+
# Handle the image upload and processing
|
107 |
+
if image_input:
|
108 |
+
prompt = "Extract all text from this image exactly as it appears, ensuring the output is in English only."
|
109 |
+
chatbot = []
|
110 |
+
history = []
|
111 |
+
for chatbot_output, history_output, status in predict(prompt, [image_input], chatbot, history):
|
112 |
+
if status == "Success":
|
113 |
+
output_text = chatbot_output[-1][1]
|
114 |
+
st.write("Extracted Text:", output_text)
|
115 |
+
else:
|
116 |
+
st.error(f"Error: {status}")
|