Spaces:

Anuji
/

OCR-app

Sleeping

App Files Files Community

OCR-app / app.py

Anuji

init app.py

3c6abc9 verified about 2 months ago

raw

history blame

5.36 kB

	# Step 2: Verify GPU
	import torch
	print("CUDA Available:", torch.cuda.is_available())
	print("Device:", torch.cuda.current_device() if torch.cuda.is_available() else "CPU")
	print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A")

	# Step 3: Modified app.py for Colab with debugging
	import os
	import gradio as gr
	import torch
	from PIL import Image
	from deepseek_vl2.serve.inference import load_model, deepseek_generate, convert_conversation_to_prompts
	from deepseek_vl2.models.conversation import SeparatorStyle
	from deepseek_vl2.serve.app_modules.utils import configure_logger, strip_stop_words, pil_to_base64
	from google.colab import files

	logger = configure_logger()

	MODELS = ["deepseek-ai/deepseek-vl2-tiny"]
	DEPLOY_MODELS = {}
	IMAGE_TOKEN = "<image>"

	def fetch_model(model_name: str, dtype=torch.bfloat16):
	global DEPLOY_MODELS
	if model_name not in DEPLOY_MODELS:
	print(f"Loading {model_name}...")
	model_info = load_model(model_name, dtype=dtype)
	tokenizer, model, vl_chat_processor = model_info
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model = model.to(device)
	DEPLOY_MODELS[model_name] = (tokenizer, model, vl_chat_processor)
	print(f"Loaded {model_name} on {device}")
	return DEPLOY_MODELS[model_name]

	def generate_prompt_with_history(text, images, history, vl_chat_processor, tokenizer, max_length=2048):
	conversation = vl_chat_processor.new_chat_template()
	if history:
	conversation.messages = history
	if images:
	text = f"{IMAGE_TOKEN}\n{text}"
	text = (text, images)
	conversation.append_message(conversation.roles[0], text)
	conversation.append_message(conversation.roles[1], "")
	return conversation

	def to_gradio_chatbot(conv):
	ret = []
	for i, (role, msg) in enumerate(conv.messages[conv.offset:]):
	if i % 2 == 0:
	if isinstance(msg, tuple):
	msg, images = msg
	for image in images:
	img_b64 = pil_to_base64(image, "user upload", max_size=800, min_size=400)
	msg = msg.replace(IMAGE_TOKEN, img_b64, 1)
	ret.append([msg, None])
	else:
	ret[-1][-1] = msg
	return ret

	def predict(text, images, chatbot, history, model_name="deepseek-ai/deepseek-vl2-tiny"):
	print("Starting predict function...")
	tokenizer, vl_gpt, vl_chat_processor = fetch_model(model_name)
	if not text:
	print("Empty text input detected.")
	return chatbot, history, "Empty context."

	print("Processing images...")
	pil_images = [Image.open(img).convert("RGB") for img in images] if images else []
	conversation = generate_prompt_with_history(
	text, pil_images, history, vl_chat_processor, tokenizer
	)
	all_conv, _ = convert_conversation_to_prompts(conversation)
	stop_words = conversation.stop_str
	gradio_chatbot_output = to_gradio_chatbot(conversation)

	full_response = ""
	print("Generating response...")
	try:
	with torch.no_grad():
	for x in deepseek_generate(
	conversations=all_conv,
	vl_gpt=vl_gpt,
	vl_chat_processor=vl_chat_processor,
	tokenizer=tokenizer,
	stop_words=stop_words,
	max_length=2048,
	temperature=0.1,
	top_p=0.9,
	repetition_penalty=1.1
	):
	full_response += x
	response = strip_stop_words(full_response, stop_words)
	conversation.update_last_message(response)
	gradio_chatbot_output[-1][1] = response
	print(f"Yielding partial response: {response[:50]}...")
	yield gradio_chatbot_output, conversation.messages, "Generating..."

	print("Generation complete.")
	torch.cuda.empty_cache()
	yield gradio_chatbot_output, conversation.messages, "Success"
	except Exception as e:
	print(f"Error in generation: {str(e)}")
	yield gradio_chatbot_output, conversation.messages, f"Error: {str(e)}"

	# Gradio interface for OCR
	def upload_and_process(image):
	if image is None:
	return "Please upload an image.", []
	prompt = "Extract all text from this image exactly as it appears, ensuring the output is in English only. Preserve spaces, bullets, numbers, and all formatting. Do not translate, generate, or include text in any other language. Stop at the last character of the image text."
	chatbot = []
	history = []
	print("Starting upload_and_process...")
	for chatbot_output, history_output, status in predict(prompt, [image], chatbot, history):
	print(f"Status: {status}")
	if status == "Success":
	return chatbot_output[-1][1], history_output
	return "Processing failed.", []

	# Launch Gradio app
	with gr.Blocks() as demo:
	gr.Markdown("### DeepSeek-VL2 OCR in Colab")
	image_input = gr.Image(type="filepath", label="Upload Image")
	output_text = gr.Textbox(label="Extracted Text")
	history_state = gr.State([])
	submit_btn = gr.Button("Extract Text")
	submit_btn.click(upload_and_process, inputs=image_input, outputs=[output_text, history_state])

	demo.launch(share=True, debug=True) # Added debug=True for more Gradio logs