Spaces:

ZennyKenny
/

Novoyaz

Runtime error

App Files Files Community

ZennyKenny commited on Aug 9

Commit

818e485

verified ·

1 Parent(s): 129f697

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -161

app.py CHANGED Viewed

@@ -1,229 +1,177 @@
 # app.py
-# ZeroGPU-friendly Gradio app:
-# 1) Upload image with pre-reform Russian.
-# 2) OCR via rednote-hilab/dots.ocr.
-# 3) Convert to modern Russian via your HF model.
-# Notes:
-#  - Import `spaces` FIRST and avoid any CUDA/tensor ops at module import.
-#  - All torch/transformers/qwen_vl_utils imports happen INSIDE the @spaces.GPU() path.
-#  - Attn impl defaults to "eager" (no flash-attn required). If flash_attn is present & compatible, we'll use it.
 import os
-os.environ.setdefault("PYTORCH_NVML_BASED_CUDA_CHECK", "0")  # avoid NVML probe before ZeroGPU init
-import spaces  # MUST be imported before anything that may touch CUDA
 import gradio as gr
 from PIL import Image
-# --- Repos & constants ---
 OCR_REPO = "rednote-hilab/dots.ocr"
 CONVERT_REPO = "ZennyKenny/oss-20b-prereform-to-modern-ru-merged"
 SYSTEM_MSG = (
     "You convert Russian text from pre-1918 orthography to modern Russian spelling. "
     "Keep wording and punctuation; change only orthography."
 )
-# --- Lazy state (populated on first GPU call) ---
-_state = {
-    "ocr_model": None,
-    "ocr_processor": None,
-    "conv_model": None,
-    "conv_tok": None,
-    "ocr_prompt": None,
-}
-def _get_ocr_prompt():
-    """Fetch OCR text-extraction prompt from dots.ocr utils if available, else fallback."""
-    if _state["ocr_prompt"] is not None:
-        return _state["ocr_prompt"]
-    try:
-        # Import lazily to avoid early CUDA init
-        from dots_ocr.utils import dict_promptmode_to_prompt  # type: ignore
-        _state["ocr_prompt"] = dict_promptmode_to_prompt().get("prompt_ocr") or (
-            "Extract the original text from this image as plain text. "
-            "Keep the reading order. Do not translate. Do not add extra formatting."
-        )
-    except Exception:
-        _state["ocr_prompt"] = (
-            "Extract the original text from this image as plain text. "
-            "Keep the reading order. Do not translate. Do not add extra formatting."
-        )
-    return _state["ocr_prompt"]
-def _pick_attn_impl():
-    """
-    Decide attention backend for OCR model.
-    If flash_attn can be imported successfully (and matches Torch/CUDA), use flash_attention_2.
-    Otherwise fall back to eager (most stable on Spaces/ZeroGPU).
-    """
-    try:
-        import importlib
-        _ = importlib.import_module("flash_attn")  # may raise
-        return "flash_attention_2"
-    except Exception:
-        return "eager"
-def _ensure_models_on_gpu():
-    """
-    Create/load models ONLY when on the GPU worker.
-    No torch/transformers imports at module scope.
-    """
-    from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
-    # OCR model/processor
-    if _state["ocr_model"] is None or _state["ocr_processor"] is None:
-        _state["ocr_model"] = AutoModelForCausalLM.from_pretrained(
-            OCR_REPO,
-            trust_remote_code=True,
-            attn_implementation=_pick_attn_impl(),  # "eager" if flash-attn unavailable
-            device_map="auto",
-            torch_dtype="auto",
-        )
-        _state["ocr_processor"] = AutoProcessor.from_pretrained(
-            OCR_REPO, trust_remote_code=True
-        )
-    # Conversion model/tokenizer (pre-reform -> modern Russian)
-    if _state["conv_model"] is None or _state["conv_tok"] is None:
-        _state["conv_tok"] = AutoTokenizer.from_pretrained(CONVERT_REPO, use_fast=True)
-        _state["conv_model"] = AutoModelForCausalLM.from_pretrained(
-            CONVERT_REPO,
-            device_map="auto",
-            torch_dtype="auto",
-        )
-def _run_ocr_on_gpu(pil_image: Image.Image) -> str:
-    """Run dots.ocr on the given image and return raw OCR text."""
-    # Heavy imports inside GPU context
-    import torch
-    from qwen_vl_utils import process_vision_info
-    ocr_model = _state["ocr_model"]
-    ocr_processor = _state["ocr_processor"]
-    ocr_prompt = _get_ocr_prompt()
-    # Build chat-style message with image + text
     messages = [
         {
             "role": "user",
             "content": [
                 {"type": "image", "image": pil_image},
-                {"type": "text", "text": ocr_prompt},
             ],
         }
     ]
-    # Apply the processor's chat template and package inputs
-    text = ocr_processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
     image_inputs, video_inputs = process_vision_info(messages)
-    inputs = ocr_processor(
         text=[text],
         images=image_inputs,
         videos=video_inputs,
         padding=True,
         return_tensors="pt",
-    )
-    # Move to model device
-    dev = next(ocr_model.parameters()).device
-    inputs = {k: (v.to(dev) if hasattr(v, "to") else v) for k, v in inputs.items()}
-    # Generate
     with torch.no_grad():
-        gen_ids = ocr_model.generate(**inputs, max_new_tokens=2048)
-        prompt_len = inputs["input_ids"].shape[1]
-        out_ids = gen_ids[0][prompt_len:]
-        text_out = ocr_processor.decode(out_ids, skip_special_tokens=True).strip()
-    return text_out
-def _convert_on_gpu(pre_reform_text: str) -> str:
     """Use your merged model to convert pre-reform Russian -> modern Russian."""
-    import torch
-    conv_model = _state["conv_model"]
-    conv_tok = _state["conv_tok"]
     messages = [
         {"role": "system", "content": SYSTEM_MSG},
         {"role": "user", "content": pre_reform_text},
     ]
-    prompt = conv_tok.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    inputs = conv_tok([prompt], return_tensors="pt")
-    dev = next(conv_model.parameters()).device
-    inputs = {k: (v.to(dev) if hasattr(v, "to") else v) for k, v in inputs.items()}
     with torch.no_grad():
-        gen = conv_model.generate(
             **inputs,
             max_new_tokens=1024,
-            do_sample=False,      # deterministic for orthography conversion
             temperature=0.0,
             repetition_penalty=1.05,
         )
     gen_only = gen[0][inputs["input_ids"].shape[1]:]
-    return conv_tok.decode(gen_only, skip_special_tokens=True).strip()
-@spaces.GPU()  # ZeroGPU entrypoint: all CUDA must happen inside here (or helpers it calls)
-def transcribe_and_convert(pil_image: Image.Image):
-    if pil_image is None:
-        return None, "", "", "Please upload an image."
-    # Lazily load models on the GPU worker
-    _ensure_models_on_gpu()
-    # 1) OCR
-    ocr_text = _run_ocr_on_gpu(pil_image)
-    # 2) Convert pre-reform -> modern Russian
-    modern_text = _convert_on_gpu(ocr_text)
-    # 3) Markdown code block for easy copy
-    md = f"```text\n{modern_text}\n```"
-    return pil_image, ocr_text, modern_text, md
-# ---------------- UI ----------------
 with gr.Blocks(title="Pre-reform → Modern Russian (OCR + Conversion)") as demo:
     gr.Markdown(
         "## Pre-reform → Modern Russian (OCR + Conversion)\n"
-        "1) Upload an image containing pre-1918 Russian text.\n"
-        "2) Click **Transcribe & Convert** — the app will OCR via `rednote-hilab/dots.ocr` and convert to modern spelling."
     )
     with gr.Row():
         with gr.Column(scale=1):
-            image_in = gr.Image(type="pil", label="Upload image")
             run_btn = gr.Button("Transcribe & Convert", variant="primary")
-            gr.Markdown("Tip: higher-resolution images OCR better. For PDFs, export a page as an image.")
         with gr.Column(scale=2):
             with gr.Row():
-                image_preview = gr.Image(label="Preview", interactive=False)
                 ocr_box = gr.Textbox(label="Transcribed (pre-reform)", lines=14)
                 modern_box = gr.Textbox(label="Modern Russian", lines=14)
-            md_block = gr.Markdown(label="Modern Russian (markdown code block)")
     run_btn.click(
         transcribe_and_convert,
-        inputs=[image_in],
-        outputs=[image_preview, ocr_box, modern_box, md_block],
         api_name="transcribe_convert",
     )
-demo.queue(max_size=16).launch()

 # app.py
+# Gradio + ZeroGPU: OCR pre-reform RU with dots.ocr -> convert to modern RU with your model.
+# Same technique as the working Space you showed:
+# - import `spaces` first
+# - snapshot the OCR repo locally
+# - load models at module scope (after spaces import)
+# - use @spaces.GPU() for the heavy call
+import spaces  # must be first so ZeroGPU patches CUDA init correctly
 import os
+import traceback
+from io import BytesIO
+from typing import Tuple
 import gradio as gr
+import requests
+import torch
+from huggingface_hub import snapshot_download
 from PIL import Image
+from qwen_vl_utils import process_vision_info
+from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+# ---------- Config ----------
 OCR_REPO = "rednote-hilab/dots.ocr"
+OCR_LOCAL_DIR = "./models/dots-ocr-local"  # local snapshot dir
 CONVERT_REPO = "ZennyKenny/oss-20b-prereform-to-modern-ru-merged"
 SYSTEM_MSG = (
     "You convert Russian text from pre-1918 orthography to modern Russian spelling. "
     "Keep wording and punctuation; change only orthography."
 )
+OCR_PROMPT = (
+    "Extract the original text from this image as plain text. "
+    "Keep the reading order. Do not translate. Do not add extra formatting."
+)
+# ---------- Utils ----------
+def fetch_image(image_input) -> Image.Image:
+    """Accept Gradio image (PIL) or URL/path string and return a PIL RGB image."""
+    if isinstance(image_input, Image.Image):
+        return image_input.convert("RGB")
+    if isinstance(image_input, str):
+        if image_input.startswith(("http://", "https://")):
+            resp = requests.get(image_input, timeout=30)
+            resp.raise_for_status()
+            return Image.open(BytesIO(resp.content)).convert("RGB")
+        return Image.open(image_input).convert("RGB")
+    raise ValueError(f"Unsupported image input: {type(image_input)}")
+# ---------- Snapshot + load models at module scope (after spaces import) ----------
+# Snapshot OCR model locally to avoid dynamic code churn and speed up cold starts.
+snapshot_download(
+    repo_id=OCR_REPO,
+    local_dir=OCR_LOCAL_DIR,
+    local_dir_use_symlinks=False,
+)
+# Load OCR (tries flash-attn 2 path; if it's mismatched at runtime, you can switch to "eager")
+_ocr_model = AutoModelForCausalLM.from_pretrained(
+    OCR_LOCAL_DIR,
+    attn_implementation="flash_attention_2",  # matches the working Space technique
+    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else "auto",
+    device_map="auto",
+    trust_remote_code=True,
+)
+_ocr_processor = AutoProcessor.from_pretrained(OCR_LOCAL_DIR, trust_remote_code=True)
+# Load conversion model (pre-reform -> modern Russian)
+_convert_tokenizer = AutoTokenizer.from_pretrained(CONVERT_REPO, use_fast=True)
+_convert_model = AutoModelForCausalLM.from_pretrained(
+    CONVERT_REPO,
+    device_map="auto",
+    torch_dtype="auto",
+)
+# Device (safe after spaces import)
+_device = "cuda" if torch.cuda.is_available() else "cpu"
+# ---------- Core pipeline ----------
+def run_ocr(pil_image: Image.Image) -> str:
+    """OCR using dots.ocr; returns plain text."""
+    # Build messages for OCR model
     messages = [
         {
             "role": "user",
             "content": [
                 {"type": "image", "image": pil_image},
+                {"type": "text", "text": OCR_PROMPT},
             ],
         }
     ]
+    # Prepare inputs
+    text = _ocr_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     image_inputs, video_inputs = process_vision_info(messages)
+    inputs = _ocr_processor(
         text=[text],
         images=image_inputs,
         videos=video_inputs,
         padding=True,
         return_tensors="pt",
+    ).to(_device)
     with torch.no_grad():
+        generated_ids = _ocr_model.generate(
+            **inputs,
+            max_new_tokens=4096,
+            do_sample=False,
+            temperature=0.0,
+        )
+    # Trim prompt
+    trimmed = [out[len(inp):] for inp, out in zip(inputs["input_ids"], generated_ids)]
+    out_text = _ocr_processor.batch_decode(
+        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
+    return (out_text or "").strip()
+def convert_prereform_to_modern(pre_reform_text: str) -> str:
     """Use your merged model to convert pre-reform Russian -> modern Russian."""
     messages = [
         {"role": "system", "content": SYSTEM_MSG},
         {"role": "user", "content": pre_reform_text},
     ]
+    prompt = _convert_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = _convert_tokenizer([prompt], return_tensors="pt").to(_convert_model.device)
     with torch.no_grad():
+        gen = _convert_model.generate(
             **inputs,
             max_new_tokens=1024,
+            do_sample=False,
             temperature=0.0,
             repetition_penalty=1.05,
         )
     gen_only = gen[0][inputs["input_ids"].shape[1]:]
+    return _convert_tokenizer.decode(gen_only, skip_special_tokens=True).strip()
+@spaces.GPU()  # heavy work happens on ZeroGPU worker
+def transcribe_and_convert(image_in) -> Tuple[Image.Image, str, str, str]:
+    try:
+        pil = fetch_image(image_in)
+        ocr_text = run_ocr(pil)
+        modern_text = convert_prereform_to_modern(ocr_text)
+        md_block = f"```text\n{modern_text}\n```"
+        return pil, ocr_text, modern_text, md_block
+    except Exception as e:
+        traceback.print_exc()
+        err = f"Error: {e}"
+        return None, "", "", err
+# ---------- UI ----------
 with gr.Blocks(title="Pre-reform → Modern Russian (OCR + Conversion)") as demo:
     gr.Markdown(
         "## Pre-reform → Modern Russian (OCR + Conversion)\n"
+        "Upload an image containing pre-1918 Russian text → OCR via **dots.ocr** → convert to modern Russian."
     )
     with gr.Row():
         with gr.Column(scale=1):
+            img_in = gr.Image(type="pil", label="Upload image (pre-reform Russian)")
             run_btn = gr.Button("Transcribe & Convert", variant="primary")
         with gr.Column(scale=2):
             with gr.Row():
+                img_preview = gr.Image(label="Preview", interactive=False)
                 ocr_box = gr.Textbox(label="Transcribed (pre-reform)", lines=14)
                 modern_box = gr.Textbox(label="Modern Russian", lines=14)
+            md_box = gr.Markdown(label="Modern Russian (markdown code block)")
     run_btn.click(
         transcribe_and_convert,
+        inputs=[img_in],
+        outputs=[img_preview, ocr_box, modern_box, md_box],
         api_name="transcribe_convert",
     )
+demo.queue(max_size=10).launch(server_name="0.0.0.0", server_port=7860, debug=True, show_error=True)