Spaces:

sagar007
/

phi-vision-math-assistant

Paused

File size: 7,742 Bytes

import gradio as gr
import spaces
from PIL import Image
import requests
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
import subprocess
from io import BytesIO

# Install flash-attn
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

# Load the model and processor
model_id = "microsoft/Phi-3.5-vision-instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    use_flash_attention_2=False,  # Explicitly disable Flash Attention 2
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)

@spaces.GPU(duration=120)  # Adjust the duration as needed
def solve_math_problem(image):
    # Move model to GPU for this function call
    model.to('cuda')
    
    # Prepare the input
    messages = [
        {"role": "user", "content": "<|image_1|>\nSolve this math problem step by step. Explain your reasoning clearly."},
    ]
    prompt = processor.tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    
    # Process the input
    inputs = processor(prompt, image, return_tensors="pt").to("cuda")
    
    # Generate the response
    generation_args = {
        "max_new_tokens": 1000,
        "temperature": 0.2,
        "do_sample": True,
    }
    generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)
    
    # Decode the response
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    
    # Move model back to CPU to free up GPU memory
    model.to('cpu')
    return response

# Function to load image from URL
def load_image_from_url(url):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    return img

# Custom CSS
custom_css = """
<style>
    body {
        background: linear-gradient(135deg, #1a1c2c, #4a4e69, #9a8c98);
        font-family: 'Arial', sans-serif;
        color: #f2e9e4;
        margin: 0;
        padding: 0;
        min-height: 100vh;
    }
    #app-header {
        text-align: center;
        background: rgba(255, 255, 255, 0.1);
        padding: 30px;
        border-radius: 20px;
        box-shadow: 0 10px 30px rgba(0, 0, 0, 0.3);
        position: relative;
        overflow: hidden;
        margin: 20px auto;
        max-width: 800px;
    }
    #app-header::before {
        content: "";
        position: absolute;
        top: -50%;
        left: -50%;
        width: 200%;
        height: 200%;
        background: radial-gradient(circle, rgba(255,255,255,0.1) 0%, rgba(255,255,255,0) 70%);
        animation: shimmer 15s infinite linear;
    }
    @keyframes shimmer {
        0% { transform: rotate(0deg); }
        100% { transform: rotate(360deg); }
    }
    #app-header h1 {
        color: #f2e9e4;
        font-size: 2.5em;
        margin-bottom: 15px;
        text-shadow: 2px 2px 4px rgba(0,0,0,0.5);
    }
    #app-header p {
        font-size: 1.2em;
        color: #c9ada7;
    }
    .concept-container {
        display: flex;
        justify-content: center;
        gap: 20px;
        margin-top: 30px;
        flex-wrap: wrap;
    }
    .concept {
        position: relative;
        transition: transform 0.3s, box-shadow 0.3s;
        border-radius: 15px;
        overflow: hidden;
        background: rgba(255, 255, 255, 0.1);
        box-shadow: 0 5px 15px rgba(0,0,0,0.2);
        width: 150px;
        height: 150px;
        display: flex;
        flex-direction: column;
        justify-content: center;
        align-items: center;
    }
    .concept:hover {
        transform: translateY(-10px) rotate(3deg);
        box-shadow: 0 15px 30px rgba(0,0,0,0.4);
    }
    .concept-emoji {
        font-size: 60px;
        margin-bottom: 10px;
    }
    .concept-description {
        background-color: rgba(110, 72, 170, 0.8);
        color: white;
        padding: 10px;
        font-size: 0.9em;
        text-align: center;
        width: 100%;
        position: absolute;
        bottom: 0;
    }
    .artifact {
        position: absolute;
        background: radial-gradient(circle, rgba(255,255,255,0.1) 0%, rgba(255,255,255,0) 70%);
        border-radius: 50%;
        opacity: 0.5;
        pointer-events: none;
    }
    .artifact.large {
        width: 400px;
        height: 400px;
        top: -100px;
        left: -200px;
        animation: float 20s infinite ease-in-out;
    }
    .artifact.medium {
        width: 300px;
        height: 300px;
        bottom: -150px;
        right: -150px;
        animation: float 15s infinite ease-in-out reverse;
    }
    .artifact.small {
        width: 150px;
        height: 150px;
        top: 50%;
        left: 50%;
        transform: translate(-50%, -50%);
        animation: pulse 5s infinite alternate;
    }
    @keyframes float {
        0%, 100% { transform: translateY(0) rotate(0deg); }
        50% { transform: translateY(-20px) rotate(10deg); }
    }
    @keyframes pulse {
        0% { transform: translate(-50%, -50%) scale(1); opacity: 0.5; }
        100% { transform: translate(-50%, -50%) scale(1.1); opacity: 0.8; }
    }
    /* Gradio component styling */
    .gr-box {
        background-color: rgba(255, 255, 255, 0.1) !important;
        border: 1px solid rgba(255, 255, 255, 0.2) !important;
    }
    .gr-input, .gr-button {
        background-color: rgba(255, 255, 255, 0.1) !important;
        color: #f2e9e4 !important;
        border: 1px solid rgba(255, 255, 255, 0.2) !important;
    }
    .gr-button:hover {
        background-color: rgba(255, 255, 255, 0.2) !important;
    }
    .gr-form {
        background-color: transparent !important;
    }
</style>
"""

# Custom HTML
custom_html = """
<div id="app-header">
    <div class="artifact large"></div>
    <div class="artifact medium"></div>
    <div class="artifact small"></div>
    <h1>Visual Math Problem Solver</h1>
    <p>Upload an image of a math problem, and I'll try to solve it step by step!</p>
    <div class="concept-container">
        <div class="concept">
            <div class="concept-emoji">🧮</div>
            <div class="concept-description">Problem Solving</div>
        </div>
        <div class="concept">
            <div class="concept-emoji">📷</div>
            <div class="concept-description">Image Recognition</div>
        </div>
        <div class="concept">
            <div class="concept-emoji">🤖</div>
            <div class="concept-description">AI-Powered</div>
        </div>
        <div class="concept">
            <div class="concept-emoji">📝</div>
            <div class="concept-description">Step-by-Step</div>
        </div>
    </div>
</div>
"""


# Create the Gradio interface
with gr.Blocks(css=custom_css) as iface:
    gr.HTML(custom_html)
    with gr.Row():
        with gr.Column(scale=1):
            input_image = gr.Image(type="pil", label="Upload Math Problem Image")
            submit_btn = gr.Button("Solve Problem")
        with gr.Column(scale=1):
            output_text = gr.Textbox(label="Solution", lines=10)
    
    submit_btn.click(fn=solve_math_problem, inputs=input_image, outputs=output_text)
    
    gr.Examples(
        examples=[
            "https://i.imgur.com/2Gwd3bN.jpg",  # Replace with actual URLs of math problem images
            "https://i.imgur.com/wPw5YtB.jpg"
        ],
        inputs=input_image,
        outputs=output_text,
        fn=lambda url: solve_math_problem(load_image_from_url(url)),
        cache_examples=True,
    )

# Launch the app
iface.launch()