Spaces:

whyumesh
/

eterniq_vision_to_code

Runtime error

File size: 6,475 Bytes

import gradio as gr
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
import cv2
import numpy as np
import os
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Device count: {torch.cuda.device_count()}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name()}")

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Set the default tensor type to cuda
if torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')


def load_model():
    try:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {device}")
        
        model = Qwen2VLForConditionalGeneration.from_pretrained(
            "Qwen/Qwen2-VL-2B-Instruct",
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto",
            low_cpu_mem_usage=True
        )
        processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
        return model, processor, device
    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None, None

model, processor, device = load_model()

SYSTEM_PROMPT = """You are an expert technical analyst specializing in identifying bugs, fixing errors, and explaining code functions from visual inputs. When presented with an image or video:
1. If you see code, analyze it for potential bugs or errors, and suggest fixes.
2. If you see a function or algorithm, explain its purpose and how it works.
3. If you see a technical diagram or flowchart, interpret its meaning and purpose.
4. For any technical content, provide detailed explanations and insights.
Always maintain a professional and technical tone in your responses."""

def process_content(file, user_prompt):
    if file is None:
        return "No content provided. Please upload an image or video of technical content."
    
    file_path = file.name
    file_extension = os.path.splitext(file_path)[1].lower()
    
    if file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
        image = Image.open(file_path)
        return analyze_image(image, user_prompt)
    elif file_extension in ['.mp4', '.avi', '.mov']:
        return analyze_video(file_path, user_prompt)
    else:
        return "Unsupported file type. Please provide an image (jpg, jpeg, png, bmp) or video (mp4, avi, mov) of technical content."

def analyze_image(image, prompt):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": f"Based on the system instructions, {prompt}"},
            ],
        }
    ]
    
    return generate_response(messages)

def analyze_video(video_path, prompt, max_frames=16, frame_interval=30, max_resolution=224):
    try:
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            return "Error: Could not open video file."
        
        frames = []
        frame_count = 0

        while len(frames) < max_frames:
            ret, frame = cap.read()
            if not ret:
                break
            
            if frame_count % frame_interval == 0:
                h, w = frame.shape[:2]
                if h > w:
                    new_h, new_w = max_resolution, int(w * max_resolution / h)
                else:
                    new_h, new_w = int(h * max_resolution / w), max_resolution
                frame = cv2.resize(frame, (new_w, new_h))
                
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = Image.fromarray(frame)
                
                frames.append(frame)
            
            frame_count += 1

        return generate_response([
            {"role": "system", "content": SYSTEM_PROMPT},
            {
                "role": "user",
                "content": [
                    {"type": "video", "video": frames},
                    {"type": "text", "text": f"Based on the system instructions, {prompt}"},
                ],
            }
        ])
    except Exception as e:
        return f"Error processing video: {e}"
    finally:
        if 'cap' in locals():
            cap.release()
                

def generate_response(messages):
    try:
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)
        
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt"
        )
        
        # Move inputs to GPU
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=512,
                do_sample=True,
                top_k=20,
                top_p=0.9,
                temperature=0.7
            )
        
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )
        
        # Clear CUDA cache
        torch.cuda.empty_cache()
        
        return output_text[0]
    except Exception as e:
        return f"Error generating response: {e}"
        
# Gradio interface
iface = gr.Interface(
    fn=process_content,
    inputs=[
        gr.File(label="Upload Image or Video of Technical Content"),
        gr.Textbox(label="Enter your technical question", placeholder="e.g., Identify any bugs in this code and suggest fixes", value="Analyze this technical content and provide insights.")
    ],
    outputs="text",
    title="Technical Content Analysis",
    description="Upload an image or video of code, diagrams, or technical content. Ask questions about bugs, errors, or explanations of functions.",
)

iface.launch(share=True)