import gradio as gr
from PIL import Image
import torch
import soundfile as sf  # Ses işleme için
from transformers import AutoModelForCausalLM, AutoProcessor
import spaces

# Modeli yükle
model_path = "microsoft/Phi-4-multimodal-instruct"
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    _attn_implementation="eager",
)

user_prompt = "<|user|>"
assistant_prompt = "<|assistant|>"
prompt_suffix = "<|end|>"

@spaces.GPU
def process_input(input_type, file):
    if not file:
        return "Please upload an image of an artwork."

    if input_type == "Image":
        # **Prompt for Artworks**
        prompt = (
            f"{user_prompt} You are an expert art historian and critic. Analyze the given artwork with these aspects:\n\n"
            f"1. **General Description**: Describe the colors, composition, and subject.\n"
            f"2. **Artistic Style**: Identify the artistic movement.\n"
            f"3. **Historical Context**: Discuss the period and influences.\n"
            f"4. **Symbolism & Meaning**: Interpret the messages conveyed.\n"
            f"5. **Technical Analysis**: Examine brushwork, lighting, and composition.\n"
            f"6. **Impact & Significance**: Explain its relevance in art history.\n\n"
            f"Here is the artwork for analysis:\n"
            f"<|image_1|>\n"
            f"{prompt_suffix}{assistant_prompt}"
        )
        image = Image.open(file)
        inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)

    elif input_type == "Audio":
        prompt = (
            f"{user_prompt} Please transcribe the given audio into text accurately.\n\n"
            f"<|audio_1|>\n"
            f"{prompt_suffix}{assistant_prompt}"
        )
        audio, samplerate = sf.read(file)
        inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to(model.device)

    else:
        return "Geçersiz giriş türü seçildi."

    with torch.no_grad():
        generate_ids = model.generate(
            **inputs,
            max_new_tokens=1000,
            num_logits_to_keep=0,
            temperature=0.7,  
            top_k=50,
        )
    generate_ids = generate_ids[:, inputs["input_ids"].shape[1]:]
    response = processor.batch_decode(
        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]

    return response

with gr.Blocks(title="Art & Audio Analysis with Phi-4") as demo:
    gr.Markdown(
        """
        # 🎨🗣️ Multimodal Art Analysis with Phi-4  
        - **Art Analysis**: Upload a piece of art, AI will perform a detailed analysis.
        - **Audio Transcription**: Upload your audio file, AI will convert it to text.
        
        🚀 Powered by Microsoft's `Phi-4-multimodal-instruct` model.

        With this project, you can both analyze the uploaded image and convert the guide's verbal explanation into text while visiting any exhibition or museum.
        
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            input_type = gr.Radio(
                choices=["Image", "Audio"],
                label="Select Input Type",
                value="Image",
            )
            file_input = gr.File(
                label="Upload File",
                file_types=["image", "audio"],
            )
            submit_btn = gr.Button("Analyze", variant="primary")

        with gr.Column(scale=2):
            output_text = gr.Textbox(
                label="AI Response",
                placeholder="The AI's response will appear here...",
                lines=12,
                interactive=False,
            )

    submit_btn.click(
        fn=process_input,
        inputs=[input_type, file_input],
        outputs=output_text,
    )

demo.launch()