import gradio as gr from PIL import Image import torch import soundfile as sf # Ses işleme için from transformers import AutoModelForCausalLM, AutoProcessor import spaces # Modeli yükle model_path = "microsoft/Phi-4-multimodal-instruct" processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_path, device_map="auto", torch_dtype="auto", trust_remote_code=True, _attn_implementation="eager", ) user_prompt = "<|user|>" assistant_prompt = "<|assistant|>" prompt_suffix = "<|end|>" @spaces.GPU def process_input(input_type, file): if not file: return "Please upload an image of an artwork." if input_type == "Image": # **Prompt for Artworks** prompt = ( f"{user_prompt} You are an expert art historian and critic. Analyze the given artwork with these aspects:\n\n" f"1. **General Description**: Describe the colors, composition, and subject.\n" f"2. **Artistic Style**: Identify the artistic movement.\n" f"3. **Historical Context**: Discuss the period and influences.\n" f"4. **Symbolism & Meaning**: Interpret the messages conveyed.\n" f"5. **Technical Analysis**: Examine brushwork, lighting, and composition.\n" f"6. **Impact & Significance**: Explain its relevance in art history.\n\n" f"Here is the artwork for analysis:\n" f"<|image_1|>\n" f"{prompt_suffix}{assistant_prompt}" ) image = Image.open(file) inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device) elif input_type == "Audio": prompt = ( f"{user_prompt} Please transcribe the given audio into text accurately.\n\n" f"<|audio_1|>\n" f"{prompt_suffix}{assistant_prompt}" ) audio, samplerate = sf.read(file) inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to(model.device) else: return "Geçersiz giriş türü seçildi." with torch.no_grad(): generate_ids = model.generate( **inputs, max_new_tokens=1000, num_logits_to_keep=0, temperature=0.7, top_k=50, ) generate_ids = generate_ids[:, inputs["input_ids"].shape[1]:] response = processor.batch_decode( generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] return response with gr.Blocks(title="Art & Audio Analysis with Phi-4") as demo: gr.Markdown( """ # 🎨🗣️ Multimodal Art Analysis with Phi-4 - **Art Analysis**: Upload a piece of art, AI will perform a detailed analysis. - **Audio Transcription**: Upload your audio file, AI will convert it to text. 🚀 Powered by Microsoft's `Phi-4-multimodal-instruct` model. With this project, you can both analyze the uploaded image and convert the guide's verbal explanation into text while visiting any exhibition or museum. """ ) with gr.Row(): with gr.Column(scale=1): input_type = gr.Radio( choices=["Image", "Audio"], label="Select Input Type", value="Image", ) file_input = gr.File( label="Upload File", file_types=["image", "audio"], ) submit_btn = gr.Button("Analyze", variant="primary") with gr.Column(scale=2): output_text = gr.Textbox( label="AI Response", placeholder="The AI's response will appear here...", lines=12, interactive=False, ) submit_btn.click( fn=process_input, inputs=[input_type, file_input], outputs=output_text, ) demo.launch()