gizemsarsinlar's picture
Update app.py
2cf5592 verified
import gradio as gr
from PIL import Image
import torch
import soundfile as sf # Ses işleme için
from transformers import AutoModelForCausalLM, AutoProcessor
import spaces
# Modeli yükle
model_path = "microsoft/Phi-4-multimodal-instruct"
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
torch_dtype="auto",
trust_remote_code=True,
_attn_implementation="eager",
)
user_prompt = "<|user|>"
assistant_prompt = "<|assistant|>"
prompt_suffix = "<|end|>"
@spaces.GPU
def process_input(input_type, file):
if not file:
return "Please upload an image of an artwork."
if input_type == "Image":
# **Prompt for Artworks**
prompt = (
f"{user_prompt} You are an expert art historian and critic. Analyze the given artwork with these aspects:\n\n"
f"1. **General Description**: Describe the colors, composition, and subject.\n"
f"2. **Artistic Style**: Identify the artistic movement.\n"
f"3. **Historical Context**: Discuss the period and influences.\n"
f"4. **Symbolism & Meaning**: Interpret the messages conveyed.\n"
f"5. **Technical Analysis**: Examine brushwork, lighting, and composition.\n"
f"6. **Impact & Significance**: Explain its relevance in art history.\n\n"
f"Here is the artwork for analysis:\n"
f"<|image_1|>\n"
f"{prompt_suffix}{assistant_prompt}"
)
image = Image.open(file)
inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
elif input_type == "Audio":
prompt = (
f"{user_prompt} Please transcribe the given audio into text accurately.\n\n"
f"<|audio_1|>\n"
f"{prompt_suffix}{assistant_prompt}"
)
audio, samplerate = sf.read(file)
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to(model.device)
else:
return "Geçersiz giriş türü seçildi."
with torch.no_grad():
generate_ids = model.generate(
**inputs,
max_new_tokens=1000,
num_logits_to_keep=0,
temperature=0.7,
top_k=50,
)
generate_ids = generate_ids[:, inputs["input_ids"].shape[1]:]
response = processor.batch_decode(
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
return response
with gr.Blocks(title="Art & Audio Analysis with Phi-4") as demo:
gr.Markdown(
"""
# 🎨🗣️ Multimodal Art Analysis with Phi-4
- **Art Analysis**: Upload a piece of art, AI will perform a detailed analysis.
- **Audio Transcription**: Upload your audio file, AI will convert it to text.
🚀 Powered by Microsoft's `Phi-4-multimodal-instruct` model.
With this project, you can both analyze the uploaded image and convert the guide's verbal explanation into text while visiting any exhibition or museum.
"""
)
with gr.Row():
with gr.Column(scale=1):
input_type = gr.Radio(
choices=["Image", "Audio"],
label="Select Input Type",
value="Image",
)
file_input = gr.File(
label="Upload File",
file_types=["image", "audio"],
)
submit_btn = gr.Button("Analyze", variant="primary")
with gr.Column(scale=2):
output_text = gr.Textbox(
label="AI Response",
placeholder="The AI's response will appear here...",
lines=12,
interactive=False,
)
submit_btn.click(
fn=process_input,
inputs=[input_type, file_input],
outputs=output_text,
)
demo.launch()