Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
from PIL import Image | |
import torch | |
import soundfile as sf # Ses işleme için | |
from transformers import AutoModelForCausalLM, AutoProcessor | |
import spaces | |
# Modeli yükle | |
model_path = "microsoft/Phi-4-multimodal-instruct" | |
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_path, | |
device_map="auto", | |
torch_dtype="auto", | |
trust_remote_code=True, | |
_attn_implementation="eager", | |
) | |
user_prompt = "<|user|>" | |
assistant_prompt = "<|assistant|>" | |
prompt_suffix = "<|end|>" | |
def process_input(input_type, file): | |
if not file: | |
return "Please upload an image of an artwork." | |
if input_type == "Image": | |
# **Prompt for Artworks** | |
prompt = ( | |
f"{user_prompt} You are an expert art historian and critic. Analyze the given artwork with these aspects:\n\n" | |
f"1. **General Description**: Describe the colors, composition, and subject.\n" | |
f"2. **Artistic Style**: Identify the artistic movement.\n" | |
f"3. **Historical Context**: Discuss the period and influences.\n" | |
f"4. **Symbolism & Meaning**: Interpret the messages conveyed.\n" | |
f"5. **Technical Analysis**: Examine brushwork, lighting, and composition.\n" | |
f"6. **Impact & Significance**: Explain its relevance in art history.\n\n" | |
f"Here is the artwork for analysis:\n" | |
f"<|image_1|>\n" | |
f"{prompt_suffix}{assistant_prompt}" | |
) | |
image = Image.open(file) | |
inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device) | |
elif input_type == "Audio": | |
prompt = ( | |
f"{user_prompt} Please transcribe the given audio into text accurately.\n\n" | |
f"<|audio_1|>\n" | |
f"{prompt_suffix}{assistant_prompt}" | |
) | |
audio, samplerate = sf.read(file) | |
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to(model.device) | |
else: | |
return "Geçersiz giriş türü seçildi." | |
with torch.no_grad(): | |
generate_ids = model.generate( | |
**inputs, | |
max_new_tokens=1000, | |
num_logits_to_keep=0, | |
temperature=0.7, | |
top_k=50, | |
) | |
generate_ids = generate_ids[:, inputs["input_ids"].shape[1]:] | |
response = processor.batch_decode( | |
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
)[0] | |
return response | |
with gr.Blocks(title="Art & Audio Analysis with Phi-4") as demo: | |
gr.Markdown( | |
""" | |
# 🎨🗣️ Multimodal Art Analysis with Phi-4 | |
- **Art Analysis**: Upload a piece of art, AI will perform a detailed analysis. | |
- **Audio Transcription**: Upload your audio file, AI will convert it to text. | |
🚀 Powered by Microsoft's `Phi-4-multimodal-instruct` model. | |
With this project, you can both analyze the uploaded image and convert the guide's verbal explanation into text while visiting any exhibition or museum. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
input_type = gr.Radio( | |
choices=["Image", "Audio"], | |
label="Select Input Type", | |
value="Image", | |
) | |
file_input = gr.File( | |
label="Upload File", | |
file_types=["image", "audio"], | |
) | |
submit_btn = gr.Button("Analyze", variant="primary") | |
with gr.Column(scale=2): | |
output_text = gr.Textbox( | |
label="AI Response", | |
placeholder="The AI's response will appear here...", | |
lines=12, | |
interactive=False, | |
) | |
submit_btn.click( | |
fn=process_input, | |
inputs=[input_type, file_input], | |
outputs=output_text, | |
) | |
demo.launch() |