ShahadFawaz99 commited on
Commit
1e29672
·
verified ·
1 Parent(s): b159748

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -0
app.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr # Import Gradio for creating web interfaces
2
+ import torch # Import PyTorch for deep learning
3
+ from PIL import Image # Import PIL for image processing
4
+ from transformers import pipeline, CLIPProcessor, CLIPModel # Import necessary classes from Hugging Face Transformers
5
+ import requests # Import requests for making HTTP requests
6
+ from bs4 import BeautifulSoup # Import BeautifulSoup for web scraping
7
+ from gtts import gTTS # Import gTTS for text-to-speech conversion
8
+
9
+ # Define the device to use (CPU or GPU)
10
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+
12
+ # Load the BLIP model for image captioning
13
+ caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
14
+
15
+ # Load CLIP model for image classification
16
+ clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
17
+ clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
18
+
19
+ # Load the English summarization model
20
+ summarization_pipeline = pipeline("summarization", model="google/pegasus-xsum")
21
+
22
+ # Load the Arabic summarization model
23
+ arabic_summarization_pipeline = pipeline("summarization", model="abdalrahmanshahrour/auto-arabic-summarization")
24
+
25
+ # Load the translation model
26
+ translation_pipeline = pipeline("translation", model="facebook/nllb-200-distilled-600M")
27
+
28
+ # Function to fetch long texts from Wikipedia
29
+ def get_wikipedia_summary(landmark_name, language='en'):
30
+ url = f"https://{language}.wikipedia.org/wiki/{landmark_name.replace(' ', '_')}" # Construct the URL
31
+ response = requests.get(url) # Make an HTTP GET request to fetch the page
32
+ soup = BeautifulSoup(response.content, 'html.parser') # Parse the HTML content with BeautifulSoup
33
+
34
+ paragraphs = soup.find_all('p') # Extract all paragraph elements
35
+ summary_text = ' '.join([para.get_text() for para in paragraphs if para.get_text()]) # Join text from all paragraphs
36
+
37
+ return summary_text[:2000] # Return the first 2000 characters of the summary
38
+
39
+ # Function to load landmarks from an external file
40
+ def load_landmarks(filename):
41
+ landmarks = {}
42
+ with open(filename, 'r', encoding='utf-8') as file: # Open the file in read mode
43
+ for line in file:
44
+ if line.strip():
45
+ english_name, arabic_name = line.strip().split('|') # Split by the delimiter
46
+ landmarks[english_name] = arabic_name # Add to the dictionary
47
+ return landmarks # Return the dictionary of landmarks
48
+
49
+ # Load landmarks from the file
50
+ landmarks_dict = load_landmarks("landmarks.txt")
51
+
52
+ # Function to convert text to speech
53
+ def text_to_speech(text, language='en'):
54
+ tts = gTTS(text=text, lang=language) # Create a gTTS object for text-to-speech
55
+ audio_file = "summary.mp3" # Define the audio file name
56
+ tts.save(audio_file) # Save the audio file
57
+ return audio_file # Return the path to the audio file
58
+
59
+ # Function to generate a caption for the image
60
+ def generate_caption(image):
61
+ return caption_image(image)[0]['generated_text'] # Get generated caption from the model
62
+
63
+ # Function to classify the image using the CLIP model
64
+ def classify_image(image, labels):
65
+ inputs = clip_processor(text=labels, images=image, return_tensors="pt", padding=True) # Prepare inputs for CLIP model
66
+ outputs = clip_model(**inputs) # Get model outputs
67
+ logits_per_image = outputs.logits_per_image # Get logits for images
68
+ probs = logits_per_image.softmax(dim=1).cpu().detach().numpy()[0] # Compute probabilities
69
+ top_label = labels[probs.argmax()] # Get the label with the highest probability
70
+ top_prob = probs.max() # Get the highest probability value
71
+ return top_label, top_prob # Return top label and probability
72
+
73
+ # Function to summarize the description
74
+ def summarize_description(full_description, language):
75
+ if language == 'ar':
76
+ return arabic_summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] # Summarize in Arabic
77
+ else:
78
+ return summarization_pipeline(full_description, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] # Summarize in English
79
+
80
+ # Function to translate the caption and classification result
81
+ def translate_results(caption, top_label, top_prob, landmarks_dict, language):
82
+ if language == 'ar':
83
+ caption_translated = translation_pipeline(caption, src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text'] # Translate caption to Arabic
84
+ classification_result = translation_pipeline(f"أفضل مطابقة: {landmarks_dict[top_label]} باحتمالية {top_prob:.4f}", src_lang='eng_Latn', tgt_lang='arb_Arab')[0]['translation_text'] # Translate classification result
85
+ else:
86
+ caption_translated = caption # Keep caption in English
87
+ classification_result = f"Best match: {top_label} with probability {top_prob:.4f}" # Create English classification result
88
+
89
+ return caption_translated, classification_result # Return translated results
90
+
91
+ # Function to process the image and generate results
92
+ def process_image(image, language='en'):
93
+ try:
94
+ # Generate caption for the image
95
+ caption = generate_caption(image) # Call the caption generation function
96
+
97
+ # Classify the image
98
+ top_label, top_prob = classify_image(image, list(landmarks_dict.keys())) # Use keys for classification
99
+
100
+ # Determine the appropriate name to use based on the language
101
+ landmark_name = top_label if language == 'en' else landmarks_dict[top_label]
102
+ full_description = get_wikipedia_summary(landmark_name, language) # Get the Wikipedia summary for the top label
103
+
104
+ # Summarize the full description
105
+ summarized_description = summarize_description(full_description, language) # Call the summarization function
106
+
107
+ # Translate caption and classification result
108
+ caption_translated, classification_result = translate_results(caption, top_label, top_prob, landmarks_dict, language) # Call the translation function
109
+
110
+ # Convert the summarized description to speech
111
+ audio_file = text_to_speech(summarized_description, language) # Convert summary to audio
112
+
113
+ # Return results formatted for Arabic
114
+ if language == 'ar':
115
+ return f"<div style='text-align: right;'>{caption_translated}</div>", \
116
+ f"<div style='text-align: right;'>{classification_result}</div>", \
117
+ f"<div style='text-align: right;'>{summarized_description}</div>", \
118
+ audio_file # Return formatted results for Arabic
119
+ else:
120
+ return caption_translated, classification_result, summarized_description, audio_file # Return results for English
121
+ except Exception as e:
122
+ return "Error processing the image.", str(e), "", "" # Return error message if any exception occurs
123
+
124
+ # Create Gradio interface for English
125
+ english_interface = gr.Interface(
126
+ fn=lambda image: process_image(image, language='en'), # Function to call on image upload
127
+ inputs=gr.Image(type="pil", label="Upload Image"), # Input field for image upload
128
+ outputs=[ # Define output fields
129
+ gr.Textbox(label="Generated Caption"), # Output for generated caption
130
+ gr.Textbox(label="Classification Result"), # Output for classification result
131
+ gr.Textbox(label="Summarized Description", lines=10), # Output for summarized description
132
+ gr.Audio(label="Summary Audio", type="filepath") # Output for audio summary
133
+ ],
134
+ title="Landmark Recognition", # Title of the interface
135
+ description="Upload an image of a landmark, and we will generate a description, classify it, and provide simple information.", # Description of the tool
136
+ examples=[ # Examples for user
137
+ ["SOL.jfif"],
138
+ ["OIP.jfif"]
139
+ ]
140
+ )
141
+
142
+ # Create Gradio interface for Arabic
143
+ arabic_interface = gr.Interface(
144
+ fn=lambda image: process_image(image, language='ar'), # Function to call on image upload
145
+ inputs=gr.Image(type="pil", label="تحميل صورة"), # Input field for image upload in Arabic
146
+ outputs=[ # Define output fields
147
+ gr.HTML(label="التعليق المولد"), # Output for generated caption in Arabic
148
+ gr.HTML(label="نتيجة التصنيف"), # Output for classification result in Arabic
149
+ gr.HTML(label="الوصف الملخص"), # Output for summarized description in Arabic
150
+ gr.Audio(label="صوت الملخص", type="filepath") # Output for audio summary in Arabic
151
+ ],
152
+ title="التعرف على المعالم", # Title of the interface in Arabic
153
+ description="قم بتحميل صورة لمعلم، وسنعمل على إنشاء وصف له وتصنيفه وتوفير معلومات بسيطة", # Description of the tool in Arabic
154
+ examples=[ # Examples for user
155
+ ["SOL.jfif"],
156
+ ["OIP.jfif"]
157
+ ]
158
+ )
159
+
160
+ # Merge all interfaces into a tabbed interface
161
+ demo = gr.TabbedInterface(
162
+ [english_interface, arabic_interface], # List of interfaces to include
163
+ ["English", "العربية"] # Names of the tabs
164
+ )
165
+
166
+ # Launch the interface
167
+ demo.launch() # Start the Gradio application.