Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import torch | |
import torch.nn.functional as F | |
import numpy as np | |
import plotly.express as px | |
import pandas as pd | |
import spaces | |
from typing import List, Tuple, Dict | |
from torch import Tensor | |
from transformers import AutoTokenizer, AutoModel | |
from sentence_transformers import SentenceTransformer | |
import json | |
# Initialize the embedder at module level | |
embedder = None | |
AVAILABLE_MODELS = { | |
"Qwen3-Embedding-0.6B": "Qwen/Qwen3-Embedding-0.6B", | |
"Semantic-Ar-Qwen-Embed-0.6B": "Omartificial-Intelligence-Space/Semantic-Ar-Qwen-Embed-0.6B" | |
} | |
class QwenEmbedder: | |
def __init__(self, model_name: str = "Qwen/Qwen3-Embedding-0.6B", embedding_dim: int = 768): | |
self.model = SentenceTransformer(model_name) | |
self.embedding_dim = embedding_dim | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
self.model.to(self.device) | |
if embedding_dim != 768: | |
# Add projection layer if needed | |
self.projection = torch.nn.Linear(768, embedding_dim) | |
self.projection.to(self.device) | |
else: | |
self.projection = None | |
def get_embeddings(self, texts: List[str], with_instruction: bool = False) -> torch.Tensor: | |
if with_instruction: | |
texts = [f"Represent this Arabic text for retrieval: {text}" for text in texts] | |
embeddings = self.model.encode(texts, convert_to_tensor=True) | |
if self.projection is not None: | |
embeddings = self.projection(embeddings) | |
# Normalize embeddings | |
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) | |
return embeddings | |
def initialize_embedder(embedding_dim=768): | |
# Initialize device inside the GPU worker | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
print(f"Initializing embedder on device: {device}") | |
# Create model with specified dimension | |
model = QwenEmbedder(embedding_dim=embedding_dim) | |
return model | |
def process_with_embedder(fn_name, *args): | |
"""Generic handler for embedder operations""" | |
global embedder | |
if embedder is None: | |
embedder = initialize_embedder() | |
# Map function names to actual functions | |
fn_map = { | |
'compute_similarity': compute_similarity, | |
'rerank_documents': rerank_documents, | |
'process_batch_embeddings': process_batch_embeddings, | |
'process_retrieval': process_retrieval, | |
'process_cross_lingual': process_cross_lingual, | |
'classify_text': classify_text, | |
'cluster_documents': cluster_documents, | |
'analyze_sentiment': analyze_sentiment, | |
'extract_concepts': extract_concepts | |
} | |
return fn_map[fn_name](embedder, *args) | |
# Check for GPU support and configure appropriately | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
zero = torch.Tensor([0]).to(device) | |
print(f"Device being used: {zero.device}") | |
def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: | |
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) | |
if left_padding: | |
return last_hidden_states[:, -1] | |
else: | |
sequence_lengths = attention_mask.sum(dim=1) - 1 | |
batch_size = last_hidden_states.shape[0] | |
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths] | |
def get_detailed_instruct(task_description: str, query: str) -> str: | |
return f'Instruct: {task_description}\nQuery: {query}' | |
def tokenize(tokenizer, input_texts, eod_id, max_length): | |
batch_dict = tokenizer(input_texts, padding=False, truncation=True, max_length=max_length-2) | |
for seq, att in zip(batch_dict["input_ids"], batch_dict["attention_mask"]): | |
seq.append(eod_id) | |
att.append(1) | |
batch_dict = tokenizer.pad(batch_dict, padding=True, return_tensors="pt") | |
return batch_dict | |
def compute_similarity(embedder: QwenEmbedder, text1: str, text2: str, model_choice: str = None, embedding_dim: int = None) -> float: | |
embeddings = embedder.get_embeddings([text1, text2]) | |
similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item() | |
return round(similarity, 3) | |
def rerank_documents(embedder: QwenEmbedder, query: str, documents: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]: | |
docs_list = [doc.strip() for doc in documents.split('\n') if doc.strip()] | |
# Add instruction to query | |
task = 'Given a search query, retrieve relevant passages that answer the query' | |
query_with_instruct = get_detailed_instruct(task, query) | |
# Get embeddings | |
query_embedding = embedder.get_embeddings([query_with_instruct]) | |
doc_embeddings = embedder.get_embeddings(docs_list) | |
# Calculate similarities | |
scores = (query_embedding @ doc_embeddings.T).squeeze(0) | |
results = [(doc, float(score)) for doc, score in zip(docs_list, scores)] | |
results.sort(key=lambda x: x[1], reverse=True) | |
return [(doc, round(score, 3)) for doc, score in results] | |
def process_batch_embeddings(embedder: QwenEmbedder, texts: str, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame: | |
text_list = [text.strip() for text in texts.split('\n') if text.strip()] | |
if len(text_list) < 1: | |
return pd.DataFrame() | |
embeddings = embedder.get_embeddings(text_list) | |
scores = (embeddings @ embeddings.T).cpu().numpy() | |
# Create similarity matrix DataFrame | |
df_similarities = pd.DataFrame( | |
scores, | |
index=text_list, | |
columns=text_list | |
) | |
return df_similarities.round(3) | |
def process_retrieval(embedder: QwenEmbedder, task_prompt: str, queries: str, documents: str, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame: | |
# Process queries and documents | |
query_list = [q.strip() for q in queries.split('\n') if q.strip()] | |
doc_list = [d.strip() for d in documents.split('\n') if d.strip()] | |
if not query_list or not doc_list: | |
return pd.DataFrame() | |
# Add instruction to queries | |
instructed_queries = [get_detailed_instruct(task_prompt, q) for q in query_list] | |
# Get embeddings for both queries and documents | |
query_embeddings = embedder.get_embeddings(instructed_queries) | |
doc_embeddings = embedder.get_embeddings(doc_list) | |
# Calculate similarity scores | |
scores = (query_embeddings @ doc_embeddings.T).cpu().numpy() | |
# Create DataFrame with results | |
df = pd.DataFrame(scores, index=query_list, columns=doc_list) | |
return df.round(3) | |
def process_cross_lingual(embedder: QwenEmbedder, arabic_text: str, english_text: str, model_choice: str = None, embedding_dim: int = None) -> dict: | |
texts = [arabic_text, english_text] | |
embeddings = embedder.get_embeddings(texts) | |
similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item() | |
return {"similarity": round(similarity, 3)} | |
def classify_text(embedder: QwenEmbedder, text: str, categories: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]: | |
cat_list = [c.strip() for c in categories.split('\n') if c.strip()] | |
text_embedding = embedder.get_embeddings([text]) | |
cat_embeddings = embedder.get_embeddings(cat_list) | |
scores = (text_embedding @ cat_embeddings.T).squeeze(0) | |
results = [(cat, float(score)) for cat, score in zip(cat_list, scores)] | |
results.sort(key=lambda x: x[1], reverse=True) | |
return [(cat, round(score, 3)) for cat, score in results] | |
def cluster_documents(embedder: QwenEmbedder, documents: str, num_clusters: int, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame: | |
from sklearn.cluster import KMeans | |
doc_list = [doc.strip() for doc in documents.split('\n') if doc.strip()] | |
if len(doc_list) < num_clusters: | |
return pd.DataFrame() | |
embeddings = embedder.get_embeddings(doc_list) | |
# Perform clustering | |
kmeans = KMeans(n_clusters=num_clusters, random_state=42) | |
clusters = kmeans.fit_predict(embeddings.cpu().numpy()) | |
# Calculate center document for each cluster | |
cluster_centers = kmeans.cluster_centers_ | |
cluster_center_docs = [] | |
for i in range(num_clusters): | |
cluster_docs = [doc for doc, cluster in zip(doc_list, clusters) if cluster == i] | |
cluster_embeddings = embedder.get_embeddings(cluster_docs) | |
center_embedding = torch.tensor(cluster_centers[i]).unsqueeze(0) | |
similarities = F.cosine_similarity(cluster_embeddings, center_embedding) | |
center_doc = cluster_docs[similarities.argmax().item()] | |
cluster_center_docs.append(center_doc) | |
# Create results DataFrame | |
df = pd.DataFrame({ | |
'Document': doc_list, | |
'Cluster': clusters, | |
'Cluster Center Document': [cluster_center_docs[c] for c in clusters] | |
}) | |
return df.sort_values('Cluster') | |
def analyze_sentiment(embedder: QwenEmbedder, text: str, model_choice: str = None, embedding_dim: int = None) -> Tuple[str, dict]: | |
# Define sentiment anchors | |
anchors = { | |
"very_positive": "هذا رائع جداً ومدهش! أنا سعيد للغاية", | |
"positive": "هذا جيد وممتع", | |
"neutral": "هذا عادي ومقبول", | |
"negative": "هذا سيء ومزعج", | |
"very_negative": "هذا فظيع جداً ومحبط للغاية" | |
} | |
# Get embeddings | |
text_embedding = embedder.get_embeddings([text]) | |
anchor_embeddings = embedder.get_embeddings(list(anchors.values())) | |
# Calculate similarities | |
scores = (text_embedding @ anchor_embeddings.T).squeeze(0) | |
results = list(zip(anchors.keys(), scores.tolist())) | |
results.sort(key=lambda x: x[1], reverse=True) | |
# Return tuple of (sentiment, scores_dict) | |
return ( | |
results[0][0], | |
{k: round(float(v), 3) for k, v in results} | |
) | |
def extract_concepts(embedder: QwenEmbedder, text: str, concept_type: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]: | |
# Define concept anchors based on type | |
concept_anchors = { | |
"emotions": [ | |
"الفرح والسعادة", | |
"الحزن والأسى", | |
"الغضب والإحباط", | |
"الخوف والقلق", | |
"الحب والعاطفة", | |
"الأمل والتفاؤل" | |
], | |
"topics": [ | |
"السياسة والحكم", | |
"الاقتصاد والمال", | |
"العلوم والتكنولوجيا", | |
"الفن والثقافة", | |
"الرياضة والترفيه", | |
"التعليم والمعرفة" | |
], | |
"themes": [ | |
"العدالة والمساواة", | |
"التقدم والتطور", | |
"التقاليد والتراث", | |
"الحرية والاستقلال", | |
"التعاون والوحدة", | |
"الإبداع والابتكار" | |
] | |
} | |
anchors = concept_anchors.get(concept_type, concept_anchors["topics"]) | |
# Get embeddings | |
text_embedding = embedder.get_embeddings([text]) | |
anchor_embeddings = embedder.get_embeddings(anchors) | |
# Calculate similarities | |
scores = (text_embedding @ anchor_embeddings.T).squeeze(0) | |
results = [(anchor, float(score)) for anchor, score in zip(anchors, scores)] | |
results.sort(key=lambda x: x[1], reverse=True) | |
return [(concept, round(score, 3)) for concept, score in results] | |
def create_embedder(model_choice: str, embedding_dim: int = 768) -> QwenEmbedder: | |
model_name = AVAILABLE_MODELS[model_choice] | |
return QwenEmbedder(model_name=model_name, embedding_dim=embedding_dim) | |
def process_similarity(text1: str, text2: str, model_choice: str, embedding_dim: int) -> float: | |
embedder = create_embedder(model_choice, embedding_dim) | |
embeddings = embedder.get_embeddings([text1, text2]) | |
similarity = torch.nn.functional.cosine_similarity(embeddings[0].unsqueeze(0), embeddings[1].unsqueeze(0)) | |
return float(similarity) | |
def process_reranking(query: str, documents: str, model_choice: str, embedding_dim: int) -> Dict: | |
embedder = create_embedder(model_choice, embedding_dim) | |
documents = [doc.strip() for doc in documents.split('\n') if doc.strip()] | |
query_embedding = embedder.get_embeddings([query], with_instruction=True) | |
doc_embeddings = embedder.get_embeddings(documents) | |
similarities = torch.nn.functional.cosine_similarity(query_embedding, doc_embeddings) | |
# Sort documents by similarity | |
sorted_indices = torch.argsort(similarities, descending=True) | |
results = [] | |
for idx in sorted_indices: | |
results.append({ | |
'document': documents[idx], | |
'score': float(similarities[idx]) | |
}) | |
return {'results': results} | |
def process_batch(texts: str, model_choice: str, embedding_dim: int) -> Dict: | |
embedder = create_embedder(model_choice, embedding_dim) | |
texts = [text.strip() for text in texts.split('\n') if text.strip()] | |
embeddings = embedder.get_embeddings(texts) | |
similarity_matrix = torch.nn.functional.cosine_similarity(embeddings.unsqueeze(1), embeddings.unsqueeze(0), dim=2) | |
df = pd.DataFrame(similarity_matrix.cpu().numpy(), index=texts, columns=texts) | |
return {'similarity_matrix': df.to_dict()} | |
def process_retrieval(prompt: str, queries: str, documents: str, model_choice: str, embedding_dim: int) -> Dict: | |
embedder = create_embedder(model_choice, embedding_dim) | |
# Process input strings | |
queries = [q.strip() for q in queries.split('\n') if q.strip()] | |
documents = [doc.strip() for doc in documents.split('\n') if doc.strip()] | |
# Get embeddings | |
prompt_embedding = embedder.get_embeddings([prompt], with_instruction=True) | |
query_embeddings = embedder.get_embeddings(queries, with_instruction=True) | |
doc_embeddings = embedder.get_embeddings(documents) | |
# Calculate similarities | |
query_similarities = torch.nn.functional.cosine_similarity(prompt_embedding, query_embeddings) | |
doc_similarities = torch.nn.functional.cosine_similarity(prompt_embedding.repeat(len(documents), 1), doc_embeddings) | |
# Process results | |
results = { | |
'relevant_queries': [], | |
'relevant_documents': [] | |
} | |
# Sort queries | |
query_indices = torch.argsort(query_similarities, descending=True) | |
for idx in query_indices: | |
results['relevant_queries'].append({ | |
'query': queries[idx], | |
'similarity': float(query_similarities[idx]) | |
}) | |
# Sort documents | |
doc_indices = torch.argsort(doc_similarities, descending=True) | |
for idx in doc_indices: | |
results['relevant_documents'].append({ | |
'document': documents[idx], | |
'similarity': float(doc_similarities[idx]) | |
}) | |
return results | |
# Update the CSS to improve feature visibility | |
custom_css = """ | |
:root { | |
--primary-color: #2196F3; | |
--secondary-color: #1976D2; | |
--background-color: #f8f9fa; | |
--sidebar-bg: #ffffff; | |
--text-color: #333333; | |
--border-color: #e0e0e0; | |
} | |
.container { | |
max-width: 1200px; | |
margin: auto; | |
padding: 20px; | |
} | |
.sidebar { | |
background-color: var(--sidebar-bg); | |
border-right: 1px solid var(--border-color); | |
padding: 20px; | |
margin-right: 20px; | |
position: sticky; | |
top: 0; | |
height: 100vh; | |
overflow-y: auto; | |
} | |
.main-content { | |
background-color: var(--background-color); | |
padding: 20px; | |
border-radius: 10px; | |
} | |
.features-grid { | |
display: grid; | |
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); | |
gap: 15px; | |
margin: 15px 0; | |
} | |
.feature-card { | |
background: white; | |
padding: 15px; | |
border-radius: 6px; | |
box-shadow: 0 1px 3px rgba(0,0,0,0.1); | |
transition: all 0.3s ease; | |
border: 1px solid var(--border-color); | |
text-align: center; | |
} | |
.feature-card:hover { | |
transform: translateY(-3px); | |
box-shadow: 0 3px 6px rgba(0,0,0,0.15); | |
border-color: var(--primary-color); | |
} | |
.feature-icon { | |
font-size: 24px; | |
margin-bottom: 10px; | |
color: var(--primary-color); | |
} | |
.feature-card h3 { | |
color: var(--text-color); | |
margin: 8px 0; | |
font-size: 0.95em; | |
font-weight: 600; | |
} | |
.feature-card p { | |
color: #666; | |
font-size: 0.8em; | |
line-height: 1.3; | |
margin: 5px 0; | |
} | |
.features-summary { | |
margin: 40px 0; | |
padding: 30px; | |
background: white; | |
border-radius: 12px; | |
box-shadow: 0 2px 8px rgba(0,0,0,0.1); | |
} | |
.features-summary h2 { | |
color: var(--text-color); | |
margin-bottom: 25px; | |
text-align: center; | |
font-size: 1.5em; | |
} | |
.feature-list { | |
display: grid; | |
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); | |
gap: 30px; | |
} | |
.feature-group { | |
padding: 20px; | |
background: var(--background-color); | |
border-radius: 8px; | |
border: 1px solid var(--border-color); | |
} | |
.feature-group h3 { | |
color: var(--primary-color); | |
margin-bottom: 15px; | |
font-size: 1.2em; | |
} | |
.feature-group ul { | |
list-style: none; | |
padding: 0; | |
margin: 0; | |
} | |
.feature-group li { | |
padding: 8px 0; | |
color: var(--text-color); | |
position: relative; | |
padding-left: 20px; | |
} | |
.feature-group li:before { | |
content: "•"; | |
color: var(--primary-color); | |
position: absolute; | |
left: 0; | |
} | |
.description { | |
margin: 20px 0; | |
padding: 15px; | |
border-radius: 8px; | |
background-color: #ffffff; | |
box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
} | |
.example { | |
margin: 10px 0; | |
padding: 15px; | |
border-left: 4px solid var(--primary-color); | |
background-color: #ffffff; | |
box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
} | |
.warning { | |
color: #721c24; | |
background-color: #f8d7da; | |
border: 1px solid #f5c6cb; | |
padding: 15px; | |
border-radius: 8px; | |
margin: 10px 0; | |
} | |
.settings { | |
background-color: #ffffff; | |
padding: 20px; | |
border-radius: 8px; | |
box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
margin: 20px 0; | |
} | |
.tab-content { | |
padding: 20px; | |
background-color: #ffffff; | |
border-radius: 8px; | |
box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
} | |
.heading { | |
color: var(--text-color); | |
margin-bottom: 20px; | |
padding-bottom: 10px; | |
border-bottom: 2px solid var(--primary-color); | |
} | |
button.primary { | |
background-color: var(--primary-color) !important; | |
} | |
button.secondary { | |
background-color: var(--secondary-color) !important; | |
} | |
""" | |
# Create the Gradio interface | |
def create_demo(): | |
demo = gr.Blocks(title="Advanced Text Processing with Qwen", css=custom_css, theme=gr.themes.Soft()) | |
with demo: | |
with gr.Row(): | |
# Sidebar | |
with gr.Column(scale=1, elem_classes="sidebar"): | |
gr.Markdown(""" | |
# Qwen Embeddings | |
### Navigation | |
- [Configuration](#configuration) | |
- [Features](#features) | |
- [Documentation](#documentation) | |
""") | |
with gr.Accordion("Configuration", open=True): | |
gr.Markdown(""" | |
### Model Settings | |
Configure the embedding model parameters below. | |
""") | |
model_choice = gr.Dropdown( | |
choices=list(AVAILABLE_MODELS.keys()), | |
value=list(AVAILABLE_MODELS.keys())[0], | |
label="Select Model" | |
) | |
embedding_dim = gr.Slider( | |
minimum=32, | |
maximum=1024, | |
value=768, | |
step=32, | |
label="Embedding Dimension", | |
elem_classes="settings" | |
) | |
update_dim_btn = gr.Button("Update Dimension", variant="secondary") | |
dim_status = gr.Textbox(label="Status", interactive=False) | |
with gr.Accordion("Documentation", open=False): | |
gr.Markdown(""" | |
### Usage Guide | |
1. **Embedding Dimension** | |
- 32-128: Fast, simple tasks | |
- 256-512: Balanced performance | |
- 768: Default, full model | |
- 1024: Maximum detail | |
2. **Best Practices** | |
- Use appropriate dimensions for your task | |
- Consider batch size for multiple documents | |
- Test different settings for optimal results | |
""") | |
# Main Content | |
with gr.Column(scale=4): | |
gr.Markdown(""" | |
# Advanced Text Processing Suite | |
Welcome to the Advanced Text Processing Suite powered by Qwen Embeddings. | |
This tool provides state-of-the-art text analysis capabilities with support for Arabic and multiple languages. | |
""") | |
# Feature Grid | |
gr.HTML(""" | |
<div class="features-grid"> | |
<div class="feature-card"> | |
<div class="feature-icon">🔄</div> | |
<h3>Text Similarity</h3> | |
<p>Compare text meanings</p> | |
</div> | |
<div class="feature-card"> | |
<div class="feature-icon">🔍</div> | |
<h3>Semantic Search</h3> | |
<p>Find relevant docs</p> | |
</div> | |
<div class="feature-card"> | |
<div class="feature-icon">📊</div> | |
<h3>Batch Analysis</h3> | |
<p>Process multiple texts</p> | |
</div> | |
<div class="feature-card"> | |
<div class="feature-icon">🎯</div> | |
<h3>Multi-Query</h3> | |
<p>Advanced retrieval</p> | |
</div> | |
<div class="feature-card"> | |
<div class="feature-icon">🌐</div> | |
<h3>Cross-Lingual</h3> | |
<p>Cross-language match</p> | |
</div> | |
<div class="feature-card"> | |
<div class="feature-icon">🏷️</div> | |
<h3>Classification</h3> | |
<p>Categorize texts</p> | |
</div> | |
<div class="feature-card"> | |
<div class="feature-icon">🔮</div> | |
<h3>Clustering</h3> | |
<p>Group documents</p> | |
</div> | |
<div class="feature-card"> | |
<div class="feature-icon">😊</div> | |
<h3>Sentiment</h3> | |
<p>Analyze emotions</p> | |
</div> | |
<div class="feature-card"> | |
<div class="feature-icon">🎨</div> | |
<h3>Concepts</h3> | |
<p>Extract themes</p> | |
</div> | |
</div> | |
""") | |
with gr.Tabs() as tabs: | |
# Text Similarity Tab | |
with gr.Tab("Text Similarity Analysis"): | |
with gr.Column(elem_classes="tab-content"): | |
gr.Markdown(""" | |
### Text Similarity Analysis | |
Compare the semantic similarity between two texts. The score ranges from 0 (completely different) to 1 (identical meaning). | |
<div class="example"> | |
<strong>Try these Arabic examples:</strong><br> | |
• "أحب القراءة كثيراً" and "القراءة من أحب هواياتي"<br> | |
• "السماء صافية اليوم" and "الطقس حار جداً" | |
</div> | |
""") | |
with gr.Row(): | |
text1 = gr.Textbox( | |
label="First Text", | |
lines=3, | |
placeholder="Enter first text here...", | |
value="أحب القراءة كثيراً" | |
) | |
text2 = gr.Textbox( | |
label="Second Text", | |
lines=3, | |
placeholder="Enter second text here...", | |
value="القراءة من أحب هواياتي" | |
) | |
similarity_btn = gr.Button("Calculate Similarity", variant="primary") | |
similarity_score = gr.Number(label="Similarity Score") | |
similarity_btn.click( | |
fn=lambda t1, t2, m, d: process_with_embedder('compute_similarity', t1, t2, m, d), | |
inputs=[text1, text2, model_choice, embedding_dim], | |
outputs=similarity_score | |
) | |
# Document Reranking Tab | |
with gr.Tab("Semantic Search & Reranking"): | |
with gr.Column(elem_classes="tab-content"): | |
gr.Markdown(""" | |
### Semantic Search & Document Reranking | |
Search through a collection of documents and rank them by semantic relevance to your query. | |
<div class="example"> | |
<strong>Try these Arabic queries:</strong><br> | |
• "ما هي عواصم الدول العربية؟"<br> | |
• "أين تقع أكبر المدن العربية؟"<br> | |
• "ما هي المراكز الثقافية العربية؟" | |
</div> | |
""") | |
query_text = gr.Textbox( | |
label="Search Query", | |
placeholder="Enter your search query...", | |
value="ما هي عواصم الدول العربية؟" | |
) | |
documents_text = gr.Textbox( | |
label="Documents Collection (one per line)", | |
lines=10, | |
placeholder="Enter documents here, one per line...", | |
value="""القاهرة هي عاصمة جمهورية مصر العربية وأكبر مدنها. | |
الرياض هي عاصمة المملكة العربية السعودية ومركزها الاقتصادي. | |
دمشق هي أقدم عاصمة مأهولة في التاريخ وهي عاصمة سوريا. | |
بغداد عاصمة العراق وتقع على نهر دجلة. | |
الدار البيضاء أكبر مدن المغرب وعاصمته الاقتصادية. | |
تونس هي عاصمة الجمهورية التونسية ومركزها الثقافي.""" | |
) | |
rerank_btn = gr.Button("Search & Rank", variant="primary") | |
rerank_results = gr.Dataframe( | |
headers=["Document", "Relevance Score"], | |
label="Search Results" | |
) | |
rerank_btn.click( | |
fn=lambda q, d, m, e: process_with_embedder('rerank_documents', q, d, m, e), | |
inputs=[query_text, documents_text, model_choice, embedding_dim], | |
outputs=rerank_results | |
) | |
# Batch Analysis Tab | |
with gr.Tab("Batch Similarity Analysis"): | |
with gr.Column(elem_classes="tab-content"): | |
gr.Markdown(""" | |
### Batch Similarity Analysis | |
Analyze semantic relationships between multiple texts simultaneously. | |
<div class="example"> | |
<strong>The example shows Arabic proverbs about friendship:</strong><br> | |
See how the model captures the semantic relationships between similar themes. | |
</div> | |
""") | |
batch_texts = gr.Textbox( | |
label="Input Texts (one per line)", | |
lines=10, | |
placeholder="Enter texts here, one per line...", | |
value="""الصديق وقت الضيق. | |
الصديق الحقيقي يظهر عند الشدائد. | |
عند المحن تعرف إخوانك. | |
وقت الشدة بتعرف صحابك. | |
الصاحب ساحب.""" | |
) | |
process_btn = gr.Button("Analyze Relationships", variant="primary") | |
similarity_matrix = gr.Dataframe( | |
label="Similarity Matrix", | |
wrap=True | |
) | |
process_btn.click( | |
fn=lambda t, m, e: process_with_embedder('process_batch_embeddings', t, m, e), | |
inputs=[batch_texts, model_choice, embedding_dim], | |
outputs=[similarity_matrix] | |
) | |
# Add new Retrieval Tab | |
with gr.Tab("Multi-Query Retrieval"): | |
with gr.Column(elem_classes="tab-content"): | |
gr.Markdown(""" | |
### Multi-Query Document Retrieval | |
Match multiple queries against multiple documents simultaneously using semantic search. | |
<div class="description"> | |
This tab implements the exact retrieval logic from the Qwen example, allowing you to: | |
- Define a custom task prompt | |
- Input multiple queries | |
- Input multiple documents | |
- See all query-document match scores in a matrix | |
</div> | |
<div class="example"> | |
<strong>Try these examples:</strong><br> | |
<strong>Task prompt:</strong> "Given a web search query, retrieve relevant passages that answer the query"<br> | |
<strong>Queries:</strong> | |
• "ما هي أكبر المدن العربية؟" | |
• "أين تقع أهم المراكز الثقافية؟"<br> | |
<strong>Documents:</strong> Use the example documents or add your own | |
</div> | |
""") | |
task_prompt = gr.Textbox( | |
label="Task Prompt", | |
placeholder="Enter the task description here...", | |
value="Given a web search query, retrieve relevant passages that answer the query", | |
lines=2 | |
) | |
with gr.Row(): | |
queries_text = gr.Textbox( | |
label="Queries (one per line)", | |
placeholder="Enter your queries here, one per line...", | |
value="""ما هي أكبر المدن العربية؟ | |
أين تقع أهم المراكز الثقافية؟""", | |
lines=5 | |
) | |
documents_text = gr.Textbox( | |
label="Documents (one per line)", | |
placeholder="Enter your documents here, one per line...", | |
value="""القاهرة هي أكبر مدينة عربية وعاصمة مصر، وتضم العديد من المعالم الثقافية والتاريخية. | |
الرياض عاصمة المملكة العربية السعودية ومركز ثقافي واقتصادي مهم. | |
دبي مدينة عالمية في الإمارات العربية المتحدة ومركز تجاري رئيسي. | |
بيروت عاصمة لبنان ومركز ثقافي مهم في العالم العربي.""", | |
lines=5 | |
) | |
retrieve_btn = gr.Button("Process Retrieval", variant="primary") | |
retrieval_matrix = gr.Dataframe( | |
label="Query-Document Relevance Matrix", | |
wrap=True | |
) | |
gr.Markdown(""" | |
<div class="description"> | |
<strong>How to read the results:</strong> | |
- Each row represents a query | |
- Each column represents a document | |
- Values show the relevance score (0-1) between each query-document pair | |
- Higher scores indicate better matches | |
</div> | |
""") | |
retrieve_btn.click( | |
fn=lambda p, q, d, m, e: process_with_embedder('process_retrieval', p, q, d, m, e), | |
inputs=[task_prompt, queries_text, documents_text, model_choice, embedding_dim], | |
outputs=[retrieval_matrix] | |
) | |
# Add Cross-Lingual Tab after the Multi-Query Retrieval tab | |
with gr.Tab("Cross-Lingual Matching"): | |
with gr.Column(elem_classes="tab-content"): | |
gr.Markdown(""" | |
### Cross-Lingual Semantic Matching | |
Compare the meaning of texts across Arabic and English languages. | |
<div class="description"> | |
This feature demonstrates the model's ability to understand semantic similarity across different languages. | |
Try comparing similar concepts expressed in Arabic and English to see how well the model captures cross-lingual meaning. | |
</div> | |
<div class="example"> | |
<strong>Try these examples:</strong><br> | |
<strong>Arabic:</strong> "القراءة غذاء العقل والروح"<br> | |
<strong>English:</strong> "Reading nourishes the mind and soul"<br> | |
Or try your own pairs of semantically similar texts in both languages. | |
</div> | |
""") | |
with gr.Row(): | |
arabic_text = gr.Textbox( | |
label="Arabic Text", | |
placeholder="Enter Arabic text here...", | |
value="القراءة غذاء العقل والروح", | |
lines=3 | |
) | |
english_text = gr.Textbox( | |
label="English Text", | |
placeholder="Enter English text here...", | |
value="Reading nourishes the mind and soul", | |
lines=3 | |
) | |
match_btn = gr.Button("Compare Texts", variant="primary") | |
with gr.Row(): | |
cross_lingual_score = gr.Number( | |
label="Cross-Lingual Similarity Score", | |
value=None | |
) | |
gr.Markdown(""" | |
<div class="description"> | |
<strong>Understanding the score:</strong> | |
- Score ranges from 0 (completely different meaning) to 1 (same meaning) | |
- Scores above 0.7 usually indicate strong semantic similarity | |
- The model considers the meaning, not just word-for-word translation | |
</div> | |
""") | |
match_btn.click( | |
fn=lambda a, e, m, d: process_with_embedder('process_cross_lingual', a, e, m, d), | |
inputs=[arabic_text, english_text, model_choice, embedding_dim], | |
outputs=[cross_lingual_score] | |
) | |
# Add Text Classification Tab | |
with gr.Tab("Text Classification"): | |
with gr.Column(elem_classes="tab-content"): | |
gr.Markdown(""" | |
### Text Classification | |
Classify text into predefined categories using semantic similarity. | |
<div class="description"> | |
The model will compare your text against each category and rank them by relevance. | |
You can define your own categories or use the provided examples. | |
</div> | |
""") | |
input_text = gr.Textbox( | |
label="Input Text", | |
placeholder="Enter the text to classify...", | |
value="الذكاء الاصطناعي يغير طريقة عملنا وتفكيرنا في المستقبل", | |
lines=3 | |
) | |
categories_text = gr.Textbox( | |
label="Categories (one per line)", | |
placeholder="Enter categories here...", | |
value="""التكنولوجيا والابتكار | |
الاقتصاد والأعمال | |
التعليم والتدريب | |
الثقافة والفنون | |
الصحة والطب""", | |
lines=5 | |
) | |
classify_btn = gr.Button("Classify Text", variant="primary") | |
classification_results = gr.Dataframe( | |
headers=["Category", "Relevance Score"], | |
label="Classification Results" | |
) | |
classify_btn.click( | |
fn=lambda t, c, m, e: process_with_embedder('classify_text', t, c, m, e), | |
inputs=[input_text, categories_text, model_choice, embedding_dim], | |
outputs=classification_results | |
) | |
# Add Document Clustering Tab | |
with gr.Tab("Document Clustering"): | |
with gr.Column(elem_classes="tab-content"): | |
gr.Markdown(""" | |
### Document Clustering | |
Group similar documents together using semantic clustering. | |
<div class="description"> | |
This feature will: | |
- Group similar documents into clusters | |
- Identify the most representative document for each cluster | |
- Help discover themes and patterns in your document collection | |
</div> | |
""") | |
cluster_docs = gr.Textbox( | |
label="Documents (one per line)", | |
placeholder="Enter documents to cluster...", | |
value="""الذكاء الاصطناعي يفتح آفاقاً جديدة في مجال الطب. | |
الروبوتات تساعد الأطباء في إجراء العمليات الجراحية. | |
التعلم الآلي يحسن من دقة التشخيص الطبي. | |
الفن يعبر عن مشاعر الإنسان وأحاسيسه. | |
الموسيقى لغة عالمية تتخطى حدود الثقافات. | |
الرسم والنحت من أقدم أشكال التعبير الفني. | |
التجارة الإلكترونية تغير نمط التسوق التقليدي. | |
التسوق عبر الإنترنت يوفر الوقت والجهد. | |
المتاجر الرقمية تتيح خيارات أوسع للمستهلكين.""", | |
lines=10 | |
) | |
num_clusters = gr.Slider( | |
minimum=2, | |
maximum=10, | |
value=3, | |
step=1, | |
label="Number of Clusters" | |
) | |
cluster_btn = gr.Button("Cluster Documents", variant="primary") | |
clustering_results = gr.Dataframe( | |
label="Clustering Results" | |
) | |
cluster_btn.click( | |
fn=lambda d, n, m, e: process_with_embedder('cluster_documents', d, n, m, e), | |
inputs=[cluster_docs, num_clusters, model_choice, embedding_dim], | |
outputs=clustering_results | |
) | |
# Add Sentiment Analysis Tab | |
with gr.Tab("Sentiment Analysis"): | |
with gr.Column(elem_classes="tab-content"): | |
gr.Markdown(""" | |
### Arabic Sentiment Analysis | |
Analyze the sentiment of Arabic text using semantic similarity to sentiment anchors. | |
<div class="description"> | |
The model will compare your text against predefined sentiment anchors and determine: | |
- The overall sentiment | |
- Confidence scores for each sentiment level | |
</div> | |
""") | |
sentiment_text = gr.Textbox( | |
label="Text to Analyze", | |
placeholder="Enter text to analyze sentiment...", | |
value="هذا المشروع رائع جداً وسيحدث تغييراً إيجابياً في حياة الكثيرين", | |
lines=3 | |
) | |
analyze_btn = gr.Button("Analyze Sentiment", variant="primary") | |
with gr.Row(): | |
sentiment_label = gr.Label(label="Overall Sentiment") | |
sentiment_scores = gr.Json(label="Detailed Scores") | |
analyze_btn.click( | |
fn=lambda t, m, e: process_with_embedder('analyze_sentiment', t, m, e), | |
inputs=[sentiment_text, model_choice, embedding_dim], | |
outputs=[sentiment_label, sentiment_scores] | |
) | |
# Add Concept Extraction Tab | |
with gr.Tab("Concept Extraction"): | |
with gr.Column(elem_classes="tab-content"): | |
gr.Markdown(""" | |
### Concept Extraction | |
Extract key concepts and themes from Arabic text. | |
<div class="description"> | |
Analyze text to identify: | |
- Emotional content | |
- Main topics | |
- Underlying themes | |
</div> | |
""") | |
concept_text = gr.Textbox( | |
label="Text to Analyze", | |
placeholder="Enter text to analyze...", | |
value="نحن نؤمن بأهمية التعليم والابتكار لبناء مستقبل أفضل لأجيالنا القادمة", | |
lines=3 | |
) | |
concept_type = gr.Radio( | |
choices=["emotions", "topics", "themes"], | |
value="themes", | |
label="Concept Type" | |
) | |
extract_btn = gr.Button("Extract Concepts", variant="primary") | |
concept_results = gr.Dataframe( | |
headers=["Concept", "Relevance Score"], | |
label="Extracted Concepts" | |
) | |
extract_btn.click( | |
fn=lambda t, c, m, e: process_with_embedder('extract_concepts', t, c, m, e), | |
inputs=[concept_text, concept_type, model_choice, embedding_dim], | |
outputs=concept_results | |
) | |
# Update dimension handler | |
def update_embedder_dim(dim): | |
global embedder | |
try: | |
embedder = initialize_embedder(embedding_dim=dim) | |
return f"Successfully updated embedding dimension to {dim}" | |
except Exception as e: | |
return f"Error updating dimension: {str(e)}" | |
update_dim_btn.click( | |
fn=update_embedder_dim, | |
inputs=[embedding_dim], | |
outputs=dim_status | |
) | |
return demo | |
if __name__ == "__main__": | |
demo = create_demo() | |
demo.queue() | |
demo.launch() |