Omartificial-Intelligence-Space's picture
Update app.py
bd3c0c4 verified
import gradio as gr
import torch
import torch.nn.functional as F
import numpy as np
import plotly.express as px
import pandas as pd
import spaces
from typing import List, Tuple, Dict
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import json
# Initialize the embedder at module level
embedder = None
AVAILABLE_MODELS = {
"Qwen3-Embedding-0.6B": "Qwen/Qwen3-Embedding-0.6B",
"Semantic-Ar-Qwen-Embed-0.6B": "Omartificial-Intelligence-Space/Semantic-Ar-Qwen-Embed-0.6B"
}
class QwenEmbedder:
def __init__(self, model_name: str = "Qwen/Qwen3-Embedding-0.6B", embedding_dim: int = 768):
self.model = SentenceTransformer(model_name)
self.embedding_dim = embedding_dim
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model.to(self.device)
if embedding_dim != 768:
# Add projection layer if needed
self.projection = torch.nn.Linear(768, embedding_dim)
self.projection.to(self.device)
else:
self.projection = None
def get_embeddings(self, texts: List[str], with_instruction: bool = False) -> torch.Tensor:
if with_instruction:
texts = [f"Represent this Arabic text for retrieval: {text}" for text in texts]
embeddings = self.model.encode(texts, convert_to_tensor=True)
if self.projection is not None:
embeddings = self.projection(embeddings)
# Normalize embeddings
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
return embeddings
@spaces.GPU(duration=120)
def initialize_embedder(embedding_dim=768):
# Initialize device inside the GPU worker
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Initializing embedder on device: {device}")
# Create model with specified dimension
model = QwenEmbedder(embedding_dim=embedding_dim)
return model
@spaces.GPU(duration=120)
def process_with_embedder(fn_name, *args):
"""Generic handler for embedder operations"""
global embedder
if embedder is None:
embedder = initialize_embedder()
# Map function names to actual functions
fn_map = {
'compute_similarity': compute_similarity,
'rerank_documents': rerank_documents,
'process_batch_embeddings': process_batch_embeddings,
'process_retrieval': process_retrieval,
'process_cross_lingual': process_cross_lingual,
'classify_text': classify_text,
'cluster_documents': cluster_documents,
'analyze_sentiment': analyze_sentiment,
'extract_concepts': extract_concepts
}
return fn_map[fn_name](embedder, *args)
# Check for GPU support and configure appropriately
device = "cuda" if torch.cuda.is_available() else "cpu"
zero = torch.Tensor([0]).to(device)
print(f"Device being used: {zero.device}")
def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
if left_padding:
return last_hidden_states[:, -1]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_states.shape[0]
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
def get_detailed_instruct(task_description: str, query: str) -> str:
return f'Instruct: {task_description}\nQuery: {query}'
def tokenize(tokenizer, input_texts, eod_id, max_length):
batch_dict = tokenizer(input_texts, padding=False, truncation=True, max_length=max_length-2)
for seq, att in zip(batch_dict["input_ids"], batch_dict["attention_mask"]):
seq.append(eod_id)
att.append(1)
batch_dict = tokenizer.pad(batch_dict, padding=True, return_tensors="pt")
return batch_dict
def compute_similarity(embedder: QwenEmbedder, text1: str, text2: str, model_choice: str = None, embedding_dim: int = None) -> float:
embeddings = embedder.get_embeddings([text1, text2])
similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item()
return round(similarity, 3)
def rerank_documents(embedder: QwenEmbedder, query: str, documents: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]:
docs_list = [doc.strip() for doc in documents.split('\n') if doc.strip()]
# Add instruction to query
task = 'Given a search query, retrieve relevant passages that answer the query'
query_with_instruct = get_detailed_instruct(task, query)
# Get embeddings
query_embedding = embedder.get_embeddings([query_with_instruct])
doc_embeddings = embedder.get_embeddings(docs_list)
# Calculate similarities
scores = (query_embedding @ doc_embeddings.T).squeeze(0)
results = [(doc, float(score)) for doc, score in zip(docs_list, scores)]
results.sort(key=lambda x: x[1], reverse=True)
return [(doc, round(score, 3)) for doc, score in results]
def process_batch_embeddings(embedder: QwenEmbedder, texts: str, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame:
text_list = [text.strip() for text in texts.split('\n') if text.strip()]
if len(text_list) < 1:
return pd.DataFrame()
embeddings = embedder.get_embeddings(text_list)
scores = (embeddings @ embeddings.T).cpu().numpy()
# Create similarity matrix DataFrame
df_similarities = pd.DataFrame(
scores,
index=text_list,
columns=text_list
)
return df_similarities.round(3)
def process_retrieval(embedder: QwenEmbedder, task_prompt: str, queries: str, documents: str, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame:
# Process queries and documents
query_list = [q.strip() for q in queries.split('\n') if q.strip()]
doc_list = [d.strip() for d in documents.split('\n') if d.strip()]
if not query_list or not doc_list:
return pd.DataFrame()
# Add instruction to queries
instructed_queries = [get_detailed_instruct(task_prompt, q) for q in query_list]
# Get embeddings for both queries and documents
query_embeddings = embedder.get_embeddings(instructed_queries)
doc_embeddings = embedder.get_embeddings(doc_list)
# Calculate similarity scores
scores = (query_embeddings @ doc_embeddings.T).cpu().numpy()
# Create DataFrame with results
df = pd.DataFrame(scores, index=query_list, columns=doc_list)
return df.round(3)
def process_cross_lingual(embedder: QwenEmbedder, arabic_text: str, english_text: str, model_choice: str = None, embedding_dim: int = None) -> dict:
texts = [arabic_text, english_text]
embeddings = embedder.get_embeddings(texts)
similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item()
return {"similarity": round(similarity, 3)}
def classify_text(embedder: QwenEmbedder, text: str, categories: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]:
cat_list = [c.strip() for c in categories.split('\n') if c.strip()]
text_embedding = embedder.get_embeddings([text])
cat_embeddings = embedder.get_embeddings(cat_list)
scores = (text_embedding @ cat_embeddings.T).squeeze(0)
results = [(cat, float(score)) for cat, score in zip(cat_list, scores)]
results.sort(key=lambda x: x[1], reverse=True)
return [(cat, round(score, 3)) for cat, score in results]
def cluster_documents(embedder: QwenEmbedder, documents: str, num_clusters: int, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame:
from sklearn.cluster import KMeans
doc_list = [doc.strip() for doc in documents.split('\n') if doc.strip()]
if len(doc_list) < num_clusters:
return pd.DataFrame()
embeddings = embedder.get_embeddings(doc_list)
# Perform clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(embeddings.cpu().numpy())
# Calculate center document for each cluster
cluster_centers = kmeans.cluster_centers_
cluster_center_docs = []
for i in range(num_clusters):
cluster_docs = [doc for doc, cluster in zip(doc_list, clusters) if cluster == i]
cluster_embeddings = embedder.get_embeddings(cluster_docs)
center_embedding = torch.tensor(cluster_centers[i]).unsqueeze(0)
similarities = F.cosine_similarity(cluster_embeddings, center_embedding)
center_doc = cluster_docs[similarities.argmax().item()]
cluster_center_docs.append(center_doc)
# Create results DataFrame
df = pd.DataFrame({
'Document': doc_list,
'Cluster': clusters,
'Cluster Center Document': [cluster_center_docs[c] for c in clusters]
})
return df.sort_values('Cluster')
def analyze_sentiment(embedder: QwenEmbedder, text: str, model_choice: str = None, embedding_dim: int = None) -> Tuple[str, dict]:
# Define sentiment anchors
anchors = {
"very_positive": "هذا رائع جداً ومدهش! أنا سعيد للغاية",
"positive": "هذا جيد وممتع",
"neutral": "هذا عادي ومقبول",
"negative": "هذا سيء ومزعج",
"very_negative": "هذا فظيع جداً ومحبط للغاية"
}
# Get embeddings
text_embedding = embedder.get_embeddings([text])
anchor_embeddings = embedder.get_embeddings(list(anchors.values()))
# Calculate similarities
scores = (text_embedding @ anchor_embeddings.T).squeeze(0)
results = list(zip(anchors.keys(), scores.tolist()))
results.sort(key=lambda x: x[1], reverse=True)
# Return tuple of (sentiment, scores_dict)
return (
results[0][0],
{k: round(float(v), 3) for k, v in results}
)
def extract_concepts(embedder: QwenEmbedder, text: str, concept_type: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]:
# Define concept anchors based on type
concept_anchors = {
"emotions": [
"الفرح والسعادة",
"الحزن والأسى",
"الغضب والإحباط",
"الخوف والقلق",
"الحب والعاطفة",
"الأمل والتفاؤل"
],
"topics": [
"السياسة والحكم",
"الاقتصاد والمال",
"العلوم والتكنولوجيا",
"الفن والثقافة",
"الرياضة والترفيه",
"التعليم والمعرفة"
],
"themes": [
"العدالة والمساواة",
"التقدم والتطور",
"التقاليد والتراث",
"الحرية والاستقلال",
"التعاون والوحدة",
"الإبداع والابتكار"
]
}
anchors = concept_anchors.get(concept_type, concept_anchors["topics"])
# Get embeddings
text_embedding = embedder.get_embeddings([text])
anchor_embeddings = embedder.get_embeddings(anchors)
# Calculate similarities
scores = (text_embedding @ anchor_embeddings.T).squeeze(0)
results = [(anchor, float(score)) for anchor, score in zip(anchors, scores)]
results.sort(key=lambda x: x[1], reverse=True)
return [(concept, round(score, 3)) for concept, score in results]
def create_embedder(model_choice: str, embedding_dim: int = 768) -> QwenEmbedder:
model_name = AVAILABLE_MODELS[model_choice]
return QwenEmbedder(model_name=model_name, embedding_dim=embedding_dim)
def process_similarity(text1: str, text2: str, model_choice: str, embedding_dim: int) -> float:
embedder = create_embedder(model_choice, embedding_dim)
embeddings = embedder.get_embeddings([text1, text2])
similarity = torch.nn.functional.cosine_similarity(embeddings[0].unsqueeze(0), embeddings[1].unsqueeze(0))
return float(similarity)
def process_reranking(query: str, documents: str, model_choice: str, embedding_dim: int) -> Dict:
embedder = create_embedder(model_choice, embedding_dim)
documents = [doc.strip() for doc in documents.split('\n') if doc.strip()]
query_embedding = embedder.get_embeddings([query], with_instruction=True)
doc_embeddings = embedder.get_embeddings(documents)
similarities = torch.nn.functional.cosine_similarity(query_embedding, doc_embeddings)
# Sort documents by similarity
sorted_indices = torch.argsort(similarities, descending=True)
results = []
for idx in sorted_indices:
results.append({
'document': documents[idx],
'score': float(similarities[idx])
})
return {'results': results}
def process_batch(texts: str, model_choice: str, embedding_dim: int) -> Dict:
embedder = create_embedder(model_choice, embedding_dim)
texts = [text.strip() for text in texts.split('\n') if text.strip()]
embeddings = embedder.get_embeddings(texts)
similarity_matrix = torch.nn.functional.cosine_similarity(embeddings.unsqueeze(1), embeddings.unsqueeze(0), dim=2)
df = pd.DataFrame(similarity_matrix.cpu().numpy(), index=texts, columns=texts)
return {'similarity_matrix': df.to_dict()}
def process_retrieval(prompt: str, queries: str, documents: str, model_choice: str, embedding_dim: int) -> Dict:
embedder = create_embedder(model_choice, embedding_dim)
# Process input strings
queries = [q.strip() for q in queries.split('\n') if q.strip()]
documents = [doc.strip() for doc in documents.split('\n') if doc.strip()]
# Get embeddings
prompt_embedding = embedder.get_embeddings([prompt], with_instruction=True)
query_embeddings = embedder.get_embeddings(queries, with_instruction=True)
doc_embeddings = embedder.get_embeddings(documents)
# Calculate similarities
query_similarities = torch.nn.functional.cosine_similarity(prompt_embedding, query_embeddings)
doc_similarities = torch.nn.functional.cosine_similarity(prompt_embedding.repeat(len(documents), 1), doc_embeddings)
# Process results
results = {
'relevant_queries': [],
'relevant_documents': []
}
# Sort queries
query_indices = torch.argsort(query_similarities, descending=True)
for idx in query_indices:
results['relevant_queries'].append({
'query': queries[idx],
'similarity': float(query_similarities[idx])
})
# Sort documents
doc_indices = torch.argsort(doc_similarities, descending=True)
for idx in doc_indices:
results['relevant_documents'].append({
'document': documents[idx],
'similarity': float(doc_similarities[idx])
})
return results
# Update the CSS to improve feature visibility
custom_css = """
:root {
--primary-color: #2196F3;
--secondary-color: #1976D2;
--background-color: #f8f9fa;
--sidebar-bg: #ffffff;
--text-color: #333333;
--border-color: #e0e0e0;
}
.container {
max-width: 1200px;
margin: auto;
padding: 20px;
}
.sidebar {
background-color: var(--sidebar-bg);
border-right: 1px solid var(--border-color);
padding: 20px;
margin-right: 20px;
position: sticky;
top: 0;
height: 100vh;
overflow-y: auto;
}
.main-content {
background-color: var(--background-color);
padding: 20px;
border-radius: 10px;
}
.features-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
gap: 15px;
margin: 15px 0;
}
.feature-card {
background: white;
padding: 15px;
border-radius: 6px;
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
transition: all 0.3s ease;
border: 1px solid var(--border-color);
text-align: center;
}
.feature-card:hover {
transform: translateY(-3px);
box-shadow: 0 3px 6px rgba(0,0,0,0.15);
border-color: var(--primary-color);
}
.feature-icon {
font-size: 24px;
margin-bottom: 10px;
color: var(--primary-color);
}
.feature-card h3 {
color: var(--text-color);
margin: 8px 0;
font-size: 0.95em;
font-weight: 600;
}
.feature-card p {
color: #666;
font-size: 0.8em;
line-height: 1.3;
margin: 5px 0;
}
.features-summary {
margin: 40px 0;
padding: 30px;
background: white;
border-radius: 12px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
}
.features-summary h2 {
color: var(--text-color);
margin-bottom: 25px;
text-align: center;
font-size: 1.5em;
}
.feature-list {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 30px;
}
.feature-group {
padding: 20px;
background: var(--background-color);
border-radius: 8px;
border: 1px solid var(--border-color);
}
.feature-group h3 {
color: var(--primary-color);
margin-bottom: 15px;
font-size: 1.2em;
}
.feature-group ul {
list-style: none;
padding: 0;
margin: 0;
}
.feature-group li {
padding: 8px 0;
color: var(--text-color);
position: relative;
padding-left: 20px;
}
.feature-group li:before {
content: "•";
color: var(--primary-color);
position: absolute;
left: 0;
}
.description {
margin: 20px 0;
padding: 15px;
border-radius: 8px;
background-color: #ffffff;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.example {
margin: 10px 0;
padding: 15px;
border-left: 4px solid var(--primary-color);
background-color: #ffffff;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.warning {
color: #721c24;
background-color: #f8d7da;
border: 1px solid #f5c6cb;
padding: 15px;
border-radius: 8px;
margin: 10px 0;
}
.settings {
background-color: #ffffff;
padding: 20px;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
margin: 20px 0;
}
.tab-content {
padding: 20px;
background-color: #ffffff;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.heading {
color: var(--text-color);
margin-bottom: 20px;
padding-bottom: 10px;
border-bottom: 2px solid var(--primary-color);
}
button.primary {
background-color: var(--primary-color) !important;
}
button.secondary {
background-color: var(--secondary-color) !important;
}
"""
# Create the Gradio interface
def create_demo():
demo = gr.Blocks(title="Advanced Text Processing with Qwen", css=custom_css, theme=gr.themes.Soft())
with demo:
with gr.Row():
# Sidebar
with gr.Column(scale=1, elem_classes="sidebar"):
gr.Markdown("""
# Qwen Embeddings
### Navigation
- [Configuration](#configuration)
- [Features](#features)
- [Documentation](#documentation)
""")
with gr.Accordion("Configuration", open=True):
gr.Markdown("""
### Model Settings
Configure the embedding model parameters below.
""")
model_choice = gr.Dropdown(
choices=list(AVAILABLE_MODELS.keys()),
value=list(AVAILABLE_MODELS.keys())[0],
label="Select Model"
)
embedding_dim = gr.Slider(
minimum=32,
maximum=1024,
value=768,
step=32,
label="Embedding Dimension",
elem_classes="settings"
)
update_dim_btn = gr.Button("Update Dimension", variant="secondary")
dim_status = gr.Textbox(label="Status", interactive=False)
with gr.Accordion("Documentation", open=False):
gr.Markdown("""
### Usage Guide
1. **Embedding Dimension**
- 32-128: Fast, simple tasks
- 256-512: Balanced performance
- 768: Default, full model
- 1024: Maximum detail
2. **Best Practices**
- Use appropriate dimensions for your task
- Consider batch size for multiple documents
- Test different settings for optimal results
""")
# Main Content
with gr.Column(scale=4):
gr.Markdown("""
# Advanced Text Processing Suite
Welcome to the Advanced Text Processing Suite powered by Qwen Embeddings.
This tool provides state-of-the-art text analysis capabilities with support for Arabic and multiple languages.
""")
# Feature Grid
gr.HTML("""
<div class="features-grid">
<div class="feature-card">
<div class="feature-icon">🔄</div>
<h3>Text Similarity</h3>
<p>Compare text meanings</p>
</div>
<div class="feature-card">
<div class="feature-icon">🔍</div>
<h3>Semantic Search</h3>
<p>Find relevant docs</p>
</div>
<div class="feature-card">
<div class="feature-icon">📊</div>
<h3>Batch Analysis</h3>
<p>Process multiple texts</p>
</div>
<div class="feature-card">
<div class="feature-icon">🎯</div>
<h3>Multi-Query</h3>
<p>Advanced retrieval</p>
</div>
<div class="feature-card">
<div class="feature-icon">🌐</div>
<h3>Cross-Lingual</h3>
<p>Cross-language match</p>
</div>
<div class="feature-card">
<div class="feature-icon">🏷️</div>
<h3>Classification</h3>
<p>Categorize texts</p>
</div>
<div class="feature-card">
<div class="feature-icon">🔮</div>
<h3>Clustering</h3>
<p>Group documents</p>
</div>
<div class="feature-card">
<div class="feature-icon">😊</div>
<h3>Sentiment</h3>
<p>Analyze emotions</p>
</div>
<div class="feature-card">
<div class="feature-icon">🎨</div>
<h3>Concepts</h3>
<p>Extract themes</p>
</div>
</div>
""")
with gr.Tabs() as tabs:
# Text Similarity Tab
with gr.Tab("Text Similarity Analysis"):
with gr.Column(elem_classes="tab-content"):
gr.Markdown("""
### Text Similarity Analysis
Compare the semantic similarity between two texts. The score ranges from 0 (completely different) to 1 (identical meaning).
<div class="example">
<strong>Try these Arabic examples:</strong><br>
• "أحب القراءة كثيراً" and "القراءة من أحب هواياتي"<br>
• "السماء صافية اليوم" and "الطقس حار جداً"
</div>
""")
with gr.Row():
text1 = gr.Textbox(
label="First Text",
lines=3,
placeholder="Enter first text here...",
value="أحب القراءة كثيراً"
)
text2 = gr.Textbox(
label="Second Text",
lines=3,
placeholder="Enter second text here...",
value="القراءة من أحب هواياتي"
)
similarity_btn = gr.Button("Calculate Similarity", variant="primary")
similarity_score = gr.Number(label="Similarity Score")
similarity_btn.click(
fn=lambda t1, t2, m, d: process_with_embedder('compute_similarity', t1, t2, m, d),
inputs=[text1, text2, model_choice, embedding_dim],
outputs=similarity_score
)
# Document Reranking Tab
with gr.Tab("Semantic Search & Reranking"):
with gr.Column(elem_classes="tab-content"):
gr.Markdown("""
### Semantic Search & Document Reranking
Search through a collection of documents and rank them by semantic relevance to your query.
<div class="example">
<strong>Try these Arabic queries:</strong><br>
• "ما هي عواصم الدول العربية؟"<br>
• "أين تقع أكبر المدن العربية؟"<br>
• "ما هي المراكز الثقافية العربية؟"
</div>
""")
query_text = gr.Textbox(
label="Search Query",
placeholder="Enter your search query...",
value="ما هي عواصم الدول العربية؟"
)
documents_text = gr.Textbox(
label="Documents Collection (one per line)",
lines=10,
placeholder="Enter documents here, one per line...",
value="""القاهرة هي عاصمة جمهورية مصر العربية وأكبر مدنها.
الرياض هي عاصمة المملكة العربية السعودية ومركزها الاقتصادي.
دمشق هي أقدم عاصمة مأهولة في التاريخ وهي عاصمة سوريا.
بغداد عاصمة العراق وتقع على نهر دجلة.
الدار البيضاء أكبر مدن المغرب وعاصمته الاقتصادية.
تونس هي عاصمة الجمهورية التونسية ومركزها الثقافي."""
)
rerank_btn = gr.Button("Search & Rank", variant="primary")
rerank_results = gr.Dataframe(
headers=["Document", "Relevance Score"],
label="Search Results"
)
rerank_btn.click(
fn=lambda q, d, m, e: process_with_embedder('rerank_documents', q, d, m, e),
inputs=[query_text, documents_text, model_choice, embedding_dim],
outputs=rerank_results
)
# Batch Analysis Tab
with gr.Tab("Batch Similarity Analysis"):
with gr.Column(elem_classes="tab-content"):
gr.Markdown("""
### Batch Similarity Analysis
Analyze semantic relationships between multiple texts simultaneously.
<div class="example">
<strong>The example shows Arabic proverbs about friendship:</strong><br>
See how the model captures the semantic relationships between similar themes.
</div>
""")
batch_texts = gr.Textbox(
label="Input Texts (one per line)",
lines=10,
placeholder="Enter texts here, one per line...",
value="""الصديق وقت الضيق.
الصديق الحقيقي يظهر عند الشدائد.
عند المحن تعرف إخوانك.
وقت الشدة بتعرف صحابك.
الصاحب ساحب."""
)
process_btn = gr.Button("Analyze Relationships", variant="primary")
similarity_matrix = gr.Dataframe(
label="Similarity Matrix",
wrap=True
)
process_btn.click(
fn=lambda t, m, e: process_with_embedder('process_batch_embeddings', t, m, e),
inputs=[batch_texts, model_choice, embedding_dim],
outputs=[similarity_matrix]
)
# Add new Retrieval Tab
with gr.Tab("Multi-Query Retrieval"):
with gr.Column(elem_classes="tab-content"):
gr.Markdown("""
### Multi-Query Document Retrieval
Match multiple queries against multiple documents simultaneously using semantic search.
<div class="description">
This tab implements the exact retrieval logic from the Qwen example, allowing you to:
- Define a custom task prompt
- Input multiple queries
- Input multiple documents
- See all query-document match scores in a matrix
</div>
<div class="example">
<strong>Try these examples:</strong><br>
<strong>Task prompt:</strong> "Given a web search query, retrieve relevant passages that answer the query"<br>
<strong>Queries:</strong>
• "ما هي أكبر المدن العربية؟"
• "أين تقع أهم المراكز الثقافية؟"<br>
<strong>Documents:</strong> Use the example documents or add your own
</div>
""")
task_prompt = gr.Textbox(
label="Task Prompt",
placeholder="Enter the task description here...",
value="Given a web search query, retrieve relevant passages that answer the query",
lines=2
)
with gr.Row():
queries_text = gr.Textbox(
label="Queries (one per line)",
placeholder="Enter your queries here, one per line...",
value="""ما هي أكبر المدن العربية؟
أين تقع أهم المراكز الثقافية؟""",
lines=5
)
documents_text = gr.Textbox(
label="Documents (one per line)",
placeholder="Enter your documents here, one per line...",
value="""القاهرة هي أكبر مدينة عربية وعاصمة مصر، وتضم العديد من المعالم الثقافية والتاريخية.
الرياض عاصمة المملكة العربية السعودية ومركز ثقافي واقتصادي مهم.
دبي مدينة عالمية في الإمارات العربية المتحدة ومركز تجاري رئيسي.
بيروت عاصمة لبنان ومركز ثقافي مهم في العالم العربي.""",
lines=5
)
retrieve_btn = gr.Button("Process Retrieval", variant="primary")
retrieval_matrix = gr.Dataframe(
label="Query-Document Relevance Matrix",
wrap=True
)
gr.Markdown("""
<div class="description">
<strong>How to read the results:</strong>
- Each row represents a query
- Each column represents a document
- Values show the relevance score (0-1) between each query-document pair
- Higher scores indicate better matches
</div>
""")
retrieve_btn.click(
fn=lambda p, q, d, m, e: process_with_embedder('process_retrieval', p, q, d, m, e),
inputs=[task_prompt, queries_text, documents_text, model_choice, embedding_dim],
outputs=[retrieval_matrix]
)
# Add Cross-Lingual Tab after the Multi-Query Retrieval tab
with gr.Tab("Cross-Lingual Matching"):
with gr.Column(elem_classes="tab-content"):
gr.Markdown("""
### Cross-Lingual Semantic Matching
Compare the meaning of texts across Arabic and English languages.
<div class="description">
This feature demonstrates the model's ability to understand semantic similarity across different languages.
Try comparing similar concepts expressed in Arabic and English to see how well the model captures cross-lingual meaning.
</div>
<div class="example">
<strong>Try these examples:</strong><br>
<strong>Arabic:</strong> "القراءة غذاء العقل والروح"<br>
<strong>English:</strong> "Reading nourishes the mind and soul"<br>
Or try your own pairs of semantically similar texts in both languages.
</div>
""")
with gr.Row():
arabic_text = gr.Textbox(
label="Arabic Text",
placeholder="Enter Arabic text here...",
value="القراءة غذاء العقل والروح",
lines=3
)
english_text = gr.Textbox(
label="English Text",
placeholder="Enter English text here...",
value="Reading nourishes the mind and soul",
lines=3
)
match_btn = gr.Button("Compare Texts", variant="primary")
with gr.Row():
cross_lingual_score = gr.Number(
label="Cross-Lingual Similarity Score",
value=None
)
gr.Markdown("""
<div class="description">
<strong>Understanding the score:</strong>
- Score ranges from 0 (completely different meaning) to 1 (same meaning)
- Scores above 0.7 usually indicate strong semantic similarity
- The model considers the meaning, not just word-for-word translation
</div>
""")
match_btn.click(
fn=lambda a, e, m, d: process_with_embedder('process_cross_lingual', a, e, m, d),
inputs=[arabic_text, english_text, model_choice, embedding_dim],
outputs=[cross_lingual_score]
)
# Add Text Classification Tab
with gr.Tab("Text Classification"):
with gr.Column(elem_classes="tab-content"):
gr.Markdown("""
### Text Classification
Classify text into predefined categories using semantic similarity.
<div class="description">
The model will compare your text against each category and rank them by relevance.
You can define your own categories or use the provided examples.
</div>
""")
input_text = gr.Textbox(
label="Input Text",
placeholder="Enter the text to classify...",
value="الذكاء الاصطناعي يغير طريقة عملنا وتفكيرنا في المستقبل",
lines=3
)
categories_text = gr.Textbox(
label="Categories (one per line)",
placeholder="Enter categories here...",
value="""التكنولوجيا والابتكار
الاقتصاد والأعمال
التعليم والتدريب
الثقافة والفنون
الصحة والطب""",
lines=5
)
classify_btn = gr.Button("Classify Text", variant="primary")
classification_results = gr.Dataframe(
headers=["Category", "Relevance Score"],
label="Classification Results"
)
classify_btn.click(
fn=lambda t, c, m, e: process_with_embedder('classify_text', t, c, m, e),
inputs=[input_text, categories_text, model_choice, embedding_dim],
outputs=classification_results
)
# Add Document Clustering Tab
with gr.Tab("Document Clustering"):
with gr.Column(elem_classes="tab-content"):
gr.Markdown("""
### Document Clustering
Group similar documents together using semantic clustering.
<div class="description">
This feature will:
- Group similar documents into clusters
- Identify the most representative document for each cluster
- Help discover themes and patterns in your document collection
</div>
""")
cluster_docs = gr.Textbox(
label="Documents (one per line)",
placeholder="Enter documents to cluster...",
value="""الذكاء الاصطناعي يفتح آفاقاً جديدة في مجال الطب.
الروبوتات تساعد الأطباء في إجراء العمليات الجراحية.
التعلم الآلي يحسن من دقة التشخيص الطبي.
الفن يعبر عن مشاعر الإنسان وأحاسيسه.
الموسيقى لغة عالمية تتخطى حدود الثقافات.
الرسم والنحت من أقدم أشكال التعبير الفني.
التجارة الإلكترونية تغير نمط التسوق التقليدي.
التسوق عبر الإنترنت يوفر الوقت والجهد.
المتاجر الرقمية تتيح خيارات أوسع للمستهلكين.""",
lines=10
)
num_clusters = gr.Slider(
minimum=2,
maximum=10,
value=3,
step=1,
label="Number of Clusters"
)
cluster_btn = gr.Button("Cluster Documents", variant="primary")
clustering_results = gr.Dataframe(
label="Clustering Results"
)
cluster_btn.click(
fn=lambda d, n, m, e: process_with_embedder('cluster_documents', d, n, m, e),
inputs=[cluster_docs, num_clusters, model_choice, embedding_dim],
outputs=clustering_results
)
# Add Sentiment Analysis Tab
with gr.Tab("Sentiment Analysis"):
with gr.Column(elem_classes="tab-content"):
gr.Markdown("""
### Arabic Sentiment Analysis
Analyze the sentiment of Arabic text using semantic similarity to sentiment anchors.
<div class="description">
The model will compare your text against predefined sentiment anchors and determine:
- The overall sentiment
- Confidence scores for each sentiment level
</div>
""")
sentiment_text = gr.Textbox(
label="Text to Analyze",
placeholder="Enter text to analyze sentiment...",
value="هذا المشروع رائع جداً وسيحدث تغييراً إيجابياً في حياة الكثيرين",
lines=3
)
analyze_btn = gr.Button("Analyze Sentiment", variant="primary")
with gr.Row():
sentiment_label = gr.Label(label="Overall Sentiment")
sentiment_scores = gr.Json(label="Detailed Scores")
analyze_btn.click(
fn=lambda t, m, e: process_with_embedder('analyze_sentiment', t, m, e),
inputs=[sentiment_text, model_choice, embedding_dim],
outputs=[sentiment_label, sentiment_scores]
)
# Add Concept Extraction Tab
with gr.Tab("Concept Extraction"):
with gr.Column(elem_classes="tab-content"):
gr.Markdown("""
### Concept Extraction
Extract key concepts and themes from Arabic text.
<div class="description">
Analyze text to identify:
- Emotional content
- Main topics
- Underlying themes
</div>
""")
concept_text = gr.Textbox(
label="Text to Analyze",
placeholder="Enter text to analyze...",
value="نحن نؤمن بأهمية التعليم والابتكار لبناء مستقبل أفضل لأجيالنا القادمة",
lines=3
)
concept_type = gr.Radio(
choices=["emotions", "topics", "themes"],
value="themes",
label="Concept Type"
)
extract_btn = gr.Button("Extract Concepts", variant="primary")
concept_results = gr.Dataframe(
headers=["Concept", "Relevance Score"],
label="Extracted Concepts"
)
extract_btn.click(
fn=lambda t, c, m, e: process_with_embedder('extract_concepts', t, c, m, e),
inputs=[concept_text, concept_type, model_choice, embedding_dim],
outputs=concept_results
)
# Update dimension handler
@spaces.GPU(duration=120)
def update_embedder_dim(dim):
global embedder
try:
embedder = initialize_embedder(embedding_dim=dim)
return f"Successfully updated embedding dimension to {dim}"
except Exception as e:
return f"Error updating dimension: {str(e)}"
update_dim_btn.click(
fn=update_embedder_dim,
inputs=[embedding_dim],
outputs=dim_status
)
return demo
if __name__ == "__main__":
demo = create_demo()
demo.queue()
demo.launch()