import gradio as gr import torch import torch.nn.functional as F import numpy as np import plotly.express as px import pandas as pd import spaces from typing import List, Tuple, Dict from torch import Tensor from transformers import AutoTokenizer, AutoModel from sentence_transformers import SentenceTransformer import json # Initialize the embedder at module level embedder = None AVAILABLE_MODELS = { "Qwen3-Embedding-0.6B": "Qwen/Qwen3-Embedding-0.6B", "Semantic-Ar-Qwen-Embed-0.6B": "Omartificial-Intelligence-Space/Semantic-Ar-Qwen-Embed-0.6B" } class QwenEmbedder: def __init__(self, model_name: str = "Qwen/Qwen3-Embedding-0.6B", embedding_dim: int = 768): self.model = SentenceTransformer(model_name) self.embedding_dim = embedding_dim self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model.to(self.device) if embedding_dim != 768: # Add projection layer if needed self.projection = torch.nn.Linear(768, embedding_dim) self.projection.to(self.device) else: self.projection = None def get_embeddings(self, texts: List[str], with_instruction: bool = False) -> torch.Tensor: if with_instruction: texts = [f"Represent this Arabic text for retrieval: {text}" for text in texts] embeddings = self.model.encode(texts, convert_to_tensor=True) if self.projection is not None: embeddings = self.projection(embeddings) # Normalize embeddings embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) return embeddings @spaces.GPU(duration=120) def initialize_embedder(embedding_dim=768): # Initialize device inside the GPU worker device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Initializing embedder on device: {device}") # Create model with specified dimension model = QwenEmbedder(embedding_dim=embedding_dim) return model @spaces.GPU(duration=120) def process_with_embedder(fn_name, *args): """Generic handler for embedder operations""" global embedder if embedder is None: embedder = initialize_embedder() # Map function names to actual functions fn_map = { 'compute_similarity': compute_similarity, 'rerank_documents': rerank_documents, 'process_batch_embeddings': process_batch_embeddings, 'process_retrieval': process_retrieval, 'process_cross_lingual': process_cross_lingual, 'classify_text': classify_text, 'cluster_documents': cluster_documents, 'analyze_sentiment': analyze_sentiment, 'extract_concepts': extract_concepts } return fn_map[fn_name](embedder, *args) # Check for GPU support and configure appropriately device = "cuda" if torch.cuda.is_available() else "cpu" zero = torch.Tensor([0]).to(device) print(f"Device being used: {zero.device}") def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) if left_padding: return last_hidden_states[:, -1] else: sequence_lengths = attention_mask.sum(dim=1) - 1 batch_size = last_hidden_states.shape[0] return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths] def get_detailed_instruct(task_description: str, query: str) -> str: return f'Instruct: {task_description}\nQuery: {query}' def tokenize(tokenizer, input_texts, eod_id, max_length): batch_dict = tokenizer(input_texts, padding=False, truncation=True, max_length=max_length-2) for seq, att in zip(batch_dict["input_ids"], batch_dict["attention_mask"]): seq.append(eod_id) att.append(1) batch_dict = tokenizer.pad(batch_dict, padding=True, return_tensors="pt") return batch_dict def compute_similarity(embedder: QwenEmbedder, text1: str, text2: str, model_choice: str = None, embedding_dim: int = None) -> float: embeddings = embedder.get_embeddings([text1, text2]) similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item() return round(similarity, 3) def rerank_documents(embedder: QwenEmbedder, query: str, documents: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]: docs_list = [doc.strip() for doc in documents.split('\n') if doc.strip()] # Add instruction to query task = 'Given a search query, retrieve relevant passages that answer the query' query_with_instruct = get_detailed_instruct(task, query) # Get embeddings query_embedding = embedder.get_embeddings([query_with_instruct]) doc_embeddings = embedder.get_embeddings(docs_list) # Calculate similarities scores = (query_embedding @ doc_embeddings.T).squeeze(0) results = [(doc, float(score)) for doc, score in zip(docs_list, scores)] results.sort(key=lambda x: x[1], reverse=True) return [(doc, round(score, 3)) for doc, score in results] def process_batch_embeddings(embedder: QwenEmbedder, texts: str, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame: text_list = [text.strip() for text in texts.split('\n') if text.strip()] if len(text_list) < 1: return pd.DataFrame() embeddings = embedder.get_embeddings(text_list) scores = (embeddings @ embeddings.T).cpu().numpy() # Create similarity matrix DataFrame df_similarities = pd.DataFrame( scores, index=text_list, columns=text_list ) return df_similarities.round(3) def process_retrieval(embedder: QwenEmbedder, task_prompt: str, queries: str, documents: str, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame: # Process queries and documents query_list = [q.strip() for q in queries.split('\n') if q.strip()] doc_list = [d.strip() for d in documents.split('\n') if d.strip()] if not query_list or not doc_list: return pd.DataFrame() # Add instruction to queries instructed_queries = [get_detailed_instruct(task_prompt, q) for q in query_list] # Get embeddings for both queries and documents query_embeddings = embedder.get_embeddings(instructed_queries) doc_embeddings = embedder.get_embeddings(doc_list) # Calculate similarity scores scores = (query_embeddings @ doc_embeddings.T).cpu().numpy() # Create DataFrame with results df = pd.DataFrame(scores, index=query_list, columns=doc_list) return df.round(3) def process_cross_lingual(embedder: QwenEmbedder, arabic_text: str, english_text: str, model_choice: str = None, embedding_dim: int = None) -> dict: texts = [arabic_text, english_text] embeddings = embedder.get_embeddings(texts) similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item() return {"similarity": round(similarity, 3)} def classify_text(embedder: QwenEmbedder, text: str, categories: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]: cat_list = [c.strip() for c in categories.split('\n') if c.strip()] text_embedding = embedder.get_embeddings([text]) cat_embeddings = embedder.get_embeddings(cat_list) scores = (text_embedding @ cat_embeddings.T).squeeze(0) results = [(cat, float(score)) for cat, score in zip(cat_list, scores)] results.sort(key=lambda x: x[1], reverse=True) return [(cat, round(score, 3)) for cat, score in results] def cluster_documents(embedder: QwenEmbedder, documents: str, num_clusters: int, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame: from sklearn.cluster import KMeans doc_list = [doc.strip() for doc in documents.split('\n') if doc.strip()] if len(doc_list) < num_clusters: return pd.DataFrame() embeddings = embedder.get_embeddings(doc_list) # Perform clustering kmeans = KMeans(n_clusters=num_clusters, random_state=42) clusters = kmeans.fit_predict(embeddings.cpu().numpy()) # Calculate center document for each cluster cluster_centers = kmeans.cluster_centers_ cluster_center_docs = [] for i in range(num_clusters): cluster_docs = [doc for doc, cluster in zip(doc_list, clusters) if cluster == i] cluster_embeddings = embedder.get_embeddings(cluster_docs) center_embedding = torch.tensor(cluster_centers[i]).unsqueeze(0) similarities = F.cosine_similarity(cluster_embeddings, center_embedding) center_doc = cluster_docs[similarities.argmax().item()] cluster_center_docs.append(center_doc) # Create results DataFrame df = pd.DataFrame({ 'Document': doc_list, 'Cluster': clusters, 'Cluster Center Document': [cluster_center_docs[c] for c in clusters] }) return df.sort_values('Cluster') def analyze_sentiment(embedder: QwenEmbedder, text: str, model_choice: str = None, embedding_dim: int = None) -> Tuple[str, dict]: # Define sentiment anchors anchors = { "very_positive": "هذا رائع جداً ومدهش! أنا سعيد للغاية", "positive": "هذا جيد وممتع", "neutral": "هذا عادي ومقبول", "negative": "هذا سيء ومزعج", "very_negative": "هذا فظيع جداً ومحبط للغاية" } # Get embeddings text_embedding = embedder.get_embeddings([text]) anchor_embeddings = embedder.get_embeddings(list(anchors.values())) # Calculate similarities scores = (text_embedding @ anchor_embeddings.T).squeeze(0) results = list(zip(anchors.keys(), scores.tolist())) results.sort(key=lambda x: x[1], reverse=True) # Return tuple of (sentiment, scores_dict) return ( results[0][0], {k: round(float(v), 3) for k, v in results} ) def extract_concepts(embedder: QwenEmbedder, text: str, concept_type: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]: # Define concept anchors based on type concept_anchors = { "emotions": [ "الفرح والسعادة", "الحزن والأسى", "الغضب والإحباط", "الخوف والقلق", "الحب والعاطفة", "الأمل والتفاؤل" ], "topics": [ "السياسة والحكم", "الاقتصاد والمال", "العلوم والتكنولوجيا", "الفن والثقافة", "الرياضة والترفيه", "التعليم والمعرفة" ], "themes": [ "العدالة والمساواة", "التقدم والتطور", "التقاليد والتراث", "الحرية والاستقلال", "التعاون والوحدة", "الإبداع والابتكار" ] } anchors = concept_anchors.get(concept_type, concept_anchors["topics"]) # Get embeddings text_embedding = embedder.get_embeddings([text]) anchor_embeddings = embedder.get_embeddings(anchors) # Calculate similarities scores = (text_embedding @ anchor_embeddings.T).squeeze(0) results = [(anchor, float(score)) for anchor, score in zip(anchors, scores)] results.sort(key=lambda x: x[1], reverse=True) return [(concept, round(score, 3)) for concept, score in results] def create_embedder(model_choice: str, embedding_dim: int = 768) -> QwenEmbedder: model_name = AVAILABLE_MODELS[model_choice] return QwenEmbedder(model_name=model_name, embedding_dim=embedding_dim) def process_similarity(text1: str, text2: str, model_choice: str, embedding_dim: int) -> float: embedder = create_embedder(model_choice, embedding_dim) embeddings = embedder.get_embeddings([text1, text2]) similarity = torch.nn.functional.cosine_similarity(embeddings[0].unsqueeze(0), embeddings[1].unsqueeze(0)) return float(similarity) def process_reranking(query: str, documents: str, model_choice: str, embedding_dim: int) -> Dict: embedder = create_embedder(model_choice, embedding_dim) documents = [doc.strip() for doc in documents.split('\n') if doc.strip()] query_embedding = embedder.get_embeddings([query], with_instruction=True) doc_embeddings = embedder.get_embeddings(documents) similarities = torch.nn.functional.cosine_similarity(query_embedding, doc_embeddings) # Sort documents by similarity sorted_indices = torch.argsort(similarities, descending=True) results = [] for idx in sorted_indices: results.append({ 'document': documents[idx], 'score': float(similarities[idx]) }) return {'results': results} def process_batch(texts: str, model_choice: str, embedding_dim: int) -> Dict: embedder = create_embedder(model_choice, embedding_dim) texts = [text.strip() for text in texts.split('\n') if text.strip()] embeddings = embedder.get_embeddings(texts) similarity_matrix = torch.nn.functional.cosine_similarity(embeddings.unsqueeze(1), embeddings.unsqueeze(0), dim=2) df = pd.DataFrame(similarity_matrix.cpu().numpy(), index=texts, columns=texts) return {'similarity_matrix': df.to_dict()} def process_retrieval(prompt: str, queries: str, documents: str, model_choice: str, embedding_dim: int) -> Dict: embedder = create_embedder(model_choice, embedding_dim) # Process input strings queries = [q.strip() for q in queries.split('\n') if q.strip()] documents = [doc.strip() for doc in documents.split('\n') if doc.strip()] # Get embeddings prompt_embedding = embedder.get_embeddings([prompt], with_instruction=True) query_embeddings = embedder.get_embeddings(queries, with_instruction=True) doc_embeddings = embedder.get_embeddings(documents) # Calculate similarities query_similarities = torch.nn.functional.cosine_similarity(prompt_embedding, query_embeddings) doc_similarities = torch.nn.functional.cosine_similarity(prompt_embedding.repeat(len(documents), 1), doc_embeddings) # Process results results = { 'relevant_queries': [], 'relevant_documents': [] } # Sort queries query_indices = torch.argsort(query_similarities, descending=True) for idx in query_indices: results['relevant_queries'].append({ 'query': queries[idx], 'similarity': float(query_similarities[idx]) }) # Sort documents doc_indices = torch.argsort(doc_similarities, descending=True) for idx in doc_indices: results['relevant_documents'].append({ 'document': documents[idx], 'similarity': float(doc_similarities[idx]) }) return results # Update the CSS to improve feature visibility custom_css = """ :root { --primary-color: #2196F3; --secondary-color: #1976D2; --background-color: #f8f9fa; --sidebar-bg: #ffffff; --text-color: #333333; --border-color: #e0e0e0; } .container { max-width: 1200px; margin: auto; padding: 20px; } .sidebar { background-color: var(--sidebar-bg); border-right: 1px solid var(--border-color); padding: 20px; margin-right: 20px; position: sticky; top: 0; height: 100vh; overflow-y: auto; } .main-content { background-color: var(--background-color); padding: 20px; border-radius: 10px; } .features-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 15px; margin: 15px 0; } .feature-card { background: white; padding: 15px; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); transition: all 0.3s ease; border: 1px solid var(--border-color); text-align: center; } .feature-card:hover { transform: translateY(-3px); box-shadow: 0 3px 6px rgba(0,0,0,0.15); border-color: var(--primary-color); } .feature-icon { font-size: 24px; margin-bottom: 10px; color: var(--primary-color); } .feature-card h3 { color: var(--text-color); margin: 8px 0; font-size: 0.95em; font-weight: 600; } .feature-card p { color: #666; font-size: 0.8em; line-height: 1.3; margin: 5px 0; } .features-summary { margin: 40px 0; padding: 30px; background: white; border-radius: 12px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); } .features-summary h2 { color: var(--text-color); margin-bottom: 25px; text-align: center; font-size: 1.5em; } .feature-list { display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 30px; } .feature-group { padding: 20px; background: var(--background-color); border-radius: 8px; border: 1px solid var(--border-color); } .feature-group h3 { color: var(--primary-color); margin-bottom: 15px; font-size: 1.2em; } .feature-group ul { list-style: none; padding: 0; margin: 0; } .feature-group li { padding: 8px 0; color: var(--text-color); position: relative; padding-left: 20px; } .feature-group li:before { content: "•"; color: var(--primary-color); position: absolute; left: 0; } .description { margin: 20px 0; padding: 15px; border-radius: 8px; background-color: #ffffff; box-shadow: 0 2px 4px rgba(0,0,0,0.1); } .example { margin: 10px 0; padding: 15px; border-left: 4px solid var(--primary-color); background-color: #ffffff; box-shadow: 0 2px 4px rgba(0,0,0,0.1); } .warning { color: #721c24; background-color: #f8d7da; border: 1px solid #f5c6cb; padding: 15px; border-radius: 8px; margin: 10px 0; } .settings { background-color: #ffffff; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 20px 0; } .tab-content { padding: 20px; background-color: #ffffff; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); } .heading { color: var(--text-color); margin-bottom: 20px; padding-bottom: 10px; border-bottom: 2px solid var(--primary-color); } button.primary { background-color: var(--primary-color) !important; } button.secondary { background-color: var(--secondary-color) !important; } """ # Create the Gradio interface def create_demo(): demo = gr.Blocks(title="Advanced Text Processing with Qwen", css=custom_css, theme=gr.themes.Soft()) with demo: with gr.Row(): # Sidebar with gr.Column(scale=1, elem_classes="sidebar"): gr.Markdown(""" # Qwen Embeddings ### Navigation - [Configuration](#configuration) - [Features](#features) - [Documentation](#documentation) """) with gr.Accordion("Configuration", open=True): gr.Markdown(""" ### Model Settings Configure the embedding model parameters below. """) model_choice = gr.Dropdown( choices=list(AVAILABLE_MODELS.keys()), value=list(AVAILABLE_MODELS.keys())[0], label="Select Model" ) embedding_dim = gr.Slider( minimum=32, maximum=1024, value=768, step=32, label="Embedding Dimension", elem_classes="settings" ) update_dim_btn = gr.Button("Update Dimension", variant="secondary") dim_status = gr.Textbox(label="Status", interactive=False) with gr.Accordion("Documentation", open=False): gr.Markdown(""" ### Usage Guide 1. **Embedding Dimension** - 32-128: Fast, simple tasks - 256-512: Balanced performance - 768: Default, full model - 1024: Maximum detail 2. **Best Practices** - Use appropriate dimensions for your task - Consider batch size for multiple documents - Test different settings for optimal results """) # Main Content with gr.Column(scale=4): gr.Markdown(""" # Advanced Text Processing Suite Welcome to the Advanced Text Processing Suite powered by Qwen Embeddings. This tool provides state-of-the-art text analysis capabilities with support for Arabic and multiple languages. """) # Feature Grid gr.HTML("""
🔄

Text Similarity

Compare text meanings

🔍

Semantic Search

Find relevant docs

📊

Batch Analysis

Process multiple texts

🎯

Multi-Query

Advanced retrieval

🌐

Cross-Lingual

Cross-language match

🏷️

Classification

Categorize texts

🔮

Clustering

Group documents

😊

Sentiment

Analyze emotions

🎨

Concepts

Extract themes

""") with gr.Tabs() as tabs: # Text Similarity Tab with gr.Tab("Text Similarity Analysis"): with gr.Column(elem_classes="tab-content"): gr.Markdown(""" ### Text Similarity Analysis Compare the semantic similarity between two texts. The score ranges from 0 (completely different) to 1 (identical meaning).
Try these Arabic examples:
• "أحب القراءة كثيراً" and "القراءة من أحب هواياتي"
• "السماء صافية اليوم" and "الطقس حار جداً"
""") with gr.Row(): text1 = gr.Textbox( label="First Text", lines=3, placeholder="Enter first text here...", value="أحب القراءة كثيراً" ) text2 = gr.Textbox( label="Second Text", lines=3, placeholder="Enter second text here...", value="القراءة من أحب هواياتي" ) similarity_btn = gr.Button("Calculate Similarity", variant="primary") similarity_score = gr.Number(label="Similarity Score") similarity_btn.click( fn=lambda t1, t2, m, d: process_with_embedder('compute_similarity', t1, t2, m, d), inputs=[text1, text2, model_choice, embedding_dim], outputs=similarity_score ) # Document Reranking Tab with gr.Tab("Semantic Search & Reranking"): with gr.Column(elem_classes="tab-content"): gr.Markdown(""" ### Semantic Search & Document Reranking Search through a collection of documents and rank them by semantic relevance to your query.
Try these Arabic queries:
• "ما هي عواصم الدول العربية؟"
• "أين تقع أكبر المدن العربية؟"
• "ما هي المراكز الثقافية العربية؟"
""") query_text = gr.Textbox( label="Search Query", placeholder="Enter your search query...", value="ما هي عواصم الدول العربية؟" ) documents_text = gr.Textbox( label="Documents Collection (one per line)", lines=10, placeholder="Enter documents here, one per line...", value="""القاهرة هي عاصمة جمهورية مصر العربية وأكبر مدنها. الرياض هي عاصمة المملكة العربية السعودية ومركزها الاقتصادي. دمشق هي أقدم عاصمة مأهولة في التاريخ وهي عاصمة سوريا. بغداد عاصمة العراق وتقع على نهر دجلة. الدار البيضاء أكبر مدن المغرب وعاصمته الاقتصادية. تونس هي عاصمة الجمهورية التونسية ومركزها الثقافي.""" ) rerank_btn = gr.Button("Search & Rank", variant="primary") rerank_results = gr.Dataframe( headers=["Document", "Relevance Score"], label="Search Results" ) rerank_btn.click( fn=lambda q, d, m, e: process_with_embedder('rerank_documents', q, d, m, e), inputs=[query_text, documents_text, model_choice, embedding_dim], outputs=rerank_results ) # Batch Analysis Tab with gr.Tab("Batch Similarity Analysis"): with gr.Column(elem_classes="tab-content"): gr.Markdown(""" ### Batch Similarity Analysis Analyze semantic relationships between multiple texts simultaneously.
The example shows Arabic proverbs about friendship:
See how the model captures the semantic relationships between similar themes.
""") batch_texts = gr.Textbox( label="Input Texts (one per line)", lines=10, placeholder="Enter texts here, one per line...", value="""الصديق وقت الضيق. الصديق الحقيقي يظهر عند الشدائد. عند المحن تعرف إخوانك. وقت الشدة بتعرف صحابك. الصاحب ساحب.""" ) process_btn = gr.Button("Analyze Relationships", variant="primary") similarity_matrix = gr.Dataframe( label="Similarity Matrix", wrap=True ) process_btn.click( fn=lambda t, m, e: process_with_embedder('process_batch_embeddings', t, m, e), inputs=[batch_texts, model_choice, embedding_dim], outputs=[similarity_matrix] ) # Add new Retrieval Tab with gr.Tab("Multi-Query Retrieval"): with gr.Column(elem_classes="tab-content"): gr.Markdown(""" ### Multi-Query Document Retrieval Match multiple queries against multiple documents simultaneously using semantic search.
This tab implements the exact retrieval logic from the Qwen example, allowing you to: - Define a custom task prompt - Input multiple queries - Input multiple documents - See all query-document match scores in a matrix
Try these examples:
Task prompt: "Given a web search query, retrieve relevant passages that answer the query"
Queries: • "ما هي أكبر المدن العربية؟" • "أين تقع أهم المراكز الثقافية؟"
Documents: Use the example documents or add your own
""") task_prompt = gr.Textbox( label="Task Prompt", placeholder="Enter the task description here...", value="Given a web search query, retrieve relevant passages that answer the query", lines=2 ) with gr.Row(): queries_text = gr.Textbox( label="Queries (one per line)", placeholder="Enter your queries here, one per line...", value="""ما هي أكبر المدن العربية؟ أين تقع أهم المراكز الثقافية؟""", lines=5 ) documents_text = gr.Textbox( label="Documents (one per line)", placeholder="Enter your documents here, one per line...", value="""القاهرة هي أكبر مدينة عربية وعاصمة مصر، وتضم العديد من المعالم الثقافية والتاريخية. الرياض عاصمة المملكة العربية السعودية ومركز ثقافي واقتصادي مهم. دبي مدينة عالمية في الإمارات العربية المتحدة ومركز تجاري رئيسي. بيروت عاصمة لبنان ومركز ثقافي مهم في العالم العربي.""", lines=5 ) retrieve_btn = gr.Button("Process Retrieval", variant="primary") retrieval_matrix = gr.Dataframe( label="Query-Document Relevance Matrix", wrap=True ) gr.Markdown("""
How to read the results: - Each row represents a query - Each column represents a document - Values show the relevance score (0-1) between each query-document pair - Higher scores indicate better matches
""") retrieve_btn.click( fn=lambda p, q, d, m, e: process_with_embedder('process_retrieval', p, q, d, m, e), inputs=[task_prompt, queries_text, documents_text, model_choice, embedding_dim], outputs=[retrieval_matrix] ) # Add Cross-Lingual Tab after the Multi-Query Retrieval tab with gr.Tab("Cross-Lingual Matching"): with gr.Column(elem_classes="tab-content"): gr.Markdown(""" ### Cross-Lingual Semantic Matching Compare the meaning of texts across Arabic and English languages.
This feature demonstrates the model's ability to understand semantic similarity across different languages. Try comparing similar concepts expressed in Arabic and English to see how well the model captures cross-lingual meaning.
Try these examples:
Arabic: "القراءة غذاء العقل والروح"
English: "Reading nourishes the mind and soul"
Or try your own pairs of semantically similar texts in both languages.
""") with gr.Row(): arabic_text = gr.Textbox( label="Arabic Text", placeholder="Enter Arabic text here...", value="القراءة غذاء العقل والروح", lines=3 ) english_text = gr.Textbox( label="English Text", placeholder="Enter English text here...", value="Reading nourishes the mind and soul", lines=3 ) match_btn = gr.Button("Compare Texts", variant="primary") with gr.Row(): cross_lingual_score = gr.Number( label="Cross-Lingual Similarity Score", value=None ) gr.Markdown("""
Understanding the score: - Score ranges from 0 (completely different meaning) to 1 (same meaning) - Scores above 0.7 usually indicate strong semantic similarity - The model considers the meaning, not just word-for-word translation
""") match_btn.click( fn=lambda a, e, m, d: process_with_embedder('process_cross_lingual', a, e, m, d), inputs=[arabic_text, english_text, model_choice, embedding_dim], outputs=[cross_lingual_score] ) # Add Text Classification Tab with gr.Tab("Text Classification"): with gr.Column(elem_classes="tab-content"): gr.Markdown(""" ### Text Classification Classify text into predefined categories using semantic similarity.
The model will compare your text against each category and rank them by relevance. You can define your own categories or use the provided examples.
""") input_text = gr.Textbox( label="Input Text", placeholder="Enter the text to classify...", value="الذكاء الاصطناعي يغير طريقة عملنا وتفكيرنا في المستقبل", lines=3 ) categories_text = gr.Textbox( label="Categories (one per line)", placeholder="Enter categories here...", value="""التكنولوجيا والابتكار الاقتصاد والأعمال التعليم والتدريب الثقافة والفنون الصحة والطب""", lines=5 ) classify_btn = gr.Button("Classify Text", variant="primary") classification_results = gr.Dataframe( headers=["Category", "Relevance Score"], label="Classification Results" ) classify_btn.click( fn=lambda t, c, m, e: process_with_embedder('classify_text', t, c, m, e), inputs=[input_text, categories_text, model_choice, embedding_dim], outputs=classification_results ) # Add Document Clustering Tab with gr.Tab("Document Clustering"): with gr.Column(elem_classes="tab-content"): gr.Markdown(""" ### Document Clustering Group similar documents together using semantic clustering.
This feature will: - Group similar documents into clusters - Identify the most representative document for each cluster - Help discover themes and patterns in your document collection
""") cluster_docs = gr.Textbox( label="Documents (one per line)", placeholder="Enter documents to cluster...", value="""الذكاء الاصطناعي يفتح آفاقاً جديدة في مجال الطب. الروبوتات تساعد الأطباء في إجراء العمليات الجراحية. التعلم الآلي يحسن من دقة التشخيص الطبي. الفن يعبر عن مشاعر الإنسان وأحاسيسه. الموسيقى لغة عالمية تتخطى حدود الثقافات. الرسم والنحت من أقدم أشكال التعبير الفني. التجارة الإلكترونية تغير نمط التسوق التقليدي. التسوق عبر الإنترنت يوفر الوقت والجهد. المتاجر الرقمية تتيح خيارات أوسع للمستهلكين.""", lines=10 ) num_clusters = gr.Slider( minimum=2, maximum=10, value=3, step=1, label="Number of Clusters" ) cluster_btn = gr.Button("Cluster Documents", variant="primary") clustering_results = gr.Dataframe( label="Clustering Results" ) cluster_btn.click( fn=lambda d, n, m, e: process_with_embedder('cluster_documents', d, n, m, e), inputs=[cluster_docs, num_clusters, model_choice, embedding_dim], outputs=clustering_results ) # Add Sentiment Analysis Tab with gr.Tab("Sentiment Analysis"): with gr.Column(elem_classes="tab-content"): gr.Markdown(""" ### Arabic Sentiment Analysis Analyze the sentiment of Arabic text using semantic similarity to sentiment anchors.
The model will compare your text against predefined sentiment anchors and determine: - The overall sentiment - Confidence scores for each sentiment level
""") sentiment_text = gr.Textbox( label="Text to Analyze", placeholder="Enter text to analyze sentiment...", value="هذا المشروع رائع جداً وسيحدث تغييراً إيجابياً في حياة الكثيرين", lines=3 ) analyze_btn = gr.Button("Analyze Sentiment", variant="primary") with gr.Row(): sentiment_label = gr.Label(label="Overall Sentiment") sentiment_scores = gr.Json(label="Detailed Scores") analyze_btn.click( fn=lambda t, m, e: process_with_embedder('analyze_sentiment', t, m, e), inputs=[sentiment_text, model_choice, embedding_dim], outputs=[sentiment_label, sentiment_scores] ) # Add Concept Extraction Tab with gr.Tab("Concept Extraction"): with gr.Column(elem_classes="tab-content"): gr.Markdown(""" ### Concept Extraction Extract key concepts and themes from Arabic text.
Analyze text to identify: - Emotional content - Main topics - Underlying themes
""") concept_text = gr.Textbox( label="Text to Analyze", placeholder="Enter text to analyze...", value="نحن نؤمن بأهمية التعليم والابتكار لبناء مستقبل أفضل لأجيالنا القادمة", lines=3 ) concept_type = gr.Radio( choices=["emotions", "topics", "themes"], value="themes", label="Concept Type" ) extract_btn = gr.Button("Extract Concepts", variant="primary") concept_results = gr.Dataframe( headers=["Concept", "Relevance Score"], label="Extracted Concepts" ) extract_btn.click( fn=lambda t, c, m, e: process_with_embedder('extract_concepts', t, c, m, e), inputs=[concept_text, concept_type, model_choice, embedding_dim], outputs=concept_results ) # Update dimension handler @spaces.GPU(duration=120) def update_embedder_dim(dim): global embedder try: embedder = initialize_embedder(embedding_dim=dim) return f"Successfully updated embedding dimension to {dim}" except Exception as e: return f"Error updating dimension: {str(e)}" update_dim_btn.click( fn=update_embedder_dim, inputs=[embedding_dim], outputs=dim_status ) return demo if __name__ == "__main__": demo = create_demo() demo.queue() demo.launch()