import gradio as gr import torch import torch.nn.functional as F import numpy as np import plotly.express as px import pandas as pd import spaces from typing import List, Tuple, Dict from torch import Tensor from transformers import AutoTokenizer, AutoModel from sentence_transformers import SentenceTransformer import json # Initialize the embedder at module level embedder = None AVAILABLE_MODELS = { "Qwen3-Embedding-0.6B": "Qwen/Qwen3-Embedding-0.6B", "Semantic-Ar-Qwen-Embed-0.6B": "Omartificial-Intelligence-Space/Semantic-Ar-Qwen-Embed-0.6B" } class QwenEmbedder: def __init__(self, model_name: str = "Qwen/Qwen3-Embedding-0.6B", embedding_dim: int = 768): self.model = SentenceTransformer(model_name) self.embedding_dim = embedding_dim self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model.to(self.device) if embedding_dim != 768: # Add projection layer if needed self.projection = torch.nn.Linear(768, embedding_dim) self.projection.to(self.device) else: self.projection = None def get_embeddings(self, texts: List[str], with_instruction: bool = False) -> torch.Tensor: if with_instruction: texts = [f"Represent this Arabic text for retrieval: {text}" for text in texts] embeddings = self.model.encode(texts, convert_to_tensor=True) if self.projection is not None: embeddings = self.projection(embeddings) # Normalize embeddings embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) return embeddings @spaces.GPU(duration=120) def initialize_embedder(embedding_dim=768): # Initialize device inside the GPU worker device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Initializing embedder on device: {device}") # Create model with specified dimension model = QwenEmbedder(embedding_dim=embedding_dim) return model @spaces.GPU(duration=120) def process_with_embedder(fn_name, *args): """Generic handler for embedder operations""" global embedder if embedder is None: embedder = initialize_embedder() # Map function names to actual functions fn_map = { 'compute_similarity': compute_similarity, 'rerank_documents': rerank_documents, 'process_batch_embeddings': process_batch_embeddings, 'process_retrieval': process_retrieval, 'process_cross_lingual': process_cross_lingual, 'classify_text': classify_text, 'cluster_documents': cluster_documents, 'analyze_sentiment': analyze_sentiment, 'extract_concepts': extract_concepts } return fn_map[fn_name](embedder, *args) # Check for GPU support and configure appropriately device = "cuda" if torch.cuda.is_available() else "cpu" zero = torch.Tensor([0]).to(device) print(f"Device being used: {zero.device}") def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) if left_padding: return last_hidden_states[:, -1] else: sequence_lengths = attention_mask.sum(dim=1) - 1 batch_size = last_hidden_states.shape[0] return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths] def get_detailed_instruct(task_description: str, query: str) -> str: return f'Instruct: {task_description}\nQuery: {query}' def tokenize(tokenizer, input_texts, eod_id, max_length): batch_dict = tokenizer(input_texts, padding=False, truncation=True, max_length=max_length-2) for seq, att in zip(batch_dict["input_ids"], batch_dict["attention_mask"]): seq.append(eod_id) att.append(1) batch_dict = tokenizer.pad(batch_dict, padding=True, return_tensors="pt") return batch_dict def compute_similarity(embedder: QwenEmbedder, text1: str, text2: str, model_choice: str = None, embedding_dim: int = None) -> float: embeddings = embedder.get_embeddings([text1, text2]) similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item() return round(similarity, 3) def rerank_documents(embedder: QwenEmbedder, query: str, documents: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]: docs_list = [doc.strip() for doc in documents.split('\n') if doc.strip()] # Add instruction to query task = 'Given a search query, retrieve relevant passages that answer the query' query_with_instruct = get_detailed_instruct(task, query) # Get embeddings query_embedding = embedder.get_embeddings([query_with_instruct]) doc_embeddings = embedder.get_embeddings(docs_list) # Calculate similarities scores = (query_embedding @ doc_embeddings.T).squeeze(0) results = [(doc, float(score)) for doc, score in zip(docs_list, scores)] results.sort(key=lambda x: x[1], reverse=True) return [(doc, round(score, 3)) for doc, score in results] def process_batch_embeddings(embedder: QwenEmbedder, texts: str, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame: text_list = [text.strip() for text in texts.split('\n') if text.strip()] if len(text_list) < 1: return pd.DataFrame() embeddings = embedder.get_embeddings(text_list) scores = (embeddings @ embeddings.T).cpu().numpy() # Create similarity matrix DataFrame df_similarities = pd.DataFrame( scores, index=text_list, columns=text_list ) return df_similarities.round(3) def process_retrieval(embedder: QwenEmbedder, task_prompt: str, queries: str, documents: str, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame: # Process queries and documents query_list = [q.strip() for q in queries.split('\n') if q.strip()] doc_list = [d.strip() for d in documents.split('\n') if d.strip()] if not query_list or not doc_list: return pd.DataFrame() # Add instruction to queries instructed_queries = [get_detailed_instruct(task_prompt, q) for q in query_list] # Get embeddings for both queries and documents query_embeddings = embedder.get_embeddings(instructed_queries) doc_embeddings = embedder.get_embeddings(doc_list) # Calculate similarity scores scores = (query_embeddings @ doc_embeddings.T).cpu().numpy() # Create DataFrame with results df = pd.DataFrame(scores, index=query_list, columns=doc_list) return df.round(3) def process_cross_lingual(embedder: QwenEmbedder, arabic_text: str, english_text: str, model_choice: str = None, embedding_dim: int = None) -> dict: texts = [arabic_text, english_text] embeddings = embedder.get_embeddings(texts) similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item() return {"similarity": round(similarity, 3)} def classify_text(embedder: QwenEmbedder, text: str, categories: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]: cat_list = [c.strip() for c in categories.split('\n') if c.strip()] text_embedding = embedder.get_embeddings([text]) cat_embeddings = embedder.get_embeddings(cat_list) scores = (text_embedding @ cat_embeddings.T).squeeze(0) results = [(cat, float(score)) for cat, score in zip(cat_list, scores)] results.sort(key=lambda x: x[1], reverse=True) return [(cat, round(score, 3)) for cat, score in results] def cluster_documents(embedder: QwenEmbedder, documents: str, num_clusters: int, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame: from sklearn.cluster import KMeans doc_list = [doc.strip() for doc in documents.split('\n') if doc.strip()] if len(doc_list) < num_clusters: return pd.DataFrame() embeddings = embedder.get_embeddings(doc_list) # Perform clustering kmeans = KMeans(n_clusters=num_clusters, random_state=42) clusters = kmeans.fit_predict(embeddings.cpu().numpy()) # Calculate center document for each cluster cluster_centers = kmeans.cluster_centers_ cluster_center_docs = [] for i in range(num_clusters): cluster_docs = [doc for doc, cluster in zip(doc_list, clusters) if cluster == i] cluster_embeddings = embedder.get_embeddings(cluster_docs) center_embedding = torch.tensor(cluster_centers[i]).unsqueeze(0) similarities = F.cosine_similarity(cluster_embeddings, center_embedding) center_doc = cluster_docs[similarities.argmax().item()] cluster_center_docs.append(center_doc) # Create results DataFrame df = pd.DataFrame({ 'Document': doc_list, 'Cluster': clusters, 'Cluster Center Document': [cluster_center_docs[c] for c in clusters] }) return df.sort_values('Cluster') def analyze_sentiment(embedder: QwenEmbedder, text: str, model_choice: str = None, embedding_dim: int = None) -> Tuple[str, dict]: # Define sentiment anchors anchors = { "very_positive": "هذا رائع جداً ومدهش! أنا سعيد للغاية", "positive": "هذا جيد وممتع", "neutral": "هذا عادي ومقبول", "negative": "هذا سيء ومزعج", "very_negative": "هذا فظيع جداً ومحبط للغاية" } # Get embeddings text_embedding = embedder.get_embeddings([text]) anchor_embeddings = embedder.get_embeddings(list(anchors.values())) # Calculate similarities scores = (text_embedding @ anchor_embeddings.T).squeeze(0) results = list(zip(anchors.keys(), scores.tolist())) results.sort(key=lambda x: x[1], reverse=True) # Return tuple of (sentiment, scores_dict) return ( results[0][0], {k: round(float(v), 3) for k, v in results} ) def extract_concepts(embedder: QwenEmbedder, text: str, concept_type: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]: # Define concept anchors based on type concept_anchors = { "emotions": [ "الفرح والسعادة", "الحزن والأسى", "الغضب والإحباط", "الخوف والقلق", "الحب والعاطفة", "الأمل والتفاؤل" ], "topics": [ "السياسة والحكم", "الاقتصاد والمال", "العلوم والتكنولوجيا", "الفن والثقافة", "الرياضة والترفيه", "التعليم والمعرفة" ], "themes": [ "العدالة والمساواة", "التقدم والتطور", "التقاليد والتراث", "الحرية والاستقلال", "التعاون والوحدة", "الإبداع والابتكار" ] } anchors = concept_anchors.get(concept_type, concept_anchors["topics"]) # Get embeddings text_embedding = embedder.get_embeddings([text]) anchor_embeddings = embedder.get_embeddings(anchors) # Calculate similarities scores = (text_embedding @ anchor_embeddings.T).squeeze(0) results = [(anchor, float(score)) for anchor, score in zip(anchors, scores)] results.sort(key=lambda x: x[1], reverse=True) return [(concept, round(score, 3)) for concept, score in results] def create_embedder(model_choice: str, embedding_dim: int = 768) -> QwenEmbedder: model_name = AVAILABLE_MODELS[model_choice] return QwenEmbedder(model_name=model_name, embedding_dim=embedding_dim) def process_similarity(text1: str, text2: str, model_choice: str, embedding_dim: int) -> float: embedder = create_embedder(model_choice, embedding_dim) embeddings = embedder.get_embeddings([text1, text2]) similarity = torch.nn.functional.cosine_similarity(embeddings[0].unsqueeze(0), embeddings[1].unsqueeze(0)) return float(similarity) def process_reranking(query: str, documents: str, model_choice: str, embedding_dim: int) -> Dict: embedder = create_embedder(model_choice, embedding_dim) documents = [doc.strip() for doc in documents.split('\n') if doc.strip()] query_embedding = embedder.get_embeddings([query], with_instruction=True) doc_embeddings = embedder.get_embeddings(documents) similarities = torch.nn.functional.cosine_similarity(query_embedding, doc_embeddings) # Sort documents by similarity sorted_indices = torch.argsort(similarities, descending=True) results = [] for idx in sorted_indices: results.append({ 'document': documents[idx], 'score': float(similarities[idx]) }) return {'results': results} def process_batch(texts: str, model_choice: str, embedding_dim: int) -> Dict: embedder = create_embedder(model_choice, embedding_dim) texts = [text.strip() for text in texts.split('\n') if text.strip()] embeddings = embedder.get_embeddings(texts) similarity_matrix = torch.nn.functional.cosine_similarity(embeddings.unsqueeze(1), embeddings.unsqueeze(0), dim=2) df = pd.DataFrame(similarity_matrix.cpu().numpy(), index=texts, columns=texts) return {'similarity_matrix': df.to_dict()} def process_retrieval(prompt: str, queries: str, documents: str, model_choice: str, embedding_dim: int) -> Dict: embedder = create_embedder(model_choice, embedding_dim) # Process input strings queries = [q.strip() for q in queries.split('\n') if q.strip()] documents = [doc.strip() for doc in documents.split('\n') if doc.strip()] # Get embeddings prompt_embedding = embedder.get_embeddings([prompt], with_instruction=True) query_embeddings = embedder.get_embeddings(queries, with_instruction=True) doc_embeddings = embedder.get_embeddings(documents) # Calculate similarities query_similarities = torch.nn.functional.cosine_similarity(prompt_embedding, query_embeddings) doc_similarities = torch.nn.functional.cosine_similarity(prompt_embedding.repeat(len(documents), 1), doc_embeddings) # Process results results = { 'relevant_queries': [], 'relevant_documents': [] } # Sort queries query_indices = torch.argsort(query_similarities, descending=True) for idx in query_indices: results['relevant_queries'].append({ 'query': queries[idx], 'similarity': float(query_similarities[idx]) }) # Sort documents doc_indices = torch.argsort(doc_similarities, descending=True) for idx in doc_indices: results['relevant_documents'].append({ 'document': documents[idx], 'similarity': float(doc_similarities[idx]) }) return results # Update the CSS to improve feature visibility custom_css = """ :root { --primary-color: #2196F3; --secondary-color: #1976D2; --background-color: #f8f9fa; --sidebar-bg: #ffffff; --text-color: #333333; --border-color: #e0e0e0; } .container { max-width: 1200px; margin: auto; padding: 20px; } .sidebar { background-color: var(--sidebar-bg); border-right: 1px solid var(--border-color); padding: 20px; margin-right: 20px; position: sticky; top: 0; height: 100vh; overflow-y: auto; } .main-content { background-color: var(--background-color); padding: 20px; border-radius: 10px; } .features-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 15px; margin: 15px 0; } .feature-card { background: white; padding: 15px; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); transition: all 0.3s ease; border: 1px solid var(--border-color); text-align: center; } .feature-card:hover { transform: translateY(-3px); box-shadow: 0 3px 6px rgba(0,0,0,0.15); border-color: var(--primary-color); } .feature-icon { font-size: 24px; margin-bottom: 10px; color: var(--primary-color); } .feature-card h3 { color: var(--text-color); margin: 8px 0; font-size: 0.95em; font-weight: 600; } .feature-card p { color: #666; font-size: 0.8em; line-height: 1.3; margin: 5px 0; } .features-summary { margin: 40px 0; padding: 30px; background: white; border-radius: 12px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); } .features-summary h2 { color: var(--text-color); margin-bottom: 25px; text-align: center; font-size: 1.5em; } .feature-list { display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 30px; } .feature-group { padding: 20px; background: var(--background-color); border-radius: 8px; border: 1px solid var(--border-color); } .feature-group h3 { color: var(--primary-color); margin-bottom: 15px; font-size: 1.2em; } .feature-group ul { list-style: none; padding: 0; margin: 0; } .feature-group li { padding: 8px 0; color: var(--text-color); position: relative; padding-left: 20px; } .feature-group li:before { content: "•"; color: var(--primary-color); position: absolute; left: 0; } .description { margin: 20px 0; padding: 15px; border-radius: 8px; background-color: #ffffff; box-shadow: 0 2px 4px rgba(0,0,0,0.1); } .example { margin: 10px 0; padding: 15px; border-left: 4px solid var(--primary-color); background-color: #ffffff; box-shadow: 0 2px 4px rgba(0,0,0,0.1); } .warning { color: #721c24; background-color: #f8d7da; border: 1px solid #f5c6cb; padding: 15px; border-radius: 8px; margin: 10px 0; } .settings { background-color: #ffffff; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 20px 0; } .tab-content { padding: 20px; background-color: #ffffff; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); } .heading { color: var(--text-color); margin-bottom: 20px; padding-bottom: 10px; border-bottom: 2px solid var(--primary-color); } button.primary { background-color: var(--primary-color) !important; } button.secondary { background-color: var(--secondary-color) !important; } """ # Create the Gradio interface def create_demo(): demo = gr.Blocks(title="Advanced Text Processing with Qwen", css=custom_css, theme=gr.themes.Soft()) with demo: with gr.Row(): # Sidebar with gr.Column(scale=1, elem_classes="sidebar"): gr.Markdown(""" # Qwen Embeddings ### Navigation - [Configuration](#configuration) - [Features](#features) - [Documentation](#documentation) """) with gr.Accordion("Configuration", open=True): gr.Markdown(""" ### Model Settings Configure the embedding model parameters below. """) model_choice = gr.Dropdown( choices=list(AVAILABLE_MODELS.keys()), value=list(AVAILABLE_MODELS.keys())[0], label="Select Model" ) embedding_dim = gr.Slider( minimum=32, maximum=1024, value=768, step=32, label="Embedding Dimension", elem_classes="settings" ) update_dim_btn = gr.Button("Update Dimension", variant="secondary") dim_status = gr.Textbox(label="Status", interactive=False) with gr.Accordion("Documentation", open=False): gr.Markdown(""" ### Usage Guide 1. **Embedding Dimension** - 32-128: Fast, simple tasks - 256-512: Balanced performance - 768: Default, full model - 1024: Maximum detail 2. **Best Practices** - Use appropriate dimensions for your task - Consider batch size for multiple documents - Test different settings for optimal results """) # Main Content with gr.Column(scale=4): gr.Markdown(""" # Advanced Text Processing Suite Welcome to the Advanced Text Processing Suite powered by Qwen Embeddings. This tool provides state-of-the-art text analysis capabilities with support for Arabic and multiple languages. """) # Feature Grid gr.HTML("""
Compare text meanings
Find relevant docs
Process multiple texts
Advanced retrieval
Cross-language match
Categorize texts
Group documents
Analyze emotions
Extract themes