Spaces:

AD2000X
/

Ontology-RAG-Demo

Running

File size: 9,531 Bytes

e1cced0

# src/semantic_retriever.py

from typing import List, Dict, Any, Tuple, Optional
import numpy as np
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from src.ontology_manager import OntologyManager

class SemanticRetriever:
    """

    Enhanced retrieval system that combines vector search with ontology awareness.

    """
    
    def __init__(

        self, 

        ontology_manager: OntologyManager, 

        embeddings_model = None,

        text_chunks: Optional[List[str]] = None

    ):
        """

        Initialize the semantic retriever.

        

        Args:

            ontology_manager: The ontology manager instance

            embeddings_model: The embeddings model to use (defaults to OpenAIEmbeddings)

            text_chunks: Optional list of text chunks to add to the vector store

        """
        self.ontology_manager = ontology_manager
        self.embeddings = embeddings_model or OpenAIEmbeddings()
        
        # Create a vector store with the text representation of the ontology
        ontology_text = ontology_manager.get_text_representation()
        self.ontology_chunks = self._split_text(ontology_text)
        
        # Add additional text chunks if provided
        if text_chunks:
            self.text_chunks = text_chunks
            all_chunks = self.ontology_chunks + text_chunks
        else:
            self.text_chunks = []
            all_chunks = self.ontology_chunks
        
        # Convert to Document objects for FAISS
        documents = [Document(page_content=chunk, metadata={"source": "ontology" if i < len(self.ontology_chunks) else "text"}) 
                    for i, chunk in enumerate(all_chunks)]
        
        # Create the vector store
        self.vector_store = FAISS.from_documents(documents, self.embeddings)
    
    def _split_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
        """Split text into chunks for embedding."""
        chunks = []
        text_length = len(text)
        
        for i in range(0, text_length, chunk_size - overlap):
            chunk = text[i:i + chunk_size]
            if len(chunk) < 50:  # Skip very small chunks
                continue
            chunks.append(chunk)
            
        return chunks
    
    def retrieve(self, query: str, k: int = 4, include_ontology_context: bool = True) -> List[Document]:
        """

        Retrieve relevant documents using a hybrid approach.

        

        Args:

            query: The query string

            k: Number of documents to retrieve

            include_ontology_context: Whether to include additional ontology context

            

        Returns:

            A list of retrieved documents

        """
        # Get semantic context from the ontology
        if include_ontology_context:
            ontology_context = self.ontology_manager.get_semantic_context(query)
        else:
            ontology_context = []
        
        # Perform vector similarity search
        vector_results = self.vector_store.similarity_search(query, k=k)
        
        # Combine results
        combined_results = vector_results
        
        # Add ontology context as additional documents
        for i, context in enumerate(ontology_context):
            combined_results.append(Document(
                page_content=context,
                metadata={"source": "ontology_context", "context_id": i}
            ))
        
        return combined_results
    
    def retrieve_with_paths(self, query: str, k: int = 4) -> Dict[str, Any]:
        """

        Enhanced retrieval that includes semantic paths between entities.

        

        Args:

            query: The query string

            k: Number of documents to retrieve

            

        Returns:

            A dictionary containing retrieved documents and semantic paths

        """
        # Basic retrieval
        basic_results = self.retrieve(query, k)
        
        # Extract potential entities from the query (simplified approach)
        # A more sophisticated approach would use NER or entity linking
        entity_types = ["Product", "Department", "Employee", "Manager", "Customer", "Feedback"]
        query_words = query.lower().split()
        
        potential_entities = []
        for entity_type in entity_types:
            if entity_type.lower() in query_words:
                # Get instances of this type
                instances = self.ontology_manager.get_instances_of_class(entity_type)
                if instances:
                    # Just take the first few for demonstration
                    potential_entities.extend(instances[:2])
        
        # Find paths between potential entities
        paths = []
        if len(potential_entities) >= 2:
            for i in range(len(potential_entities)):
                for j in range(i+1, len(potential_entities)):
                    source = potential_entities[i]
                    target = potential_entities[j]
                    
                    # Find paths between these entities
                    entity_paths = self.ontology_manager.find_paths(source, target, max_length=3)
                    
                    if entity_paths:
                        for path in entity_paths:
                            # Convert path to text
                            path_text = self._path_to_text(path)
                            paths.append({
                                "source": source,
                                "target": target,
                                "path": path,
                                "text": path_text
                            })
        
        # Convert paths to documents
        path_documents = []
        for i, path_info in enumerate(paths):
            path_documents.append(Document(
                page_content=path_info["text"],
                metadata={
                    "source": "semantic_path",
                    "path_id": i,
                    "source_entity": path_info["source"],
                    "target_entity": path_info["target"]
                }
            ))
        
        return {
            "documents": basic_results + path_documents,
            "paths": paths
        }
    
    def _path_to_text(self, path: List[Dict]) -> str:
        """Convert a path to a text description."""
        if not path:
            return ""
        
        text_parts = []
        for edge in path:
            source = edge["source"]
            target = edge["target"]
            relation = edge["type"]
            
            # Get entity information
            source_info = self.ontology_manager.get_entity_info(source)
            target_info = self.ontology_manager.get_entity_info(target)
            
            # Get names if available
            source_name = source
            if "properties" in source_info and "name" in source_info["properties"]:
                source_name = source_info["properties"]["name"]
            
            target_name = target
            if "properties" in target_info and "name" in target_info["properties"]:
                target_name = target_info["properties"]["name"]
            
            # Describe the relationship
            text_parts.append(f"{source_name} {relation} {target_name}")
        
        return " -> ".join(text_parts)
    
    def search_by_property(self, class_type: str, property_name: str, property_value: str) -> List[Document]:
        """

        Search for instances of a class with a specific property value.

        

        Args:

            class_type: The class to search in

            property_name: The property name to match

            property_value: The property value to match

            

        Returns:

            A list of matched entities as documents

        """
        instances = self.ontology_manager.get_instances_of_class(class_type)
        
        results = []
        for instance_id in instances:
            entity_info = self.ontology_manager.get_entity_info(instance_id)
            if "properties" in entity_info:
                properties = entity_info["properties"]
                if property_name in properties:
                    # Simple string matching (could be enhanced with fuzzy matching)
                    if str(properties[property_name]).lower() == property_value.lower():
                        # Convert to document
                        doc_content = f"Instance: {instance_id}\n"
                        doc_content += f"Type: {class_type}\n"
                        doc_content += "Properties:\n"
                        
                        for prop_name, prop_value in properties.items():
                            doc_content += f"- {prop_name}: {prop_value}\n"
                        
                        results.append(Document(
                            page_content=doc_content,
                            metadata={
                                "source": "property_search",
                                "instance_id": instance_id,
                                "class_type": class_type
                            }
                        ))
        
        return results