File size: 9,531 Bytes
e1cced0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# src/semantic_retriever.py

from typing import List, Dict, Any, Tuple, Optional
import numpy as np
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from src.ontology_manager import OntologyManager

class SemanticRetriever:
    """

    Enhanced retrieval system that combines vector search with ontology awareness.

    """
    
    def __init__(

        self, 

        ontology_manager: OntologyManager, 

        embeddings_model = None,

        text_chunks: Optional[List[str]] = None

    ):
        """

        Initialize the semantic retriever.

        

        Args:

            ontology_manager: The ontology manager instance

            embeddings_model: The embeddings model to use (defaults to OpenAIEmbeddings)

            text_chunks: Optional list of text chunks to add to the vector store

        """
        self.ontology_manager = ontology_manager
        self.embeddings = embeddings_model or OpenAIEmbeddings()
        
        # Create a vector store with the text representation of the ontology
        ontology_text = ontology_manager.get_text_representation()
        self.ontology_chunks = self._split_text(ontology_text)
        
        # Add additional text chunks if provided
        if text_chunks:
            self.text_chunks = text_chunks
            all_chunks = self.ontology_chunks + text_chunks
        else:
            self.text_chunks = []
            all_chunks = self.ontology_chunks
        
        # Convert to Document objects for FAISS
        documents = [Document(page_content=chunk, metadata={"source": "ontology" if i < len(self.ontology_chunks) else "text"}) 
                    for i, chunk in enumerate(all_chunks)]
        
        # Create the vector store
        self.vector_store = FAISS.from_documents(documents, self.embeddings)
    
    def _split_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
        """Split text into chunks for embedding."""
        chunks = []
        text_length = len(text)
        
        for i in range(0, text_length, chunk_size - overlap):
            chunk = text[i:i + chunk_size]
            if len(chunk) < 50:  # Skip very small chunks
                continue
            chunks.append(chunk)
            
        return chunks
    
    def retrieve(self, query: str, k: int = 4, include_ontology_context: bool = True) -> List[Document]:
        """

        Retrieve relevant documents using a hybrid approach.

        

        Args:

            query: The query string

            k: Number of documents to retrieve

            include_ontology_context: Whether to include additional ontology context

            

        Returns:

            A list of retrieved documents

        """
        # Get semantic context from the ontology
        if include_ontology_context:
            ontology_context = self.ontology_manager.get_semantic_context(query)
        else:
            ontology_context = []
        
        # Perform vector similarity search
        vector_results = self.vector_store.similarity_search(query, k=k)
        
        # Combine results
        combined_results = vector_results
        
        # Add ontology context as additional documents
        for i, context in enumerate(ontology_context):
            combined_results.append(Document(
                page_content=context,
                metadata={"source": "ontology_context", "context_id": i}
            ))
        
        return combined_results
    
    def retrieve_with_paths(self, query: str, k: int = 4) -> Dict[str, Any]:
        """

        Enhanced retrieval that includes semantic paths between entities.

        

        Args:

            query: The query string

            k: Number of documents to retrieve

            

        Returns:

            A dictionary containing retrieved documents and semantic paths

        """
        # Basic retrieval
        basic_results = self.retrieve(query, k)
        
        # Extract potential entities from the query (simplified approach)
        # A more sophisticated approach would use NER or entity linking
        entity_types = ["Product", "Department", "Employee", "Manager", "Customer", "Feedback"]
        query_words = query.lower().split()
        
        potential_entities = []
        for entity_type in entity_types:
            if entity_type.lower() in query_words:
                # Get instances of this type
                instances = self.ontology_manager.get_instances_of_class(entity_type)
                if instances:
                    # Just take the first few for demonstration
                    potential_entities.extend(instances[:2])
        
        # Find paths between potential entities
        paths = []
        if len(potential_entities) >= 2:
            for i in range(len(potential_entities)):
                for j in range(i+1, len(potential_entities)):
                    source = potential_entities[i]
                    target = potential_entities[j]
                    
                    # Find paths between these entities
                    entity_paths = self.ontology_manager.find_paths(source, target, max_length=3)
                    
                    if entity_paths:
                        for path in entity_paths:
                            # Convert path to text
                            path_text = self._path_to_text(path)
                            paths.append({
                                "source": source,
                                "target": target,
                                "path": path,
                                "text": path_text
                            })
        
        # Convert paths to documents
        path_documents = []
        for i, path_info in enumerate(paths):
            path_documents.append(Document(
                page_content=path_info["text"],
                metadata={
                    "source": "semantic_path",
                    "path_id": i,
                    "source_entity": path_info["source"],
                    "target_entity": path_info["target"]
                }
            ))
        
        return {
            "documents": basic_results + path_documents,
            "paths": paths
        }
    
    def _path_to_text(self, path: List[Dict]) -> str:
        """Convert a path to a text description."""
        if not path:
            return ""
        
        text_parts = []
        for edge in path:
            source = edge["source"]
            target = edge["target"]
            relation = edge["type"]
            
            # Get entity information
            source_info = self.ontology_manager.get_entity_info(source)
            target_info = self.ontology_manager.get_entity_info(target)
            
            # Get names if available
            source_name = source
            if "properties" in source_info and "name" in source_info["properties"]:
                source_name = source_info["properties"]["name"]
            
            target_name = target
            if "properties" in target_info and "name" in target_info["properties"]:
                target_name = target_info["properties"]["name"]
            
            # Describe the relationship
            text_parts.append(f"{source_name} {relation} {target_name}")
        
        return " -> ".join(text_parts)
    
    def search_by_property(self, class_type: str, property_name: str, property_value: str) -> List[Document]:
        """

        Search for instances of a class with a specific property value.

        

        Args:

            class_type: The class to search in

            property_name: The property name to match

            property_value: The property value to match

            

        Returns:

            A list of matched entities as documents

        """
        instances = self.ontology_manager.get_instances_of_class(class_type)
        
        results = []
        for instance_id in instances:
            entity_info = self.ontology_manager.get_entity_info(instance_id)
            if "properties" in entity_info:
                properties = entity_info["properties"]
                if property_name in properties:
                    # Simple string matching (could be enhanced with fuzzy matching)
                    if str(properties[property_name]).lower() == property_value.lower():
                        # Convert to document
                        doc_content = f"Instance: {instance_id}\n"
                        doc_content += f"Type: {class_type}\n"
                        doc_content += "Properties:\n"
                        
                        for prop_name, prop_value in properties.items():
                            doc_content += f"- {prop_name}: {prop_value}\n"
                        
                        results.append(Document(
                            page_content=doc_content,
                            metadata={
                                "source": "property_search",
                                "instance_id": instance_id,
                                "class_type": class_type
                            }
                        ))
        
        return results