Spaces:
Running
Running
File size: 9,531 Bytes
e1cced0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
# src/semantic_retriever.py
from typing import List, Dict, Any, Tuple, Optional
import numpy as np
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from src.ontology_manager import OntologyManager
class SemanticRetriever:
"""
Enhanced retrieval system that combines vector search with ontology awareness.
"""
def __init__(
self,
ontology_manager: OntologyManager,
embeddings_model = None,
text_chunks: Optional[List[str]] = None
):
"""
Initialize the semantic retriever.
Args:
ontology_manager: The ontology manager instance
embeddings_model: The embeddings model to use (defaults to OpenAIEmbeddings)
text_chunks: Optional list of text chunks to add to the vector store
"""
self.ontology_manager = ontology_manager
self.embeddings = embeddings_model or OpenAIEmbeddings()
# Create a vector store with the text representation of the ontology
ontology_text = ontology_manager.get_text_representation()
self.ontology_chunks = self._split_text(ontology_text)
# Add additional text chunks if provided
if text_chunks:
self.text_chunks = text_chunks
all_chunks = self.ontology_chunks + text_chunks
else:
self.text_chunks = []
all_chunks = self.ontology_chunks
# Convert to Document objects for FAISS
documents = [Document(page_content=chunk, metadata={"source": "ontology" if i < len(self.ontology_chunks) else "text"})
for i, chunk in enumerate(all_chunks)]
# Create the vector store
self.vector_store = FAISS.from_documents(documents, self.embeddings)
def _split_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
"""Split text into chunks for embedding."""
chunks = []
text_length = len(text)
for i in range(0, text_length, chunk_size - overlap):
chunk = text[i:i + chunk_size]
if len(chunk) < 50: # Skip very small chunks
continue
chunks.append(chunk)
return chunks
def retrieve(self, query: str, k: int = 4, include_ontology_context: bool = True) -> List[Document]:
"""
Retrieve relevant documents using a hybrid approach.
Args:
query: The query string
k: Number of documents to retrieve
include_ontology_context: Whether to include additional ontology context
Returns:
A list of retrieved documents
"""
# Get semantic context from the ontology
if include_ontology_context:
ontology_context = self.ontology_manager.get_semantic_context(query)
else:
ontology_context = []
# Perform vector similarity search
vector_results = self.vector_store.similarity_search(query, k=k)
# Combine results
combined_results = vector_results
# Add ontology context as additional documents
for i, context in enumerate(ontology_context):
combined_results.append(Document(
page_content=context,
metadata={"source": "ontology_context", "context_id": i}
))
return combined_results
def retrieve_with_paths(self, query: str, k: int = 4) -> Dict[str, Any]:
"""
Enhanced retrieval that includes semantic paths between entities.
Args:
query: The query string
k: Number of documents to retrieve
Returns:
A dictionary containing retrieved documents and semantic paths
"""
# Basic retrieval
basic_results = self.retrieve(query, k)
# Extract potential entities from the query (simplified approach)
# A more sophisticated approach would use NER or entity linking
entity_types = ["Product", "Department", "Employee", "Manager", "Customer", "Feedback"]
query_words = query.lower().split()
potential_entities = []
for entity_type in entity_types:
if entity_type.lower() in query_words:
# Get instances of this type
instances = self.ontology_manager.get_instances_of_class(entity_type)
if instances:
# Just take the first few for demonstration
potential_entities.extend(instances[:2])
# Find paths between potential entities
paths = []
if len(potential_entities) >= 2:
for i in range(len(potential_entities)):
for j in range(i+1, len(potential_entities)):
source = potential_entities[i]
target = potential_entities[j]
# Find paths between these entities
entity_paths = self.ontology_manager.find_paths(source, target, max_length=3)
if entity_paths:
for path in entity_paths:
# Convert path to text
path_text = self._path_to_text(path)
paths.append({
"source": source,
"target": target,
"path": path,
"text": path_text
})
# Convert paths to documents
path_documents = []
for i, path_info in enumerate(paths):
path_documents.append(Document(
page_content=path_info["text"],
metadata={
"source": "semantic_path",
"path_id": i,
"source_entity": path_info["source"],
"target_entity": path_info["target"]
}
))
return {
"documents": basic_results + path_documents,
"paths": paths
}
def _path_to_text(self, path: List[Dict]) -> str:
"""Convert a path to a text description."""
if not path:
return ""
text_parts = []
for edge in path:
source = edge["source"]
target = edge["target"]
relation = edge["type"]
# Get entity information
source_info = self.ontology_manager.get_entity_info(source)
target_info = self.ontology_manager.get_entity_info(target)
# Get names if available
source_name = source
if "properties" in source_info and "name" in source_info["properties"]:
source_name = source_info["properties"]["name"]
target_name = target
if "properties" in target_info and "name" in target_info["properties"]:
target_name = target_info["properties"]["name"]
# Describe the relationship
text_parts.append(f"{source_name} {relation} {target_name}")
return " -> ".join(text_parts)
def search_by_property(self, class_type: str, property_name: str, property_value: str) -> List[Document]:
"""
Search for instances of a class with a specific property value.
Args:
class_type: The class to search in
property_name: The property name to match
property_value: The property value to match
Returns:
A list of matched entities as documents
"""
instances = self.ontology_manager.get_instances_of_class(class_type)
results = []
for instance_id in instances:
entity_info = self.ontology_manager.get_entity_info(instance_id)
if "properties" in entity_info:
properties = entity_info["properties"]
if property_name in properties:
# Simple string matching (could be enhanced with fuzzy matching)
if str(properties[property_name]).lower() == property_value.lower():
# Convert to document
doc_content = f"Instance: {instance_id}\n"
doc_content += f"Type: {class_type}\n"
doc_content += "Properties:\n"
for prop_name, prop_value in properties.items():
doc_content += f"- {prop_name}: {prop_value}\n"
results.append(Document(
page_content=doc_content,
metadata={
"source": "property_search",
"instance_id": instance_id,
"class_type": class_type
}
))
return results |