# Suppress warnings - must be before any imports import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TOKENIZERS_PARALLELISM'] = 'false' os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' import warnings import logging # Suppress all warnings warnings.filterwarnings('ignore') # Specific suppressions warnings.filterwarnings('ignore', category=UserWarning) warnings.filterwarnings('ignore', category=DeprecationWarning) warnings.filterwarnings('ignore', category=FutureWarning) warnings.filterwarnings('ignore', message='.*benefit from vacuuming.*') warnings.filterwarnings('ignore', message='.*sparse_softmax_cross_entropy.*') # Suppress all logging logging.getLogger().setLevel(logging.ERROR) # Suppress TensorFlow logging logging.getLogger('tensorflow').setLevel(logging.ERROR) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' import json from typing import Dict, List import chromadb from chromadb.utils import embedding_functions def clean_text(text: str) -> str: """Clean the text by removing extra spaces and formatting.""" # Remove multiple spaces text = ' '.join(text.split()) # Remove unnecessary line breaks text = text.replace('\n', ' ') # Remove text before first complete sentence if '.' in text: # Split by period and remove any incomplete sentence at start sentences = text.split('.') # Remove first part if it seems like a partial sentence if len(sentences) > 1: # Only if there are multiple sentences sentences = sentences[1:] # Remove first part text = '.'.join(sentences) text = text.strip() # Remove leading/trailing whitespace if text: # Add period back if text is not empty text += '.' return text def get_answer_from_osho(question: str, n_results: int = 5) -> Dict: """ Get answer from Osho's books based on the question. Args: question (str): The question to ask n_results (int): Number of relevant passages to return Returns: Dict: A dictionary containing the question and formatted answer with sources """ # Initialize ChromaDB client db_dir = os.path.join(os.getcwd(), "vector_db") if not os.path.exists(db_dir): # If local path doesn't exist, download from Hugging Face from huggingface_hub import snapshot_download db_dir = snapshot_download(repo_id="harithapliyal/osho-vector-db") client = chromadb.PersistentClient(path=db_dir) # Initialize embedding function embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction( model_name="all-MiniLM-L6-v2" ) # Get the collection collection = client.get_collection( name="osho_books", embedding_function=embedding_function ) # Query the collection results = collection.query( query_texts=[question], n_results=n_results ) # Format the answer answer_parts = [] for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])): answer_part = { "passage_number": i + 1, "book": metadata['book'], "text": clean_text(doc.strip()) } answer_parts.append(answer_part) # Create the response response = { "question": question, "answer_passages": answer_parts, "total_passages": len(answer_parts) } return response def save_qa_to_file(qa_response: Dict, output_file: str = None): """ Save the Q&A response to a JSON file. Args: qa_response (Dict): The Q&A response to save output_file (str): Optional output file path. If None, generates a filename """ if output_file is None: # Create answers directory if it doesn't exist answers_dir = os.path.join(os.getcwd(), "answers") os.makedirs(answers_dir, exist_ok=True) # Generate filename from question filename = f"answer_{qa_response['question'][:30].lower().replace(' ', '_')}.json" output_file = os.path.join(answers_dir, filename) # Save to file with open(output_file, 'w', encoding='utf-8') as f: json.dump(qa_response, f, ensure_ascii=False, indent=2) return output_file if __name__ == "__main__": # Example usage question = "What is the nature of consciousness?" # Get answer response = get_answer_from_osho(question) # Save to file output_file = save_qa_to_file(response) # Print the response print(f"\nQuestion: {response['question']}\n") for passage in response['answer_passages']: print(f"\nPassage {passage['passage_number']}:") print(f"Book: {passage['book']}") print(f"Text: {passage['text'][:200]}...") print("-" * 80) print(f"\nResponse saved to: {output_file}")