File size: 3,585 Bytes
ae4184d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# utils.py (Updated for OpenAI context formatting)
import re
import os
import time
import traceback
import openai
from typing import Optional, List, Dict

try:
    import config
except ImportError:
    print("Error: config.py not found. Cannot proceed.")
    raise SystemExit("config.py not found")

# ... (keep openai_client init, clean_source_text, get_embedding) ...
openai_client = None
if config.OPENAI_API_KEY:
    try:
        openai_client = openai.OpenAI(api_key=config.OPENAI_API_KEY)
        print("Utils: OpenAI client initialized for embeddings.")
    except Exception as e:
        print(f"Utils: Error initializing OpenAI client for embeddings: {e}")
else:
    print("Utils: Warning - OPENAI_API_KEY not found. Embeddings will fail.")

def clean_source_text(text: Optional[str]) -> str:
    if not text: return ""
    text = text.replace('\x00', '').replace('\ufffd', '')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def get_embedding(text: str, model: str = config.EMBEDDING_MODEL, max_retries: int = 3) -> Optional[List[float]]:
    global openai_client
    if not openai_client:
        print("Error: OpenAI client not initialized (utils.py). Cannot get embedding.")
        return None
    if not text or not isinstance(text, str):
        print("Error: Invalid input text for embedding.")
        return None
    cleaned_text = text.replace("\n", " ").strip()
    if not cleaned_text:
         print("Warning: Text is empty after cleaning, cannot get embedding.")
         return None
    attempt = 0
    while attempt < max_retries:
        try:
            response = openai_client.embeddings.create(input=[cleaned_text], model=model)
            return response.data[0].embedding
        except openai.RateLimitError as e:
            wait_time = (2 ** attempt); print(f"Rate limit embedding. Retrying in {wait_time}s..."); time.sleep(wait_time)
            attempt += 1
        except openai.APIConnectionError as e:
             print(f"Connection error embedding. Retrying..."); time.sleep(2)
             attempt += 1
        except Exception as e:
            print(f"Error generating embedding (Attempt {attempt + 1}/{max_retries}): {type(e).__name__}")
            attempt += 1
    print(f"Failed embedding after {max_retries} attempts.")
    return None

# --- REMOVED format_context_for_anthropic ---

# --- NEW Function to format context for OpenAI ---
def format_context_for_openai(documents: List[Dict]) -> str:
    """Formats documents for the OpenAI prompt context section using numbered list."""
    if not documents:
        return "No source texts provided."
    formatted_docs = []
    language_key = 'hebrew_text'
    id_key = 'original_id'
    source_key = 'source_name' # Optional: Include source name if available

    for index, doc in enumerate(documents):
        if not isinstance(doc, dict):
            print(f"Warning: Skipping non-dict item in documents list: {doc}")
            continue

        text = clean_source_text(doc.get(language_key, ''))
        doc_id = doc.get(id_key, f'unknown_{index+1}')
        source_name = doc.get(source_key, '') # Get source name

        if text:
            # Start with 1-based indexing for readability
            header = f"Source {index + 1} (ID: {doc_id}"
            if source_name:
                header += f", SourceName: {source_name}"
            header += ")"
            formatted_docs.append(f"{header}:\n{text}\n---") # Add separator

    if not formatted_docs:
         return "No valid source texts could be formatted."

    return "\n".join(formatted_docs)