ginipick commited on
Commit
b4107f4
ยท
verified ยท
1 Parent(s): 5516dab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +308 -367
app.py CHANGED
@@ -17,7 +17,7 @@ from reportlab.lib.styles import getSampleStyleSheet
17
  import io
18
  import docx2txt
19
 
20
- # ๋กœ๊น… ์„ค์ •
21
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
22
  logger = logging.getLogger(__name__)
23
 
@@ -25,138 +25,139 @@ BASE = pathlib.Path(__file__).parent
25
  app = FastAPI()
26
  app.mount("/static", StaticFiles(directory=BASE), name="static")
27
 
28
- # PDF ๋””๋ ‰ํ† ๋ฆฌ ์„ค์ •
29
  PDF_DIR = BASE / "pdf"
30
  if not PDF_DIR.exists():
31
  PDF_DIR.mkdir(parents=True)
32
 
33
- # ์˜๊ตฌ PDF ๋””๋ ‰ํ† ๋ฆฌ ์„ค์ • (Hugging Face ์˜๊ตฌ ๋””์Šคํฌ)
34
  PERMANENT_PDF_DIR = pathlib.Path("/data/pdfs") if os.path.exists("/data") else BASE / "permanent_pdfs"
35
  if not PERMANENT_PDF_DIR.exists():
36
  PERMANENT_PDF_DIR.mkdir(parents=True)
37
 
38
- # ์บ์‹œ ๋””๋ ‰ํ† ๋ฆฌ ์„ค์ •
39
  CACHE_DIR = BASE / "cache"
40
  if not CACHE_DIR.exists():
41
  CACHE_DIR.mkdir(parents=True)
42
 
43
- # PDF ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋””๋ ‰ํ† ๋ฆฌ ๋ฐ ํŒŒ์ผ ์„ค์ •
44
  METADATA_DIR = pathlib.Path("/data/metadata") if os.path.exists("/data") else BASE / "metadata"
45
  if not METADATA_DIR.exists():
46
  METADATA_DIR.mkdir(parents=True)
47
  PDF_METADATA_FILE = METADATA_DIR / "pdf_metadata.json"
48
 
49
- # ์ž„๋ฒ ๋”ฉ ์บ์‹œ ๋””๋ ‰ํ† ๋ฆฌ ์„ค์ •
50
  EMBEDDING_DIR = pathlib.Path("/data/embeddings") if os.path.exists("/data") else BASE / "embeddings"
51
  if not EMBEDDING_DIR.exists():
52
  EMBEDDING_DIR.mkdir(parents=True)
53
 
54
- # ๊ด€๋ฆฌ์ž ๋น„๋ฐ€๋ฒˆํ˜ธ
55
- ADMIN_PASSWORD = os.getenv("PASSWORD", "admin") # ํ™˜๊ฒฝ ๋ณ€์ˆ˜์—์„œ ๊ฐ€์ ธ์˜ค๊ธฐ, ๊ธฐ๋ณธ๊ฐ’์€ ํ…Œ์ŠคํŠธ์šฉ
56
 
57
- # OpenAI API ํ‚ค ์„ค์ •
58
  OPENAI_API_KEY = os.getenv("LLM_API", "")
59
- # API ํ‚ค๊ฐ€ ์—†๊ฑฐ๋‚˜ ๋น„์–ด์žˆ์„ ๋•Œ ํ”Œ๋ž˜๊ทธ ์„ค์ •
60
  HAS_VALID_API_KEY = bool(OPENAI_API_KEY and OPENAI_API_KEY.strip())
61
 
62
  if HAS_VALID_API_KEY:
63
  try:
64
  openai_client = OpenAI(api_key=OPENAI_API_KEY, timeout=30.0)
65
- logger.info("OpenAI ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™” ์„ฑ๊ณต")
66
  except Exception as e:
67
- logger.error(f"OpenAI ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™” ์‹คํŒจ: {e}")
68
  HAS_VALID_API_KEY = False
69
  else:
70
- logger.warning("์œ ํšจํ•œ OpenAI API ํ‚ค๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. AI ๊ธฐ๋Šฅ์ด ์ œํ•œ๋ฉ๋‹ˆ๋‹ค.")
71
  openai_client = None
72
 
73
- # ์ „์—ญ ์บ์‹œ ๊ฐ์ฒด
74
  pdf_cache: Dict[str, Dict[str, Any]] = {}
75
- # ์บ์‹ฑ ๋ฝ
76
  cache_locks = {}
77
- # PDF ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ (ID to ๊ฒฝ๋กœ ๋งคํ•‘)
78
  pdf_metadata: Dict[str, str] = {}
79
- # PDF ์ž„๋ฒ ๋”ฉ ์บ์‹œ
80
  pdf_embeddings: Dict[str, Dict[str, Any]] = {}
81
 
82
- # PDF ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋กœ๋“œ
 
83
  def load_pdf_metadata():
84
  global pdf_metadata
85
  if PDF_METADATA_FILE.exists():
86
  try:
87
  with open(PDF_METADATA_FILE, "r") as f:
88
  pdf_metadata = json.load(f)
89
- logger.info(f"PDF ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋กœ๋“œ ์™„๋ฃŒ: {len(pdf_metadata)} ํ•ญ๋ชฉ")
90
  except Exception as e:
91
- logger.error(f"๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋กœ๋“œ ์˜ค๋ฅ˜: {e}")
92
  pdf_metadata = {}
93
  else:
94
  pdf_metadata = {}
95
 
96
- # PDF ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ €์žฅ
 
97
  def save_pdf_metadata():
98
  try:
99
  with open(PDF_METADATA_FILE, "w") as f:
100
  json.dump(pdf_metadata, f)
101
  except Exception as e:
102
- logger.error(f"๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ €์žฅ ์˜ค๋ฅ˜: {e}")
 
103
 
104
- # PDF ID ์ƒ์„ฑ (ํŒŒ์ผ๋ช… + ํƒ€์ž„์Šคํƒฌํ”„ ๊ธฐ๋ฐ˜) - ๋” ๋‹จ์ˆœํ•˜๊ณ  ์•ˆ์ „ํ•œ ๋ฐฉ์‹์œผ๋กœ ๋ณ€๊ฒฝ
105
  def generate_pdf_id(filename: str) -> str:
106
- # ํŒŒ์ผ๋ช…์—์„œ ํ™•์žฅ์ž ์ œ๊ฑฐ
107
- base_name = os.path.splitext(filename)[0]
108
- # ์•ˆ์ „ํ•œ ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜ (URL ์ธ์ฝ”๋”ฉ ๋Œ€์‹  ์ง์ ‘ ๋ณ€ํ™˜)
109
  import re
 
110
  safe_name = re.sub(r'[^\w\-_]', '_', base_name.replace(" ", "_"))
111
- # ํƒ€์ž„์Šคํƒฌํ”„ ์ถ”๊ฐ€๋กœ ๊ณ ์œ ์„ฑ ๋ณด์žฅ
112
  timestamp = int(time.time())
113
- # ์งง์€ ์ž„์˜ ๋ฌธ์ž์—ด ์ถ”๊ฐ€
114
  random_suffix = uuid.uuid4().hex[:6]
115
  return f"{safe_name}_{timestamp}_{random_suffix}"
116
 
117
- # PDF ํŒŒ์ผ ๋ชฉ๋ก ๊ฐ€์ ธ์˜ค๊ธฐ (๋ฉ”์ธ ๋””๋ ‰ํ† ๋ฆฌ์šฉ)
 
118
  def get_pdf_files():
119
  pdf_files = []
120
  if PDF_DIR.exists():
121
  pdf_files = [f for f in PDF_DIR.glob("*.pdf")]
122
  return pdf_files
123
 
124
- # ์˜๊ตฌ ์ €์žฅ์†Œ์˜ PDF ํŒŒ์ผ ๋ชฉ๋ก ๏ฟฝ๏ฟฝ์ ธ์˜ค๊ธฐ
 
125
  def get_permanent_pdf_files():
126
  pdf_files = []
127
  if PERMANENT_PDF_DIR.exists():
128
  pdf_files = [f for f in PERMANENT_PDF_DIR.glob("*.pdf")]
129
  return pdf_files
130
 
131
- # PDF ์ธ๋„ค์ผ ์ƒ์„ฑ ๋ฐ ํ”„๋กœ์ ํŠธ ๋ฐ์ดํ„ฐ ์ค€๋น„
 
132
  def generate_pdf_projects():
133
  projects_data = []
134
 
135
- # ๋ฉ”์ธ ๋””๋ ‰ํ† ๋ฆฌ์™€ ์˜๊ตฌ ์ €์žฅ์†Œ์˜ ํŒŒ์ผ๋“ค ๊ฐ€์ ธ์˜ค๊ธฐ
136
  pdf_files = get_pdf_files()
137
  permanent_pdf_files = get_permanent_pdf_files()
138
 
139
- # ๋ชจ๋“  ํŒŒ์ผ ํ•ฉ์น˜๊ธฐ (ํŒŒ์ผ๋ช… ๊ธฐ์ค€์œผ๋กœ ์ค‘๋ณต ์ œ๊ฑฐ)
140
  unique_files = {}
141
 
142
- # ๋จผ์ € ๋ฉ”์ธ ๋””๋ ‰ํ† ๋ฆฌ์˜ ํŒŒ์ผ๋“ค ์ถ”๊ฐ€
143
  for file in pdf_files:
144
  unique_files[file.name] = file
145
 
146
- # ์˜๊ตฌ ์ €์žฅ์†Œ์˜ ํŒŒ์ผ๋“ค ์ถ”๊ฐ€ (๋™์ผ ํŒŒ์ผ๋ช…์ด ์žˆ์œผ๋ฉด ์˜๊ตฌ ์ €์žฅ์†Œ ํŒŒ์ผ ์šฐ์„ )
147
  for file in permanent_pdf_files:
148
  unique_files[file.name] = file
149
 
150
- # ์ค‘๋ณต ์ œ๊ฑฐ๋œ ํŒŒ์ผ๋“ค๋กœ ํ”„๋กœ์ ํŠธ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
151
  for pdf_file in unique_files.values():
152
- # ํ•ด๋‹น ํŒŒ์ผ์˜ PDF ID ์ฐพ๊ธฐ
153
  pdf_id = None
154
  for pid, path in pdf_metadata.items():
155
  if os.path.basename(path) == pdf_file.name:
156
  pdf_id = pid
157
  break
158
 
159
- # ID๊ฐ€ ์—†์œผ๋ฉด ์ƒˆ๋กœ ์ƒ์„ฑํ•˜๊ณ  ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ์— ์ถ”๊ฐ€
160
  if not pdf_id:
161
  pdf_id = generate_pdf_id(pdf_file.name)
162
  pdf_metadata[pdf_id] = str(pdf_file)
@@ -171,15 +172,18 @@ def generate_pdf_projects():
171
 
172
  return projects_data
173
 
174
- # ์บ์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ ์ƒ์„ฑ
 
175
  def get_cache_path(pdf_name: str):
176
  return CACHE_DIR / f"{pdf_name}_cache.json"
177
 
178
- # ์ž„๋ฒ ๋”ฉ ์บ์‹œ ํŒŒ์ผ ๊ฒฝ๋กœ ์ƒ์„ฑ
 
179
  def get_embedding_path(pdf_id: str):
180
  return EMBEDDING_DIR / f"{pdf_id}_embedding.json"
181
 
182
- # PDF ํ…์ŠคํŠธ ์ถ”์ถœ ํ•จ์ˆ˜
 
183
  def extract_pdf_text(pdf_path: str) -> List[Dict[str, Any]]:
184
  try:
185
  doc = fitz.open(pdf_path)
@@ -189,7 +193,7 @@ def extract_pdf_text(pdf_path: str) -> List[Dict[str, Any]]:
189
  page = doc[page_num]
190
  text = page.get_text()
191
 
192
- # ํŽ˜์ด์ง€ ํ…์ŠคํŠธ๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ๋งŒ ์ถ”๊ฐ€
193
  if text.strip():
194
  chunks.append({
195
  "page": page_num + 1,
@@ -199,32 +203,33 @@ def extract_pdf_text(pdf_path: str) -> List[Dict[str, Any]]:
199
 
200
  return chunks
201
  except Exception as e:
202
- logger.error(f"PDF ํ…์ŠคํŠธ ์ถ”์ถœ ์˜ค๋ฅ˜: {e}")
203
  return []
204
 
205
- # PDF ID๋กœ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ๋˜๋Š” ๊ฐ€์ ธ์˜ค๊ธฐ
 
206
  async def get_pdf_embedding(pdf_id: str) -> Dict[str, Any]:
207
  try:
208
- # ์ž„๋ฒ ๋”ฉ ์บ์‹œ ํ™•์ธ
209
  embedding_path = get_embedding_path(pdf_id)
210
  if embedding_path.exists():
211
  try:
212
  with open(embedding_path, "r", encoding="utf-8") as f:
213
  return json.load(f)
214
  except Exception as e:
215
- logger.error(f"์ž„๋ฒ ๋”ฉ ์บ์‹œ ๋กœ๋“œ ์˜ค๋ฅ˜: {e}")
216
 
217
- # PDF ๊ฒฝ๋กœ ์ฐพ๊ธฐ
218
  pdf_path = get_pdf_path_by_id(pdf_id)
219
  if not pdf_path:
220
- raise ValueError(f"PDF ID {pdf_id}์— ํ•ด๋‹นํ•˜๋Š” ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค")
221
 
222
- # ํ…์ŠคํŠธ ์ถ”์ถœ
223
  chunks = extract_pdf_text(pdf_path)
224
  if not chunks:
225
- raise ValueError(f"PDF์—์„œ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {pdf_path}")
226
 
227
- # ์ž„๋ฒ ๋”ฉ ์ €์žฅ ๋ฐ ๋ฐ˜ํ™˜
228
  embedding_data = {
229
  "pdf_id": pdf_id,
230
  "pdf_path": pdf_path,
@@ -232,61 +237,69 @@ async def get_pdf_embedding(pdf_id: str) -> Dict[str, Any]:
232
  "created_at": time.time()
233
  }
234
 
235
- # ์ž„๋ฒ ๋”ฉ ์บ์‹œ ์ €์žฅ
236
  with open(embedding_path, "w", encoding="utf-8") as f:
237
  json.dump(embedding_data, f, ensure_ascii=False)
238
 
239
  return embedding_data
240
 
241
  except Exception as e:
242
- logger.error(f"PDF ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ์˜ค๋ฅ˜: {e}")
243
  return {"error": str(e), "pdf_id": pdf_id}
244
 
245
- # PDF ๋‚ด์šฉ ๊ธฐ๋ฐ˜ ์งˆ์˜์‘๋‹ต
246
- # PDF ๋‚ด์šฉ ๊ธฐ๋ฐ˜ ์งˆ์˜์‘๋‹ต ํ•จ์ˆ˜ ๊ฐœ์„ 
247
  async def query_pdf(pdf_id: str, query: str) -> Dict[str, Any]:
248
  try:
249
- # API ํ‚ค๊ฐ€ ์—†๊ฑฐ๋‚˜ ์œ ํšจํ•˜์ง€ ์•Š์€ ๊ฒฝ์šฐ
250
  if not HAS_VALID_API_KEY or not openai_client:
251
  return {
252
- "error": "OpenAI API ํ‚ค๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.",
253
- "answer": "์ฃ„์†กํ•ฉ๋‹ˆ๋‹ค. ํ˜„์žฌ AI ๊ธฐ๋Šฅ์ด ๋น„ํ™œ์„ฑํ™”๋˜์–ด ์žˆ์–ด ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ์‹œ์Šคํ…œ ๊ด€๋ฆฌ์ž์—๊ฒŒ ๋ฌธ์˜ํ•˜์„ธ์š”."
254
  }
255
 
256
- # ์ž„๋ฒ ๋”ฉ ๋ฐ์ดํ„ฐ ๊ฐ€์ ธ์˜ค๊ธฐ
257
  embedding_data = await get_pdf_embedding(pdf_id)
258
  if "error" in embedding_data:
259
  return {"error": embedding_data["error"]}
260
 
261
- # ์ฒญํฌ ํ…์ŠคํŠธ ๋ชจ์œผ๊ธฐ (์ž„์‹œ๋กœ ๊ฐ„๋‹จํ•˜๊ฒŒ ์ „์ฒด ํ…์ŠคํŠธ ์‚ฌ์šฉ)
262
  all_text = "\n\n".join([f"Page {chunk['page']}: {chunk['text']}" for chunk in embedding_data["chunks"]])
263
 
264
- # ์ปจํ…์ŠคํŠธ ํฌ๊ธฐ๋ฅผ ๊ณ ๋ คํ•˜์—ฌ ํ…์ŠคํŠธ๊ฐ€ ๋„ˆ๋ฌด ๊ธธ๋ฉด ์•ž๋ถ€๋ถ„๋งŒ ์‚ฌ์šฉ
265
- max_context_length = 60000 # ํ† ํฐ ์ˆ˜๊ฐ€ ์•„๋‹Œ ๋ฌธ์ž ์ˆ˜ ๊ธฐ์ค€ (๋Œ€๋žต์ ์ธ ์ œํ•œ)
266
  if len(all_text) > max_context_length:
267
- all_text = all_text[:max_context_length] + "...(์ดํ•˜ ์ƒ๋žต)"
268
 
269
- # ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ์ค€๋น„
270
  system_prompt = """
271
- The default language is set to English. However, please respond in the language used in the user's prompt (e.g., English, Korean, Japanese, Chinese, etc.).
272
- You are an assistant that answers questions based solely on the provided PDF context. Please use only the information from the provided PDF content to respond. If relevant information is not available in the context, honestly reply with, "The requested information could not be found in the provided PDF."
273
- Please ensure your responses are clear and concise, citing relevant page numbers. Always respond politely and courteously.
274
  """
275
 
276
- # gpt-4.1-mini ๋ชจ๋ธ ์‚ฌ์šฉ
277
  try:
278
- # ํƒ€์ž„์•„์›ƒ ๋ฐ ์žฌ์‹œ๋„ ์„ค์ • ๊ฐœ์„ 
279
- for attempt in range(3): # ์ตœ๋Œ€ 3๋ฒˆ ์žฌ์‹œ๋„
280
  try:
281
  response = openai_client.chat.completions.create(
282
  model="gpt-4.1-mini",
283
  messages=[
284
  {"role": "system", "content": system_prompt},
285
- {"role": "user", "content": f"The default language is set to English.๋‹ค์Œ PDF ๋‚ด์šฉ์„ ์ฐธ๊ณ ํ•˜์—ฌ ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•ด์ฃผ์„ธ์š”.\n\nPDF ๋‚ด์šฉ:\n{all_text}\n\n์งˆ๋ฌธ: {query}"}
 
 
 
 
 
 
 
 
286
  ],
287
  temperature=0.7,
288
  max_tokens=2048,
289
- timeout=30.0 # 30์ดˆ ํƒ€์ž„์•„์›ƒ
290
  )
291
 
292
  answer = response.choices[0].message.content
@@ -296,68 +309,70 @@ Please ensure your responses are clear and concise, citing relevant page numbers
296
  "query": query
297
  }
298
  except Exception as api_error:
299
- logger.error(f"OpenAI API ํ˜ธ์ถœ ์˜ค๋ฅ˜ (์‹œ๋„ {attempt+1}/3): {api_error}")
300
- if attempt == 2: # ๋งˆ์ง€๋ง‰ ์‹œ๋„์—์„œ๋„ ์‹คํŒจ
301
  raise api_error
302
- await asyncio.sleep(1 * (attempt + 1)) # ์žฌ์‹œ๋„ ๊ฐ„ ์ง€์—ฐ ์‹œ๊ฐ„ ์ฆ๊ฐ€
303
 
304
- # ์—ฌ๊ธฐ๊นŒ์ง€ ๋„๋‹ฌํ•˜์ง€ ์•Š์•„์•ผ ํ•จ
305
- raise Exception("API ํ˜ธ์ถœ ์žฌ์‹œ๋„ ๋ชจ๋‘ ์‹คํŒจ")
306
  except Exception as api_error:
307
- logger.error(f"OpenAI API ํ˜ธ์ถœ ์ตœ์ข… ์˜ค๋ฅ˜: {api_error}")
308
- # ์˜ค๋ฅ˜ ์œ ํ˜•์— ๋”ฐ๋ฅธ ๋” ๋ช…ํ™•ํ•œ ๋ฉ”์‹œ์ง€ ์ œ๊ณต
309
  error_message = str(api_error)
310
  if "Connection" in error_message:
311
- return {"error": "OpenAI ์„œ๋ฒ„์™€ ์—ฐ๊ฒฐํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ์ธํ„ฐ๋„ท ์—ฐ๊ฒฐ์„ ํ™•์ธํ•˜์„ธ์š”."}
312
  elif "Unauthorized" in error_message or "Authentication" in error_message:
313
- return {"error": "API ํ‚ค๊ฐ€ ์œ ํšจํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค."}
314
  elif "Rate limit" in error_message:
315
- return {"error": "API ํ˜ธ์ถœ ํ•œ๋„๋ฅผ ์ดˆ๊ณผํ–ˆ์Šต๋‹ˆ๋‹ค. ์ž ์‹œ ํ›„ ๋‹ค์‹œ ์‹œ๋„ํ•˜์„ธ์š”."}
316
  else:
317
- return {"error": f"AI ์‘๋‹ต ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {error_message}"}
318
 
319
  except Exception as e:
320
- logger.error(f"์งˆ์˜์‘๋‹ต ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {e}")
321
  return {"error": str(e)}
322
 
323
- # PDF ์š”์•ฝ ์ƒ์„ฑ
324
- # PDF ์š”์•ฝ ์ƒ์„ฑ ํ•จ์ˆ˜ ๊ฐœ์„ 
325
  async def summarize_pdf(pdf_id: str) -> Dict[str, Any]:
326
  try:
327
- # API ํ‚ค๊ฐ€ ์—†๊ฑฐ๋‚˜ ์œ ํšจํ•˜์ง€ ์•Š์€ ๊ฒฝ์šฐ
328
  if not HAS_VALID_API_KEY or not openai_client:
329
  return {
330
- "error": "OpenAI API ํ‚ค๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. 'LLM_API' ํ™˜๊ฒฝ ๋ณ€์ˆ˜๋ฅผ ํ™•์ธํ•˜์„ธ์š”.",
331
- "summary": "API ํ‚ค๊ฐ€ ์—†์–ด ์š”์•ฝ์„ ์ƒ์„ฑํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ์‹œ์Šคํ…œ ๊ด€๋ฆฌ์ž์—๊ฒŒ ๋ฌธ์˜ํ•˜์„ธ์š”."
332
  }
333
 
334
- # ์ž„๋ฒ ๋”ฉ ๋ฐ์ดํ„ฐ ๊ฐ€์ ธ์˜ค๊ธฐ
335
  embedding_data = await get_pdf_embedding(pdf_id)
336
  if "error" in embedding_data:
337
- return {"error": embedding_data["error"], "summary": "PDF์—์„œ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."}
338
 
339
- # ์ฒญํฌ ํ…์ŠคํŠธ ๋ชจ์œผ๊ธฐ (์ œํ•œ๋œ ๊ธธ์ด)
340
  all_text = "\n\n".join([f"Page {chunk['page']}: {chunk['text']}" for chunk in embedding_data["chunks"]])
341
 
342
- # ์ปจํ…์ŠคํŠธ ํฌ๊ธฐ๋ฅผ ๊ณ ๋ คํ•˜์—ฌ ํ…์ŠคํŠธ๊ฐ€ ๋„ˆ๋ฌด ๊ธธ๋ฉด ์•ž๋ถ€๋ถ„๋งŒ ์‚ฌ์šฉ
343
- max_context_length = 60000 # ํ† ํฐ ์ˆ˜๊ฐ€ ์•„๋‹Œ ๋ฌธ์ž ์ˆ˜ ๊ธฐ์ค€ (๋Œ€๋žต์ ์ธ ์ œํ•œ)
344
  if len(all_text) > max_context_length:
345
- all_text = all_text[:max_context_length] + "...(์ดํ•˜ ์ƒ๋žต)"
346
 
347
- # OpenAI API ํ˜ธ์ถœ
348
  try:
349
- # ํƒ€์ž„์•„์›ƒ ๋ฐ ์žฌ์‹œ๋„ ์„ค์ • ๊ฐœ์„ 
350
- for attempt in range(3): # ์ตœ๋Œ€ 3๋ฒˆ ์žฌ์‹œ๋„
351
  try:
352
  response = openai_client.chat.completions.create(
353
  model="gpt-4.1-mini",
354
  messages=[
355
- {"role": "system", "content": "The default language is set to English. ๋‹ค์Œ PDF ๋‚ด์šฉ์„ ๊ฐ„๊ฒฐํ•˜๊ฒŒ ์š”์•ฝํ•ด์ฃผ์„ธ์š”. ํ•ต์‹ฌ ์ฃผ์ œ์™€ ์ฃผ์š” ํฌ์ธํŠธ๋ฅผ ํฌํ•จํ•œ ์š”์•ฝ์„ 500์ž ์ด๋‚ด๋กœ ์ž‘์„ฑํ•ด์ฃผ์„ธ์š”."},
356
- {"role": "user", "content": f"PDF ๋‚ด์šฉ:\n{all_text}"}
 
 
 
 
 
 
357
  ],
358
  temperature=0.7,
359
  max_tokens=1024,
360
- timeout=30.0 # 30์ดˆ ํƒ€์ž„์•„์›ƒ
361
  )
362
 
363
  summary = response.choices[0].message.content
@@ -366,60 +381,57 @@ async def summarize_pdf(pdf_id: str) -> Dict[str, Any]:
366
  "pdf_id": pdf_id
367
  }
368
  except Exception as api_error:
369
- logger.error(f"OpenAI API ํ˜ธ์ถœ ์˜ค๋ฅ˜ (์‹œ๋„ {attempt+1}/3): {api_error}")
370
- if attempt == 2: # ๋งˆ์ง€๋ง‰ ์‹œ๋„์—์„œ๋„ ์‹คํŒจ
371
  raise api_error
372
- await asyncio.sleep(1 * (attempt + 1)) # ์žฌ์‹œ๋„ ๊ฐ„ ์ง€์—ฐ ์‹œ๊ฐ„ ์ฆ๊ฐ€
373
 
374
- # ์—ฌ๊ธฐ๊นŒ์ง€ ๋„๋‹ฌํ•˜์ง€ ์•Š์•„์•ผ ํ•จ
375
- raise Exception("API ํ˜ธ์ถœ ์žฌ์‹œ๋„ ๋ชจ๋‘ ์‹คํŒจ")
376
  except Exception as api_error:
377
- logger.error(f"OpenAI API ํ˜ธ์ถœ ์ตœ์ข… ์˜ค๋ฅ˜: {api_error}")
378
- # ์˜ค๋ฅ˜ ์œ ํ˜•์— ๋”ฐ๋ฅธ ๋” ๋ช…ํ™•ํ•œ ๋ฉ”์‹œ์ง€ ์ œ๊ณต
379
  error_message = str(api_error)
380
  if "Connection" in error_message:
381
- return {"error": "OpenAI ์„œ๋ฒ„์™€ ์—ฐ๊ฒฐํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ์ธํ„ฐ๋„ท ์—ฐ๊ฒฐ์„ ํ™•์ธํ•˜์„ธ์š”.", "pdf_id": pdf_id}
382
  elif "Unauthorized" in error_message or "Authentication" in error_message:
383
- return {"error": "API ํ‚ค๊ฐ€ ์œ ํšจํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.", "pdf_id": pdf_id}
384
  elif "Rate limit" in error_message:
385
- return {"error": "API ํ˜ธ์ถœ ํ•œ๋„๋ฅผ ์ดˆ๊ณผํ–ˆ์Šต๋‹ˆ๋‹ค. ์ž ์‹œ ํ›„ ๋‹ค์‹œ ์‹œ๋„ํ•˜์„ธ์š”.", "pdf_id": pdf_id}
386
  else:
387
- return {"error": f"AI ์š”์•ฝ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {error_message}", "pdf_id": pdf_id}
388
 
389
  except Exception as e:
390
- logger.error(f"PDF ์š”์•ฝ ์ƒ์„ฑ ์˜ค๋ฅ˜: {e}")
391
  return {
392
  "error": str(e),
393
- "summary": "PDF ์š”์•ฝ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. PDF ํŽ˜์ด์ง€ ์ˆ˜๊ฐ€ ๋„ˆ๋ฌด ๋งŽ๊ฑฐ๋‚˜ ํ˜•์‹์ด ์ง€์›๋˜์ง€ ์•Š์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค."
394
  }
395
 
396
-
397
- # ์ตœ์ ํ™”๋œ PDF ํŽ˜์ด์ง€ ์บ์‹ฑ ํ•จ์ˆ˜
398
  async def cache_pdf(pdf_path: str):
399
  try:
400
- import fitz # PyMuPDF
401
 
402
  pdf_file = pathlib.Path(pdf_path)
403
  pdf_name = pdf_file.stem
404
 
405
- # ๋ฝ ์ƒ์„ฑ - ๋™์ผํ•œ PDF์— ๋Œ€ํ•ด ๋™์‹œ ์บ์‹ฑ ๋ฐฉ์ง€
406
  if pdf_name not in cache_locks:
407
  cache_locks[pdf_name] = threading.Lock()
408
 
409
- # ์ด๋ฏธ ์บ์‹ฑ ์ค‘์ด๊ฑฐ๋‚˜ ์บ์‹ฑ ์™„๋ฃŒ๋œ PDF๋Š” ๊ฑด๋„ˆ๋›ฐ๊ธฐ
410
  if pdf_name in pdf_cache and pdf_cache[pdf_name].get("status") in ["processing", "completed"]:
411
- logger.info(f"PDF {pdf_name} ์ด๋ฏธ ์บ์‹ฑ ์™„๋ฃŒ ๋˜๋Š” ์ง„ํ–‰ ์ค‘")
412
  return
413
 
414
  with cache_locks[pdf_name]:
415
- # ์ด์ค‘ ์ฒดํฌ - ๋ฝ ํš๋“ ํ›„ ๋‹ค์‹œ ํ™•์ธ
416
  if pdf_name in pdf_cache and pdf_cache[pdf_name].get("status") in ["processing", "completed"]:
417
  return
418
 
419
- # ์บ์‹œ ์ƒํƒœ ์—…๋ฐ์ดํŠธ
420
  pdf_cache[pdf_name] = {"status": "processing", "progress": 0, "pages": []}
421
 
422
- # ์บ์‹œ ํŒŒ์ผ์ด ์ด๋ฏธ ์กด์žฌํ•˜๋Š”์ง€ ํ™•์ธ
423
  cache_path = get_cache_path(pdf_name)
424
  if cache_path.exists():
425
  try:
@@ -428,47 +440,41 @@ async def cache_pdf(pdf_path: str):
428
  if cached_data.get("status") == "completed" and cached_data.get("pages"):
429
  pdf_cache[pdf_name] = cached_data
430
  pdf_cache[pdf_name]["status"] = "completed"
431
- logger.info(f"์บ์‹œ ํŒŒ์ผ์—์„œ {pdf_name} ๏ฟฝ๏ฟฝ๋“œ ์™„๋ฃŒ")
432
  return
433
  except Exception as e:
434
- logger.error(f"์บ์‹œ ํŒŒ์ผ ๋กœ๋“œ ์‹คํŒจ: {e}")
435
 
436
- # PDF ํŒŒ์ผ ์—ด๊ธฐ
437
  doc = fitz.open(pdf_path)
438
  total_pages = doc.page_count
439
 
440
- # ๋ฏธ๋ฆฌ ์ธ๋„ค์ผ๋งŒ ๋จผ์ € ์ƒ์„ฑ (๋น ๋ฅธ UI ๋กœ๋”ฉ์šฉ)
441
  if total_pages > 0:
442
- # ์ฒซ ํŽ˜์ด์ง€ ์ธ๋„ค์ผ ์ƒ์„ฑ
443
  page = doc[0]
444
- pix_thumb = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2)) # ๋” ์ž‘์€ ์ธ๋„ค์ผ
445
  thumb_data = pix_thumb.tobytes("png")
446
  b64_thumb = base64.b64encode(thumb_data).decode('utf-8')
447
  thumb_src = f"data:image/png;base64,{b64_thumb}"
448
 
449
- # ์ธ๋„ค์ผ ํŽ˜์ด์ง€๋งŒ ๋จผ์ € ์บ์‹œ
450
  pdf_cache[pdf_name]["pages"] = [{"thumb": thumb_src, "src": ""}]
451
  pdf_cache[pdf_name]["progress"] = 1
452
  pdf_cache[pdf_name]["total_pages"] = total_pages
453
 
454
- # ์ด๋ฏธ์ง€ ํ•ด์ƒ๋„ ๋ฐ ์••์ถ• ํ’ˆ์งˆ ์„ค์ • (์„ฑ๋Šฅ ์ตœ์ ํ™”)
455
- scale_factor = 1.0 # ๊ธฐ๋ณธ ํ•ด์ƒ๋„ (๋‚ฎ์ถœ์ˆ˜๋ก ๋กœ๋”ฉ ๋น ๋ฆ„)
456
- jpeg_quality = 80 # JPEG ํ’ˆ์งˆ (๋‚ฎ์ถœ์ˆ˜๋ก ์šฉ๋Ÿ‰ ์ž‘์•„์ง)
457
 
458
- # ํŽ˜์ด์ง€ ์ฒ˜๋ฆฌ ์ž‘์—…์ž ํ•จ์ˆ˜ (๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ์šฉ)
459
  def process_page(page_num):
460
  try:
461
  page = doc[page_num]
462
-
463
- # ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜ ์‹œ ๋งคํŠธ๋ฆญ์Šค ์Šค์ผ€์ผ๋ง ์ ์šฉ (์„ฑ๋Šฅ ์ตœ์ ํ™”)
464
  pix = page.get_pixmap(matrix=fitz.Matrix(scale_factor, scale_factor))
465
-
466
- # JPEG ํ˜•์‹์œผ๋กœ ์ธ์ฝ”๋”ฉ (PNG๋ณด๋‹ค ํฌ๊ธฐ ์ž‘์Œ)
467
  img_data = pix.tobytes("jpeg", jpeg_quality)
468
  b64_img = base64.b64encode(img_data).decode('utf-8')
469
  img_src = f"data:image/jpeg;base64,{b64_img}"
470
 
471
- # ์ธ๋„ค์ผ (์ฒซ ํŽ˜์ด์ง€๊ฐ€ ์•„๋‹ˆ๋ฉด ๋นˆ ๋ฌธ์ž์—ด)
472
  thumb_src = "" if page_num > 0 else pdf_cache[pdf_name]["pages"][0]["thumb"]
473
 
474
  return {
@@ -477,7 +483,7 @@ async def cache_pdf(pdf_path: str):
477
  "thumb": thumb_src
478
  }
479
  except Exception as e:
480
- logger.error(f"ํŽ˜์ด์ง€ {page_num} ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {e}")
481
  return {
482
  "page_num": page_num,
483
  "src": "",
@@ -485,22 +491,19 @@ async def cache_pdf(pdf_path: str):
485
  "error": str(e)
486
  }
487
 
488
- # ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ๋กœ ๋ชจ๋“  ํŽ˜์ด์ง€ ์ฒ˜๋ฆฌ
489
  pages = [None] * total_pages
490
  processed_count = 0
491
 
492
- # ํŽ˜์ด์ง€ ๋ฐฐ์น˜ ์ฒ˜๋ฆฌ (๋ฉ”๋ชจ๋ฆฌ ๊ด€๋ฆฌ)
493
- batch_size = 5 # ํ•œ ๋ฒˆ์— ์ฒ˜๋ฆฌํ•  ํŽ˜์ด์ง€ ์ˆ˜
494
 
495
  for batch_start in range(0, total_pages, batch_size):
496
  batch_end = min(batch_start + batch_size, total_pages)
497
  current_batch = list(range(batch_start, batch_end))
498
 
499
- # ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ๋กœ ๋ฐฐ์น˜ ํŽ˜์ด์ง€ ๋ Œ๋”๋ง
500
  with concurrent.futures.ThreadPoolExecutor(max_workers=min(5, batch_size)) as executor:
501
  batch_results = list(executor.map(process_page, current_batch))
502
 
503
- # ๊ฒฐ๊ณผ ์ €์žฅ
504
  for result in batch_results:
505
  page_num = result["page_num"]
506
  pages[page_num] = {
@@ -512,7 +515,6 @@ async def cache_pdf(pdf_path: str):
512
  progress = round(processed_count / total_pages * 100)
513
  pdf_cache[pdf_name]["progress"] = progress
514
 
515
- # ์ค‘๊ฐ„ ์ €์žฅ
516
  pdf_cache[pdf_name]["pages"] = pages
517
  try:
518
  with open(cache_path, "w") as cache_file:
@@ -523,9 +525,8 @@ async def cache_pdf(pdf_path: str):
523
  "total_pages": total_pages
524
  }, cache_file)
525
  except Exception as e:
526
- logger.error(f"์ค‘๊ฐ„ ์บ์‹œ ์ €์žฅ ์‹คํŒจ: {e}")
527
 
528
- # ์บ์‹ฑ ์™„๋ฃŒ
529
  pdf_cache[pdf_name] = {
530
  "status": "completed",
531
  "progress": 100,
@@ -533,74 +534,66 @@ async def cache_pdf(pdf_path: str):
533
  "total_pages": total_pages
534
  }
535
 
536
- # ์ตœ์ข… ์บ์‹œ ํŒŒ์ผ ์ €์žฅ
537
  try:
538
  with open(cache_path, "w") as cache_file:
539
  json.dump(pdf_cache[pdf_name], cache_file)
540
- logger.info(f"PDF {pdf_name} ์บ์‹ฑ ์™„๋ฃŒ, {total_pages}ํŽ˜์ด์ง€")
541
  except Exception as e:
542
- logger.error(f"์ตœ์ข… ์บ์‹œ ์ €์žฅ ์‹คํŒจ: {e}")
543
 
544
  except Exception as e:
545
  import traceback
546
- logger.error(f"PDF ์บ์‹ฑ ์˜ค๋ฅ˜: {str(e)}\n{traceback.format_exc()}")
547
  if pdf_name in pdf_cache:
548
  pdf_cache[pdf_name]["status"] = "error"
549
  pdf_cache[pdf_name]["error"] = str(e)
550
 
551
- # PDF ID๋กœ PDF ๊ฒฝ๋กœ ์ฐพ๊ธฐ (๊ฐœ์„ ๋œ ๊ฒ€์ƒ‰ ๋กœ์ง)
 
552
  def get_pdf_path_by_id(pdf_id: str) -> str:
553
- logger.info(f"PDF ID๋กœ ํŒŒ์ผ ์กฐํšŒ: {pdf_id}")
554
 
555
- # 1. ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ์—์„œ ์ง์ ‘ ID๋กœ ๊ฒ€์ƒ‰
556
  if pdf_id in pdf_metadata:
557
  path = pdf_metadata[pdf_id]
558
- # ํŒŒ์ผ ์กด์žฌ ํ™•์ธ
559
  if os.path.exists(path):
560
  return path
561
 
562
- # ํŒŒ์ผ์ด ์ด๋™ํ–ˆ์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ํŒŒ์ผ๋ช…์œผ๋กœ ๊ฒ€์ƒ‰
563
  filename = os.path.basename(path)
564
 
565
- # ์˜๊ตฌ ์ €์žฅ์†Œ์—์„œ ๊ฒ€์ƒ‰
566
  perm_path = PERMANENT_PDF_DIR / filename
567
  if perm_path.exists():
568
- # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์—…๋ฐ์ดํŠธ
569
  pdf_metadata[pdf_id] = str(perm_path)
570
  save_pdf_metadata()
571
  return str(perm_path)
572
-
573
- # ๋ฉ”์ธ ๋””๋ ‰ํ† ๋ฆฌ์—์„œ ๊ฒ€์ƒ‰
574
  main_path = PDF_DIR / filename
575
  if main_path.exists():
576
- # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์—…๋ฐ์ดํŠธ
577
  pdf_metadata[pdf_id] = str(main_path)
578
  save_pdf_metadata()
579
  return str(main_path)
580
 
581
- # 2. ํŒŒ์ผ๋ช… ๋ถ€๋ถ„๋งŒ ์ถ”์ถœํ•˜์—ฌ ๋ชจ๋“  PDF ํŒŒ์ผ ๊ฒ€์ƒ‰
582
  try:
583
- # ID ํ˜•์‹: filename_timestamp_random
584
- # ํŒŒ์ผ๋ช… ๋ถ€๋ถ„๋งŒ ์ถ”์ถœ
585
  name_part = pdf_id.split('_')[0] if '_' in pdf_id else pdf_id
586
 
587
- # ๋ชจ๋“  PDF ํŒŒ์ผ ๊ฒ€์ƒ‰
588
  for file_path in get_pdf_files() + get_permanent_pdf_files():
589
- # ํŒŒ์ผ๋ช…์ด ID์˜ ์‹œ์ž‘ ๋ถ€๋ถ„๊ณผ ์ผ์น˜ํ•˜๋ฉด
590
  file_basename = os.path.basename(file_path)
591
  if file_basename.startswith(name_part) or file_path.stem.startswith(name_part):
592
- # ID ๋งคํ•‘ ์—…๋ฐ์ดํŠธ
593
  pdf_metadata[pdf_id] = str(file_path)
594
  save_pdf_metadata()
595
  return str(file_path)
596
  except Exception as e:
597
- logger.error(f"ํŒŒ์ผ๋ช… ๊ฒ€์ƒ‰ ์ค‘ ์˜ค๋ฅ˜: {e}")
598
 
599
- # 3. ๋ชจ๋“  PDF ํŒŒ์ผ์— ๋Œ€ํ•ด ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํ™•์ธ
600
  for pid, path in pdf_metadata.items():
601
  if os.path.exists(path):
602
  file_basename = os.path.basename(path)
603
- # ์œ ์‚ฌํ•œ ํŒŒ์ผ๋ช…์„ ๊ฐ€์ง„ ๊ฒฝ์šฐ
604
  if pdf_id in pid or pid in pdf_id:
605
  pdf_metadata[pdf_id] = path
606
  save_pdf_metadata()
@@ -608,28 +601,22 @@ def get_pdf_path_by_id(pdf_id: str) -> str:
608
 
609
  return None
610
 
611
- # ์‹œ์ž‘ ์‹œ ๋ชจ๋“  PDF ํŒŒ์ผ ์บ์‹ฑ
 
612
  async def init_cache_all_pdfs():
613
- logger.info("PDF ์บ์‹ฑ ์ž‘์—… ์‹œ์ž‘")
614
-
615
- # PDF ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋กœ๋“œ
616
  load_pdf_metadata()
617
 
618
- # ๋ฉ”์ธ ๋ฐ ์˜๊ตฌ ๋””๋ ‰ํ† ๋ฆฌ์—์„œ PDF ํŒŒ์ผ ๋ชจ๋‘ ๊ฐ€์ ธ์˜ค๊ธฐ
619
  pdf_files = get_pdf_files() + get_permanent_pdf_files()
620
-
621
- # ์ค‘๋ณต ์ œ๊ฑฐ
622
  unique_pdf_paths = set(str(p) for p in pdf_files)
623
  pdf_files = [pathlib.Path(p) for p in unique_pdf_paths]
624
 
625
- # ํŒŒ์ผ ๊ธฐ๋ฐ˜ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์—…๋ฐ์ดํŠธ
626
  for pdf_file in pdf_files:
627
- # ID๊ฐ€ ์—†๋Š” ํŒŒ์ผ์— ๋Œ€ํ•ด ID ์ƒ์„ฑ
628
  found = False
629
  for pid, path in pdf_metadata.items():
630
  if os.path.basename(path) == pdf_file.name:
631
  found = True
632
- # ๊ฒฝ๋กœ ์—…๋ฐ์ดํŠธ ํ•„์š”ํ•œ ๊ฒฝ์šฐ
633
  if not os.path.exists(path):
634
  pdf_metadata[pid] = str(pdf_file)
635
  break
@@ -638,10 +625,9 @@ async def init_cache_all_pdfs():
638
  pdf_id = generate_pdf_id(pdf_file.name)
639
  pdf_metadata[pdf_id] = str(pdf_file)
640
 
641
- # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ €์žฅ
642
  save_pdf_metadata()
643
 
644
- # ์ด๋ฏธ ์บ์‹œ๋œ PDF ํŒŒ์ผ ๋กœ๋“œ (๋น ๋ฅธ ์‹œ์ž‘์„ ์œ„ํ•ด ๋จผ์ € ์ˆ˜ํ–‰)
645
  for cache_file in CACHE_DIR.glob("*_cache.json"):
646
  try:
647
  pdf_name = cache_file.stem.replace("_cache", "")
@@ -650,69 +636,67 @@ async def init_cache_all_pdfs():
650
  if cached_data.get("status") == "completed" and cached_data.get("pages"):
651
  pdf_cache[pdf_name] = cached_data
652
  pdf_cache[pdf_name]["status"] = "completed"
653
- logger.info(f"๊ธฐ์กด ์บ์‹œ ๋กœ๋“œ: {pdf_name}")
654
  except Exception as e:
655
- logger.error(f"์บ์‹œ ํŒŒ์ผ ๋กœ๋“œ ์˜ค๋ฅ˜: {str(e)}")
656
 
657
- # ์บ์‹ฑ๋˜์ง€ ์•Š์€ PDF ํŒŒ์ผ ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ
658
- await asyncio.gather(*[asyncio.create_task(cache_pdf(str(pdf_file)))
659
- for pdf_file in pdf_files
660
- if pdf_file.stem not in pdf_cache
661
- or pdf_cache[pdf_file.stem].get("status") != "completed"])
 
 
662
 
663
- # ๋ฐฑ๊ทธ๋ผ์šด๋“œ ์ž‘์—… ์‹œ์ž‘ ํ•จ์ˆ˜
664
  @app.on_event("startup")
665
  async def startup_event():
666
- # PDF ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋กœ๋“œ
667
  load_pdf_metadata()
668
 
669
- # ๋ˆ„๋ฝ๋œ PDF ํŒŒ์ผ์— ๋Œ€ํ•œ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ƒ์„ฑ
670
  for pdf_file in get_pdf_files() + get_permanent_pdf_files():
671
  found = False
672
  for pid, path in pdf_metadata.items():
673
  if os.path.basename(path) == pdf_file.name:
674
  found = True
675
- # ๊ฒฝ๋กœ ์—…๋ฐ์ดํŠธ
676
  if not os.path.exists(path):
677
  pdf_metadata[pid] = str(pdf_file)
678
  break
679
 
680
  if not found:
681
- # ์ƒˆ ID ์ƒ์„ฑ ๋ฐ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ์— ์ถ”๊ฐ€
682
  pdf_id = generate_pdf_id(pdf_file.name)
683
  pdf_metadata[pdf_id] = str(pdf_file)
684
 
685
- # ๋ณ€๊ฒฝ์‚ฌํ•ญ ์ €์žฅ
686
  save_pdf_metadata()
687
 
688
- # ๋ฐฑ๊ทธ๋ผ์šด๋“œ ํƒœ์Šคํฌ๋กœ ์บ์‹ฑ ์‹คํ–‰
689
  asyncio.create_task(init_cache_all_pdfs())
690
 
691
- # API ์—”๋“œํฌ์ธํŠธ: PDF ํ”„๋กœ์ ํŠธ ๋ชฉ๋ก
 
692
  @app.get("/api/pdf-projects")
693
  async def get_pdf_projects_api():
694
  return generate_pdf_projects()
695
 
696
- # API ์—”๋“œํฌ์ธํŠธ: ์˜๊ตฌ ์ €์žฅ๋œ PDF ํ”„๋กœ์ ํŠธ ๋ชฉ๋ก
 
697
  @app.get("/api/permanent-pdf-projects")
698
  async def get_permanent_pdf_projects_api():
699
  pdf_files = get_permanent_pdf_files()
700
  projects_data = []
701
 
702
  for pdf_file in pdf_files:
703
- # PDF ID ์ฐพ๊ธฐ
704
  pdf_id = None
705
  for pid, path in pdf_metadata.items():
706
  if os.path.basename(path) == pdf_file.name:
707
  pdf_id = pid
708
  break
709
 
710
- # ID๊ฐ€ ์—†์œผ๋ฉด ์ƒ์„ฑ
711
  if not pdf_id:
712
  pdf_id = generate_pdf_id(pdf_file.name)
713
  pdf_metadata[pdf_id] = str(pdf_file)
714
  save_pdf_metadata()
715
-
716
  projects_data.append({
717
  "path": str(pdf_file),
718
  "name": pdf_file.stem,
@@ -722,7 +706,8 @@ async def get_permanent_pdf_projects_api():
722
 
723
  return projects_data
724
 
725
- # API ์—”๋“œํฌ์ธํŠธ: PDF ID๋กœ ์ •๋ณด ๊ฐ€์ ธ์˜ค๊ธฐ
 
726
  @app.get("/api/pdf-info-by-id/{pdf_id}")
727
  async def get_pdf_info_by_id(pdf_id: str):
728
  pdf_path = get_pdf_path_by_id(pdf_id)
@@ -735,40 +720,42 @@ async def get_pdf_info_by_id(pdf_id: str):
735
  "exists": True,
736
  "cached": pdf_file.stem in pdf_cache and pdf_cache[pdf_file.stem].get("status") == "completed"
737
  }
738
- return {"exists": False, "error": "PDF๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"}
 
739
 
740
- # API ์—”๋“œํฌ์ธํŠธ: PDF ์ธ๋„ค์ผ ์ œ๊ณต (์ตœ์ ํ™”)
741
  @app.get("/api/pdf-thumbnail")
742
  async def get_pdf_thumbnail(path: str):
743
  try:
744
  pdf_file = pathlib.Path(path)
745
  pdf_name = pdf_file.stem
746
 
747
- # ์บ์‹œ์—์„œ ์ธ๋„ค์ผ ๊ฐ€์ ธ์˜ค๊ธฐ
748
  if pdf_name in pdf_cache and pdf_cache[pdf_name].get("pages"):
749
  if pdf_cache[pdf_name]["pages"][0].get("thumb"):
750
  return {"thumbnail": pdf_cache[pdf_name]["pages"][0]["thumb"]}
751
-
752
- # ์บ์‹œ์— ์—†์œผ๋ฉด ์ƒ์„ฑ (๋” ์ž‘๊ณ  ๋น ๋ฅธ ์ธ๋„ค์ผ)
753
  import fitz
754
  doc = fitz.open(path)
755
  if doc.page_count > 0:
756
  page = doc[0]
757
- pix = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2)) # ๋” ์ž‘์€ ์ธ๋„ค์ผ
758
- img_data = pix.tobytes("jpeg", 70) # JPEG ์••์ถ• ์‚ฌ์šฉ
759
  b64_img = base64.b64encode(img_data).decode('utf-8')
760
 
761
- # ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ์บ์‹ฑ ์‹œ์ž‘
762
  asyncio.create_task(cache_pdf(path))
763
 
764
  return {"thumbnail": f"data:image/jpeg;base64,{b64_img}"}
765
 
766
  return {"thumbnail": None}
767
  except Exception as e:
768
- logger.error(f"์ธ๋„ค์ผ ์ƒ์„ฑ ์˜ค๋ฅ˜: {str(e)}")
769
  return {"error": str(e), "thumbnail": None}
770
 
771
- # API ์—”๋“œํฌ์ธํŠธ: ์บ์‹œ ์ƒํƒœ ํ™•์ธ
 
772
  @app.get("/api/cache-status")
773
  async def get_cache_status(path: str = None):
774
  if path:
@@ -778,23 +765,24 @@ async def get_cache_status(path: str = None):
778
  return pdf_cache[pdf_name]
779
  return {"status": "not_cached"}
780
  else:
781
- return {name: {"status": info["status"], "progress": info.get("progress", 0)}
782
- for name, info in pdf_cache.items()}
 
 
 
783
 
784
- # API ์—”๋“œํฌ์ธํŠธ: PDF์— ๋Œ€ํ•œ ์งˆ์˜์‘๋‹ต
785
  @app.post("/api/ai/query-pdf/{pdf_id}")
786
  async def api_query_pdf(pdf_id: str, query: Dict[str, str]):
787
  try:
788
  user_query = query.get("query", "")
789
  if not user_query:
790
- return JSONResponse(content={"error": "์งˆ๋ฌธ์ด ์ œ๊ณต๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค"}, status_code=400)
791
 
792
- # PDF ๊ฒฝ๋กœ ํ™•์ธ
793
  pdf_path = get_pdf_path_by_id(pdf_id)
794
  if not pdf_path:
795
- return JSONResponse(content={"error": f"PDF ID {pdf_id}์— ํ•ด๋‹นํ•˜๋Š” ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"}, status_code=404)
796
 
797
- # ์งˆ์˜์‘๋‹ต ์ฒ˜๋ฆฌ
798
  result = await query_pdf(pdf_id, user_query)
799
 
800
  if "error" in result:
@@ -802,19 +790,18 @@ async def api_query_pdf(pdf_id: str, query: Dict[str, str]):
802
 
803
  return result
804
  except Exception as e:
805
- logger.error(f"์งˆ์˜์‘๋‹ต API ์˜ค๋ฅ˜: {e}")
806
  return JSONResponse(content={"error": str(e)}, status_code=500)
807
 
808
- # API ์—”๋“œํฌ์ธํŠธ: PDF ์š”์•ฝ
 
809
  @app.get("/api/ai/summarize-pdf/{pdf_id}")
810
  async def api_summarize_pdf(pdf_id: str):
811
  try:
812
- # PDF ๊ฒฝ๋กœ ํ™•์ธ
813
  pdf_path = get_pdf_path_by_id(pdf_id)
814
  if not pdf_path:
815
- return JSONResponse(content={"error": f"PDF ID {pdf_id}์— ํ•ด๋‹นํ•˜๋Š” ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"}, status_code=404)
816
 
817
- # ์š”์•ฝ ์ฒ˜๋ฆฌ
818
  result = await summarize_pdf(pdf_id)
819
 
820
  if "error" in result:
@@ -822,124 +809,114 @@ async def api_summarize_pdf(pdf_id: str):
822
 
823
  return result
824
  except Exception as e:
825
- logger.error(f"PDF ์š”์•ฝ API ์˜ค๋ฅ˜: {e}")
826
  return JSONResponse(content={"error": str(e)}, status_code=500)
827
 
828
- # API ์—”๋“œํฌ์ธํŠธ: ์บ์‹œ๋œ PDF ์ฝ˜ํ…์ธ  ์ œ๊ณต (์ ์ง„์  ๋กœ๋”ฉ ์ง€์›)
 
829
  @app.get("/api/cached-pdf")
830
  async def get_cached_pdf(path: str, background_tasks: BackgroundTasks):
831
  try:
832
  pdf_file = pathlib.Path(path)
833
  pdf_name = pdf_file.stem
834
 
835
- # ์บ์‹œ ํ™•์ธ
836
  if pdf_name in pdf_cache:
837
  status = pdf_cache[pdf_name].get("status", "")
838
 
839
- # ์™„๋ฃŒ๋œ ๊ฒฝ์šฐ ์ „์ฒด ๋ฐ์ดํ„ฐ ๋ฐ˜ํ™˜
840
  if status == "completed":
841
  return pdf_cache[pdf_name]
842
-
843
- # ์ฒ˜๋ฆฌ ์ค‘์ธ ๊ฒฝ์šฐ ํ˜„์žฌ๊นŒ์ง€์˜ ํŽ˜์ด์ง€ ๋ฐ์ดํ„ฐ ํฌํ•จ (์ ์ง„์  ๋กœ๋”ฉ)
844
  elif status == "processing":
845
  progress = pdf_cache[pdf_name].get("progress", 0)
846
  pages = pdf_cache[pdf_name].get("pages", [])
847
  total_pages = pdf_cache[pdf_name].get("total_pages", 0)
848
 
849
- # ์ผ๋ถ€๋งŒ ์ฒ˜๋ฆฌ๋œ ๊ฒฝ์šฐ์—๋„ ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ํŽ˜์ด์ง€ ์ œ๊ณต
850
  return {
851
- "status": "processing",
852
  "progress": progress,
853
  "pages": pages,
854
  "total_pages": total_pages,
855
  "available_pages": len([p for p in pages if p and p.get("src")])
856
  }
857
 
858
- # ์บ์‹œ๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ์บ์‹ฑ ์‹œ์ž‘
859
  background_tasks.add_task(cache_pdf, path)
860
  return {"status": "started", "progress": 0}
861
 
862
  except Exception as e:
863
- logger.error(f"์บ์‹œ๋œ PDF ์ œ๊ณต ์˜ค๋ฅ˜: {str(e)}")
864
  return {"error": str(e), "status": "error"}
865
 
866
- # API ์—”๋“œํฌ์ธํŠธ: PDF ์›๋ณธ ์ฝ˜ํ…์ธ  ์ œ๊ณต(์บ์‹œ๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ)
 
867
  @app.get("/api/pdf-content")
868
  async def get_pdf_content(path: str, background_tasks: BackgroundTasks):
869
  try:
870
- # ์บ์‹ฑ ์ƒํƒœ ํ™•์ธ
871
  pdf_file = pathlib.Path(path)
872
  if not pdf_file.exists():
873
- return JSONResponse(content={"error": f"ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {path}"}, status_code=404)
874
 
875
  pdf_name = pdf_file.stem
876
 
877
- # ์บ์‹œ๋œ ๊ฒฝ์šฐ ๋ฆฌ๋‹ค์ด๋ ‰ํŠธ
878
- if pdf_name in pdf_cache and (pdf_cache[pdf_name].get("status") == "completed"
879
- or (pdf_cache[pdf_name].get("status") == "processing"
880
- and pdf_cache[pdf_name].get("progress", 0) > 10)):
 
 
 
 
881
  return JSONResponse(content={"redirect": f"/api/cached-pdf?path={path}"})
882
 
883
- # ํŒŒ์ผ ์ฝ๊ธฐ
884
- with open(path, "rb") as pdf_file:
885
- content = pdf_file.read()
886
-
887
- # ํŒŒ์ผ๋ช… ์ฒ˜๋ฆฌ
888
  import urllib.parse
889
  filename = pdf_file.name
890
  encoded_filename = urllib.parse.quote(filename)
891
 
892
- # ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ์บ์‹ฑ ์‹œ์ž‘
893
  background_tasks.add_task(cache_pdf, path)
894
 
895
- # ์‘๋‹ต ํ—ค๋” ์„ค์ •
896
  headers = {
897
  "Content-Type": "application/pdf",
898
- "Content-Disposition": f"inline; filename=\"{encoded_filename}\"; filename*=UTF-8''{encoded_filename}"
899
  }
900
 
901
  return Response(content=content, media_type="application/pdf", headers=headers)
902
  except Exception as e:
903
  import traceback
904
  error_details = traceback.format_exc()
905
- logger.error(f"PDF ์ฝ˜ํ…์ธ  ๋กœ๋“œ ์˜ค๋ฅ˜: {str(e)}\n{error_details}")
906
  return JSONResponse(content={"error": str(e)}, status_code=500)
907
 
908
- # PDF ์—…๋กœ๋“œ ์—”๋“œํฌ์ธํŠธ - ์˜๊ตฌ ์ €์žฅ์†Œ์— ์ €์žฅ ๋ฐ ๋ฉ”์ธ ํ™”๋ฉด์— ์ž๋™ ํ‘œ์‹œ
 
909
  @app.post("/api/upload-pdf")
910
  async def upload_pdf(file: UploadFile = File(...)):
911
  try:
912
- # ํŒŒ์ผ ์ด๋ฆ„ ํ™•์ธ
913
  if not file.filename.lower().endswith('.pdf'):
914
- return JSONResponse(
915
- content={"success": False, "message": "PDF ํŒŒ์ผ๋งŒ ์—…๋กœ๋“œ ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค"},
916
- status_code=400
917
- )
918
 
919
- # ์˜๊ตฌ ์ €์žฅ์†Œ์— ํŒŒ์ผ ์ €์žฅ
920
  file_path = PERMANENT_PDF_DIR / file.filename
921
 
922
- # ํŒŒ์ผ ์ฝ๊ธฐ ๋ฐ ์ €์žฅ
923
  content = await file.read()
924
  with open(file_path, "wb") as buffer:
925
  buffer.write(content)
926
 
927
- # ๋ฉ”์ธ ๋””๋ ‰ํ† ๋ฆฌ์—๋„ ์ž๋™์œผ๋กœ ๋ณต์‚ฌ (์ž๋™ ํ‘œ์‹œ)
928
  with open(PDF_DIR / file.filename, "wb") as buffer:
929
  buffer.write(content)
930
 
931
- # PDF ID ์ƒ์„ฑ ๋ฐ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ €์žฅ
932
  pdf_id = generate_pdf_id(file.filename)
933
  pdf_metadata[pdf_id] = str(file_path)
934
  save_pdf_metadata()
935
 
936
- # ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ์บ์‹ฑ ์‹œ์ž‘
937
  asyncio.create_task(cache_pdf(str(file_path)))
938
 
939
  return JSONResponse(
940
  content={
941
- "success": True,
942
- "path": str(file_path),
943
  "name": file_path.stem,
944
  "id": pdf_id,
945
  "viewUrl": f"/view/{pdf_id}"
@@ -949,48 +926,39 @@ async def upload_pdf(file: UploadFile = File(...)):
949
  except Exception as e:
950
  import traceback
951
  error_details = traceback.format_exc()
952
- logger.error(f"PDF ์—…๋กœ๋“œ ์˜ค๋ฅ˜: {str(e)}\n{error_details}")
953
- return JSONResponse(
954
- content={"success": False, "message": str(e)},
955
- status_code=500
956
- )
957
 
958
- # ํ…์ŠคํŠธ ํŒŒ์ผ์„ PDF๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ํ•จ์ˆ˜
 
959
  async def convert_text_to_pdf(text_content: str, title: str) -> str:
960
  try:
961
- # ์ œ๋ชฉ์—์„œ ์œ ํšจํ•œ ํŒŒ์ผ๋ช… ์ƒ์„ฑ
962
  import re
963
  safe_title = re.sub(r'[^\w\-_\. ]', '_', title)
964
  if not safe_title:
965
  safe_title = "aibook"
966
 
967
- # ํƒ€์ž„์Šคํƒฌํ”„ ์ถ”๊ฐ€๋กœ ๊ณ ์œ ํ•œ ํŒŒ์ผ๋ช… ์ƒ์„ฑ
968
  timestamp = int(time.time())
969
  filename = f"{safe_title}_{timestamp}.pdf"
970
 
971
- # ์˜๊ตฌ ์ €์žฅ์†Œ์˜ ํŒŒ์ผ ๊ฒฝ๋กœ
972
  file_path = PERMANENT_PDF_DIR / filename
973
 
974
- # ํ•œ๊ธ€ ํฐํŠธ ๋“ฑ๋ก - ์—…๋กœ๋“œ๋œ MaruBuri-SemiBold.ttf ์‚ฌ์šฉ
975
  from reportlab.pdfbase import pdfmetrics
976
  from reportlab.pdfbase.ttfonts import TTFont
977
 
978
- # ํฐํŠธ ๊ฒฝ๋กœ ์„ค์ • (app.py์™€ ๊ฐ™์€ ๋””๋ ‰ํ† ๋ฆฌ์— ์žˆ๋Š” ํฐํŠธ ์‚ฌ์šฉ)
979
  font_path = BASE / "MaruBuri-SemiBold.ttf"
980
 
981
- # ํฐํŠธ ๋“ฑ๋ก
982
  font_name = "MaruBuri"
983
  if font_path.exists():
984
  pdfmetrics.registerFont(TTFont(font_name, str(font_path)))
985
- logger.info(f"ํ•œ๊ธ€ ํฐํŠธ ๋“ฑ๋ก ์„ฑ๊ณต: {font_path}")
986
  else:
987
  font_name = "Helvetica"
988
- logger.warning(f"ํ•œ๊ธ€ ํฐํŠธ ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {font_path}. ๊ธฐ๋ณธ ํฐํŠธ๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.")
989
 
990
- # ์ž„์‹œ PDF ํŒŒ์ผ ์ƒ์„ฑ
991
  pdf_buffer = io.BytesIO()
992
 
993
- # ํ•œ๊ธ€ ์ง€์›์„ ์œ„ํ•œ ์Šคํƒ€์ผ ์„ค์ •
994
  from reportlab.lib.pagesizes import letter
995
  from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
996
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
@@ -998,7 +966,6 @@ async def convert_text_to_pdf(text_content: str, title: str) -> str:
998
 
999
  doc = SimpleDocTemplate(pdf_buffer, pagesize=letter, encoding='utf-8')
1000
 
1001
- # ์‚ฌ์šฉ์ž ์ •์˜ ์Šคํƒ€์ผ ์ƒ์„ฑ
1002
  title_style = ParagraphStyle(
1003
  name='CustomTitle',
1004
  fontName=font_name,
@@ -1018,41 +985,34 @@ async def convert_text_to_pdf(text_content: str, title: str) -> str:
1018
  spaceAfter=6
1019
  )
1020
 
1021
- # ๋‚ด์šฉ์„ ๋ฌธ๋‹จ์œผ๋กœ ๋ถ„ํ• 
1022
  content = []
1023
 
1024
- # ์ œ๋ชฉ ์ถ”๊ฐ€
1025
  content.append(Paragraph(title, title_style))
1026
  content.append(Spacer(1, 20))
1027
 
1028
- # ํ…์ŠคํŠธ๋ฅผ ๋‹จ๋ฝ์œผ๋กœ ๋ถ„๋ฆฌํ•˜์—ฌ ์ถ”๊ฐ€
1029
  paragraphs = text_content.split('\n\n')
1030
  for para in paragraphs:
1031
  if para.strip():
1032
- # XML ํŠน์ˆ˜๋ฌธ์ž ์ด์Šค์ผ€์ดํ”„ ์ฒ˜๋ฆฌ
1033
  from xml.sax.saxutils import escape
1034
  safe_para = escape(para.replace('\n', '<br/>'))
1035
  p = Paragraph(safe_para, normal_style)
1036
  content.append(p)
1037
  content.append(Spacer(1, 10))
1038
 
1039
- # PDF ์ƒ์„ฑ
1040
  doc.build(content)
1041
 
1042
- # ํŒŒ์ผ๋กœ ์ €์žฅ
1043
  with open(file_path, 'wb') as f:
1044
  f.write(pdf_buffer.getvalue())
1045
 
1046
- # ๋ฉ”์ธ ๋””๋ ‰ํ† ๋ฆฌ์—๋„ ๋ณต์‚ฌ
1047
  with open(PDF_DIR / filename, 'wb') as f:
1048
  f.write(pdf_buffer.getvalue())
1049
 
1050
- # PDF ID ์ƒ์„ฑ ๋ฐ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ €์žฅ
1051
  pdf_id = generate_pdf_id(filename)
1052
  pdf_metadata[pdf_id] = str(file_path)
1053
  save_pdf_metadata()
1054
 
1055
- # ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ์บ์‹ฑ ์‹œ์ž‘
1056
  asyncio.create_task(cache_pdf(str(file_path)))
1057
 
1058
  return {
@@ -1062,77 +1022,68 @@ async def convert_text_to_pdf(text_content: str, title: str) -> str:
1062
  }
1063
 
1064
  except Exception as e:
1065
- logger.error(f"ํ…์ŠคํŠธ๋ฅผ PDF๋กœ ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜: {e}")
1066
  raise e
1067
 
1068
 
1069
- # AI๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ํ…์ŠคํŠธ๋ฅผ ๋” ๊ตฌ์กฐํ™”๋œ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜ (OpenAI ์ œ๊ฑฐ ๋ฒ„์ „)
1070
  async def enhance_text_with_ai(text_content: str, title: str) -> str:
1071
- # ์›๋ณธ ํ…์ŠคํŠธ ๊ทธ๋Œ€๋กœ ๋ฐ˜ํ™˜ (AI ํ–ฅ์ƒ ๊ธฐ๋Šฅ ๋น„ํ™œ์„ฑํ™”)
1072
- return text_content
1073
-
1074
 
1075
 
1076
- # ํ…์ŠคํŠธ ํŒŒ์ผ์„ PDF๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ์—”๋“œํฌ์ธํŠธ
1077
  @app.post("/api/text-to-pdf")
1078
  async def text_to_pdf(file: UploadFile = File(...)):
1079
  try:
1080
- # ์ง€์›ํ•˜๋Š” ํŒŒ์ผ ํ˜•์‹ ํ™•์ธ
1081
  filename = file.filename.lower()
1082
  if not (filename.endswith('.txt') or filename.endswith('.docx') or filename.endswith('.doc')):
1083
  return JSONResponse(
1084
- content={"success": False, "message": "์ง€์›ํ•˜๋Š” ํŒŒ์ผ ํ˜•์‹์€ .txt, .docx, .doc์ž…๋‹ˆ๋‹ค."},
1085
  status_code=400
1086
  )
1087
 
1088
- # ํŒŒ์ผ ๋‚ด์šฉ ์ฝ๊ธฐ
1089
  content = await file.read()
1090
 
1091
- # ํŒŒ์ผ ํƒ€์ž…์— ๋”ฐ๋ผ ํ…์ŠคํŠธ ์ถ”์ถœ
1092
  if filename.endswith('.txt'):
1093
- # ์ธ์ฝ”๋”ฉ ์ž๋™ ๊ฐ์ง€ ์‹œ๋„
1094
  encodings = ['utf-8', 'euc-kr', 'cp949', 'latin1']
1095
  text_content = None
1096
 
1097
  for encoding in encodings:
1098
  try:
1099
  text_content = content.decode(encoding, errors='strict')
1100
- logger.info(f"ํ…์ŠคํŠธ ํŒŒ์ผ ์ธ์ฝ”๋”ฉ ๊ฐ์ง€: {encoding}")
1101
  break
1102
  except UnicodeDecodeError:
1103
  continue
1104
 
1105
  if text_content is None:
1106
- # ๋ชจ๋“  ์ธ์ฝ”๋”ฉ ์‹œ๋„ ์‹คํŒจ ์‹œ ๊ธฐ๋ณธ์ ์œผ๋กœ UTF-8๋กœ ์‹œ๋„ํ•˜๊ณ  ์˜ค๋ฅ˜๋Š” ๋Œ€์ฒด ๋ฌธ์ž๋กœ ์ฒ˜๋ฆฌ
1107
  text_content = content.decode('utf-8', errors='replace')
1108
- logger.warning("ํ…์ŠคํŠธ ํŒŒ์ผ ์ธ์ฝ”๋”ฉ์„ ๊ฐ์ง€ํ•  ์ˆ˜ ์—†์–ด UTF-8์œผ๋กœ ์‹œ๋„ํ•ฉ๋‹ˆ๋‹ค.")
1109
 
1110
  elif filename.endswith('.docx') or filename.endswith('.doc'):
1111
- # ์ž„์‹œ ํŒŒ์ผ๋กœ ์ €์žฅ
1112
  with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as temp_file:
1113
  temp_file.write(content)
1114
  temp_path = temp_file.name
1115
 
1116
  try:
1117
- # docx2txt๋กœ ํ…์ŠคํŠธ ์ถ”์ถœ
1118
  text_content = docx2txt.process(temp_path)
1119
  finally:
1120
- # ์ž„์‹œ ํŒŒ์ผ ์‚ญ์ œ
1121
  os.unlink(temp_path)
1122
 
1123
- # ํŒŒ์ผ๋ช…์—์„œ ์ œ๋ชฉ ์ถ”์ถœ (ํ™•์žฅ์ž ์ œ์™ธ)
1124
  title = os.path.splitext(filename)[0]
1125
 
1126
- # AI๋กœ ํ…์ŠคํŠธ ๋‚ด์šฉ ํ–ฅ์ƒ
1127
  enhanced_text = await enhance_text_with_ai(text_content, title)
1128
 
1129
- # ํ…์ŠคํŠธ๋ฅผ PDF๋กœ ๋ณ€ํ™˜
1130
  pdf_info = await convert_text_to_pdf(enhanced_text, title)
1131
 
1132
  return JSONResponse(
1133
  content={
1134
- "success": True,
1135
- "path": pdf_info["path"],
1136
  "name": os.path.splitext(pdf_info["filename"])[0],
1137
  "id": pdf_info["id"],
1138
  "viewUrl": f"/view/{pdf_info['id']}"
@@ -1142,49 +1093,46 @@ async def text_to_pdf(file: UploadFile = File(...)):
1142
  except Exception as e:
1143
  import traceback
1144
  error_details = traceback.format_exc()
1145
- logger.error(f"ํ…์ŠคํŠธ๋ฅผ PDF๋กœ ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜: {str(e)}\n{error_details}")
1146
- return JSONResponse(
1147
- content={"success": False, "message": str(e)},
1148
- status_code=500
1149
- )
1150
 
1151
- # ๊ด€๋ฆฌ์ž ์ธ์ฆ ์—”๋“œํฌ์ธํŠธ
 
1152
  @app.post("/api/admin-login")
1153
  async def admin_login(password: str = Form(...)):
1154
  if password == ADMIN_PASSWORD:
1155
  return {"success": True}
1156
- return {"success": False, "message": "์ธ์ฆ ์‹คํŒจ"}
 
1157
 
1158
- # ๊ด€๋ฆฌ์ž์šฉ PDF ์‚ญ์ œ ์—”๋“œํฌ์ธํŠธ
1159
  @app.delete("/api/admin/delete-pdf")
1160
  async def delete_pdf(path: str):
1161
  try:
1162
  pdf_file = pathlib.Path(path)
1163
  if not pdf_file.exists():
1164
- return {"success": False, "message": "ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"}
1165
 
1166
- # PDF ํŒŒ์ผ๋ช… ๊ฐ€์ ธ์˜ค๊ธฐ
1167
  filename = pdf_file.name
1168
 
1169
- # PDF ํŒŒ์ผ ์‚ญ์ œ (์˜๊ตฌ ์ €์žฅ์†Œ์—์„œ)
1170
  pdf_file.unlink()
1171
 
1172
- # ๋ฉ”์ธ ๋””๋ ‰ํ† ๋ฆฌ์—์„œ๋„ ๋™์ผํ•œ ํŒŒ์ผ์ด ์žˆ์œผ๋ฉด ์‚ญ์ œ (๋ฒ„๊ทธ ์ˆ˜์ •)
1173
  main_file_path = PDF_DIR / filename
1174
  if main_file_path.exists():
1175
  main_file_path.unlink()
1176
 
1177
- # ๊ด€๋ จ ์บ์‹œ ํŒŒ์ผ ์‚ญ์ œ
1178
  pdf_name = pdf_file.stem
1179
  cache_path = get_cache_path(pdf_name)
1180
  if cache_path.exists():
1181
  cache_path.unlink()
1182
-
1183
- # ์บ์‹œ ๋ฉ”๋ชจ๋ฆฌ์—์„œ๋„ ์ œ๊ฑฐ
1184
  if pdf_name in pdf_cache:
1185
  del pdf_cache[pdf_name]
1186
-
1187
- # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ์—์„œ ํ•ด๋‹น ํŒŒ์ผ ID ์ œ๊ฑฐ
1188
  to_remove = []
1189
  for pid, fpath in pdf_metadata.items():
1190
  if os.path.basename(fpath) == filename:
@@ -1194,30 +1142,31 @@ async def delete_pdf(path: str):
1194
  del pdf_metadata[pid]
1195
 
1196
  save_pdf_metadata()
1197
-
1198
  return {"success": True}
1199
  except Exception as e:
1200
- logger.error(f"PDF ์‚ญ์ œ ์˜ค๋ฅ˜: {str(e)}")
1201
  return {"success": False, "message": str(e)}
1202
 
1203
- # PDF๋ฅผ ๋ฉ”์ธ ๋””๋ ‰ํ† ๋ฆฌ์— ํ‘œ์‹œ ์„ค์ •
 
1204
  @app.post("/api/admin/feature-pdf")
1205
  async def feature_pdf(path: str):
1206
  try:
1207
  pdf_file = pathlib.Path(path)
1208
  if not pdf_file.exists():
1209
- return {"success": False, "message": "ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"}
1210
 
1211
- # ๋ฉ”์ธ ๋””๋ ‰ํ† ๋ฆฌ์— ๋ณต์‚ฌ
1212
  target_path = PDF_DIR / pdf_file.name
1213
  shutil.copy2(pdf_file, target_path)
1214
-
1215
  return {"success": True}
1216
  except Exception as e:
1217
- logger.error(f"PDF ํ‘œ์‹œ ์„ค์ • ์˜ค๋ฅ˜: {str(e)}")
1218
  return {"success": False, "message": str(e)}
1219
 
1220
- # PDF๋ฅผ ๋ฉ”์ธ ๋””๋ ‰ํ† ๋ฆฌ์—์„œ ์ œ๊ฑฐ (์˜๊ตฌ ์ €์žฅ์†Œ์—์„œ๋Š” ์œ ์ง€)
 
1221
  @app.delete("/api/admin/unfeature-pdf")
1222
  async def unfeature_pdf(path: str):
1223
  try:
@@ -1226,25 +1175,24 @@ async def unfeature_pdf(path: str):
1226
 
1227
  if target_path.exists():
1228
  target_path.unlink()
1229
-
1230
  return {"success": True}
1231
  except Exception as e:
1232
- logger.error(f"PDF ํ‘œ์‹œ ํ•ด์ œ ์˜ค๋ฅ˜: {str(e)}")
1233
  return {"success": False, "message": str(e)}
1234
 
1235
- # ์ง์ ‘ PDF ๋ทฐ์–ด URL ์ ‘๊ทผ์šฉ ๋ผ์šฐํŠธ
1236
  @app.get("/view/{pdf_id}")
1237
  async def view_pdf_by_id(pdf_id: str):
1238
- # PDF ID ์œ ํšจํ•œ์ง€ ํ™•์ธ
1239
  pdf_path = get_pdf_path_by_id(pdf_id)
1240
 
1241
  if not pdf_path:
1242
- # ์ผ๋‹จ ๋ชจ๋“  PDF ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ๋ฅผ ๋‹ค์‹œ ๋กœ๋“œํ•˜๊ณ  ์žฌ์‹œ๋„
1243
  load_pdf_metadata()
1244
  pdf_path = get_pdf_path_by_id(pdf_id)
1245
 
1246
  if not pdf_path:
1247
- # ๋ชจ๋“  PDF ํŒŒ์ผ์„ ์ง์ ‘ ์Šค์บ”ํ•˜์—ฌ ์œ ์‚ฌํ•œ ์ด๋ฆ„ ์ฐพ๊ธฐ
1248
  for file_path in get_pdf_files() + get_permanent_pdf_files():
1249
  name_part = pdf_id.split('_')[0] if '_' in pdf_id else pdf_id
1250
  if file_path.stem.startswith(name_part):
@@ -1255,14 +1203,17 @@ async def view_pdf_by_id(pdf_id: str):
1255
 
1256
  if not pdf_path:
1257
  return HTMLResponse(
1258
- content=f"<html><body><h1>PDF๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค</h1><p>ID: {pdf_id}</p><a href='/'>ํ™ˆ์œผ๋กœ ๋Œ์•„๊ฐ€๊ธฐ</a></body></html>",
 
 
 
1259
  status_code=404
1260
  )
1261
 
1262
- # ๋ฉ”์ธ ํŽ˜์ด์ง€๋กœ ๋ฆฌ๋‹ค์ด๋ ‰ํŠธํ•˜๋˜, PDF ID ํŒŒ๋ผ๋ฏธํ„ฐ ์ถ”๊ฐ€
1263
  return get_html_content(pdf_id=pdf_id)
1264
 
1265
- # HTML ํŒŒ์ผ ์ฝ๊ธฐ ํ•จ์ˆ˜
1266
  def get_html_content(pdf_id: str = None):
1267
  html_path = BASE / "flipbook_template.html"
1268
  content = ""
@@ -1270,59 +1221,49 @@ def get_html_content(pdf_id: str = None):
1270
  with open(html_path, "r", encoding="utf-8") as f:
1271
  content = f.read()
1272
  else:
1273
- content = HTML # ๊ธฐ๋ณธ HTML ์‚ฌ์šฉ
1274
 
1275
- # PDF ID๊ฐ€ ์ œ๊ณต๋œ ๊ฒฝ์šฐ, ์ž๋™ ๋กœ๋“œ ์Šคํฌ๋ฆฝํŠธ ์ถ”๊ฐ€
1276
  if pdf_id:
1277
  auto_load_script = f"""
1278
  <script>
1279
- // ํŽ˜์ด์ง€ ๋กœ๋“œ ์‹œ ์ž๋™์œผ๋กœ ํ•ด๋‹น PDF ์—ด๊ธฐ
1280
  document.addEventListener('DOMContentLoaded', async function() {{
1281
  try {{
1282
- // PDF ์ •๋ณด ๊ฐ€์ ธ์˜ค๊ธฐ
1283
  const response = await fetch('/api/pdf-info-by-id/{pdf_id}');
1284
  const pdfInfo = await response.json();
1285
 
1286
  if (pdfInfo.exists && pdfInfo.path) {{
1287
- // ์•ฝ๊ฐ„์˜ ์ง€์—ฐ ํ›„ PDF ๋ทฐ์–ด ์—ด๊ธฐ (UI๊ฐ€ ์ค€๋น„๋œ ํ›„)
1288
  setTimeout(() => {{
1289
  openPdfById('{pdf_id}', pdfInfo.path, pdfInfo.cached);
1290
  }}, 500);
1291
  }} else {{
1292
- showError("์š”์ฒญํ•œ PDF๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.");
1293
  }}
1294
  }} catch (e) {{
1295
- console.error("์ž๋™ PDF ๋กœ๋“œ ์˜ค๋ฅ˜:", e);
1296
  }}
1297
  }});
1298
  </script>
1299
  """
1300
 
1301
- # body ์ข…๋ฃŒ ํƒœ๊ทธ ์ „์— ์Šคํฌ๋ฆฝํŠธ ์‚ฝ์ž…
1302
  content = content.replace("</body>", auto_load_script + "</body>")
1303
 
1304
  return HTMLResponse(content=content)
1305
 
 
1306
  @app.get("/", response_class=HTMLResponse)
1307
  async def root(request: Request, pdf_id: Optional[str] = Query(None)):
1308
- # PDF ID๊ฐ€ ์ฟผ๋ฆฌ ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ ์ œ๊ณต๋œ ๊ฒฝ์šฐ /view/{pdf_id}๋กœ ๋ฆฌ๋‹ค์ด๋ ‰ํŠธ
1309
  if pdf_id:
1310
  return RedirectResponse(url=f"/view/{pdf_id}")
1311
  return get_html_content()
1312
 
1313
- # HTML ๋ฌธ์ž์—ด (AI ๋ฒ„ํŠผ ๋ฐ ์ฑ—๋ด‡ UI ์ถ”๊ฐ€)
1314
- # HTML ๋ฌธ์ž์—ด (AI ๋ฒ„ํŠผ ๋ฐ ์ฑ—๋ด‡ UI ์ถ”๊ฐ€)
1315
- import os
1316
 
1317
- # Hugging Face Space์˜ secret์—์„œ HTML ํ…œํ”Œ๋ฆฟ ๋กœ๋“œ
1318
  HTML = os.getenv("HTML_TEMPLATE", "")
1319
-
1320
- # HTML์ด ๋น„์–ด์žˆ์„ ๊ฒฝ์šฐ ๊ธฐ๋ณธ HTML ์‚ฌ์šฉ (fallback)
1321
  if not HTML:
1322
- logger.warning("HTML_TEMPLATE secret์ด ์„ค์ •๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค. ๊ธฐ๋ณธ HTML์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.")
1323
  HTML = """
1324
  <!doctype html>
1325
- <html lang="ko">
1326
  <head>
1327
  <meta charset="utf-8">
1328
  <title>FlipBook Space</title>
@@ -1332,12 +1273,12 @@ if not HTML:
1332
  </style>
1333
  </head>
1334
  <body>
1335
- <h1>HTML ํ…œํ”Œ๋ฆฟ์„ ๋ถˆ๋Ÿฌ์˜ฌ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค</h1>
1336
- <p class="error">HTML_TEMPLATE secret์ด ์„ค์ •๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค.</p>
1337
- <p>Hugging Face Space์˜ secret ์˜์—ญ์— HTML_TEMPLATE์„ ์„ค์ •ํ•ด์ฃผ์„ธ์š”.</p>
1338
  </body>
1339
  </html>
1340
  """
1341
 
1342
  if __name__ == "__main__":
1343
- uvicorn.run("app:app", host="0.0.0.0", port=int(os.getenv("PORT", 7860)))
 
17
  import io
18
  import docx2txt
19
 
20
+ # Logging configuration
21
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
22
  logger = logging.getLogger(__name__)
23
 
 
25
  app = FastAPI()
26
  app.mount("/static", StaticFiles(directory=BASE), name="static")
27
 
28
+ # PDF directory (main directory)
29
  PDF_DIR = BASE / "pdf"
30
  if not PDF_DIR.exists():
31
  PDF_DIR.mkdir(parents=True)
32
 
33
+ # Permanent PDF directory (Hugging Face persistent disk)
34
  PERMANENT_PDF_DIR = pathlib.Path("/data/pdfs") if os.path.exists("/data") else BASE / "permanent_pdfs"
35
  if not PERMANENT_PDF_DIR.exists():
36
  PERMANENT_PDF_DIR.mkdir(parents=True)
37
 
38
+ # Cache directory
39
  CACHE_DIR = BASE / "cache"
40
  if not CACHE_DIR.exists():
41
  CACHE_DIR.mkdir(parents=True)
42
 
43
+ # PDF metadata directory and file
44
  METADATA_DIR = pathlib.Path("/data/metadata") if os.path.exists("/data") else BASE / "metadata"
45
  if not METADATA_DIR.exists():
46
  METADATA_DIR.mkdir(parents=True)
47
  PDF_METADATA_FILE = METADATA_DIR / "pdf_metadata.json"
48
 
49
+ # Embedding cache directory
50
  EMBEDDING_DIR = pathlib.Path("/data/embeddings") if os.path.exists("/data") else BASE / "embeddings"
51
  if not EMBEDDING_DIR.exists():
52
  EMBEDDING_DIR.mkdir(parents=True)
53
 
54
+ # Admin password
55
+ ADMIN_PASSWORD = os.getenv("PASSWORD", "admin") # Retrieved from environment variable; default is for testing
56
 
57
+ # OpenAI API key
58
  OPENAI_API_KEY = os.getenv("LLM_API", "")
59
+ # Flag indicating if we have a valid API key
60
  HAS_VALID_API_KEY = bool(OPENAI_API_KEY and OPENAI_API_KEY.strip())
61
 
62
  if HAS_VALID_API_KEY:
63
  try:
64
  openai_client = OpenAI(api_key=OPENAI_API_KEY, timeout=30.0)
65
+ logger.info("OpenAI client initialized successfully.")
66
  except Exception as e:
67
+ logger.error(f"Failed to initialize OpenAI client: {e}")
68
  HAS_VALID_API_KEY = False
69
  else:
70
+ logger.warning("No valid OpenAI API key found. AI features will be limited.")
71
  openai_client = None
72
 
73
+ # Global cache object
74
  pdf_cache: Dict[str, Dict[str, Any]] = {}
75
+ # Cache locks
76
  cache_locks = {}
77
+ # PDF metadata (ID -> path)
78
  pdf_metadata: Dict[str, str] = {}
79
+ # PDF embedding cache
80
  pdf_embeddings: Dict[str, Dict[str, Any]] = {}
81
 
82
+
83
+ # Load PDF metadata from file
84
  def load_pdf_metadata():
85
  global pdf_metadata
86
  if PDF_METADATA_FILE.exists():
87
  try:
88
  with open(PDF_METADATA_FILE, "r") as f:
89
  pdf_metadata = json.load(f)
90
+ logger.info(f"PDF metadata loaded successfully: {len(pdf_metadata)} entries")
91
  except Exception as e:
92
+ logger.error(f"Error loading metadata: {e}")
93
  pdf_metadata = {}
94
  else:
95
  pdf_metadata = {}
96
 
97
+
98
+ # Save PDF metadata to file
99
  def save_pdf_metadata():
100
  try:
101
  with open(PDF_METADATA_FILE, "w") as f:
102
  json.dump(pdf_metadata, f)
103
  except Exception as e:
104
+ logger.error(f"Error saving metadata: {e}")
105
+
106
 
107
+ # Generate a PDF ID (based on filename + timestamp)
108
  def generate_pdf_id(filename: str) -> str:
 
 
 
109
  import re
110
+ base_name = os.path.splitext(filename)[0]
111
  safe_name = re.sub(r'[^\w\-_]', '_', base_name.replace(" ", "_"))
 
112
  timestamp = int(time.time())
 
113
  random_suffix = uuid.uuid4().hex[:6]
114
  return f"{safe_name}_{timestamp}_{random_suffix}"
115
 
116
+
117
+ # Retrieve list of PDF files in main directory
118
  def get_pdf_files():
119
  pdf_files = []
120
  if PDF_DIR.exists():
121
  pdf_files = [f for f in PDF_DIR.glob("*.pdf")]
122
  return pdf_files
123
 
124
+
125
+ # Retrieve list of PDF files in permanent directory
126
  def get_permanent_pdf_files():
127
  pdf_files = []
128
  if PERMANENT_PDF_DIR.exists():
129
  pdf_files = [f for f in PERMANENT_PDF_DIR.glob("*.pdf")]
130
  return pdf_files
131
 
132
+
133
+ # Generate PDF project data (thumbnails, etc.)
134
  def generate_pdf_projects():
135
  projects_data = []
136
 
137
+ # Get files from both main and permanent directories
138
  pdf_files = get_pdf_files()
139
  permanent_pdf_files = get_permanent_pdf_files()
140
 
141
+ # Combine both sets of files (remove duplicates by filename)
142
  unique_files = {}
143
 
144
+ # Add from main directory first
145
  for file in pdf_files:
146
  unique_files[file.name] = file
147
 
148
+ # Then add from permanent directory (overwrite if same filename)
149
  for file in permanent_pdf_files:
150
  unique_files[file.name] = file
151
 
 
152
  for pdf_file in unique_files.values():
153
+ # Find the PDF ID for this file
154
  pdf_id = None
155
  for pid, path in pdf_metadata.items():
156
  if os.path.basename(path) == pdf_file.name:
157
  pdf_id = pid
158
  break
159
 
160
+ # If the file has no ID, generate one and add it to metadata
161
  if not pdf_id:
162
  pdf_id = generate_pdf_id(pdf_file.name)
163
  pdf_metadata[pdf_id] = str(pdf_file)
 
172
 
173
  return projects_data
174
 
175
+
176
+ # Get path for cache file
177
  def get_cache_path(pdf_name: str):
178
  return CACHE_DIR / f"{pdf_name}_cache.json"
179
 
180
+
181
+ # Get path for embedding cache file
182
  def get_embedding_path(pdf_id: str):
183
  return EMBEDDING_DIR / f"{pdf_id}_embedding.json"
184
 
185
+
186
+ # Extract text from a PDF
187
  def extract_pdf_text(pdf_path: str) -> List[Dict[str, Any]]:
188
  try:
189
  doc = fitz.open(pdf_path)
 
193
  page = doc[page_num]
194
  text = page.get_text()
195
 
196
+ # Only add if the text is non-empty
197
  if text.strip():
198
  chunks.append({
199
  "page": page_num + 1,
 
203
 
204
  return chunks
205
  except Exception as e:
206
+ logger.error(f"Error extracting text from PDF: {e}")
207
  return []
208
 
209
+
210
+ # Get or create PDF embedding by PDF ID
211
  async def get_pdf_embedding(pdf_id: str) -> Dict[str, Any]:
212
  try:
213
+ # Check embedding cache file
214
  embedding_path = get_embedding_path(pdf_id)
215
  if embedding_path.exists():
216
  try:
217
  with open(embedding_path, "r", encoding="utf-8") as f:
218
  return json.load(f)
219
  except Exception as e:
220
+ logger.error(f"Error loading embedding cache: {e}")
221
 
222
+ # Find the actual PDF path
223
  pdf_path = get_pdf_path_by_id(pdf_id)
224
  if not pdf_path:
225
+ raise ValueError(f"Could not find a file corresponding to PDF ID {pdf_id}")
226
 
227
+ # Extract text
228
  chunks = extract_pdf_text(pdf_path)
229
  if not chunks:
230
+ raise ValueError(f"No text could be extracted from PDF: {pdf_path}")
231
 
232
+ # Here, you'd normally create or fetch embeddings. For now, we just store chunks.
233
  embedding_data = {
234
  "pdf_id": pdf_id,
235
  "pdf_path": pdf_path,
 
237
  "created_at": time.time()
238
  }
239
 
240
+ # Save embedding data to cache
241
  with open(embedding_path, "w", encoding="utf-8") as f:
242
  json.dump(embedding_data, f, ensure_ascii=False)
243
 
244
  return embedding_data
245
 
246
  except Exception as e:
247
+ logger.error(f"Error creating PDF embedding: {e}")
248
  return {"error": str(e), "pdf_id": pdf_id}
249
 
250
+
251
+ # Query a PDF using its content (simple approach)
252
  async def query_pdf(pdf_id: str, query: str) -> Dict[str, Any]:
253
  try:
254
+ # If there's no valid API key
255
  if not HAS_VALID_API_KEY or not openai_client:
256
  return {
257
+ "error": "OpenAI API key not set.",
258
+ "answer": "Sorry, the AI feature is currently disabled. Please contact the system administrator."
259
  }
260
 
261
+ # Get embedding data
262
  embedding_data = await get_pdf_embedding(pdf_id)
263
  if "error" in embedding_data:
264
  return {"error": embedding_data["error"]}
265
 
266
+ # For simplicity, gather all text from the PDF
267
  all_text = "\n\n".join([f"Page {chunk['page']}: {chunk['text']}" for chunk in embedding_data["chunks"]])
268
 
269
+ # Truncate context if too long
270
+ max_context_length = 60000 # roughly by characters
271
  if len(all_text) > max_context_length:
272
+ all_text = all_text[:max_context_length] + "...(truncated)"
273
 
274
+ # System prompt
275
  system_prompt = """
276
+ The default language is English. However, please respond in the language used in the user's prompt (e.g., English, Korean, Japanese, Chinese, etc.).
277
+ You are an assistant that answers questions based solely on the provided PDF content. Use only the information from the PDF content to respond. If the relevant information is not available in the PDF, respond with: "The requested information could not be found in the provided PDF."
278
+ Provide clear, concise answers and cite relevant page numbers. Always remain polite and courteous.
279
  """
280
 
281
+ # Attempting to call the openai_client
282
  try:
283
+ # Retry logic
284
+ for attempt in range(3):
285
  try:
286
  response = openai_client.chat.completions.create(
287
  model="gpt-4.1-mini",
288
  messages=[
289
  {"role": "system", "content": system_prompt},
290
+ {
291
+ "role": "user",
292
+ "content": (
293
+ f"The default language is English."
294
+ f"Please answer the following question using the PDF content below.\n\n"
295
+ f"PDF Content:\n{all_text}\n\n"
296
+ f"Question: {query}"
297
+ ),
298
+ },
299
  ],
300
  temperature=0.7,
301
  max_tokens=2048,
302
+ timeout=30.0
303
  )
304
 
305
  answer = response.choices[0].message.content
 
309
  "query": query
310
  }
311
  except Exception as api_error:
312
+ logger.error(f"OpenAI API call error (attempt {attempt+1}/3): {api_error}")
313
+ if attempt == 2:
314
  raise api_error
315
+ await asyncio.sleep(1 * (attempt + 1))
316
 
317
+ raise Exception("All retry attempts for API call failed.")
 
318
  except Exception as api_error:
319
+ logger.error(f"Final OpenAI API call error: {api_error}")
 
320
  error_message = str(api_error)
321
  if "Connection" in error_message:
322
+ return {"error": "Could not connect to the OpenAI server. Please check your internet connection."}
323
  elif "Unauthorized" in error_message or "Authentication" in error_message:
324
+ return {"error": "Invalid API key."}
325
  elif "Rate limit" in error_message:
326
+ return {"error": "API rate limit exceeded. Please try again later."}
327
  else:
328
+ return {"error": f"An error occurred while generating the AI response: {error_message}"}
329
 
330
  except Exception as e:
331
+ logger.error(f"Error in query_pdf: {e}")
332
  return {"error": str(e)}
333
 
334
+
335
+ # Summarize PDF
336
  async def summarize_pdf(pdf_id: str) -> Dict[str, Any]:
337
  try:
338
+ # If there's no valid API key
339
  if not HAS_VALID_API_KEY or not openai_client:
340
  return {
341
+ "error": "OpenAI API key not set. Check 'LLM_API' environment variable.",
342
+ "summary": "Cannot generate summary without an API key. Please contact the system administrator."
343
  }
344
 
345
+ # Get embedding data
346
  embedding_data = await get_pdf_embedding(pdf_id)
347
  if "error" in embedding_data:
348
+ return {"error": embedding_data["error"], "summary": "Cannot extract text from the PDF."}
349
 
 
350
  all_text = "\n\n".join([f"Page {chunk['page']}: {chunk['text']}" for chunk in embedding_data["chunks"]])
351
 
352
+ # Truncate if too long
353
+ max_context_length = 60000
354
  if len(all_text) > max_context_length:
355
+ all_text = all_text[:max_context_length] + "...(truncated)"
356
 
 
357
  try:
358
+ # Retry logic
359
+ for attempt in range(3):
360
  try:
361
  response = openai_client.chat.completions.create(
362
  model="gpt-4.1-mini",
363
  messages=[
364
+ {
365
+ "role": "system",
366
+ "content": (
367
+ "The default language is English. Please summarize the following PDF content "
368
+ "concisely, including key topics and main points, in less than 500 characters."
369
+ ),
370
+ },
371
+ {"role": "user", "content": f"PDF Content:\n{all_text}"}
372
  ],
373
  temperature=0.7,
374
  max_tokens=1024,
375
+ timeout=30.0
376
  )
377
 
378
  summary = response.choices[0].message.content
 
381
  "pdf_id": pdf_id
382
  }
383
  except Exception as api_error:
384
+ logger.error(f"OpenAI API call error (attempt {attempt+1}/3): {api_error}")
385
+ if attempt == 2:
386
  raise api_error
387
+ await asyncio.sleep(1 * (attempt + 1))
388
 
389
+ raise Exception("All retry attempts for API call failed.")
 
390
  except Exception as api_error:
391
+ logger.error(f"Final OpenAI API error: {api_error}")
 
392
  error_message = str(api_error)
393
  if "Connection" in error_message:
394
+ return {"error": "Could not connect to the OpenAI server. Check your internet connection.", "pdf_id": pdf_id}
395
  elif "Unauthorized" in error_message or "Authentication" in error_message:
396
+ return {"error": "Invalid API key.", "pdf_id": pdf_id}
397
  elif "Rate limit" in error_message:
398
+ return {"error": "API rate limit exceeded. Please try again later.", "pdf_id": pdf_id}
399
  else:
400
+ return {"error": f"An error occurred while generating the summary: {error_message}", "pdf_id": pdf_id}
401
 
402
  except Exception as e:
403
+ logger.error(f"Error summarizing PDF: {e}")
404
  return {
405
  "error": str(e),
406
+ "summary": "An error occurred while summarizing the PDF. The PDF may be too large or in an unsupported format."
407
  }
408
 
409
+
410
+ # Optimized PDF page caching
411
  async def cache_pdf(pdf_path: str):
412
  try:
413
+ import fitz
414
 
415
  pdf_file = pathlib.Path(pdf_path)
416
  pdf_name = pdf_file.stem
417
 
418
+ # Create a lock for this PDF (avoid concurrent caching)
419
  if pdf_name not in cache_locks:
420
  cache_locks[pdf_name] = threading.Lock()
421
 
422
+ # If it's already being cached or completed, skip
423
  if pdf_name in pdf_cache and pdf_cache[pdf_name].get("status") in ["processing", "completed"]:
424
+ logger.info(f"PDF {pdf_name} is already cached or in progress.")
425
  return
426
 
427
  with cache_locks[pdf_name]:
428
+ # Double check after lock acquisition
429
  if pdf_name in pdf_cache and pdf_cache[pdf_name].get("status") in ["processing", "completed"]:
430
  return
431
 
 
432
  pdf_cache[pdf_name] = {"status": "processing", "progress": 0, "pages": []}
433
 
434
+ # Check if there's an existing cache file
435
  cache_path = get_cache_path(pdf_name)
436
  if cache_path.exists():
437
  try:
 
440
  if cached_data.get("status") == "completed" and cached_data.get("pages"):
441
  pdf_cache[pdf_name] = cached_data
442
  pdf_cache[pdf_name]["status"] = "completed"
443
+ logger.info(f"Loaded {pdf_name} from cache file.")
444
  return
445
  except Exception as e:
446
+ logger.error(f"Failed to load cache file: {e}")
447
 
448
+ # Open the PDF
449
  doc = fitz.open(pdf_path)
450
  total_pages = doc.page_count
451
 
452
+ # Generate a small thumbnail for the first page in advance (fast UI loading)
453
  if total_pages > 0:
 
454
  page = doc[0]
455
+ pix_thumb = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2))
456
  thumb_data = pix_thumb.tobytes("png")
457
  b64_thumb = base64.b64encode(thumb_data).decode('utf-8')
458
  thumb_src = f"data:image/png;base64,{b64_thumb}"
459
 
 
460
  pdf_cache[pdf_name]["pages"] = [{"thumb": thumb_src, "src": ""}]
461
  pdf_cache[pdf_name]["progress"] = 1
462
  pdf_cache[pdf_name]["total_pages"] = total_pages
463
 
464
+ # Adjust resolution and quality to optimize performance
465
+ scale_factor = 1.0
466
+ jpeg_quality = 80
467
 
468
+ # Worker function for parallel page processing
469
  def process_page(page_num):
470
  try:
471
  page = doc[page_num]
 
 
472
  pix = page.get_pixmap(matrix=fitz.Matrix(scale_factor, scale_factor))
 
 
473
  img_data = pix.tobytes("jpeg", jpeg_quality)
474
  b64_img = base64.b64encode(img_data).decode('utf-8')
475
  img_src = f"data:image/jpeg;base64,{b64_img}"
476
 
477
+ # First page gets the thumbnail, others empty
478
  thumb_src = "" if page_num > 0 else pdf_cache[pdf_name]["pages"][0]["thumb"]
479
 
480
  return {
 
483
  "thumb": thumb_src
484
  }
485
  except Exception as e:
486
+ logger.error(f"Error processing page {page_num}: {e}")
487
  return {
488
  "page_num": page_num,
489
  "src": "",
 
491
  "error": str(e)
492
  }
493
 
 
494
  pages = [None] * total_pages
495
  processed_count = 0
496
 
497
+ # Batch processing
498
+ batch_size = 5
499
 
500
  for batch_start in range(0, total_pages, batch_size):
501
  batch_end = min(batch_start + batch_size, total_pages)
502
  current_batch = list(range(batch_start, batch_end))
503
 
 
504
  with concurrent.futures.ThreadPoolExecutor(max_workers=min(5, batch_size)) as executor:
505
  batch_results = list(executor.map(process_page, current_batch))
506
 
 
507
  for result in batch_results:
508
  page_num = result["page_num"]
509
  pages[page_num] = {
 
515
  progress = round(processed_count / total_pages * 100)
516
  pdf_cache[pdf_name]["progress"] = progress
517
 
 
518
  pdf_cache[pdf_name]["pages"] = pages
519
  try:
520
  with open(cache_path, "w") as cache_file:
 
525
  "total_pages": total_pages
526
  }, cache_file)
527
  except Exception as e:
528
+ logger.error(f"Failed to save intermediate cache: {e}")
529
 
 
530
  pdf_cache[pdf_name] = {
531
  "status": "completed",
532
  "progress": 100,
 
534
  "total_pages": total_pages
535
  }
536
 
537
+ # Final save
538
  try:
539
  with open(cache_path, "w") as cache_file:
540
  json.dump(pdf_cache[pdf_name], cache_file)
541
+ logger.info(f"PDF {pdf_name} cached successfully with {total_pages} pages.")
542
  except Exception as e:
543
+ logger.error(f"Failed to save final cache: {e}")
544
 
545
  except Exception as e:
546
  import traceback
547
+ logger.error(f"Error caching PDF: {str(e)}\n{traceback.format_exc()}")
548
  if pdf_name in pdf_cache:
549
  pdf_cache[pdf_name]["status"] = "error"
550
  pdf_cache[pdf_name]["error"] = str(e)
551
 
552
+
553
+ # Retrieve PDF path by PDF ID
554
  def get_pdf_path_by_id(pdf_id: str) -> str:
555
+ logger.info(f"Searching for PDF by ID: {pdf_id}")
556
 
557
+ # 1. Directly check in metadata
558
  if pdf_id in pdf_metadata:
559
  path = pdf_metadata[pdf_id]
 
560
  if os.path.exists(path):
561
  return path
562
 
563
+ # If file was moved, try searching by filename
564
  filename = os.path.basename(path)
565
 
566
+ # Check permanent directory
567
  perm_path = PERMANENT_PDF_DIR / filename
568
  if perm_path.exists():
 
569
  pdf_metadata[pdf_id] = str(perm_path)
570
  save_pdf_metadata()
571
  return str(perm_path)
572
+
573
+ # Check main directory
574
  main_path = PDF_DIR / filename
575
  if main_path.exists():
 
576
  pdf_metadata[pdf_id] = str(main_path)
577
  save_pdf_metadata()
578
  return str(main_path)
579
 
580
+ # 2. Fallback: search by partial filename
581
  try:
 
 
582
  name_part = pdf_id.split('_')[0] if '_' in pdf_id else pdf_id
583
 
 
584
  for file_path in get_pdf_files() + get_permanent_pdf_files():
 
585
  file_basename = os.path.basename(file_path)
586
  if file_basename.startswith(name_part) or file_path.stem.startswith(name_part):
 
587
  pdf_metadata[pdf_id] = str(file_path)
588
  save_pdf_metadata()
589
  return str(file_path)
590
  except Exception as e:
591
+ logger.error(f"Error searching by filename: {e}")
592
 
593
+ # 3. As a last resort, compare with existing metadata
594
  for pid, path in pdf_metadata.items():
595
  if os.path.exists(path):
596
  file_basename = os.path.basename(path)
 
597
  if pdf_id in pid or pid in pdf_id:
598
  pdf_metadata[pdf_id] = path
599
  save_pdf_metadata()
 
601
 
602
  return None
603
 
604
+
605
+ # Initialize caching for all PDFs on startup
606
  async def init_cache_all_pdfs():
607
+ logger.info("Starting PDF caching process.")
 
 
608
  load_pdf_metadata()
609
 
 
610
  pdf_files = get_pdf_files() + get_permanent_pdf_files()
 
 
611
  unique_pdf_paths = set(str(p) for p in pdf_files)
612
  pdf_files = [pathlib.Path(p) for p in unique_pdf_paths]
613
 
614
+ # Update metadata for all files
615
  for pdf_file in pdf_files:
 
616
  found = False
617
  for pid, path in pdf_metadata.items():
618
  if os.path.basename(path) == pdf_file.name:
619
  found = True
 
620
  if not os.path.exists(path):
621
  pdf_metadata[pid] = str(pdf_file)
622
  break
 
625
  pdf_id = generate_pdf_id(pdf_file.name)
626
  pdf_metadata[pdf_id] = str(pdf_file)
627
 
 
628
  save_pdf_metadata()
629
 
630
+ # Load existing cache for a quick start
631
  for cache_file in CACHE_DIR.glob("*_cache.json"):
632
  try:
633
  pdf_name = cache_file.stem.replace("_cache", "")
 
636
  if cached_data.get("status") == "completed" and cached_data.get("pages"):
637
  pdf_cache[pdf_name] = cached_data
638
  pdf_cache[pdf_name]["status"] = "completed"
639
+ logger.info(f"Loaded existing cache: {pdf_name}")
640
  except Exception as e:
641
+ logger.error(f"Error loading cache file: {str(e)}")
642
 
643
+ # Cache non-cached files in parallel
644
+ await asyncio.gather(*[
645
+ asyncio.create_task(cache_pdf(str(pdf_file)))
646
+ for pdf_file in pdf_files
647
+ if pdf_file.stem not in pdf_cache or pdf_cache[pdf_file.stem].get("status") != "completed"
648
+ ])
649
+
650
 
 
651
  @app.on_event("startup")
652
  async def startup_event():
653
+ # Load PDF metadata
654
  load_pdf_metadata()
655
 
656
+ # Create IDs for missing files
657
  for pdf_file in get_pdf_files() + get_permanent_pdf_files():
658
  found = False
659
  for pid, path in pdf_metadata.items():
660
  if os.path.basename(path) == pdf_file.name:
661
  found = True
 
662
  if not os.path.exists(path):
663
  pdf_metadata[pid] = str(pdf_file)
664
  break
665
 
666
  if not found:
 
667
  pdf_id = generate_pdf_id(pdf_file.name)
668
  pdf_metadata[pdf_id] = str(pdf_file)
669
 
 
670
  save_pdf_metadata()
671
 
672
+ # Start background caching task
673
  asyncio.create_task(init_cache_all_pdfs())
674
 
675
+
676
+ # API endpoint: List PDF projects
677
  @app.get("/api/pdf-projects")
678
  async def get_pdf_projects_api():
679
  return generate_pdf_projects()
680
 
681
+
682
+ # API endpoint: List permanently stored PDF projects
683
  @app.get("/api/permanent-pdf-projects")
684
  async def get_permanent_pdf_projects_api():
685
  pdf_files = get_permanent_pdf_files()
686
  projects_data = []
687
 
688
  for pdf_file in pdf_files:
 
689
  pdf_id = None
690
  for pid, path in pdf_metadata.items():
691
  if os.path.basename(path) == pdf_file.name:
692
  pdf_id = pid
693
  break
694
 
 
695
  if not pdf_id:
696
  pdf_id = generate_pdf_id(pdf_file.name)
697
  pdf_metadata[pdf_id] = str(pdf_file)
698
  save_pdf_metadata()
699
+
700
  projects_data.append({
701
  "path": str(pdf_file),
702
  "name": pdf_file.stem,
 
706
 
707
  return projects_data
708
 
709
+
710
+ # API endpoint: Get PDF info by ID
711
  @app.get("/api/pdf-info-by-id/{pdf_id}")
712
  async def get_pdf_info_by_id(pdf_id: str):
713
  pdf_path = get_pdf_path_by_id(pdf_id)
 
720
  "exists": True,
721
  "cached": pdf_file.stem in pdf_cache and pdf_cache[pdf_file.stem].get("status") == "completed"
722
  }
723
+ return {"exists": False, "error": "Could not find the specified PDF."}
724
+
725
 
726
+ # API endpoint: Get PDF thumbnail (optimized)
727
  @app.get("/api/pdf-thumbnail")
728
  async def get_pdf_thumbnail(path: str):
729
  try:
730
  pdf_file = pathlib.Path(path)
731
  pdf_name = pdf_file.stem
732
 
733
+ # If cached, return the thumbnail from cache
734
  if pdf_name in pdf_cache and pdf_cache[pdf_name].get("pages"):
735
  if pdf_cache[pdf_name]["pages"][0].get("thumb"):
736
  return {"thumbnail": pdf_cache[pdf_name]["pages"][0]["thumb"]}
737
+
738
+ # If not cached, generate a quick thumbnail (smaller resolution)
739
  import fitz
740
  doc = fitz.open(path)
741
  if doc.page_count > 0:
742
  page = doc[0]
743
+ pix = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2))
744
+ img_data = pix.tobytes("jpeg", 70)
745
  b64_img = base64.b64encode(img_data).decode('utf-8')
746
 
747
+ # Start background caching
748
  asyncio.create_task(cache_pdf(path))
749
 
750
  return {"thumbnail": f"data:image/jpeg;base64,{b64_img}"}
751
 
752
  return {"thumbnail": None}
753
  except Exception as e:
754
+ logger.error(f"Error generating thumbnail: {str(e)}")
755
  return {"error": str(e), "thumbnail": None}
756
 
757
+
758
+ # API endpoint: Cache status
759
  @app.get("/api/cache-status")
760
  async def get_cache_status(path: str = None):
761
  if path:
 
765
  return pdf_cache[pdf_name]
766
  return {"status": "not_cached"}
767
  else:
768
+ return {
769
+ name: {"status": info["status"], "progress": info.get("progress", 0)}
770
+ for name, info in pdf_cache.items()
771
+ }
772
+
773
 
774
+ # API endpoint: Query PDF content with AI
775
  @app.post("/api/ai/query-pdf/{pdf_id}")
776
  async def api_query_pdf(pdf_id: str, query: Dict[str, str]):
777
  try:
778
  user_query = query.get("query", "")
779
  if not user_query:
780
+ return JSONResponse(content={"error": "No question provided."}, status_code=400)
781
 
 
782
  pdf_path = get_pdf_path_by_id(pdf_id)
783
  if not pdf_path:
784
+ return JSONResponse(content={"error": f"No file found for PDF ID {pdf_id}"}, status_code=404)
785
 
 
786
  result = await query_pdf(pdf_id, user_query)
787
 
788
  if "error" in result:
 
790
 
791
  return result
792
  except Exception as e:
793
+ logger.error(f"Error in AI query endpoint: {e}")
794
  return JSONResponse(content={"error": str(e)}, status_code=500)
795
 
796
+
797
+ # API endpoint: Summarize PDF
798
  @app.get("/api/ai/summarize-pdf/{pdf_id}")
799
  async def api_summarize_pdf(pdf_id: str):
800
  try:
 
801
  pdf_path = get_pdf_path_by_id(pdf_id)
802
  if not pdf_path:
803
+ return JSONResponse(content={"error": f"No file found for PDF ID {pdf_id}"}, status_code=404)
804
 
 
805
  result = await summarize_pdf(pdf_id)
806
 
807
  if "error" in result:
 
809
 
810
  return result
811
  except Exception as e:
812
+ logger.error(f"Error in PDF summary endpoint: {e}")
813
  return JSONResponse(content={"error": str(e)}, status_code=500)
814
 
815
+
816
+ # API endpoint: Provide cached PDF content (progressive loading)
817
  @app.get("/api/cached-pdf")
818
  async def get_cached_pdf(path: str, background_tasks: BackgroundTasks):
819
  try:
820
  pdf_file = pathlib.Path(path)
821
  pdf_name = pdf_file.stem
822
 
 
823
  if pdf_name in pdf_cache:
824
  status = pdf_cache[pdf_name].get("status", "")
825
 
 
826
  if status == "completed":
827
  return pdf_cache[pdf_name]
 
 
828
  elif status == "processing":
829
  progress = pdf_cache[pdf_name].get("progress", 0)
830
  pages = pdf_cache[pdf_name].get("pages", [])
831
  total_pages = pdf_cache[pdf_name].get("total_pages", 0)
832
 
 
833
  return {
834
+ "status": "processing",
835
  "progress": progress,
836
  "pages": pages,
837
  "total_pages": total_pages,
838
  "available_pages": len([p for p in pages if p and p.get("src")])
839
  }
840
 
841
+ # If no cache exists, start caching in the background
842
  background_tasks.add_task(cache_pdf, path)
843
  return {"status": "started", "progress": 0}
844
 
845
  except Exception as e:
846
+ logger.error(f"Error providing cached PDF: {str(e)}")
847
  return {"error": str(e), "status": "error"}
848
 
849
+
850
+ # API endpoint: Provide original PDF content (if not cached)
851
  @app.get("/api/pdf-content")
852
  async def get_pdf_content(path: str, background_tasks: BackgroundTasks):
853
  try:
 
854
  pdf_file = pathlib.Path(path)
855
  if not pdf_file.exists():
856
+ return JSONResponse(content={"error": f"File not found: {path}"}, status_code=404)
857
 
858
  pdf_name = pdf_file.stem
859
 
860
+ # If already cached or partially cached, redirect
861
+ if pdf_name in pdf_cache and (
862
+ pdf_cache[pdf_name].get("status") == "completed"
863
+ or (
864
+ pdf_cache[pdf_name].get("status") == "processing"
865
+ and pdf_cache[pdf_name].get("progress", 0) > 10
866
+ )
867
+ ):
868
  return JSONResponse(content={"redirect": f"/api/cached-pdf?path={path}"})
869
 
870
+ with open(path, "rb") as pdf_file_handle:
871
+ content = pdf_file_handle.read()
872
+
 
 
873
  import urllib.parse
874
  filename = pdf_file.name
875
  encoded_filename = urllib.parse.quote(filename)
876
 
877
+ # Start caching in the background
878
  background_tasks.add_task(cache_pdf, path)
879
 
 
880
  headers = {
881
  "Content-Type": "application/pdf",
882
+ "Content-Disposition": f'inline; filename="{encoded_filename}"; filename*=UTF-8\'\'{encoded_filename}'
883
  }
884
 
885
  return Response(content=content, media_type="application/pdf", headers=headers)
886
  except Exception as e:
887
  import traceback
888
  error_details = traceback.format_exc()
889
+ logger.error(f"Error loading PDF content: {str(e)}\n{error_details}")
890
  return JSONResponse(content={"error": str(e)}, status_code=500)
891
 
892
+
893
+ # API endpoint: Upload PDF to permanent storage
894
  @app.post("/api/upload-pdf")
895
  async def upload_pdf(file: UploadFile = File(...)):
896
  try:
 
897
  if not file.filename.lower().endswith('.pdf'):
898
+ return JSONResponse(content={"success": False, "message": "Only PDF files are allowed."}, status_code=400)
 
 
 
899
 
 
900
  file_path = PERMANENT_PDF_DIR / file.filename
901
 
 
902
  content = await file.read()
903
  with open(file_path, "wb") as buffer:
904
  buffer.write(content)
905
 
906
+ # Also copy to main directory to be automatically displayed
907
  with open(PDF_DIR / file.filename, "wb") as buffer:
908
  buffer.write(content)
909
 
 
910
  pdf_id = generate_pdf_id(file.filename)
911
  pdf_metadata[pdf_id] = str(file_path)
912
  save_pdf_metadata()
913
 
 
914
  asyncio.create_task(cache_pdf(str(file_path)))
915
 
916
  return JSONResponse(
917
  content={
918
+ "success": True,
919
+ "path": str(file_path),
920
  "name": file_path.stem,
921
  "id": pdf_id,
922
  "viewUrl": f"/view/{pdf_id}"
 
926
  except Exception as e:
927
  import traceback
928
  error_details = traceback.format_exc()
929
+ logger.error(f"Error uploading PDF: {str(e)}\n{error_details}")
930
+ return JSONResponse(content={"success": False, "message": str(e)}, status_code=500)
 
 
 
931
 
932
+
933
+ # Convert text file to PDF
934
  async def convert_text_to_pdf(text_content: str, title: str) -> str:
935
  try:
 
936
  import re
937
  safe_title = re.sub(r'[^\w\-_\. ]', '_', title)
938
  if not safe_title:
939
  safe_title = "aibook"
940
 
 
941
  timestamp = int(time.time())
942
  filename = f"{safe_title}_{timestamp}.pdf"
943
 
 
944
  file_path = PERMANENT_PDF_DIR / filename
945
 
946
+ # Registering a Korean font. If not found, fallback to Helvetica.
947
  from reportlab.pdfbase import pdfmetrics
948
  from reportlab.pdfbase.ttfonts import TTFont
949
 
 
950
  font_path = BASE / "MaruBuri-SemiBold.ttf"
951
 
 
952
  font_name = "MaruBuri"
953
  if font_path.exists():
954
  pdfmetrics.registerFont(TTFont(font_name, str(font_path)))
955
+ logger.info(f"Successfully registered the Korean font: {font_path}")
956
  else:
957
  font_name = "Helvetica"
958
+ logger.warning(f"Could not find the Korean font file: {font_path}. Using a default font.")
959
 
 
960
  pdf_buffer = io.BytesIO()
961
 
 
962
  from reportlab.lib.pagesizes import letter
963
  from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
964
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
 
966
 
967
  doc = SimpleDocTemplate(pdf_buffer, pagesize=letter, encoding='utf-8')
968
 
 
969
  title_style = ParagraphStyle(
970
  name='CustomTitle',
971
  fontName=font_name,
 
985
  spaceAfter=6
986
  )
987
 
 
988
  content = []
989
 
990
+ # Add title
991
  content.append(Paragraph(title, title_style))
992
  content.append(Spacer(1, 20))
993
 
 
994
  paragraphs = text_content.split('\n\n')
995
  for para in paragraphs:
996
  if para.strip():
 
997
  from xml.sax.saxutils import escape
998
  safe_para = escape(para.replace('\n', '<br/>'))
999
  p = Paragraph(safe_para, normal_style)
1000
  content.append(p)
1001
  content.append(Spacer(1, 10))
1002
 
 
1003
  doc.build(content)
1004
 
 
1005
  with open(file_path, 'wb') as f:
1006
  f.write(pdf_buffer.getvalue())
1007
 
1008
+ # Copy to main directory
1009
  with open(PDF_DIR / filename, 'wb') as f:
1010
  f.write(pdf_buffer.getvalue())
1011
 
 
1012
  pdf_id = generate_pdf_id(filename)
1013
  pdf_metadata[pdf_id] = str(file_path)
1014
  save_pdf_metadata()
1015
 
 
1016
  asyncio.create_task(cache_pdf(str(file_path)))
1017
 
1018
  return {
 
1022
  }
1023
 
1024
  except Exception as e:
1025
+ logger.error(f"Error converting text to PDF: {e}")
1026
  raise e
1027
 
1028
 
1029
+ # AI-based text enhancement stub (placeholder)
1030
  async def enhance_text_with_ai(text_content: str, title: str) -> str:
1031
+ # Currently returns the original text (AI enhancement disabled)
1032
+ return text_content
 
1033
 
1034
 
1035
+ # API endpoint: Convert uploaded text file to PDF
1036
  @app.post("/api/text-to-pdf")
1037
  async def text_to_pdf(file: UploadFile = File(...)):
1038
  try:
 
1039
  filename = file.filename.lower()
1040
  if not (filename.endswith('.txt') or filename.endswith('.docx') or filename.endswith('.doc')):
1041
  return JSONResponse(
1042
+ content={"success": False, "message": "Supported file formats are .txt, .docx, and .doc only."},
1043
  status_code=400
1044
  )
1045
 
 
1046
  content = await file.read()
1047
 
1048
+ # Extract text depending on file type
1049
  if filename.endswith('.txt'):
 
1050
  encodings = ['utf-8', 'euc-kr', 'cp949', 'latin1']
1051
  text_content = None
1052
 
1053
  for encoding in encodings:
1054
  try:
1055
  text_content = content.decode(encoding, errors='strict')
1056
+ logger.info(f"Detected text file encoding: {encoding}")
1057
  break
1058
  except UnicodeDecodeError:
1059
  continue
1060
 
1061
  if text_content is None:
 
1062
  text_content = content.decode('utf-8', errors='replace')
1063
+ logger.warning("Could not detect text file encoding; defaulting to UTF-8.")
1064
 
1065
  elif filename.endswith('.docx') or filename.endswith('.doc'):
 
1066
  with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as temp_file:
1067
  temp_file.write(content)
1068
  temp_path = temp_file.name
1069
 
1070
  try:
 
1071
  text_content = docx2txt.process(temp_path)
1072
  finally:
 
1073
  os.unlink(temp_path)
1074
 
 
1075
  title = os.path.splitext(filename)[0]
1076
 
1077
+ # Optional AI enhancement
1078
  enhanced_text = await enhance_text_with_ai(text_content, title)
1079
 
1080
+ # Convert the final text to PDF
1081
  pdf_info = await convert_text_to_pdf(enhanced_text, title)
1082
 
1083
  return JSONResponse(
1084
  content={
1085
+ "success": True,
1086
+ "path": pdf_info["path"],
1087
  "name": os.path.splitext(pdf_info["filename"])[0],
1088
  "id": pdf_info["id"],
1089
  "viewUrl": f"/view/{pdf_info['id']}"
 
1093
  except Exception as e:
1094
  import traceback
1095
  error_details = traceback.format_exc()
1096
+ logger.error(f"Error converting text to PDF: {str(e)}\n{error_details}")
1097
+ return JSONResponse(content={"success": False, "message": str(e)}, status_code=500)
 
 
 
1098
 
1099
+
1100
+ # Admin authentication endpoint
1101
  @app.post("/api/admin-login")
1102
  async def admin_login(password: str = Form(...)):
1103
  if password == ADMIN_PASSWORD:
1104
  return {"success": True}
1105
+ return {"success": False, "message": "Authentication failed."}
1106
+
1107
 
1108
+ # Admin: Delete PDF
1109
  @app.delete("/api/admin/delete-pdf")
1110
  async def delete_pdf(path: str):
1111
  try:
1112
  pdf_file = pathlib.Path(path)
1113
  if not pdf_file.exists():
1114
+ return {"success": False, "message": "File not found."}
1115
 
 
1116
  filename = pdf_file.name
1117
 
1118
+ # Delete from permanent storage
1119
  pdf_file.unlink()
1120
 
1121
+ # Also delete from main directory if exists
1122
  main_file_path = PDF_DIR / filename
1123
  if main_file_path.exists():
1124
  main_file_path.unlink()
1125
 
1126
+ # Delete related cache
1127
  pdf_name = pdf_file.stem
1128
  cache_path = get_cache_path(pdf_name)
1129
  if cache_path.exists():
1130
  cache_path.unlink()
1131
+
 
1132
  if pdf_name in pdf_cache:
1133
  del pdf_cache[pdf_name]
1134
+
1135
+ # Remove from metadata
1136
  to_remove = []
1137
  for pid, fpath in pdf_metadata.items():
1138
  if os.path.basename(fpath) == filename:
 
1142
  del pdf_metadata[pid]
1143
 
1144
  save_pdf_metadata()
1145
+
1146
  return {"success": True}
1147
  except Exception as e:
1148
+ logger.error(f"Error deleting PDF: {str(e)}")
1149
  return {"success": False, "message": str(e)}
1150
 
1151
+
1152
+ # Admin: Feature PDF (copy to main directory)
1153
  @app.post("/api/admin/feature-pdf")
1154
  async def feature_pdf(path: str):
1155
  try:
1156
  pdf_file = pathlib.Path(path)
1157
  if not pdf_file.exists():
1158
+ return {"success": False, "message": "File not found."}
1159
 
 
1160
  target_path = PDF_DIR / pdf_file.name
1161
  shutil.copy2(pdf_file, target_path)
1162
+
1163
  return {"success": True}
1164
  except Exception as e:
1165
+ logger.error(f"Error featuring PDF: {str(e)}")
1166
  return {"success": False, "message": str(e)}
1167
 
1168
+
1169
+ # Admin: Unfeature PDF (remove from main directory only)
1170
  @app.delete("/api/admin/unfeature-pdf")
1171
  async def unfeature_pdf(path: str):
1172
  try:
 
1175
 
1176
  if target_path.exists():
1177
  target_path.unlink()
1178
+
1179
  return {"success": True}
1180
  except Exception as e:
1181
+ logger.error(f"Error unfeaturing PDF: {str(e)}")
1182
  return {"success": False, "message": str(e)}
1183
 
1184
+
1185
  @app.get("/view/{pdf_id}")
1186
  async def view_pdf_by_id(pdf_id: str):
 
1187
  pdf_path = get_pdf_path_by_id(pdf_id)
1188
 
1189
  if not pdf_path:
1190
+ # Reload metadata and retry
1191
  load_pdf_metadata()
1192
  pdf_path = get_pdf_path_by_id(pdf_id)
1193
 
1194
  if not pdf_path:
1195
+ # As a final fallback, try scanning all files for a match
1196
  for file_path in get_pdf_files() + get_permanent_pdf_files():
1197
  name_part = pdf_id.split('_')[0] if '_' in pdf_id else pdf_id
1198
  if file_path.stem.startswith(name_part):
 
1203
 
1204
  if not pdf_path:
1205
  return HTMLResponse(
1206
+ content=(
1207
+ f"<html><body><h1>Could not find the requested PDF</h1>"
1208
+ f"<p>ID: {pdf_id}</p><a href='/'>Go back to home</a></body></html>"
1209
+ ),
1210
  status_code=404
1211
  )
1212
 
1213
+ # Redirect to the main page with PDF ID parameter
1214
  return get_html_content(pdf_id=pdf_id)
1215
 
1216
+
1217
  def get_html_content(pdf_id: str = None):
1218
  html_path = BASE / "flipbook_template.html"
1219
  content = ""
 
1221
  with open(html_path, "r", encoding="utf-8") as f:
1222
  content = f.read()
1223
  else:
1224
+ content = HTML # fallback if no local template
1225
 
 
1226
  if pdf_id:
1227
  auto_load_script = f"""
1228
  <script>
 
1229
  document.addEventListener('DOMContentLoaded', async function() {{
1230
  try {{
 
1231
  const response = await fetch('/api/pdf-info-by-id/{pdf_id}');
1232
  const pdfInfo = await response.json();
1233
 
1234
  if (pdfInfo.exists && pdfInfo.path) {{
 
1235
  setTimeout(() => {{
1236
  openPdfById('{pdf_id}', pdfInfo.path, pdfInfo.cached);
1237
  }}, 500);
1238
  }} else {{
1239
+ showError("The requested PDF could not be found.");
1240
  }}
1241
  }} catch (e) {{
1242
+ console.error("Auto-load PDF error:", e);
1243
  }}
1244
  }});
1245
  </script>
1246
  """
1247
 
 
1248
  content = content.replace("</body>", auto_load_script + "</body>")
1249
 
1250
  return HTMLResponse(content=content)
1251
 
1252
+
1253
  @app.get("/", response_class=HTMLResponse)
1254
  async def root(request: Request, pdf_id: Optional[str] = Query(None)):
 
1255
  if pdf_id:
1256
  return RedirectResponse(url=f"/view/{pdf_id}")
1257
  return get_html_content()
1258
 
 
 
 
1259
 
1260
+ import os
1261
  HTML = os.getenv("HTML_TEMPLATE", "")
 
 
1262
  if not HTML:
1263
+ logger.warning("HTML_TEMPLATE secret is not set. Using default HTML.")
1264
  HTML = """
1265
  <!doctype html>
1266
+ <html lang="en">
1267
  <head>
1268
  <meta charset="utf-8">
1269
  <title>FlipBook Space</title>
 
1273
  </style>
1274
  </head>
1275
  <body>
1276
+ <h1>Could not load the HTML template</h1>
1277
+ <p class="error">HTML_TEMPLATE secret is not configured.</p>
1278
+ <p>Please set the HTML_TEMPLATE in your Hugging Face Space secrets.</p>
1279
  </body>
1280
  </html>
1281
  """
1282
 
1283
  if __name__ == "__main__":
1284
+ uvicorn.run("app:app", host="0.0.0.0", port=int(os.getenv("PORT", 7860)))