Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update app.py
Browse files
app.py
CHANGED
@@ -17,7 +17,7 @@ from reportlab.lib.styles import getSampleStyleSheet
|
|
17 |
import io
|
18 |
import docx2txt
|
19 |
|
20 |
-
#
|
21 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
22 |
logger = logging.getLogger(__name__)
|
23 |
|
@@ -25,138 +25,139 @@ BASE = pathlib.Path(__file__).parent
|
|
25 |
app = FastAPI()
|
26 |
app.mount("/static", StaticFiles(directory=BASE), name="static")
|
27 |
|
28 |
-
# PDF
|
29 |
PDF_DIR = BASE / "pdf"
|
30 |
if not PDF_DIR.exists():
|
31 |
PDF_DIR.mkdir(parents=True)
|
32 |
|
33 |
-
#
|
34 |
PERMANENT_PDF_DIR = pathlib.Path("/data/pdfs") if os.path.exists("/data") else BASE / "permanent_pdfs"
|
35 |
if not PERMANENT_PDF_DIR.exists():
|
36 |
PERMANENT_PDF_DIR.mkdir(parents=True)
|
37 |
|
38 |
-
#
|
39 |
CACHE_DIR = BASE / "cache"
|
40 |
if not CACHE_DIR.exists():
|
41 |
CACHE_DIR.mkdir(parents=True)
|
42 |
|
43 |
-
# PDF
|
44 |
METADATA_DIR = pathlib.Path("/data/metadata") if os.path.exists("/data") else BASE / "metadata"
|
45 |
if not METADATA_DIR.exists():
|
46 |
METADATA_DIR.mkdir(parents=True)
|
47 |
PDF_METADATA_FILE = METADATA_DIR / "pdf_metadata.json"
|
48 |
|
49 |
-
#
|
50 |
EMBEDDING_DIR = pathlib.Path("/data/embeddings") if os.path.exists("/data") else BASE / "embeddings"
|
51 |
if not EMBEDDING_DIR.exists():
|
52 |
EMBEDDING_DIR.mkdir(parents=True)
|
53 |
|
54 |
-
#
|
55 |
-
ADMIN_PASSWORD = os.getenv("PASSWORD", "admin") #
|
56 |
|
57 |
-
# OpenAI API
|
58 |
OPENAI_API_KEY = os.getenv("LLM_API", "")
|
59 |
-
#
|
60 |
HAS_VALID_API_KEY = bool(OPENAI_API_KEY and OPENAI_API_KEY.strip())
|
61 |
|
62 |
if HAS_VALID_API_KEY:
|
63 |
try:
|
64 |
openai_client = OpenAI(api_key=OPENAI_API_KEY, timeout=30.0)
|
65 |
-
logger.info("OpenAI
|
66 |
except Exception as e:
|
67 |
-
logger.error(f"
|
68 |
HAS_VALID_API_KEY = False
|
69 |
else:
|
70 |
-
logger.warning("
|
71 |
openai_client = None
|
72 |
|
73 |
-
#
|
74 |
pdf_cache: Dict[str, Dict[str, Any]] = {}
|
75 |
-
#
|
76 |
cache_locks = {}
|
77 |
-
# PDF
|
78 |
pdf_metadata: Dict[str, str] = {}
|
79 |
-
# PDF
|
80 |
pdf_embeddings: Dict[str, Dict[str, Any]] = {}
|
81 |
|
82 |
-
|
|
|
83 |
def load_pdf_metadata():
|
84 |
global pdf_metadata
|
85 |
if PDF_METADATA_FILE.exists():
|
86 |
try:
|
87 |
with open(PDF_METADATA_FILE, "r") as f:
|
88 |
pdf_metadata = json.load(f)
|
89 |
-
logger.info(f"PDF
|
90 |
except Exception as e:
|
91 |
-
logger.error(f"
|
92 |
pdf_metadata = {}
|
93 |
else:
|
94 |
pdf_metadata = {}
|
95 |
|
96 |
-
|
|
|
97 |
def save_pdf_metadata():
|
98 |
try:
|
99 |
with open(PDF_METADATA_FILE, "w") as f:
|
100 |
json.dump(pdf_metadata, f)
|
101 |
except Exception as e:
|
102 |
-
logger.error(f"
|
|
|
103 |
|
104 |
-
# PDF ID
|
105 |
def generate_pdf_id(filename: str) -> str:
|
106 |
-
# ํ์ผ๋ช
์์ ํ์ฅ์ ์ ๊ฑฐ
|
107 |
-
base_name = os.path.splitext(filename)[0]
|
108 |
-
# ์์ ํ ๋ฌธ์์ด๋ก ๋ณํ (URL ์ธ์ฝ๋ฉ ๋์ ์ง์ ๋ณํ)
|
109 |
import re
|
|
|
110 |
safe_name = re.sub(r'[^\w\-_]', '_', base_name.replace(" ", "_"))
|
111 |
-
# ํ์์คํฌํ ์ถ๊ฐ๋ก ๊ณ ์ ์ฑ ๋ณด์ฅ
|
112 |
timestamp = int(time.time())
|
113 |
-
# ์งง์ ์์ ๋ฌธ์์ด ์ถ๊ฐ
|
114 |
random_suffix = uuid.uuid4().hex[:6]
|
115 |
return f"{safe_name}_{timestamp}_{random_suffix}"
|
116 |
|
117 |
-
|
|
|
118 |
def get_pdf_files():
|
119 |
pdf_files = []
|
120 |
if PDF_DIR.exists():
|
121 |
pdf_files = [f for f in PDF_DIR.glob("*.pdf")]
|
122 |
return pdf_files
|
123 |
|
124 |
-
|
|
|
125 |
def get_permanent_pdf_files():
|
126 |
pdf_files = []
|
127 |
if PERMANENT_PDF_DIR.exists():
|
128 |
pdf_files = [f for f in PERMANENT_PDF_DIR.glob("*.pdf")]
|
129 |
return pdf_files
|
130 |
|
131 |
-
|
|
|
132 |
def generate_pdf_projects():
|
133 |
projects_data = []
|
134 |
|
135 |
-
#
|
136 |
pdf_files = get_pdf_files()
|
137 |
permanent_pdf_files = get_permanent_pdf_files()
|
138 |
|
139 |
-
#
|
140 |
unique_files = {}
|
141 |
|
142 |
-
#
|
143 |
for file in pdf_files:
|
144 |
unique_files[file.name] = file
|
145 |
|
146 |
-
#
|
147 |
for file in permanent_pdf_files:
|
148 |
unique_files[file.name] = file
|
149 |
|
150 |
-
# ์ค๋ณต ์ ๊ฑฐ๋ ํ์ผ๋ค๋ก ํ๋ก์ ํธ ๋ฐ์ดํฐ ์์ฑ
|
151 |
for pdf_file in unique_files.values():
|
152 |
-
#
|
153 |
pdf_id = None
|
154 |
for pid, path in pdf_metadata.items():
|
155 |
if os.path.basename(path) == pdf_file.name:
|
156 |
pdf_id = pid
|
157 |
break
|
158 |
|
159 |
-
# ID
|
160 |
if not pdf_id:
|
161 |
pdf_id = generate_pdf_id(pdf_file.name)
|
162 |
pdf_metadata[pdf_id] = str(pdf_file)
|
@@ -171,15 +172,18 @@ def generate_pdf_projects():
|
|
171 |
|
172 |
return projects_data
|
173 |
|
174 |
-
|
|
|
175 |
def get_cache_path(pdf_name: str):
|
176 |
return CACHE_DIR / f"{pdf_name}_cache.json"
|
177 |
|
178 |
-
|
|
|
179 |
def get_embedding_path(pdf_id: str):
|
180 |
return EMBEDDING_DIR / f"{pdf_id}_embedding.json"
|
181 |
|
182 |
-
|
|
|
183 |
def extract_pdf_text(pdf_path: str) -> List[Dict[str, Any]]:
|
184 |
try:
|
185 |
doc = fitz.open(pdf_path)
|
@@ -189,7 +193,7 @@ def extract_pdf_text(pdf_path: str) -> List[Dict[str, Any]]:
|
|
189 |
page = doc[page_num]
|
190 |
text = page.get_text()
|
191 |
|
192 |
-
#
|
193 |
if text.strip():
|
194 |
chunks.append({
|
195 |
"page": page_num + 1,
|
@@ -199,32 +203,33 @@ def extract_pdf_text(pdf_path: str) -> List[Dict[str, Any]]:
|
|
199 |
|
200 |
return chunks
|
201 |
except Exception as e:
|
202 |
-
logger.error(f"
|
203 |
return []
|
204 |
|
205 |
-
|
|
|
206 |
async def get_pdf_embedding(pdf_id: str) -> Dict[str, Any]:
|
207 |
try:
|
208 |
-
#
|
209 |
embedding_path = get_embedding_path(pdf_id)
|
210 |
if embedding_path.exists():
|
211 |
try:
|
212 |
with open(embedding_path, "r", encoding="utf-8") as f:
|
213 |
return json.load(f)
|
214 |
except Exception as e:
|
215 |
-
logger.error(f"
|
216 |
|
217 |
-
# PDF
|
218 |
pdf_path = get_pdf_path_by_id(pdf_id)
|
219 |
if not pdf_path:
|
220 |
-
raise ValueError(f"PDF ID {pdf_id}
|
221 |
|
222 |
-
#
|
223 |
chunks = extract_pdf_text(pdf_path)
|
224 |
if not chunks:
|
225 |
-
raise ValueError(f"
|
226 |
|
227 |
-
#
|
228 |
embedding_data = {
|
229 |
"pdf_id": pdf_id,
|
230 |
"pdf_path": pdf_path,
|
@@ -232,61 +237,69 @@ async def get_pdf_embedding(pdf_id: str) -> Dict[str, Any]:
|
|
232 |
"created_at": time.time()
|
233 |
}
|
234 |
|
235 |
-
#
|
236 |
with open(embedding_path, "w", encoding="utf-8") as f:
|
237 |
json.dump(embedding_data, f, ensure_ascii=False)
|
238 |
|
239 |
return embedding_data
|
240 |
|
241 |
except Exception as e:
|
242 |
-
logger.error(f"
|
243 |
return {"error": str(e), "pdf_id": pdf_id}
|
244 |
|
245 |
-
|
246 |
-
# PDF
|
247 |
async def query_pdf(pdf_id: str, query: str) -> Dict[str, Any]:
|
248 |
try:
|
249 |
-
#
|
250 |
if not HAS_VALID_API_KEY or not openai_client:
|
251 |
return {
|
252 |
-
"error": "OpenAI API
|
253 |
-
"answer": "
|
254 |
}
|
255 |
|
256 |
-
#
|
257 |
embedding_data = await get_pdf_embedding(pdf_id)
|
258 |
if "error" in embedding_data:
|
259 |
return {"error": embedding_data["error"]}
|
260 |
|
261 |
-
#
|
262 |
all_text = "\n\n".join([f"Page {chunk['page']}: {chunk['text']}" for chunk in embedding_data["chunks"]])
|
263 |
|
264 |
-
#
|
265 |
-
max_context_length = 60000 #
|
266 |
if len(all_text) > max_context_length:
|
267 |
-
all_text = all_text[:max_context_length] + "...(
|
268 |
|
269 |
-
#
|
270 |
system_prompt = """
|
271 |
-
|
272 |
-
You are an assistant that answers questions based solely on the provided PDF
|
273 |
-
|
274 |
"""
|
275 |
|
276 |
-
#
|
277 |
try:
|
278 |
-
#
|
279 |
-
for attempt in range(3):
|
280 |
try:
|
281 |
response = openai_client.chat.completions.create(
|
282 |
model="gpt-4.1-mini",
|
283 |
messages=[
|
284 |
{"role": "system", "content": system_prompt},
|
285 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
],
|
287 |
temperature=0.7,
|
288 |
max_tokens=2048,
|
289 |
-
timeout=30.0
|
290 |
)
|
291 |
|
292 |
answer = response.choices[0].message.content
|
@@ -296,68 +309,70 @@ Please ensure your responses are clear and concise, citing relevant page numbers
|
|
296 |
"query": query
|
297 |
}
|
298 |
except Exception as api_error:
|
299 |
-
logger.error(f"OpenAI API
|
300 |
-
if attempt == 2:
|
301 |
raise api_error
|
302 |
-
await asyncio.sleep(1 * (attempt + 1))
|
303 |
|
304 |
-
|
305 |
-
raise Exception("API ํธ์ถ ์ฌ์๋ ๋ชจ๋ ์คํจ")
|
306 |
except Exception as api_error:
|
307 |
-
logger.error(f"OpenAI API
|
308 |
-
# ์ค๋ฅ ์ ํ์ ๋ฐ๋ฅธ ๋ ๋ช
ํํ ๋ฉ์์ง ์ ๊ณต
|
309 |
error_message = str(api_error)
|
310 |
if "Connection" in error_message:
|
311 |
-
return {"error": "OpenAI
|
312 |
elif "Unauthorized" in error_message or "Authentication" in error_message:
|
313 |
-
return {"error": "API
|
314 |
elif "Rate limit" in error_message:
|
315 |
-
return {"error": "API
|
316 |
else:
|
317 |
-
return {"error": f"
|
318 |
|
319 |
except Exception as e:
|
320 |
-
logger.error(f"
|
321 |
return {"error": str(e)}
|
322 |
|
323 |
-
|
324 |
-
# PDF
|
325 |
async def summarize_pdf(pdf_id: str) -> Dict[str, Any]:
|
326 |
try:
|
327 |
-
#
|
328 |
if not HAS_VALID_API_KEY or not openai_client:
|
329 |
return {
|
330 |
-
"error": "OpenAI API
|
331 |
-
"summary": "
|
332 |
}
|
333 |
|
334 |
-
#
|
335 |
embedding_data = await get_pdf_embedding(pdf_id)
|
336 |
if "error" in embedding_data:
|
337 |
-
return {"error": embedding_data["error"], "summary": "
|
338 |
|
339 |
-
# ์ฒญํฌ ํ
์คํธ ๋ชจ์ผ๊ธฐ (์ ํ๋ ๊ธธ์ด)
|
340 |
all_text = "\n\n".join([f"Page {chunk['page']}: {chunk['text']}" for chunk in embedding_data["chunks"]])
|
341 |
|
342 |
-
#
|
343 |
-
max_context_length = 60000
|
344 |
if len(all_text) > max_context_length:
|
345 |
-
all_text = all_text[:max_context_length] + "...(
|
346 |
|
347 |
-
# OpenAI API ํธ์ถ
|
348 |
try:
|
349 |
-
#
|
350 |
-
for attempt in range(3):
|
351 |
try:
|
352 |
response = openai_client.chat.completions.create(
|
353 |
model="gpt-4.1-mini",
|
354 |
messages=[
|
355 |
-
{
|
356 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
],
|
358 |
temperature=0.7,
|
359 |
max_tokens=1024,
|
360 |
-
timeout=30.0
|
361 |
)
|
362 |
|
363 |
summary = response.choices[0].message.content
|
@@ -366,60 +381,57 @@ async def summarize_pdf(pdf_id: str) -> Dict[str, Any]:
|
|
366 |
"pdf_id": pdf_id
|
367 |
}
|
368 |
except Exception as api_error:
|
369 |
-
logger.error(f"OpenAI API
|
370 |
-
if attempt == 2:
|
371 |
raise api_error
|
372 |
-
await asyncio.sleep(1 * (attempt + 1))
|
373 |
|
374 |
-
|
375 |
-
raise Exception("API ํธ์ถ ์ฌ์๋ ๋ชจ๋ ์คํจ")
|
376 |
except Exception as api_error:
|
377 |
-
logger.error(f"OpenAI API
|
378 |
-
# ์ค๋ฅ ์ ํ์ ๋ฐ๋ฅธ ๋ ๋ช
ํํ ๋ฉ์์ง ์ ๊ณต
|
379 |
error_message = str(api_error)
|
380 |
if "Connection" in error_message:
|
381 |
-
return {"error": "
|
382 |
elif "Unauthorized" in error_message or "Authentication" in error_message:
|
383 |
-
return {"error": "API
|
384 |
elif "Rate limit" in error_message:
|
385 |
-
return {"error": "API
|
386 |
else:
|
387 |
-
return {"error": f"
|
388 |
|
389 |
except Exception as e:
|
390 |
-
logger.error(f"
|
391 |
return {
|
392 |
"error": str(e),
|
393 |
-
"summary": "
|
394 |
}
|
395 |
|
396 |
-
|
397 |
-
#
|
398 |
async def cache_pdf(pdf_path: str):
|
399 |
try:
|
400 |
-
import fitz
|
401 |
|
402 |
pdf_file = pathlib.Path(pdf_path)
|
403 |
pdf_name = pdf_file.stem
|
404 |
|
405 |
-
#
|
406 |
if pdf_name not in cache_locks:
|
407 |
cache_locks[pdf_name] = threading.Lock()
|
408 |
|
409 |
-
#
|
410 |
if pdf_name in pdf_cache and pdf_cache[pdf_name].get("status") in ["processing", "completed"]:
|
411 |
-
logger.info(f"PDF {pdf_name}
|
412 |
return
|
413 |
|
414 |
with cache_locks[pdf_name]:
|
415 |
-
#
|
416 |
if pdf_name in pdf_cache and pdf_cache[pdf_name].get("status") in ["processing", "completed"]:
|
417 |
return
|
418 |
|
419 |
-
# ์บ์ ์ํ ์
๋ฐ์ดํธ
|
420 |
pdf_cache[pdf_name] = {"status": "processing", "progress": 0, "pages": []}
|
421 |
|
422 |
-
#
|
423 |
cache_path = get_cache_path(pdf_name)
|
424 |
if cache_path.exists():
|
425 |
try:
|
@@ -428,47 +440,41 @@ async def cache_pdf(pdf_path: str):
|
|
428 |
if cached_data.get("status") == "completed" and cached_data.get("pages"):
|
429 |
pdf_cache[pdf_name] = cached_data
|
430 |
pdf_cache[pdf_name]["status"] = "completed"
|
431 |
-
logger.info(f"
|
432 |
return
|
433 |
except Exception as e:
|
434 |
-
logger.error(f"
|
435 |
|
436 |
-
#
|
437 |
doc = fitz.open(pdf_path)
|
438 |
total_pages = doc.page_count
|
439 |
|
440 |
-
#
|
441 |
if total_pages > 0:
|
442 |
-
# ์ฒซ ํ์ด์ง ์ธ๋ค์ผ ์์ฑ
|
443 |
page = doc[0]
|
444 |
-
pix_thumb = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2))
|
445 |
thumb_data = pix_thumb.tobytes("png")
|
446 |
b64_thumb = base64.b64encode(thumb_data).decode('utf-8')
|
447 |
thumb_src = f"data:image/png;base64,{b64_thumb}"
|
448 |
|
449 |
-
# ์ธ๋ค์ผ ํ์ด์ง๋ง ๋จผ์ ์บ์
|
450 |
pdf_cache[pdf_name]["pages"] = [{"thumb": thumb_src, "src": ""}]
|
451 |
pdf_cache[pdf_name]["progress"] = 1
|
452 |
pdf_cache[pdf_name]["total_pages"] = total_pages
|
453 |
|
454 |
-
#
|
455 |
-
scale_factor = 1.0
|
456 |
-
jpeg_quality = 80
|
457 |
|
458 |
-
#
|
459 |
def process_page(page_num):
|
460 |
try:
|
461 |
page = doc[page_num]
|
462 |
-
|
463 |
-
# ์ด๋ฏธ์ง๋ก ๋ณํ ์ ๋งคํธ๋ฆญ์ค ์ค์ผ์ผ๋ง ์ ์ฉ (์ฑ๋ฅ ์ต์ ํ)
|
464 |
pix = page.get_pixmap(matrix=fitz.Matrix(scale_factor, scale_factor))
|
465 |
-
|
466 |
-
# JPEG ํ์์ผ๋ก ์ธ์ฝ๋ฉ (PNG๋ณด๋ค ํฌ๊ธฐ ์์)
|
467 |
img_data = pix.tobytes("jpeg", jpeg_quality)
|
468 |
b64_img = base64.b64encode(img_data).decode('utf-8')
|
469 |
img_src = f"data:image/jpeg;base64,{b64_img}"
|
470 |
|
471 |
-
#
|
472 |
thumb_src = "" if page_num > 0 else pdf_cache[pdf_name]["pages"][0]["thumb"]
|
473 |
|
474 |
return {
|
@@ -477,7 +483,7 @@ async def cache_pdf(pdf_path: str):
|
|
477 |
"thumb": thumb_src
|
478 |
}
|
479 |
except Exception as e:
|
480 |
-
logger.error(f"
|
481 |
return {
|
482 |
"page_num": page_num,
|
483 |
"src": "",
|
@@ -485,22 +491,19 @@ async def cache_pdf(pdf_path: str):
|
|
485 |
"error": str(e)
|
486 |
}
|
487 |
|
488 |
-
# ๋ณ๋ ฌ ์ฒ๋ฆฌ๋ก ๋ชจ๋ ํ์ด์ง ์ฒ๋ฆฌ
|
489 |
pages = [None] * total_pages
|
490 |
processed_count = 0
|
491 |
|
492 |
-
#
|
493 |
-
batch_size = 5
|
494 |
|
495 |
for batch_start in range(0, total_pages, batch_size):
|
496 |
batch_end = min(batch_start + batch_size, total_pages)
|
497 |
current_batch = list(range(batch_start, batch_end))
|
498 |
|
499 |
-
# ๋ณ๋ ฌ ์ฒ๋ฆฌ๋ก ๋ฐฐ์น ํ์ด์ง ๋ ๋๋ง
|
500 |
with concurrent.futures.ThreadPoolExecutor(max_workers=min(5, batch_size)) as executor:
|
501 |
batch_results = list(executor.map(process_page, current_batch))
|
502 |
|
503 |
-
# ๊ฒฐ๊ณผ ์ ์ฅ
|
504 |
for result in batch_results:
|
505 |
page_num = result["page_num"]
|
506 |
pages[page_num] = {
|
@@ -512,7 +515,6 @@ async def cache_pdf(pdf_path: str):
|
|
512 |
progress = round(processed_count / total_pages * 100)
|
513 |
pdf_cache[pdf_name]["progress"] = progress
|
514 |
|
515 |
-
# ์ค๊ฐ ์ ์ฅ
|
516 |
pdf_cache[pdf_name]["pages"] = pages
|
517 |
try:
|
518 |
with open(cache_path, "w") as cache_file:
|
@@ -523,9 +525,8 @@ async def cache_pdf(pdf_path: str):
|
|
523 |
"total_pages": total_pages
|
524 |
}, cache_file)
|
525 |
except Exception as e:
|
526 |
-
logger.error(f"
|
527 |
|
528 |
-
# ์บ์ฑ ์๋ฃ
|
529 |
pdf_cache[pdf_name] = {
|
530 |
"status": "completed",
|
531 |
"progress": 100,
|
@@ -533,74 +534,66 @@ async def cache_pdf(pdf_path: str):
|
|
533 |
"total_pages": total_pages
|
534 |
}
|
535 |
|
536 |
-
#
|
537 |
try:
|
538 |
with open(cache_path, "w") as cache_file:
|
539 |
json.dump(pdf_cache[pdf_name], cache_file)
|
540 |
-
logger.info(f"PDF {pdf_name}
|
541 |
except Exception as e:
|
542 |
-
logger.error(f"
|
543 |
|
544 |
except Exception as e:
|
545 |
import traceback
|
546 |
-
logger.error(f"
|
547 |
if pdf_name in pdf_cache:
|
548 |
pdf_cache[pdf_name]["status"] = "error"
|
549 |
pdf_cache[pdf_name]["error"] = str(e)
|
550 |
|
551 |
-
|
|
|
552 |
def get_pdf_path_by_id(pdf_id: str) -> str:
|
553 |
-
logger.info(f"PDF ID
|
554 |
|
555 |
-
# 1.
|
556 |
if pdf_id in pdf_metadata:
|
557 |
path = pdf_metadata[pdf_id]
|
558 |
-
# ํ์ผ ์กด์ฌ ํ์ธ
|
559 |
if os.path.exists(path):
|
560 |
return path
|
561 |
|
562 |
-
#
|
563 |
filename = os.path.basename(path)
|
564 |
|
565 |
-
#
|
566 |
perm_path = PERMANENT_PDF_DIR / filename
|
567 |
if perm_path.exists():
|
568 |
-
# ๋ฉํ๋ฐ์ดํฐ ์
๋ฐ์ดํธ
|
569 |
pdf_metadata[pdf_id] = str(perm_path)
|
570 |
save_pdf_metadata()
|
571 |
return str(perm_path)
|
572 |
-
|
573 |
-
#
|
574 |
main_path = PDF_DIR / filename
|
575 |
if main_path.exists():
|
576 |
-
# ๋ฉํ๋ฐ์ดํฐ ์
๋ฐ์ดํธ
|
577 |
pdf_metadata[pdf_id] = str(main_path)
|
578 |
save_pdf_metadata()
|
579 |
return str(main_path)
|
580 |
|
581 |
-
# 2.
|
582 |
try:
|
583 |
-
# ID ํ์: filename_timestamp_random
|
584 |
-
# ํ์ผ๋ช
๋ถ๋ถ๋ง ์ถ์ถ
|
585 |
name_part = pdf_id.split('_')[0] if '_' in pdf_id else pdf_id
|
586 |
|
587 |
-
# ๋ชจ๋ PDF ํ์ผ ๊ฒ์
|
588 |
for file_path in get_pdf_files() + get_permanent_pdf_files():
|
589 |
-
# ํ์ผ๋ช
์ด ID์ ์์ ๋ถ๋ถ๊ณผ ์ผ์นํ๋ฉด
|
590 |
file_basename = os.path.basename(file_path)
|
591 |
if file_basename.startswith(name_part) or file_path.stem.startswith(name_part):
|
592 |
-
# ID ๋งคํ ์
๋ฐ์ดํธ
|
593 |
pdf_metadata[pdf_id] = str(file_path)
|
594 |
save_pdf_metadata()
|
595 |
return str(file_path)
|
596 |
except Exception as e:
|
597 |
-
logger.error(f"
|
598 |
|
599 |
-
# 3.
|
600 |
for pid, path in pdf_metadata.items():
|
601 |
if os.path.exists(path):
|
602 |
file_basename = os.path.basename(path)
|
603 |
-
# ์ ์ฌํ ํ์ผ๋ช
์ ๊ฐ์ง ๊ฒฝ์ฐ
|
604 |
if pdf_id in pid or pid in pdf_id:
|
605 |
pdf_metadata[pdf_id] = path
|
606 |
save_pdf_metadata()
|
@@ -608,28 +601,22 @@ def get_pdf_path_by_id(pdf_id: str) -> str:
|
|
608 |
|
609 |
return None
|
610 |
|
611 |
-
|
|
|
612 |
async def init_cache_all_pdfs():
|
613 |
-
logger.info("PDF
|
614 |
-
|
615 |
-
# PDF ๋ฉํ๋ฐ์ดํฐ ๋ก๋
|
616 |
load_pdf_metadata()
|
617 |
|
618 |
-
# ๋ฉ์ธ ๋ฐ ์๊ตฌ ๋๋ ํ ๋ฆฌ์์ PDF ํ์ผ ๋ชจ๋ ๊ฐ์ ธ์ค๊ธฐ
|
619 |
pdf_files = get_pdf_files() + get_permanent_pdf_files()
|
620 |
-
|
621 |
-
# ์ค๋ณต ์ ๊ฑฐ
|
622 |
unique_pdf_paths = set(str(p) for p in pdf_files)
|
623 |
pdf_files = [pathlib.Path(p) for p in unique_pdf_paths]
|
624 |
|
625 |
-
#
|
626 |
for pdf_file in pdf_files:
|
627 |
-
# ID๊ฐ ์๋ ํ์ผ์ ๋ํด ID ์์ฑ
|
628 |
found = False
|
629 |
for pid, path in pdf_metadata.items():
|
630 |
if os.path.basename(path) == pdf_file.name:
|
631 |
found = True
|
632 |
-
# ๊ฒฝ๋ก ์
๋ฐ์ดํธ ํ์ํ ๊ฒฝ์ฐ
|
633 |
if not os.path.exists(path):
|
634 |
pdf_metadata[pid] = str(pdf_file)
|
635 |
break
|
@@ -638,10 +625,9 @@ async def init_cache_all_pdfs():
|
|
638 |
pdf_id = generate_pdf_id(pdf_file.name)
|
639 |
pdf_metadata[pdf_id] = str(pdf_file)
|
640 |
|
641 |
-
# ๋ฉํ๋ฐ์ดํฐ ์ ์ฅ
|
642 |
save_pdf_metadata()
|
643 |
|
644 |
-
#
|
645 |
for cache_file in CACHE_DIR.glob("*_cache.json"):
|
646 |
try:
|
647 |
pdf_name = cache_file.stem.replace("_cache", "")
|
@@ -650,69 +636,67 @@ async def init_cache_all_pdfs():
|
|
650 |
if cached_data.get("status") == "completed" and cached_data.get("pages"):
|
651 |
pdf_cache[pdf_name] = cached_data
|
652 |
pdf_cache[pdf_name]["status"] = "completed"
|
653 |
-
logger.info(f"
|
654 |
except Exception as e:
|
655 |
-
logger.error(f"
|
656 |
|
657 |
-
#
|
658 |
-
await asyncio.gather(*[
|
659 |
-
|
660 |
-
|
661 |
-
|
|
|
|
|
662 |
|
663 |
-
# ๋ฐฑ๊ทธ๋ผ์ด๋ ์์
์์ ํจ์
|
664 |
@app.on_event("startup")
|
665 |
async def startup_event():
|
666 |
-
# PDF
|
667 |
load_pdf_metadata()
|
668 |
|
669 |
-
#
|
670 |
for pdf_file in get_pdf_files() + get_permanent_pdf_files():
|
671 |
found = False
|
672 |
for pid, path in pdf_metadata.items():
|
673 |
if os.path.basename(path) == pdf_file.name:
|
674 |
found = True
|
675 |
-
# ๊ฒฝ๋ก ์
๋ฐ์ดํธ
|
676 |
if not os.path.exists(path):
|
677 |
pdf_metadata[pid] = str(pdf_file)
|
678 |
break
|
679 |
|
680 |
if not found:
|
681 |
-
# ์ ID ์์ฑ ๋ฐ ๋ฉํ๋ฐ์ดํฐ์ ์ถ๊ฐ
|
682 |
pdf_id = generate_pdf_id(pdf_file.name)
|
683 |
pdf_metadata[pdf_id] = str(pdf_file)
|
684 |
|
685 |
-
# ๋ณ๊ฒฝ์ฌํญ ์ ์ฅ
|
686 |
save_pdf_metadata()
|
687 |
|
688 |
-
#
|
689 |
asyncio.create_task(init_cache_all_pdfs())
|
690 |
|
691 |
-
|
|
|
692 |
@app.get("/api/pdf-projects")
|
693 |
async def get_pdf_projects_api():
|
694 |
return generate_pdf_projects()
|
695 |
|
696 |
-
|
|
|
697 |
@app.get("/api/permanent-pdf-projects")
|
698 |
async def get_permanent_pdf_projects_api():
|
699 |
pdf_files = get_permanent_pdf_files()
|
700 |
projects_data = []
|
701 |
|
702 |
for pdf_file in pdf_files:
|
703 |
-
# PDF ID ์ฐพ๊ธฐ
|
704 |
pdf_id = None
|
705 |
for pid, path in pdf_metadata.items():
|
706 |
if os.path.basename(path) == pdf_file.name:
|
707 |
pdf_id = pid
|
708 |
break
|
709 |
|
710 |
-
# ID๊ฐ ์์ผ๋ฉด ์์ฑ
|
711 |
if not pdf_id:
|
712 |
pdf_id = generate_pdf_id(pdf_file.name)
|
713 |
pdf_metadata[pdf_id] = str(pdf_file)
|
714 |
save_pdf_metadata()
|
715 |
-
|
716 |
projects_data.append({
|
717 |
"path": str(pdf_file),
|
718 |
"name": pdf_file.stem,
|
@@ -722,7 +706,8 @@ async def get_permanent_pdf_projects_api():
|
|
722 |
|
723 |
return projects_data
|
724 |
|
725 |
-
|
|
|
726 |
@app.get("/api/pdf-info-by-id/{pdf_id}")
|
727 |
async def get_pdf_info_by_id(pdf_id: str):
|
728 |
pdf_path = get_pdf_path_by_id(pdf_id)
|
@@ -735,40 +720,42 @@ async def get_pdf_info_by_id(pdf_id: str):
|
|
735 |
"exists": True,
|
736 |
"cached": pdf_file.stem in pdf_cache and pdf_cache[pdf_file.stem].get("status") == "completed"
|
737 |
}
|
738 |
-
return {"exists": False, "error": "
|
|
|
739 |
|
740 |
-
# API
|
741 |
@app.get("/api/pdf-thumbnail")
|
742 |
async def get_pdf_thumbnail(path: str):
|
743 |
try:
|
744 |
pdf_file = pathlib.Path(path)
|
745 |
pdf_name = pdf_file.stem
|
746 |
|
747 |
-
#
|
748 |
if pdf_name in pdf_cache and pdf_cache[pdf_name].get("pages"):
|
749 |
if pdf_cache[pdf_name]["pages"][0].get("thumb"):
|
750 |
return {"thumbnail": pdf_cache[pdf_name]["pages"][0]["thumb"]}
|
751 |
-
|
752 |
-
#
|
753 |
import fitz
|
754 |
doc = fitz.open(path)
|
755 |
if doc.page_count > 0:
|
756 |
page = doc[0]
|
757 |
-
pix = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2))
|
758 |
-
img_data = pix.tobytes("jpeg", 70)
|
759 |
b64_img = base64.b64encode(img_data).decode('utf-8')
|
760 |
|
761 |
-
#
|
762 |
asyncio.create_task(cache_pdf(path))
|
763 |
|
764 |
return {"thumbnail": f"data:image/jpeg;base64,{b64_img}"}
|
765 |
|
766 |
return {"thumbnail": None}
|
767 |
except Exception as e:
|
768 |
-
logger.error(f"
|
769 |
return {"error": str(e), "thumbnail": None}
|
770 |
|
771 |
-
|
|
|
772 |
@app.get("/api/cache-status")
|
773 |
async def get_cache_status(path: str = None):
|
774 |
if path:
|
@@ -778,23 +765,24 @@ async def get_cache_status(path: str = None):
|
|
778 |
return pdf_cache[pdf_name]
|
779 |
return {"status": "not_cached"}
|
780 |
else:
|
781 |
-
return {
|
782 |
-
|
|
|
|
|
|
|
783 |
|
784 |
-
# API
|
785 |
@app.post("/api/ai/query-pdf/{pdf_id}")
|
786 |
async def api_query_pdf(pdf_id: str, query: Dict[str, str]):
|
787 |
try:
|
788 |
user_query = query.get("query", "")
|
789 |
if not user_query:
|
790 |
-
return JSONResponse(content={"error": "
|
791 |
|
792 |
-
# PDF ๊ฒฝ๋ก ํ์ธ
|
793 |
pdf_path = get_pdf_path_by_id(pdf_id)
|
794 |
if not pdf_path:
|
795 |
-
return JSONResponse(content={"error": f"PDF ID {pdf_id}
|
796 |
|
797 |
-
# ์ง์์๋ต ์ฒ๋ฆฌ
|
798 |
result = await query_pdf(pdf_id, user_query)
|
799 |
|
800 |
if "error" in result:
|
@@ -802,19 +790,18 @@ async def api_query_pdf(pdf_id: str, query: Dict[str, str]):
|
|
802 |
|
803 |
return result
|
804 |
except Exception as e:
|
805 |
-
logger.error(f"
|
806 |
return JSONResponse(content={"error": str(e)}, status_code=500)
|
807 |
|
808 |
-
|
|
|
809 |
@app.get("/api/ai/summarize-pdf/{pdf_id}")
|
810 |
async def api_summarize_pdf(pdf_id: str):
|
811 |
try:
|
812 |
-
# PDF ๊ฒฝ๋ก ํ์ธ
|
813 |
pdf_path = get_pdf_path_by_id(pdf_id)
|
814 |
if not pdf_path:
|
815 |
-
return JSONResponse(content={"error": f"PDF ID {pdf_id}
|
816 |
|
817 |
-
# ์์ฝ ์ฒ๋ฆฌ
|
818 |
result = await summarize_pdf(pdf_id)
|
819 |
|
820 |
if "error" in result:
|
@@ -822,124 +809,114 @@ async def api_summarize_pdf(pdf_id: str):
|
|
822 |
|
823 |
return result
|
824 |
except Exception as e:
|
825 |
-
logger.error(f"PDF
|
826 |
return JSONResponse(content={"error": str(e)}, status_code=500)
|
827 |
|
828 |
-
|
|
|
829 |
@app.get("/api/cached-pdf")
|
830 |
async def get_cached_pdf(path: str, background_tasks: BackgroundTasks):
|
831 |
try:
|
832 |
pdf_file = pathlib.Path(path)
|
833 |
pdf_name = pdf_file.stem
|
834 |
|
835 |
-
# ์บ์ ํ์ธ
|
836 |
if pdf_name in pdf_cache:
|
837 |
status = pdf_cache[pdf_name].get("status", "")
|
838 |
|
839 |
-
# ์๋ฃ๋ ๊ฒฝ์ฐ ์ ์ฒด ๋ฐ์ดํฐ ๋ฐํ
|
840 |
if status == "completed":
|
841 |
return pdf_cache[pdf_name]
|
842 |
-
|
843 |
-
# ์ฒ๋ฆฌ ์ค์ธ ๊ฒฝ์ฐ ํ์ฌ๊น์ง์ ํ์ด์ง ๋ฐ์ดํฐ ํฌํจ (์ ์ง์ ๋ก๋ฉ)
|
844 |
elif status == "processing":
|
845 |
progress = pdf_cache[pdf_name].get("progress", 0)
|
846 |
pages = pdf_cache[pdf_name].get("pages", [])
|
847 |
total_pages = pdf_cache[pdf_name].get("total_pages", 0)
|
848 |
|
849 |
-
# ์ผ๋ถ๋ง ์ฒ๋ฆฌ๋ ๊ฒฝ์ฐ์๋ ์ฌ์ฉ ๊ฐ๋ฅํ ํ์ด์ง ์ ๊ณต
|
850 |
return {
|
851 |
-
"status": "processing",
|
852 |
"progress": progress,
|
853 |
"pages": pages,
|
854 |
"total_pages": total_pages,
|
855 |
"available_pages": len([p for p in pages if p and p.get("src")])
|
856 |
}
|
857 |
|
858 |
-
#
|
859 |
background_tasks.add_task(cache_pdf, path)
|
860 |
return {"status": "started", "progress": 0}
|
861 |
|
862 |
except Exception as e:
|
863 |
-
logger.error(f"
|
864 |
return {"error": str(e), "status": "error"}
|
865 |
|
866 |
-
|
|
|
867 |
@app.get("/api/pdf-content")
|
868 |
async def get_pdf_content(path: str, background_tasks: BackgroundTasks):
|
869 |
try:
|
870 |
-
# ์บ์ฑ ์ํ ํ์ธ
|
871 |
pdf_file = pathlib.Path(path)
|
872 |
if not pdf_file.exists():
|
873 |
-
return JSONResponse(content={"error": f"
|
874 |
|
875 |
pdf_name = pdf_file.stem
|
876 |
|
877 |
-
#
|
878 |
-
if pdf_name in pdf_cache and (
|
879 |
-
|
880 |
-
|
|
|
|
|
|
|
|
|
881 |
return JSONResponse(content={"redirect": f"/api/cached-pdf?path={path}"})
|
882 |
|
883 |
-
|
884 |
-
|
885 |
-
|
886 |
-
|
887 |
-
# ํ์ผ๋ช
์ฒ๋ฆฌ
|
888 |
import urllib.parse
|
889 |
filename = pdf_file.name
|
890 |
encoded_filename = urllib.parse.quote(filename)
|
891 |
|
892 |
-
#
|
893 |
background_tasks.add_task(cache_pdf, path)
|
894 |
|
895 |
-
# ์๋ต ํค๋ ์ค์
|
896 |
headers = {
|
897 |
"Content-Type": "application/pdf",
|
898 |
-
"Content-Disposition": f
|
899 |
}
|
900 |
|
901 |
return Response(content=content, media_type="application/pdf", headers=headers)
|
902 |
except Exception as e:
|
903 |
import traceback
|
904 |
error_details = traceback.format_exc()
|
905 |
-
logger.error(f"
|
906 |
return JSONResponse(content={"error": str(e)}, status_code=500)
|
907 |
|
908 |
-
|
|
|
909 |
@app.post("/api/upload-pdf")
|
910 |
async def upload_pdf(file: UploadFile = File(...)):
|
911 |
try:
|
912 |
-
# ํ์ผ ์ด๋ฆ ํ์ธ
|
913 |
if not file.filename.lower().endswith('.pdf'):
|
914 |
-
return JSONResponse(
|
915 |
-
content={"success": False, "message": "PDF ํ์ผ๋ง ์
๋ก๋ ๊ฐ๋ฅํฉ๋๋ค"},
|
916 |
-
status_code=400
|
917 |
-
)
|
918 |
|
919 |
-
# ์๊ตฌ ์ ์ฅ์์ ํ์ผ ์ ์ฅ
|
920 |
file_path = PERMANENT_PDF_DIR / file.filename
|
921 |
|
922 |
-
# ํ์ผ ์ฝ๊ธฐ ๋ฐ ์ ์ฅ
|
923 |
content = await file.read()
|
924 |
with open(file_path, "wb") as buffer:
|
925 |
buffer.write(content)
|
926 |
|
927 |
-
#
|
928 |
with open(PDF_DIR / file.filename, "wb") as buffer:
|
929 |
buffer.write(content)
|
930 |
|
931 |
-
# PDF ID ์์ฑ ๋ฐ ๋ฉํ๋ฐ์ดํฐ ์ ์ฅ
|
932 |
pdf_id = generate_pdf_id(file.filename)
|
933 |
pdf_metadata[pdf_id] = str(file_path)
|
934 |
save_pdf_metadata()
|
935 |
|
936 |
-
# ๋ฐฑ๊ทธ๋ผ์ด๋์์ ์บ์ฑ ์์
|
937 |
asyncio.create_task(cache_pdf(str(file_path)))
|
938 |
|
939 |
return JSONResponse(
|
940 |
content={
|
941 |
-
"success": True,
|
942 |
-
"path": str(file_path),
|
943 |
"name": file_path.stem,
|
944 |
"id": pdf_id,
|
945 |
"viewUrl": f"/view/{pdf_id}"
|
@@ -949,48 +926,39 @@ async def upload_pdf(file: UploadFile = File(...)):
|
|
949 |
except Exception as e:
|
950 |
import traceback
|
951 |
error_details = traceback.format_exc()
|
952 |
-
logger.error(f"
|
953 |
-
return JSONResponse(
|
954 |
-
content={"success": False, "message": str(e)},
|
955 |
-
status_code=500
|
956 |
-
)
|
957 |
|
958 |
-
|
|
|
959 |
async def convert_text_to_pdf(text_content: str, title: str) -> str:
|
960 |
try:
|
961 |
-
# ์ ๋ชฉ์์ ์ ํจํ ํ์ผ๋ช
์์ฑ
|
962 |
import re
|
963 |
safe_title = re.sub(r'[^\w\-_\. ]', '_', title)
|
964 |
if not safe_title:
|
965 |
safe_title = "aibook"
|
966 |
|
967 |
-
# ํ์์คํฌํ ์ถ๊ฐ๋ก ๊ณ ์ ํ ํ์ผ๋ช
์์ฑ
|
968 |
timestamp = int(time.time())
|
969 |
filename = f"{safe_title}_{timestamp}.pdf"
|
970 |
|
971 |
-
# ์๊ตฌ ์ ์ฅ์์ ํ์ผ ๊ฒฝ๋ก
|
972 |
file_path = PERMANENT_PDF_DIR / filename
|
973 |
|
974 |
-
#
|
975 |
from reportlab.pdfbase import pdfmetrics
|
976 |
from reportlab.pdfbase.ttfonts import TTFont
|
977 |
|
978 |
-
# ํฐํธ ๊ฒฝ๋ก ์ค์ (app.py์ ๊ฐ์ ๋๋ ํ ๋ฆฌ์ ์๋ ํฐํธ ์ฌ์ฉ)
|
979 |
font_path = BASE / "MaruBuri-SemiBold.ttf"
|
980 |
|
981 |
-
# ํฐํธ ๋ฑ๋ก
|
982 |
font_name = "MaruBuri"
|
983 |
if font_path.exists():
|
984 |
pdfmetrics.registerFont(TTFont(font_name, str(font_path)))
|
985 |
-
logger.info(f"
|
986 |
else:
|
987 |
font_name = "Helvetica"
|
988 |
-
logger.warning(f"
|
989 |
|
990 |
-
# ์์ PDF ํ์ผ ์์ฑ
|
991 |
pdf_buffer = io.BytesIO()
|
992 |
|
993 |
-
# ํ๊ธ ์ง์์ ์ํ ์คํ์ผ ์ค์
|
994 |
from reportlab.lib.pagesizes import letter
|
995 |
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
996 |
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
@@ -998,7 +966,6 @@ async def convert_text_to_pdf(text_content: str, title: str) -> str:
|
|
998 |
|
999 |
doc = SimpleDocTemplate(pdf_buffer, pagesize=letter, encoding='utf-8')
|
1000 |
|
1001 |
-
# ์ฌ์ฉ์ ์ ์ ์คํ์ผ ์์ฑ
|
1002 |
title_style = ParagraphStyle(
|
1003 |
name='CustomTitle',
|
1004 |
fontName=font_name,
|
@@ -1018,41 +985,34 @@ async def convert_text_to_pdf(text_content: str, title: str) -> str:
|
|
1018 |
spaceAfter=6
|
1019 |
)
|
1020 |
|
1021 |
-
# ๋ด์ฉ์ ๋ฌธ๋จ์ผ๋ก ๋ถํ
|
1022 |
content = []
|
1023 |
|
1024 |
-
#
|
1025 |
content.append(Paragraph(title, title_style))
|
1026 |
content.append(Spacer(1, 20))
|
1027 |
|
1028 |
-
# ํ
์คํธ๋ฅผ ๋จ๋ฝ์ผ๋ก ๋ถ๋ฆฌํ์ฌ ์ถ๊ฐ
|
1029 |
paragraphs = text_content.split('\n\n')
|
1030 |
for para in paragraphs:
|
1031 |
if para.strip():
|
1032 |
-
# XML ํน์๋ฌธ์ ์ด์ค์ผ์ดํ ์ฒ๋ฆฌ
|
1033 |
from xml.sax.saxutils import escape
|
1034 |
safe_para = escape(para.replace('\n', '<br/>'))
|
1035 |
p = Paragraph(safe_para, normal_style)
|
1036 |
content.append(p)
|
1037 |
content.append(Spacer(1, 10))
|
1038 |
|
1039 |
-
# PDF ์์ฑ
|
1040 |
doc.build(content)
|
1041 |
|
1042 |
-
# ํ์ผ๋ก ์ ์ฅ
|
1043 |
with open(file_path, 'wb') as f:
|
1044 |
f.write(pdf_buffer.getvalue())
|
1045 |
|
1046 |
-
#
|
1047 |
with open(PDF_DIR / filename, 'wb') as f:
|
1048 |
f.write(pdf_buffer.getvalue())
|
1049 |
|
1050 |
-
# PDF ID ์์ฑ ๋ฐ ๋ฉํ๋ฐ์ดํฐ ์ ์ฅ
|
1051 |
pdf_id = generate_pdf_id(filename)
|
1052 |
pdf_metadata[pdf_id] = str(file_path)
|
1053 |
save_pdf_metadata()
|
1054 |
|
1055 |
-
# ๋ฐฑ๊ทธ๋ผ์ด๋์์ ์บ์ฑ ์์
|
1056 |
asyncio.create_task(cache_pdf(str(file_path)))
|
1057 |
|
1058 |
return {
|
@@ -1062,77 +1022,68 @@ async def convert_text_to_pdf(text_content: str, title: str) -> str:
|
|
1062 |
}
|
1063 |
|
1064 |
except Exception as e:
|
1065 |
-
logger.error(f"
|
1066 |
raise e
|
1067 |
|
1068 |
|
1069 |
-
# AI
|
1070 |
async def enhance_text_with_ai(text_content: str, title: str) -> str:
|
1071 |
-
#
|
1072 |
-
return text_content
|
1073 |
-
|
1074 |
|
1075 |
|
1076 |
-
#
|
1077 |
@app.post("/api/text-to-pdf")
|
1078 |
async def text_to_pdf(file: UploadFile = File(...)):
|
1079 |
try:
|
1080 |
-
# ์ง์ํ๋ ํ์ผ ํ์ ํ์ธ
|
1081 |
filename = file.filename.lower()
|
1082 |
if not (filename.endswith('.txt') or filename.endswith('.docx') or filename.endswith('.doc')):
|
1083 |
return JSONResponse(
|
1084 |
-
content={"success": False, "message": "
|
1085 |
status_code=400
|
1086 |
)
|
1087 |
|
1088 |
-
# ํ์ผ ๋ด์ฉ ์ฝ๊ธฐ
|
1089 |
content = await file.read()
|
1090 |
|
1091 |
-
#
|
1092 |
if filename.endswith('.txt'):
|
1093 |
-
# ์ธ์ฝ๋ฉ ์๋ ๊ฐ์ง ์๋
|
1094 |
encodings = ['utf-8', 'euc-kr', 'cp949', 'latin1']
|
1095 |
text_content = None
|
1096 |
|
1097 |
for encoding in encodings:
|
1098 |
try:
|
1099 |
text_content = content.decode(encoding, errors='strict')
|
1100 |
-
logger.info(f"
|
1101 |
break
|
1102 |
except UnicodeDecodeError:
|
1103 |
continue
|
1104 |
|
1105 |
if text_content is None:
|
1106 |
-
# ๋ชจ๋ ์ธ์ฝ๋ฉ ์๋ ์คํจ ์ ๊ธฐ๋ณธ์ ์ผ๋ก UTF-8๋ก ์๋ํ๊ณ ์ค๋ฅ๋ ๋์ฒด ๋ฌธ์๋ก ์ฒ๋ฆฌ
|
1107 |
text_content = content.decode('utf-8', errors='replace')
|
1108 |
-
logger.warning("
|
1109 |
|
1110 |
elif filename.endswith('.docx') or filename.endswith('.doc'):
|
1111 |
-
# ์์ ํ์ผ๋ก ์ ์ฅ
|
1112 |
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as temp_file:
|
1113 |
temp_file.write(content)
|
1114 |
temp_path = temp_file.name
|
1115 |
|
1116 |
try:
|
1117 |
-
# docx2txt๋ก ํ
์คํธ ์ถ์ถ
|
1118 |
text_content = docx2txt.process(temp_path)
|
1119 |
finally:
|
1120 |
-
# ์์ ํ์ผ ์ญ์
|
1121 |
os.unlink(temp_path)
|
1122 |
|
1123 |
-
# ํ์ผ๋ช
์์ ์ ๋ชฉ ์ถ์ถ (ํ์ฅ์ ์ ์ธ)
|
1124 |
title = os.path.splitext(filename)[0]
|
1125 |
|
1126 |
-
# AI
|
1127 |
enhanced_text = await enhance_text_with_ai(text_content, title)
|
1128 |
|
1129 |
-
#
|
1130 |
pdf_info = await convert_text_to_pdf(enhanced_text, title)
|
1131 |
|
1132 |
return JSONResponse(
|
1133 |
content={
|
1134 |
-
"success": True,
|
1135 |
-
"path": pdf_info["path"],
|
1136 |
"name": os.path.splitext(pdf_info["filename"])[0],
|
1137 |
"id": pdf_info["id"],
|
1138 |
"viewUrl": f"/view/{pdf_info['id']}"
|
@@ -1142,49 +1093,46 @@ async def text_to_pdf(file: UploadFile = File(...)):
|
|
1142 |
except Exception as e:
|
1143 |
import traceback
|
1144 |
error_details = traceback.format_exc()
|
1145 |
-
logger.error(f"
|
1146 |
-
return JSONResponse(
|
1147 |
-
content={"success": False, "message": str(e)},
|
1148 |
-
status_code=500
|
1149 |
-
)
|
1150 |
|
1151 |
-
|
|
|
1152 |
@app.post("/api/admin-login")
|
1153 |
async def admin_login(password: str = Form(...)):
|
1154 |
if password == ADMIN_PASSWORD:
|
1155 |
return {"success": True}
|
1156 |
-
return {"success": False, "message": "
|
|
|
1157 |
|
1158 |
-
#
|
1159 |
@app.delete("/api/admin/delete-pdf")
|
1160 |
async def delete_pdf(path: str):
|
1161 |
try:
|
1162 |
pdf_file = pathlib.Path(path)
|
1163 |
if not pdf_file.exists():
|
1164 |
-
return {"success": False, "message": "
|
1165 |
|
1166 |
-
# PDF ํ์ผ๋ช
๊ฐ์ ธ์ค๊ธฐ
|
1167 |
filename = pdf_file.name
|
1168 |
|
1169 |
-
#
|
1170 |
pdf_file.unlink()
|
1171 |
|
1172 |
-
#
|
1173 |
main_file_path = PDF_DIR / filename
|
1174 |
if main_file_path.exists():
|
1175 |
main_file_path.unlink()
|
1176 |
|
1177 |
-
#
|
1178 |
pdf_name = pdf_file.stem
|
1179 |
cache_path = get_cache_path(pdf_name)
|
1180 |
if cache_path.exists():
|
1181 |
cache_path.unlink()
|
1182 |
-
|
1183 |
-
# ์บ์ ๋ฉ๋ชจ๋ฆฌ์์๋ ์ ๊ฑฐ
|
1184 |
if pdf_name in pdf_cache:
|
1185 |
del pdf_cache[pdf_name]
|
1186 |
-
|
1187 |
-
#
|
1188 |
to_remove = []
|
1189 |
for pid, fpath in pdf_metadata.items():
|
1190 |
if os.path.basename(fpath) == filename:
|
@@ -1194,30 +1142,31 @@ async def delete_pdf(path: str):
|
|
1194 |
del pdf_metadata[pid]
|
1195 |
|
1196 |
save_pdf_metadata()
|
1197 |
-
|
1198 |
return {"success": True}
|
1199 |
except Exception as e:
|
1200 |
-
logger.error(f"
|
1201 |
return {"success": False, "message": str(e)}
|
1202 |
|
1203 |
-
|
|
|
1204 |
@app.post("/api/admin/feature-pdf")
|
1205 |
async def feature_pdf(path: str):
|
1206 |
try:
|
1207 |
pdf_file = pathlib.Path(path)
|
1208 |
if not pdf_file.exists():
|
1209 |
-
return {"success": False, "message": "
|
1210 |
|
1211 |
-
# ๋ฉ์ธ ๋๋ ํ ๋ฆฌ์ ๋ณต์ฌ
|
1212 |
target_path = PDF_DIR / pdf_file.name
|
1213 |
shutil.copy2(pdf_file, target_path)
|
1214 |
-
|
1215 |
return {"success": True}
|
1216 |
except Exception as e:
|
1217 |
-
logger.error(f"
|
1218 |
return {"success": False, "message": str(e)}
|
1219 |
|
1220 |
-
|
|
|
1221 |
@app.delete("/api/admin/unfeature-pdf")
|
1222 |
async def unfeature_pdf(path: str):
|
1223 |
try:
|
@@ -1226,25 +1175,24 @@ async def unfeature_pdf(path: str):
|
|
1226 |
|
1227 |
if target_path.exists():
|
1228 |
target_path.unlink()
|
1229 |
-
|
1230 |
return {"success": True}
|
1231 |
except Exception as e:
|
1232 |
-
logger.error(f"
|
1233 |
return {"success": False, "message": str(e)}
|
1234 |
|
1235 |
-
|
1236 |
@app.get("/view/{pdf_id}")
|
1237 |
async def view_pdf_by_id(pdf_id: str):
|
1238 |
-
# PDF ID ์ ํจํ์ง ํ์ธ
|
1239 |
pdf_path = get_pdf_path_by_id(pdf_id)
|
1240 |
|
1241 |
if not pdf_path:
|
1242 |
-
#
|
1243 |
load_pdf_metadata()
|
1244 |
pdf_path = get_pdf_path_by_id(pdf_id)
|
1245 |
|
1246 |
if not pdf_path:
|
1247 |
-
#
|
1248 |
for file_path in get_pdf_files() + get_permanent_pdf_files():
|
1249 |
name_part = pdf_id.split('_')[0] if '_' in pdf_id else pdf_id
|
1250 |
if file_path.stem.startswith(name_part):
|
@@ -1255,14 +1203,17 @@ async def view_pdf_by_id(pdf_id: str):
|
|
1255 |
|
1256 |
if not pdf_path:
|
1257 |
return HTMLResponse(
|
1258 |
-
content=
|
|
|
|
|
|
|
1259 |
status_code=404
|
1260 |
)
|
1261 |
|
1262 |
-
#
|
1263 |
return get_html_content(pdf_id=pdf_id)
|
1264 |
|
1265 |
-
|
1266 |
def get_html_content(pdf_id: str = None):
|
1267 |
html_path = BASE / "flipbook_template.html"
|
1268 |
content = ""
|
@@ -1270,59 +1221,49 @@ def get_html_content(pdf_id: str = None):
|
|
1270 |
with open(html_path, "r", encoding="utf-8") as f:
|
1271 |
content = f.read()
|
1272 |
else:
|
1273 |
-
content = HTML #
|
1274 |
|
1275 |
-
# PDF ID๊ฐ ์ ๊ณต๋ ๊ฒฝ์ฐ, ์๋ ๋ก๋ ์คํฌ๋ฆฝํธ ์ถ๊ฐ
|
1276 |
if pdf_id:
|
1277 |
auto_load_script = f"""
|
1278 |
<script>
|
1279 |
-
// ํ์ด์ง ๋ก๋ ์ ์๋์ผ๋ก ํด๋น PDF ์ด๊ธฐ
|
1280 |
document.addEventListener('DOMContentLoaded', async function() {{
|
1281 |
try {{
|
1282 |
-
// PDF ์ ๋ณด ๊ฐ์ ธ์ค๊ธฐ
|
1283 |
const response = await fetch('/api/pdf-info-by-id/{pdf_id}');
|
1284 |
const pdfInfo = await response.json();
|
1285 |
|
1286 |
if (pdfInfo.exists && pdfInfo.path) {{
|
1287 |
-
// ์ฝ๊ฐ์ ์ง์ฐ ํ PDF ๋ทฐ์ด ์ด๊ธฐ (UI๊ฐ ์ค๋น๋ ํ)
|
1288 |
setTimeout(() => {{
|
1289 |
openPdfById('{pdf_id}', pdfInfo.path, pdfInfo.cached);
|
1290 |
}}, 500);
|
1291 |
}} else {{
|
1292 |
-
showError("
|
1293 |
}}
|
1294 |
}} catch (e) {{
|
1295 |
-
console.error("
|
1296 |
}}
|
1297 |
}});
|
1298 |
</script>
|
1299 |
"""
|
1300 |
|
1301 |
-
# body ์ข
๋ฃ ํ๊ทธ ์ ์ ์คํฌ๋ฆฝํธ ์ฝ์
|
1302 |
content = content.replace("</body>", auto_load_script + "</body>")
|
1303 |
|
1304 |
return HTMLResponse(content=content)
|
1305 |
|
|
|
1306 |
@app.get("/", response_class=HTMLResponse)
|
1307 |
async def root(request: Request, pdf_id: Optional[str] = Query(None)):
|
1308 |
-
# PDF ID๊ฐ ์ฟผ๋ฆฌ ํ๋ผ๋ฏธํฐ๋ก ์ ๊ณต๋ ๊ฒฝ์ฐ /view/{pdf_id}๋ก ๋ฆฌ๋ค์ด๋ ํธ
|
1309 |
if pdf_id:
|
1310 |
return RedirectResponse(url=f"/view/{pdf_id}")
|
1311 |
return get_html_content()
|
1312 |
|
1313 |
-
# HTML ๋ฌธ์์ด (AI ๋ฒํผ ๋ฐ ์ฑ๋ด UI ์ถ๊ฐ)
|
1314 |
-
# HTML ๋ฌธ์์ด (AI ๋ฒํผ ๋ฐ ์ฑ๋ด UI ์ถ๊ฐ)
|
1315 |
-
import os
|
1316 |
|
1317 |
-
|
1318 |
HTML = os.getenv("HTML_TEMPLATE", "")
|
1319 |
-
|
1320 |
-
# HTML์ด ๋น์ด์์ ๊ฒฝ์ฐ ๊ธฐ๋ณธ HTML ์ฌ์ฉ (fallback)
|
1321 |
if not HTML:
|
1322 |
-
logger.warning("HTML_TEMPLATE secret
|
1323 |
HTML = """
|
1324 |
<!doctype html>
|
1325 |
-
<html lang="
|
1326 |
<head>
|
1327 |
<meta charset="utf-8">
|
1328 |
<title>FlipBook Space</title>
|
@@ -1332,12 +1273,12 @@ if not HTML:
|
|
1332 |
</style>
|
1333 |
</head>
|
1334 |
<body>
|
1335 |
-
<h1>
|
1336 |
-
<p class="error">HTML_TEMPLATE secret
|
1337 |
-
<p>Hugging Face Space
|
1338 |
</body>
|
1339 |
</html>
|
1340 |
"""
|
1341 |
|
1342 |
if __name__ == "__main__":
|
1343 |
-
uvicorn.run("app:app", host="0.0.0.0", port=int(os.getenv("PORT", 7860)))
|
|
|
17 |
import io
|
18 |
import docx2txt
|
19 |
|
20 |
+
# Logging configuration
|
21 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
22 |
logger = logging.getLogger(__name__)
|
23 |
|
|
|
25 |
app = FastAPI()
|
26 |
app.mount("/static", StaticFiles(directory=BASE), name="static")
|
27 |
|
28 |
+
# PDF directory (main directory)
|
29 |
PDF_DIR = BASE / "pdf"
|
30 |
if not PDF_DIR.exists():
|
31 |
PDF_DIR.mkdir(parents=True)
|
32 |
|
33 |
+
# Permanent PDF directory (Hugging Face persistent disk)
|
34 |
PERMANENT_PDF_DIR = pathlib.Path("/data/pdfs") if os.path.exists("/data") else BASE / "permanent_pdfs"
|
35 |
if not PERMANENT_PDF_DIR.exists():
|
36 |
PERMANENT_PDF_DIR.mkdir(parents=True)
|
37 |
|
38 |
+
# Cache directory
|
39 |
CACHE_DIR = BASE / "cache"
|
40 |
if not CACHE_DIR.exists():
|
41 |
CACHE_DIR.mkdir(parents=True)
|
42 |
|
43 |
+
# PDF metadata directory and file
|
44 |
METADATA_DIR = pathlib.Path("/data/metadata") if os.path.exists("/data") else BASE / "metadata"
|
45 |
if not METADATA_DIR.exists():
|
46 |
METADATA_DIR.mkdir(parents=True)
|
47 |
PDF_METADATA_FILE = METADATA_DIR / "pdf_metadata.json"
|
48 |
|
49 |
+
# Embedding cache directory
|
50 |
EMBEDDING_DIR = pathlib.Path("/data/embeddings") if os.path.exists("/data") else BASE / "embeddings"
|
51 |
if not EMBEDDING_DIR.exists():
|
52 |
EMBEDDING_DIR.mkdir(parents=True)
|
53 |
|
54 |
+
# Admin password
|
55 |
+
ADMIN_PASSWORD = os.getenv("PASSWORD", "admin") # Retrieved from environment variable; default is for testing
|
56 |
|
57 |
+
# OpenAI API key
|
58 |
OPENAI_API_KEY = os.getenv("LLM_API", "")
|
59 |
+
# Flag indicating if we have a valid API key
|
60 |
HAS_VALID_API_KEY = bool(OPENAI_API_KEY and OPENAI_API_KEY.strip())
|
61 |
|
62 |
if HAS_VALID_API_KEY:
|
63 |
try:
|
64 |
openai_client = OpenAI(api_key=OPENAI_API_KEY, timeout=30.0)
|
65 |
+
logger.info("OpenAI client initialized successfully.")
|
66 |
except Exception as e:
|
67 |
+
logger.error(f"Failed to initialize OpenAI client: {e}")
|
68 |
HAS_VALID_API_KEY = False
|
69 |
else:
|
70 |
+
logger.warning("No valid OpenAI API key found. AI features will be limited.")
|
71 |
openai_client = None
|
72 |
|
73 |
+
# Global cache object
|
74 |
pdf_cache: Dict[str, Dict[str, Any]] = {}
|
75 |
+
# Cache locks
|
76 |
cache_locks = {}
|
77 |
+
# PDF metadata (ID -> path)
|
78 |
pdf_metadata: Dict[str, str] = {}
|
79 |
+
# PDF embedding cache
|
80 |
pdf_embeddings: Dict[str, Dict[str, Any]] = {}
|
81 |
|
82 |
+
|
83 |
+
# Load PDF metadata from file
|
84 |
def load_pdf_metadata():
|
85 |
global pdf_metadata
|
86 |
if PDF_METADATA_FILE.exists():
|
87 |
try:
|
88 |
with open(PDF_METADATA_FILE, "r") as f:
|
89 |
pdf_metadata = json.load(f)
|
90 |
+
logger.info(f"PDF metadata loaded successfully: {len(pdf_metadata)} entries")
|
91 |
except Exception as e:
|
92 |
+
logger.error(f"Error loading metadata: {e}")
|
93 |
pdf_metadata = {}
|
94 |
else:
|
95 |
pdf_metadata = {}
|
96 |
|
97 |
+
|
98 |
+
# Save PDF metadata to file
|
99 |
def save_pdf_metadata():
|
100 |
try:
|
101 |
with open(PDF_METADATA_FILE, "w") as f:
|
102 |
json.dump(pdf_metadata, f)
|
103 |
except Exception as e:
|
104 |
+
logger.error(f"Error saving metadata: {e}")
|
105 |
+
|
106 |
|
107 |
+
# Generate a PDF ID (based on filename + timestamp)
|
108 |
def generate_pdf_id(filename: str) -> str:
|
|
|
|
|
|
|
109 |
import re
|
110 |
+
base_name = os.path.splitext(filename)[0]
|
111 |
safe_name = re.sub(r'[^\w\-_]', '_', base_name.replace(" ", "_"))
|
|
|
112 |
timestamp = int(time.time())
|
|
|
113 |
random_suffix = uuid.uuid4().hex[:6]
|
114 |
return f"{safe_name}_{timestamp}_{random_suffix}"
|
115 |
|
116 |
+
|
117 |
+
# Retrieve list of PDF files in main directory
|
118 |
def get_pdf_files():
|
119 |
pdf_files = []
|
120 |
if PDF_DIR.exists():
|
121 |
pdf_files = [f for f in PDF_DIR.glob("*.pdf")]
|
122 |
return pdf_files
|
123 |
|
124 |
+
|
125 |
+
# Retrieve list of PDF files in permanent directory
|
126 |
def get_permanent_pdf_files():
|
127 |
pdf_files = []
|
128 |
if PERMANENT_PDF_DIR.exists():
|
129 |
pdf_files = [f for f in PERMANENT_PDF_DIR.glob("*.pdf")]
|
130 |
return pdf_files
|
131 |
|
132 |
+
|
133 |
+
# Generate PDF project data (thumbnails, etc.)
|
134 |
def generate_pdf_projects():
|
135 |
projects_data = []
|
136 |
|
137 |
+
# Get files from both main and permanent directories
|
138 |
pdf_files = get_pdf_files()
|
139 |
permanent_pdf_files = get_permanent_pdf_files()
|
140 |
|
141 |
+
# Combine both sets of files (remove duplicates by filename)
|
142 |
unique_files = {}
|
143 |
|
144 |
+
# Add from main directory first
|
145 |
for file in pdf_files:
|
146 |
unique_files[file.name] = file
|
147 |
|
148 |
+
# Then add from permanent directory (overwrite if same filename)
|
149 |
for file in permanent_pdf_files:
|
150 |
unique_files[file.name] = file
|
151 |
|
|
|
152 |
for pdf_file in unique_files.values():
|
153 |
+
# Find the PDF ID for this file
|
154 |
pdf_id = None
|
155 |
for pid, path in pdf_metadata.items():
|
156 |
if os.path.basename(path) == pdf_file.name:
|
157 |
pdf_id = pid
|
158 |
break
|
159 |
|
160 |
+
# If the file has no ID, generate one and add it to metadata
|
161 |
if not pdf_id:
|
162 |
pdf_id = generate_pdf_id(pdf_file.name)
|
163 |
pdf_metadata[pdf_id] = str(pdf_file)
|
|
|
172 |
|
173 |
return projects_data
|
174 |
|
175 |
+
|
176 |
+
# Get path for cache file
|
177 |
def get_cache_path(pdf_name: str):
|
178 |
return CACHE_DIR / f"{pdf_name}_cache.json"
|
179 |
|
180 |
+
|
181 |
+
# Get path for embedding cache file
|
182 |
def get_embedding_path(pdf_id: str):
|
183 |
return EMBEDDING_DIR / f"{pdf_id}_embedding.json"
|
184 |
|
185 |
+
|
186 |
+
# Extract text from a PDF
|
187 |
def extract_pdf_text(pdf_path: str) -> List[Dict[str, Any]]:
|
188 |
try:
|
189 |
doc = fitz.open(pdf_path)
|
|
|
193 |
page = doc[page_num]
|
194 |
text = page.get_text()
|
195 |
|
196 |
+
# Only add if the text is non-empty
|
197 |
if text.strip():
|
198 |
chunks.append({
|
199 |
"page": page_num + 1,
|
|
|
203 |
|
204 |
return chunks
|
205 |
except Exception as e:
|
206 |
+
logger.error(f"Error extracting text from PDF: {e}")
|
207 |
return []
|
208 |
|
209 |
+
|
210 |
+
# Get or create PDF embedding by PDF ID
|
211 |
async def get_pdf_embedding(pdf_id: str) -> Dict[str, Any]:
|
212 |
try:
|
213 |
+
# Check embedding cache file
|
214 |
embedding_path = get_embedding_path(pdf_id)
|
215 |
if embedding_path.exists():
|
216 |
try:
|
217 |
with open(embedding_path, "r", encoding="utf-8") as f:
|
218 |
return json.load(f)
|
219 |
except Exception as e:
|
220 |
+
logger.error(f"Error loading embedding cache: {e}")
|
221 |
|
222 |
+
# Find the actual PDF path
|
223 |
pdf_path = get_pdf_path_by_id(pdf_id)
|
224 |
if not pdf_path:
|
225 |
+
raise ValueError(f"Could not find a file corresponding to PDF ID {pdf_id}")
|
226 |
|
227 |
+
# Extract text
|
228 |
chunks = extract_pdf_text(pdf_path)
|
229 |
if not chunks:
|
230 |
+
raise ValueError(f"No text could be extracted from PDF: {pdf_path}")
|
231 |
|
232 |
+
# Here, you'd normally create or fetch embeddings. For now, we just store chunks.
|
233 |
embedding_data = {
|
234 |
"pdf_id": pdf_id,
|
235 |
"pdf_path": pdf_path,
|
|
|
237 |
"created_at": time.time()
|
238 |
}
|
239 |
|
240 |
+
# Save embedding data to cache
|
241 |
with open(embedding_path, "w", encoding="utf-8") as f:
|
242 |
json.dump(embedding_data, f, ensure_ascii=False)
|
243 |
|
244 |
return embedding_data
|
245 |
|
246 |
except Exception as e:
|
247 |
+
logger.error(f"Error creating PDF embedding: {e}")
|
248 |
return {"error": str(e), "pdf_id": pdf_id}
|
249 |
|
250 |
+
|
251 |
+
# Query a PDF using its content (simple approach)
|
252 |
async def query_pdf(pdf_id: str, query: str) -> Dict[str, Any]:
|
253 |
try:
|
254 |
+
# If there's no valid API key
|
255 |
if not HAS_VALID_API_KEY or not openai_client:
|
256 |
return {
|
257 |
+
"error": "OpenAI API key not set.",
|
258 |
+
"answer": "Sorry, the AI feature is currently disabled. Please contact the system administrator."
|
259 |
}
|
260 |
|
261 |
+
# Get embedding data
|
262 |
embedding_data = await get_pdf_embedding(pdf_id)
|
263 |
if "error" in embedding_data:
|
264 |
return {"error": embedding_data["error"]}
|
265 |
|
266 |
+
# For simplicity, gather all text from the PDF
|
267 |
all_text = "\n\n".join([f"Page {chunk['page']}: {chunk['text']}" for chunk in embedding_data["chunks"]])
|
268 |
|
269 |
+
# Truncate context if too long
|
270 |
+
max_context_length = 60000 # roughly by characters
|
271 |
if len(all_text) > max_context_length:
|
272 |
+
all_text = all_text[:max_context_length] + "...(truncated)"
|
273 |
|
274 |
+
# System prompt
|
275 |
system_prompt = """
|
276 |
+
The default language is English. However, please respond in the language used in the user's prompt (e.g., English, Korean, Japanese, Chinese, etc.).
|
277 |
+
You are an assistant that answers questions based solely on the provided PDF content. Use only the information from the PDF content to respond. If the relevant information is not available in the PDF, respond with: "The requested information could not be found in the provided PDF."
|
278 |
+
Provide clear, concise answers and cite relevant page numbers. Always remain polite and courteous.
|
279 |
"""
|
280 |
|
281 |
+
# Attempting to call the openai_client
|
282 |
try:
|
283 |
+
# Retry logic
|
284 |
+
for attempt in range(3):
|
285 |
try:
|
286 |
response = openai_client.chat.completions.create(
|
287 |
model="gpt-4.1-mini",
|
288 |
messages=[
|
289 |
{"role": "system", "content": system_prompt},
|
290 |
+
{
|
291 |
+
"role": "user",
|
292 |
+
"content": (
|
293 |
+
f"The default language is English."
|
294 |
+
f"Please answer the following question using the PDF content below.\n\n"
|
295 |
+
f"PDF Content:\n{all_text}\n\n"
|
296 |
+
f"Question: {query}"
|
297 |
+
),
|
298 |
+
},
|
299 |
],
|
300 |
temperature=0.7,
|
301 |
max_tokens=2048,
|
302 |
+
timeout=30.0
|
303 |
)
|
304 |
|
305 |
answer = response.choices[0].message.content
|
|
|
309 |
"query": query
|
310 |
}
|
311 |
except Exception as api_error:
|
312 |
+
logger.error(f"OpenAI API call error (attempt {attempt+1}/3): {api_error}")
|
313 |
+
if attempt == 2:
|
314 |
raise api_error
|
315 |
+
await asyncio.sleep(1 * (attempt + 1))
|
316 |
|
317 |
+
raise Exception("All retry attempts for API call failed.")
|
|
|
318 |
except Exception as api_error:
|
319 |
+
logger.error(f"Final OpenAI API call error: {api_error}")
|
|
|
320 |
error_message = str(api_error)
|
321 |
if "Connection" in error_message:
|
322 |
+
return {"error": "Could not connect to the OpenAI server. Please check your internet connection."}
|
323 |
elif "Unauthorized" in error_message or "Authentication" in error_message:
|
324 |
+
return {"error": "Invalid API key."}
|
325 |
elif "Rate limit" in error_message:
|
326 |
+
return {"error": "API rate limit exceeded. Please try again later."}
|
327 |
else:
|
328 |
+
return {"error": f"An error occurred while generating the AI response: {error_message}"}
|
329 |
|
330 |
except Exception as e:
|
331 |
+
logger.error(f"Error in query_pdf: {e}")
|
332 |
return {"error": str(e)}
|
333 |
|
334 |
+
|
335 |
+
# Summarize PDF
|
336 |
async def summarize_pdf(pdf_id: str) -> Dict[str, Any]:
|
337 |
try:
|
338 |
+
# If there's no valid API key
|
339 |
if not HAS_VALID_API_KEY or not openai_client:
|
340 |
return {
|
341 |
+
"error": "OpenAI API key not set. Check 'LLM_API' environment variable.",
|
342 |
+
"summary": "Cannot generate summary without an API key. Please contact the system administrator."
|
343 |
}
|
344 |
|
345 |
+
# Get embedding data
|
346 |
embedding_data = await get_pdf_embedding(pdf_id)
|
347 |
if "error" in embedding_data:
|
348 |
+
return {"error": embedding_data["error"], "summary": "Cannot extract text from the PDF."}
|
349 |
|
|
|
350 |
all_text = "\n\n".join([f"Page {chunk['page']}: {chunk['text']}" for chunk in embedding_data["chunks"]])
|
351 |
|
352 |
+
# Truncate if too long
|
353 |
+
max_context_length = 60000
|
354 |
if len(all_text) > max_context_length:
|
355 |
+
all_text = all_text[:max_context_length] + "...(truncated)"
|
356 |
|
|
|
357 |
try:
|
358 |
+
# Retry logic
|
359 |
+
for attempt in range(3):
|
360 |
try:
|
361 |
response = openai_client.chat.completions.create(
|
362 |
model="gpt-4.1-mini",
|
363 |
messages=[
|
364 |
+
{
|
365 |
+
"role": "system",
|
366 |
+
"content": (
|
367 |
+
"The default language is English. Please summarize the following PDF content "
|
368 |
+
"concisely, including key topics and main points, in less than 500 characters."
|
369 |
+
),
|
370 |
+
},
|
371 |
+
{"role": "user", "content": f"PDF Content:\n{all_text}"}
|
372 |
],
|
373 |
temperature=0.7,
|
374 |
max_tokens=1024,
|
375 |
+
timeout=30.0
|
376 |
)
|
377 |
|
378 |
summary = response.choices[0].message.content
|
|
|
381 |
"pdf_id": pdf_id
|
382 |
}
|
383 |
except Exception as api_error:
|
384 |
+
logger.error(f"OpenAI API call error (attempt {attempt+1}/3): {api_error}")
|
385 |
+
if attempt == 2:
|
386 |
raise api_error
|
387 |
+
await asyncio.sleep(1 * (attempt + 1))
|
388 |
|
389 |
+
raise Exception("All retry attempts for API call failed.")
|
|
|
390 |
except Exception as api_error:
|
391 |
+
logger.error(f"Final OpenAI API error: {api_error}")
|
|
|
392 |
error_message = str(api_error)
|
393 |
if "Connection" in error_message:
|
394 |
+
return {"error": "Could not connect to the OpenAI server. Check your internet connection.", "pdf_id": pdf_id}
|
395 |
elif "Unauthorized" in error_message or "Authentication" in error_message:
|
396 |
+
return {"error": "Invalid API key.", "pdf_id": pdf_id}
|
397 |
elif "Rate limit" in error_message:
|
398 |
+
return {"error": "API rate limit exceeded. Please try again later.", "pdf_id": pdf_id}
|
399 |
else:
|
400 |
+
return {"error": f"An error occurred while generating the summary: {error_message}", "pdf_id": pdf_id}
|
401 |
|
402 |
except Exception as e:
|
403 |
+
logger.error(f"Error summarizing PDF: {e}")
|
404 |
return {
|
405 |
"error": str(e),
|
406 |
+
"summary": "An error occurred while summarizing the PDF. The PDF may be too large or in an unsupported format."
|
407 |
}
|
408 |
|
409 |
+
|
410 |
+
# Optimized PDF page caching
|
411 |
async def cache_pdf(pdf_path: str):
|
412 |
try:
|
413 |
+
import fitz
|
414 |
|
415 |
pdf_file = pathlib.Path(pdf_path)
|
416 |
pdf_name = pdf_file.stem
|
417 |
|
418 |
+
# Create a lock for this PDF (avoid concurrent caching)
|
419 |
if pdf_name not in cache_locks:
|
420 |
cache_locks[pdf_name] = threading.Lock()
|
421 |
|
422 |
+
# If it's already being cached or completed, skip
|
423 |
if pdf_name in pdf_cache and pdf_cache[pdf_name].get("status") in ["processing", "completed"]:
|
424 |
+
logger.info(f"PDF {pdf_name} is already cached or in progress.")
|
425 |
return
|
426 |
|
427 |
with cache_locks[pdf_name]:
|
428 |
+
# Double check after lock acquisition
|
429 |
if pdf_name in pdf_cache and pdf_cache[pdf_name].get("status") in ["processing", "completed"]:
|
430 |
return
|
431 |
|
|
|
432 |
pdf_cache[pdf_name] = {"status": "processing", "progress": 0, "pages": []}
|
433 |
|
434 |
+
# Check if there's an existing cache file
|
435 |
cache_path = get_cache_path(pdf_name)
|
436 |
if cache_path.exists():
|
437 |
try:
|
|
|
440 |
if cached_data.get("status") == "completed" and cached_data.get("pages"):
|
441 |
pdf_cache[pdf_name] = cached_data
|
442 |
pdf_cache[pdf_name]["status"] = "completed"
|
443 |
+
logger.info(f"Loaded {pdf_name} from cache file.")
|
444 |
return
|
445 |
except Exception as e:
|
446 |
+
logger.error(f"Failed to load cache file: {e}")
|
447 |
|
448 |
+
# Open the PDF
|
449 |
doc = fitz.open(pdf_path)
|
450 |
total_pages = doc.page_count
|
451 |
|
452 |
+
# Generate a small thumbnail for the first page in advance (fast UI loading)
|
453 |
if total_pages > 0:
|
|
|
454 |
page = doc[0]
|
455 |
+
pix_thumb = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2))
|
456 |
thumb_data = pix_thumb.tobytes("png")
|
457 |
b64_thumb = base64.b64encode(thumb_data).decode('utf-8')
|
458 |
thumb_src = f"data:image/png;base64,{b64_thumb}"
|
459 |
|
|
|
460 |
pdf_cache[pdf_name]["pages"] = [{"thumb": thumb_src, "src": ""}]
|
461 |
pdf_cache[pdf_name]["progress"] = 1
|
462 |
pdf_cache[pdf_name]["total_pages"] = total_pages
|
463 |
|
464 |
+
# Adjust resolution and quality to optimize performance
|
465 |
+
scale_factor = 1.0
|
466 |
+
jpeg_quality = 80
|
467 |
|
468 |
+
# Worker function for parallel page processing
|
469 |
def process_page(page_num):
|
470 |
try:
|
471 |
page = doc[page_num]
|
|
|
|
|
472 |
pix = page.get_pixmap(matrix=fitz.Matrix(scale_factor, scale_factor))
|
|
|
|
|
473 |
img_data = pix.tobytes("jpeg", jpeg_quality)
|
474 |
b64_img = base64.b64encode(img_data).decode('utf-8')
|
475 |
img_src = f"data:image/jpeg;base64,{b64_img}"
|
476 |
|
477 |
+
# First page gets the thumbnail, others empty
|
478 |
thumb_src = "" if page_num > 0 else pdf_cache[pdf_name]["pages"][0]["thumb"]
|
479 |
|
480 |
return {
|
|
|
483 |
"thumb": thumb_src
|
484 |
}
|
485 |
except Exception as e:
|
486 |
+
logger.error(f"Error processing page {page_num}: {e}")
|
487 |
return {
|
488 |
"page_num": page_num,
|
489 |
"src": "",
|
|
|
491 |
"error": str(e)
|
492 |
}
|
493 |
|
|
|
494 |
pages = [None] * total_pages
|
495 |
processed_count = 0
|
496 |
|
497 |
+
# Batch processing
|
498 |
+
batch_size = 5
|
499 |
|
500 |
for batch_start in range(0, total_pages, batch_size):
|
501 |
batch_end = min(batch_start + batch_size, total_pages)
|
502 |
current_batch = list(range(batch_start, batch_end))
|
503 |
|
|
|
504 |
with concurrent.futures.ThreadPoolExecutor(max_workers=min(5, batch_size)) as executor:
|
505 |
batch_results = list(executor.map(process_page, current_batch))
|
506 |
|
|
|
507 |
for result in batch_results:
|
508 |
page_num = result["page_num"]
|
509 |
pages[page_num] = {
|
|
|
515 |
progress = round(processed_count / total_pages * 100)
|
516 |
pdf_cache[pdf_name]["progress"] = progress
|
517 |
|
|
|
518 |
pdf_cache[pdf_name]["pages"] = pages
|
519 |
try:
|
520 |
with open(cache_path, "w") as cache_file:
|
|
|
525 |
"total_pages": total_pages
|
526 |
}, cache_file)
|
527 |
except Exception as e:
|
528 |
+
logger.error(f"Failed to save intermediate cache: {e}")
|
529 |
|
|
|
530 |
pdf_cache[pdf_name] = {
|
531 |
"status": "completed",
|
532 |
"progress": 100,
|
|
|
534 |
"total_pages": total_pages
|
535 |
}
|
536 |
|
537 |
+
# Final save
|
538 |
try:
|
539 |
with open(cache_path, "w") as cache_file:
|
540 |
json.dump(pdf_cache[pdf_name], cache_file)
|
541 |
+
logger.info(f"PDF {pdf_name} cached successfully with {total_pages} pages.")
|
542 |
except Exception as e:
|
543 |
+
logger.error(f"Failed to save final cache: {e}")
|
544 |
|
545 |
except Exception as e:
|
546 |
import traceback
|
547 |
+
logger.error(f"Error caching PDF: {str(e)}\n{traceback.format_exc()}")
|
548 |
if pdf_name in pdf_cache:
|
549 |
pdf_cache[pdf_name]["status"] = "error"
|
550 |
pdf_cache[pdf_name]["error"] = str(e)
|
551 |
|
552 |
+
|
553 |
+
# Retrieve PDF path by PDF ID
|
554 |
def get_pdf_path_by_id(pdf_id: str) -> str:
|
555 |
+
logger.info(f"Searching for PDF by ID: {pdf_id}")
|
556 |
|
557 |
+
# 1. Directly check in metadata
|
558 |
if pdf_id in pdf_metadata:
|
559 |
path = pdf_metadata[pdf_id]
|
|
|
560 |
if os.path.exists(path):
|
561 |
return path
|
562 |
|
563 |
+
# If file was moved, try searching by filename
|
564 |
filename = os.path.basename(path)
|
565 |
|
566 |
+
# Check permanent directory
|
567 |
perm_path = PERMANENT_PDF_DIR / filename
|
568 |
if perm_path.exists():
|
|
|
569 |
pdf_metadata[pdf_id] = str(perm_path)
|
570 |
save_pdf_metadata()
|
571 |
return str(perm_path)
|
572 |
+
|
573 |
+
# Check main directory
|
574 |
main_path = PDF_DIR / filename
|
575 |
if main_path.exists():
|
|
|
576 |
pdf_metadata[pdf_id] = str(main_path)
|
577 |
save_pdf_metadata()
|
578 |
return str(main_path)
|
579 |
|
580 |
+
# 2. Fallback: search by partial filename
|
581 |
try:
|
|
|
|
|
582 |
name_part = pdf_id.split('_')[0] if '_' in pdf_id else pdf_id
|
583 |
|
|
|
584 |
for file_path in get_pdf_files() + get_permanent_pdf_files():
|
|
|
585 |
file_basename = os.path.basename(file_path)
|
586 |
if file_basename.startswith(name_part) or file_path.stem.startswith(name_part):
|
|
|
587 |
pdf_metadata[pdf_id] = str(file_path)
|
588 |
save_pdf_metadata()
|
589 |
return str(file_path)
|
590 |
except Exception as e:
|
591 |
+
logger.error(f"Error searching by filename: {e}")
|
592 |
|
593 |
+
# 3. As a last resort, compare with existing metadata
|
594 |
for pid, path in pdf_metadata.items():
|
595 |
if os.path.exists(path):
|
596 |
file_basename = os.path.basename(path)
|
|
|
597 |
if pdf_id in pid or pid in pdf_id:
|
598 |
pdf_metadata[pdf_id] = path
|
599 |
save_pdf_metadata()
|
|
|
601 |
|
602 |
return None
|
603 |
|
604 |
+
|
605 |
+
# Initialize caching for all PDFs on startup
|
606 |
async def init_cache_all_pdfs():
|
607 |
+
logger.info("Starting PDF caching process.")
|
|
|
|
|
608 |
load_pdf_metadata()
|
609 |
|
|
|
610 |
pdf_files = get_pdf_files() + get_permanent_pdf_files()
|
|
|
|
|
611 |
unique_pdf_paths = set(str(p) for p in pdf_files)
|
612 |
pdf_files = [pathlib.Path(p) for p in unique_pdf_paths]
|
613 |
|
614 |
+
# Update metadata for all files
|
615 |
for pdf_file in pdf_files:
|
|
|
616 |
found = False
|
617 |
for pid, path in pdf_metadata.items():
|
618 |
if os.path.basename(path) == pdf_file.name:
|
619 |
found = True
|
|
|
620 |
if not os.path.exists(path):
|
621 |
pdf_metadata[pid] = str(pdf_file)
|
622 |
break
|
|
|
625 |
pdf_id = generate_pdf_id(pdf_file.name)
|
626 |
pdf_metadata[pdf_id] = str(pdf_file)
|
627 |
|
|
|
628 |
save_pdf_metadata()
|
629 |
|
630 |
+
# Load existing cache for a quick start
|
631 |
for cache_file in CACHE_DIR.glob("*_cache.json"):
|
632 |
try:
|
633 |
pdf_name = cache_file.stem.replace("_cache", "")
|
|
|
636 |
if cached_data.get("status") == "completed" and cached_data.get("pages"):
|
637 |
pdf_cache[pdf_name] = cached_data
|
638 |
pdf_cache[pdf_name]["status"] = "completed"
|
639 |
+
logger.info(f"Loaded existing cache: {pdf_name}")
|
640 |
except Exception as e:
|
641 |
+
logger.error(f"Error loading cache file: {str(e)}")
|
642 |
|
643 |
+
# Cache non-cached files in parallel
|
644 |
+
await asyncio.gather(*[
|
645 |
+
asyncio.create_task(cache_pdf(str(pdf_file)))
|
646 |
+
for pdf_file in pdf_files
|
647 |
+
if pdf_file.stem not in pdf_cache or pdf_cache[pdf_file.stem].get("status") != "completed"
|
648 |
+
])
|
649 |
+
|
650 |
|
|
|
651 |
@app.on_event("startup")
|
652 |
async def startup_event():
|
653 |
+
# Load PDF metadata
|
654 |
load_pdf_metadata()
|
655 |
|
656 |
+
# Create IDs for missing files
|
657 |
for pdf_file in get_pdf_files() + get_permanent_pdf_files():
|
658 |
found = False
|
659 |
for pid, path in pdf_metadata.items():
|
660 |
if os.path.basename(path) == pdf_file.name:
|
661 |
found = True
|
|
|
662 |
if not os.path.exists(path):
|
663 |
pdf_metadata[pid] = str(pdf_file)
|
664 |
break
|
665 |
|
666 |
if not found:
|
|
|
667 |
pdf_id = generate_pdf_id(pdf_file.name)
|
668 |
pdf_metadata[pdf_id] = str(pdf_file)
|
669 |
|
|
|
670 |
save_pdf_metadata()
|
671 |
|
672 |
+
# Start background caching task
|
673 |
asyncio.create_task(init_cache_all_pdfs())
|
674 |
|
675 |
+
|
676 |
+
# API endpoint: List PDF projects
|
677 |
@app.get("/api/pdf-projects")
|
678 |
async def get_pdf_projects_api():
|
679 |
return generate_pdf_projects()
|
680 |
|
681 |
+
|
682 |
+
# API endpoint: List permanently stored PDF projects
|
683 |
@app.get("/api/permanent-pdf-projects")
|
684 |
async def get_permanent_pdf_projects_api():
|
685 |
pdf_files = get_permanent_pdf_files()
|
686 |
projects_data = []
|
687 |
|
688 |
for pdf_file in pdf_files:
|
|
|
689 |
pdf_id = None
|
690 |
for pid, path in pdf_metadata.items():
|
691 |
if os.path.basename(path) == pdf_file.name:
|
692 |
pdf_id = pid
|
693 |
break
|
694 |
|
|
|
695 |
if not pdf_id:
|
696 |
pdf_id = generate_pdf_id(pdf_file.name)
|
697 |
pdf_metadata[pdf_id] = str(pdf_file)
|
698 |
save_pdf_metadata()
|
699 |
+
|
700 |
projects_data.append({
|
701 |
"path": str(pdf_file),
|
702 |
"name": pdf_file.stem,
|
|
|
706 |
|
707 |
return projects_data
|
708 |
|
709 |
+
|
710 |
+
# API endpoint: Get PDF info by ID
|
711 |
@app.get("/api/pdf-info-by-id/{pdf_id}")
|
712 |
async def get_pdf_info_by_id(pdf_id: str):
|
713 |
pdf_path = get_pdf_path_by_id(pdf_id)
|
|
|
720 |
"exists": True,
|
721 |
"cached": pdf_file.stem in pdf_cache and pdf_cache[pdf_file.stem].get("status") == "completed"
|
722 |
}
|
723 |
+
return {"exists": False, "error": "Could not find the specified PDF."}
|
724 |
+
|
725 |
|
726 |
+
# API endpoint: Get PDF thumbnail (optimized)
|
727 |
@app.get("/api/pdf-thumbnail")
|
728 |
async def get_pdf_thumbnail(path: str):
|
729 |
try:
|
730 |
pdf_file = pathlib.Path(path)
|
731 |
pdf_name = pdf_file.stem
|
732 |
|
733 |
+
# If cached, return the thumbnail from cache
|
734 |
if pdf_name in pdf_cache and pdf_cache[pdf_name].get("pages"):
|
735 |
if pdf_cache[pdf_name]["pages"][0].get("thumb"):
|
736 |
return {"thumbnail": pdf_cache[pdf_name]["pages"][0]["thumb"]}
|
737 |
+
|
738 |
+
# If not cached, generate a quick thumbnail (smaller resolution)
|
739 |
import fitz
|
740 |
doc = fitz.open(path)
|
741 |
if doc.page_count > 0:
|
742 |
page = doc[0]
|
743 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(0.2, 0.2))
|
744 |
+
img_data = pix.tobytes("jpeg", 70)
|
745 |
b64_img = base64.b64encode(img_data).decode('utf-8')
|
746 |
|
747 |
+
# Start background caching
|
748 |
asyncio.create_task(cache_pdf(path))
|
749 |
|
750 |
return {"thumbnail": f"data:image/jpeg;base64,{b64_img}"}
|
751 |
|
752 |
return {"thumbnail": None}
|
753 |
except Exception as e:
|
754 |
+
logger.error(f"Error generating thumbnail: {str(e)}")
|
755 |
return {"error": str(e), "thumbnail": None}
|
756 |
|
757 |
+
|
758 |
+
# API endpoint: Cache status
|
759 |
@app.get("/api/cache-status")
|
760 |
async def get_cache_status(path: str = None):
|
761 |
if path:
|
|
|
765 |
return pdf_cache[pdf_name]
|
766 |
return {"status": "not_cached"}
|
767 |
else:
|
768 |
+
return {
|
769 |
+
name: {"status": info["status"], "progress": info.get("progress", 0)}
|
770 |
+
for name, info in pdf_cache.items()
|
771 |
+
}
|
772 |
+
|
773 |
|
774 |
+
# API endpoint: Query PDF content with AI
|
775 |
@app.post("/api/ai/query-pdf/{pdf_id}")
|
776 |
async def api_query_pdf(pdf_id: str, query: Dict[str, str]):
|
777 |
try:
|
778 |
user_query = query.get("query", "")
|
779 |
if not user_query:
|
780 |
+
return JSONResponse(content={"error": "No question provided."}, status_code=400)
|
781 |
|
|
|
782 |
pdf_path = get_pdf_path_by_id(pdf_id)
|
783 |
if not pdf_path:
|
784 |
+
return JSONResponse(content={"error": f"No file found for PDF ID {pdf_id}"}, status_code=404)
|
785 |
|
|
|
786 |
result = await query_pdf(pdf_id, user_query)
|
787 |
|
788 |
if "error" in result:
|
|
|
790 |
|
791 |
return result
|
792 |
except Exception as e:
|
793 |
+
logger.error(f"Error in AI query endpoint: {e}")
|
794 |
return JSONResponse(content={"error": str(e)}, status_code=500)
|
795 |
|
796 |
+
|
797 |
+
# API endpoint: Summarize PDF
|
798 |
@app.get("/api/ai/summarize-pdf/{pdf_id}")
|
799 |
async def api_summarize_pdf(pdf_id: str):
|
800 |
try:
|
|
|
801 |
pdf_path = get_pdf_path_by_id(pdf_id)
|
802 |
if not pdf_path:
|
803 |
+
return JSONResponse(content={"error": f"No file found for PDF ID {pdf_id}"}, status_code=404)
|
804 |
|
|
|
805 |
result = await summarize_pdf(pdf_id)
|
806 |
|
807 |
if "error" in result:
|
|
|
809 |
|
810 |
return result
|
811 |
except Exception as e:
|
812 |
+
logger.error(f"Error in PDF summary endpoint: {e}")
|
813 |
return JSONResponse(content={"error": str(e)}, status_code=500)
|
814 |
|
815 |
+
|
816 |
+
# API endpoint: Provide cached PDF content (progressive loading)
|
817 |
@app.get("/api/cached-pdf")
|
818 |
async def get_cached_pdf(path: str, background_tasks: BackgroundTasks):
|
819 |
try:
|
820 |
pdf_file = pathlib.Path(path)
|
821 |
pdf_name = pdf_file.stem
|
822 |
|
|
|
823 |
if pdf_name in pdf_cache:
|
824 |
status = pdf_cache[pdf_name].get("status", "")
|
825 |
|
|
|
826 |
if status == "completed":
|
827 |
return pdf_cache[pdf_name]
|
|
|
|
|
828 |
elif status == "processing":
|
829 |
progress = pdf_cache[pdf_name].get("progress", 0)
|
830 |
pages = pdf_cache[pdf_name].get("pages", [])
|
831 |
total_pages = pdf_cache[pdf_name].get("total_pages", 0)
|
832 |
|
|
|
833 |
return {
|
834 |
+
"status": "processing",
|
835 |
"progress": progress,
|
836 |
"pages": pages,
|
837 |
"total_pages": total_pages,
|
838 |
"available_pages": len([p for p in pages if p and p.get("src")])
|
839 |
}
|
840 |
|
841 |
+
# If no cache exists, start caching in the background
|
842 |
background_tasks.add_task(cache_pdf, path)
|
843 |
return {"status": "started", "progress": 0}
|
844 |
|
845 |
except Exception as e:
|
846 |
+
logger.error(f"Error providing cached PDF: {str(e)}")
|
847 |
return {"error": str(e), "status": "error"}
|
848 |
|
849 |
+
|
850 |
+
# API endpoint: Provide original PDF content (if not cached)
|
851 |
@app.get("/api/pdf-content")
|
852 |
async def get_pdf_content(path: str, background_tasks: BackgroundTasks):
|
853 |
try:
|
|
|
854 |
pdf_file = pathlib.Path(path)
|
855 |
if not pdf_file.exists():
|
856 |
+
return JSONResponse(content={"error": f"File not found: {path}"}, status_code=404)
|
857 |
|
858 |
pdf_name = pdf_file.stem
|
859 |
|
860 |
+
# If already cached or partially cached, redirect
|
861 |
+
if pdf_name in pdf_cache and (
|
862 |
+
pdf_cache[pdf_name].get("status") == "completed"
|
863 |
+
or (
|
864 |
+
pdf_cache[pdf_name].get("status") == "processing"
|
865 |
+
and pdf_cache[pdf_name].get("progress", 0) > 10
|
866 |
+
)
|
867 |
+
):
|
868 |
return JSONResponse(content={"redirect": f"/api/cached-pdf?path={path}"})
|
869 |
|
870 |
+
with open(path, "rb") as pdf_file_handle:
|
871 |
+
content = pdf_file_handle.read()
|
872 |
+
|
|
|
|
|
873 |
import urllib.parse
|
874 |
filename = pdf_file.name
|
875 |
encoded_filename = urllib.parse.quote(filename)
|
876 |
|
877 |
+
# Start caching in the background
|
878 |
background_tasks.add_task(cache_pdf, path)
|
879 |
|
|
|
880 |
headers = {
|
881 |
"Content-Type": "application/pdf",
|
882 |
+
"Content-Disposition": f'inline; filename="{encoded_filename}"; filename*=UTF-8\'\'{encoded_filename}'
|
883 |
}
|
884 |
|
885 |
return Response(content=content, media_type="application/pdf", headers=headers)
|
886 |
except Exception as e:
|
887 |
import traceback
|
888 |
error_details = traceback.format_exc()
|
889 |
+
logger.error(f"Error loading PDF content: {str(e)}\n{error_details}")
|
890 |
return JSONResponse(content={"error": str(e)}, status_code=500)
|
891 |
|
892 |
+
|
893 |
+
# API endpoint: Upload PDF to permanent storage
|
894 |
@app.post("/api/upload-pdf")
|
895 |
async def upload_pdf(file: UploadFile = File(...)):
|
896 |
try:
|
|
|
897 |
if not file.filename.lower().endswith('.pdf'):
|
898 |
+
return JSONResponse(content={"success": False, "message": "Only PDF files are allowed."}, status_code=400)
|
|
|
|
|
|
|
899 |
|
|
|
900 |
file_path = PERMANENT_PDF_DIR / file.filename
|
901 |
|
|
|
902 |
content = await file.read()
|
903 |
with open(file_path, "wb") as buffer:
|
904 |
buffer.write(content)
|
905 |
|
906 |
+
# Also copy to main directory to be automatically displayed
|
907 |
with open(PDF_DIR / file.filename, "wb") as buffer:
|
908 |
buffer.write(content)
|
909 |
|
|
|
910 |
pdf_id = generate_pdf_id(file.filename)
|
911 |
pdf_metadata[pdf_id] = str(file_path)
|
912 |
save_pdf_metadata()
|
913 |
|
|
|
914 |
asyncio.create_task(cache_pdf(str(file_path)))
|
915 |
|
916 |
return JSONResponse(
|
917 |
content={
|
918 |
+
"success": True,
|
919 |
+
"path": str(file_path),
|
920 |
"name": file_path.stem,
|
921 |
"id": pdf_id,
|
922 |
"viewUrl": f"/view/{pdf_id}"
|
|
|
926 |
except Exception as e:
|
927 |
import traceback
|
928 |
error_details = traceback.format_exc()
|
929 |
+
logger.error(f"Error uploading PDF: {str(e)}\n{error_details}")
|
930 |
+
return JSONResponse(content={"success": False, "message": str(e)}, status_code=500)
|
|
|
|
|
|
|
931 |
|
932 |
+
|
933 |
+
# Convert text file to PDF
|
934 |
async def convert_text_to_pdf(text_content: str, title: str) -> str:
|
935 |
try:
|
|
|
936 |
import re
|
937 |
safe_title = re.sub(r'[^\w\-_\. ]', '_', title)
|
938 |
if not safe_title:
|
939 |
safe_title = "aibook"
|
940 |
|
|
|
941 |
timestamp = int(time.time())
|
942 |
filename = f"{safe_title}_{timestamp}.pdf"
|
943 |
|
|
|
944 |
file_path = PERMANENT_PDF_DIR / filename
|
945 |
|
946 |
+
# Registering a Korean font. If not found, fallback to Helvetica.
|
947 |
from reportlab.pdfbase import pdfmetrics
|
948 |
from reportlab.pdfbase.ttfonts import TTFont
|
949 |
|
|
|
950 |
font_path = BASE / "MaruBuri-SemiBold.ttf"
|
951 |
|
|
|
952 |
font_name = "MaruBuri"
|
953 |
if font_path.exists():
|
954 |
pdfmetrics.registerFont(TTFont(font_name, str(font_path)))
|
955 |
+
logger.info(f"Successfully registered the Korean font: {font_path}")
|
956 |
else:
|
957 |
font_name = "Helvetica"
|
958 |
+
logger.warning(f"Could not find the Korean font file: {font_path}. Using a default font.")
|
959 |
|
|
|
960 |
pdf_buffer = io.BytesIO()
|
961 |
|
|
|
962 |
from reportlab.lib.pagesizes import letter
|
963 |
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
964 |
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
|
|
966 |
|
967 |
doc = SimpleDocTemplate(pdf_buffer, pagesize=letter, encoding='utf-8')
|
968 |
|
|
|
969 |
title_style = ParagraphStyle(
|
970 |
name='CustomTitle',
|
971 |
fontName=font_name,
|
|
|
985 |
spaceAfter=6
|
986 |
)
|
987 |
|
|
|
988 |
content = []
|
989 |
|
990 |
+
# Add title
|
991 |
content.append(Paragraph(title, title_style))
|
992 |
content.append(Spacer(1, 20))
|
993 |
|
|
|
994 |
paragraphs = text_content.split('\n\n')
|
995 |
for para in paragraphs:
|
996 |
if para.strip():
|
|
|
997 |
from xml.sax.saxutils import escape
|
998 |
safe_para = escape(para.replace('\n', '<br/>'))
|
999 |
p = Paragraph(safe_para, normal_style)
|
1000 |
content.append(p)
|
1001 |
content.append(Spacer(1, 10))
|
1002 |
|
|
|
1003 |
doc.build(content)
|
1004 |
|
|
|
1005 |
with open(file_path, 'wb') as f:
|
1006 |
f.write(pdf_buffer.getvalue())
|
1007 |
|
1008 |
+
# Copy to main directory
|
1009 |
with open(PDF_DIR / filename, 'wb') as f:
|
1010 |
f.write(pdf_buffer.getvalue())
|
1011 |
|
|
|
1012 |
pdf_id = generate_pdf_id(filename)
|
1013 |
pdf_metadata[pdf_id] = str(file_path)
|
1014 |
save_pdf_metadata()
|
1015 |
|
|
|
1016 |
asyncio.create_task(cache_pdf(str(file_path)))
|
1017 |
|
1018 |
return {
|
|
|
1022 |
}
|
1023 |
|
1024 |
except Exception as e:
|
1025 |
+
logger.error(f"Error converting text to PDF: {e}")
|
1026 |
raise e
|
1027 |
|
1028 |
|
1029 |
+
# AI-based text enhancement stub (placeholder)
|
1030 |
async def enhance_text_with_ai(text_content: str, title: str) -> str:
|
1031 |
+
# Currently returns the original text (AI enhancement disabled)
|
1032 |
+
return text_content
|
|
|
1033 |
|
1034 |
|
1035 |
+
# API endpoint: Convert uploaded text file to PDF
|
1036 |
@app.post("/api/text-to-pdf")
|
1037 |
async def text_to_pdf(file: UploadFile = File(...)):
|
1038 |
try:
|
|
|
1039 |
filename = file.filename.lower()
|
1040 |
if not (filename.endswith('.txt') or filename.endswith('.docx') or filename.endswith('.doc')):
|
1041 |
return JSONResponse(
|
1042 |
+
content={"success": False, "message": "Supported file formats are .txt, .docx, and .doc only."},
|
1043 |
status_code=400
|
1044 |
)
|
1045 |
|
|
|
1046 |
content = await file.read()
|
1047 |
|
1048 |
+
# Extract text depending on file type
|
1049 |
if filename.endswith('.txt'):
|
|
|
1050 |
encodings = ['utf-8', 'euc-kr', 'cp949', 'latin1']
|
1051 |
text_content = None
|
1052 |
|
1053 |
for encoding in encodings:
|
1054 |
try:
|
1055 |
text_content = content.decode(encoding, errors='strict')
|
1056 |
+
logger.info(f"Detected text file encoding: {encoding}")
|
1057 |
break
|
1058 |
except UnicodeDecodeError:
|
1059 |
continue
|
1060 |
|
1061 |
if text_content is None:
|
|
|
1062 |
text_content = content.decode('utf-8', errors='replace')
|
1063 |
+
logger.warning("Could not detect text file encoding; defaulting to UTF-8.")
|
1064 |
|
1065 |
elif filename.endswith('.docx') or filename.endswith('.doc'):
|
|
|
1066 |
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as temp_file:
|
1067 |
temp_file.write(content)
|
1068 |
temp_path = temp_file.name
|
1069 |
|
1070 |
try:
|
|
|
1071 |
text_content = docx2txt.process(temp_path)
|
1072 |
finally:
|
|
|
1073 |
os.unlink(temp_path)
|
1074 |
|
|
|
1075 |
title = os.path.splitext(filename)[0]
|
1076 |
|
1077 |
+
# Optional AI enhancement
|
1078 |
enhanced_text = await enhance_text_with_ai(text_content, title)
|
1079 |
|
1080 |
+
# Convert the final text to PDF
|
1081 |
pdf_info = await convert_text_to_pdf(enhanced_text, title)
|
1082 |
|
1083 |
return JSONResponse(
|
1084 |
content={
|
1085 |
+
"success": True,
|
1086 |
+
"path": pdf_info["path"],
|
1087 |
"name": os.path.splitext(pdf_info["filename"])[0],
|
1088 |
"id": pdf_info["id"],
|
1089 |
"viewUrl": f"/view/{pdf_info['id']}"
|
|
|
1093 |
except Exception as e:
|
1094 |
import traceback
|
1095 |
error_details = traceback.format_exc()
|
1096 |
+
logger.error(f"Error converting text to PDF: {str(e)}\n{error_details}")
|
1097 |
+
return JSONResponse(content={"success": False, "message": str(e)}, status_code=500)
|
|
|
|
|
|
|
1098 |
|
1099 |
+
|
1100 |
+
# Admin authentication endpoint
|
1101 |
@app.post("/api/admin-login")
|
1102 |
async def admin_login(password: str = Form(...)):
|
1103 |
if password == ADMIN_PASSWORD:
|
1104 |
return {"success": True}
|
1105 |
+
return {"success": False, "message": "Authentication failed."}
|
1106 |
+
|
1107 |
|
1108 |
+
# Admin: Delete PDF
|
1109 |
@app.delete("/api/admin/delete-pdf")
|
1110 |
async def delete_pdf(path: str):
|
1111 |
try:
|
1112 |
pdf_file = pathlib.Path(path)
|
1113 |
if not pdf_file.exists():
|
1114 |
+
return {"success": False, "message": "File not found."}
|
1115 |
|
|
|
1116 |
filename = pdf_file.name
|
1117 |
|
1118 |
+
# Delete from permanent storage
|
1119 |
pdf_file.unlink()
|
1120 |
|
1121 |
+
# Also delete from main directory if exists
|
1122 |
main_file_path = PDF_DIR / filename
|
1123 |
if main_file_path.exists():
|
1124 |
main_file_path.unlink()
|
1125 |
|
1126 |
+
# Delete related cache
|
1127 |
pdf_name = pdf_file.stem
|
1128 |
cache_path = get_cache_path(pdf_name)
|
1129 |
if cache_path.exists():
|
1130 |
cache_path.unlink()
|
1131 |
+
|
|
|
1132 |
if pdf_name in pdf_cache:
|
1133 |
del pdf_cache[pdf_name]
|
1134 |
+
|
1135 |
+
# Remove from metadata
|
1136 |
to_remove = []
|
1137 |
for pid, fpath in pdf_metadata.items():
|
1138 |
if os.path.basename(fpath) == filename:
|
|
|
1142 |
del pdf_metadata[pid]
|
1143 |
|
1144 |
save_pdf_metadata()
|
1145 |
+
|
1146 |
return {"success": True}
|
1147 |
except Exception as e:
|
1148 |
+
logger.error(f"Error deleting PDF: {str(e)}")
|
1149 |
return {"success": False, "message": str(e)}
|
1150 |
|
1151 |
+
|
1152 |
+
# Admin: Feature PDF (copy to main directory)
|
1153 |
@app.post("/api/admin/feature-pdf")
|
1154 |
async def feature_pdf(path: str):
|
1155 |
try:
|
1156 |
pdf_file = pathlib.Path(path)
|
1157 |
if not pdf_file.exists():
|
1158 |
+
return {"success": False, "message": "File not found."}
|
1159 |
|
|
|
1160 |
target_path = PDF_DIR / pdf_file.name
|
1161 |
shutil.copy2(pdf_file, target_path)
|
1162 |
+
|
1163 |
return {"success": True}
|
1164 |
except Exception as e:
|
1165 |
+
logger.error(f"Error featuring PDF: {str(e)}")
|
1166 |
return {"success": False, "message": str(e)}
|
1167 |
|
1168 |
+
|
1169 |
+
# Admin: Unfeature PDF (remove from main directory only)
|
1170 |
@app.delete("/api/admin/unfeature-pdf")
|
1171 |
async def unfeature_pdf(path: str):
|
1172 |
try:
|
|
|
1175 |
|
1176 |
if target_path.exists():
|
1177 |
target_path.unlink()
|
1178 |
+
|
1179 |
return {"success": True}
|
1180 |
except Exception as e:
|
1181 |
+
logger.error(f"Error unfeaturing PDF: {str(e)}")
|
1182 |
return {"success": False, "message": str(e)}
|
1183 |
|
1184 |
+
|
1185 |
@app.get("/view/{pdf_id}")
|
1186 |
async def view_pdf_by_id(pdf_id: str):
|
|
|
1187 |
pdf_path = get_pdf_path_by_id(pdf_id)
|
1188 |
|
1189 |
if not pdf_path:
|
1190 |
+
# Reload metadata and retry
|
1191 |
load_pdf_metadata()
|
1192 |
pdf_path = get_pdf_path_by_id(pdf_id)
|
1193 |
|
1194 |
if not pdf_path:
|
1195 |
+
# As a final fallback, try scanning all files for a match
|
1196 |
for file_path in get_pdf_files() + get_permanent_pdf_files():
|
1197 |
name_part = pdf_id.split('_')[0] if '_' in pdf_id else pdf_id
|
1198 |
if file_path.stem.startswith(name_part):
|
|
|
1203 |
|
1204 |
if not pdf_path:
|
1205 |
return HTMLResponse(
|
1206 |
+
content=(
|
1207 |
+
f"<html><body><h1>Could not find the requested PDF</h1>"
|
1208 |
+
f"<p>ID: {pdf_id}</p><a href='/'>Go back to home</a></body></html>"
|
1209 |
+
),
|
1210 |
status_code=404
|
1211 |
)
|
1212 |
|
1213 |
+
# Redirect to the main page with PDF ID parameter
|
1214 |
return get_html_content(pdf_id=pdf_id)
|
1215 |
|
1216 |
+
|
1217 |
def get_html_content(pdf_id: str = None):
|
1218 |
html_path = BASE / "flipbook_template.html"
|
1219 |
content = ""
|
|
|
1221 |
with open(html_path, "r", encoding="utf-8") as f:
|
1222 |
content = f.read()
|
1223 |
else:
|
1224 |
+
content = HTML # fallback if no local template
|
1225 |
|
|
|
1226 |
if pdf_id:
|
1227 |
auto_load_script = f"""
|
1228 |
<script>
|
|
|
1229 |
document.addEventListener('DOMContentLoaded', async function() {{
|
1230 |
try {{
|
|
|
1231 |
const response = await fetch('/api/pdf-info-by-id/{pdf_id}');
|
1232 |
const pdfInfo = await response.json();
|
1233 |
|
1234 |
if (pdfInfo.exists && pdfInfo.path) {{
|
|
|
1235 |
setTimeout(() => {{
|
1236 |
openPdfById('{pdf_id}', pdfInfo.path, pdfInfo.cached);
|
1237 |
}}, 500);
|
1238 |
}} else {{
|
1239 |
+
showError("The requested PDF could not be found.");
|
1240 |
}}
|
1241 |
}} catch (e) {{
|
1242 |
+
console.error("Auto-load PDF error:", e);
|
1243 |
}}
|
1244 |
}});
|
1245 |
</script>
|
1246 |
"""
|
1247 |
|
|
|
1248 |
content = content.replace("</body>", auto_load_script + "</body>")
|
1249 |
|
1250 |
return HTMLResponse(content=content)
|
1251 |
|
1252 |
+
|
1253 |
@app.get("/", response_class=HTMLResponse)
|
1254 |
async def root(request: Request, pdf_id: Optional[str] = Query(None)):
|
|
|
1255 |
if pdf_id:
|
1256 |
return RedirectResponse(url=f"/view/{pdf_id}")
|
1257 |
return get_html_content()
|
1258 |
|
|
|
|
|
|
|
1259 |
|
1260 |
+
import os
|
1261 |
HTML = os.getenv("HTML_TEMPLATE", "")
|
|
|
|
|
1262 |
if not HTML:
|
1263 |
+
logger.warning("HTML_TEMPLATE secret is not set. Using default HTML.")
|
1264 |
HTML = """
|
1265 |
<!doctype html>
|
1266 |
+
<html lang="en">
|
1267 |
<head>
|
1268 |
<meta charset="utf-8">
|
1269 |
<title>FlipBook Space</title>
|
|
|
1273 |
</style>
|
1274 |
</head>
|
1275 |
<body>
|
1276 |
+
<h1>Could not load the HTML template</h1>
|
1277 |
+
<p class="error">HTML_TEMPLATE secret is not configured.</p>
|
1278 |
+
<p>Please set the HTML_TEMPLATE in your Hugging Face Space secrets.</p>
|
1279 |
</body>
|
1280 |
</html>
|
1281 |
"""
|
1282 |
|
1283 |
if __name__ == "__main__":
|
1284 |
+
uvicorn.run("app:app", host="0.0.0.0", port=int(os.getenv("PORT", 7860)))
|