Marsouuu commited on
Commit
5e5fca2
·
1 Parent(s): 7c7c5da

Amélioration du support Poppler pour la conversion PDF et la prévisualisation

Browse files
Files changed (2) hide show
  1. Dockerfile +15 -3
  2. app.py +79 -4
Dockerfile CHANGED
@@ -2,12 +2,24 @@ FROM python:3.9
2
 
3
  WORKDIR /code
4
 
5
- # Installer poppler-utils pour pdf2image
6
- RUN apt-get update && apt-get install -y poppler-utils && rm -rf /var/lib/apt/lists/*
 
 
 
 
 
 
 
7
 
8
  COPY requirements.txt /code/requirements.txt
9
  RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
10
 
 
 
 
 
11
  COPY . /code
12
 
13
- CMD ["python", "app.py"]
 
 
2
 
3
  WORKDIR /code
4
 
5
+ # Installer poppler-utils complet pour pdf2image
6
+ RUN apt-get update && apt-get install -y \
7
+ poppler-utils \
8
+ libpoppler-cpp-dev \
9
+ libpoppler-private-dev \
10
+ pkg-config \
11
+ && rm -rf /var/lib/apt/lists/* \
12
+ && which pdftoppm && echo "Poppler correctement installé!" \
13
+ || echo "ERREUR: Poppler non installé correctement!"
14
 
15
  COPY requirements.txt /code/requirements.txt
16
  RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
17
 
18
+ # Vérification de l'installation de pdf2image
19
+ RUN python -c "from pdf2image import convert_from_bytes; print('pdf2image est correctement installé!')" \
20
+ || echo "ERREUR: pdf2image n'est pas installé correctement!"
21
+
22
  COPY . /code
23
 
24
+ # Commande de démarrage de l'application Gradio
25
+ CMD ["python", "app.py"]
app.py CHANGED
@@ -4,9 +4,48 @@ import os
4
  import json
5
  import time
6
  import base64
 
7
  from PIL import Image
8
  from pdf2image import convert_from_path
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # Configuration
11
  GOOGLE_API_KEY = "AIzaSyA4ma5pE1pPCzHHn-i9tDWuKqQEgSltMtI"
12
  genai.configure(api_key=GOOGLE_API_KEY)
@@ -919,7 +958,21 @@ def process_document(file, progress=gr.Progress()):
919
 
920
  try:
921
  if file.name.lower().endswith('.pdf'):
922
- images = convert_from_path(file.name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
923
 
924
  if len(images) > 10:
925
  return {"error": TEXT["error"]["too_many_pages"]}
@@ -954,8 +1007,27 @@ def update_preview(file):
954
 
955
  if file.name.lower().endswith('.pdf'):
956
  try:
957
- # Just show first 3 pages
958
- images = convert_from_path(file.name, first_page=1, last_page=3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
959
  image_paths = []
960
 
961
  for i, img in enumerate(images):
@@ -963,8 +1035,11 @@ def update_preview(file):
963
  img.save(temp_filename)
964
  image_paths.append(temp_filename)
965
 
 
966
  return image_paths
967
- except:
 
 
968
  return []
969
  elif file.name.lower().endswith(('.png', '.jpg', '.jpeg')):
970
  return [file.name]
 
4
  import json
5
  import time
6
  import base64
7
+ import subprocess
8
  from PIL import Image
9
  from pdf2image import convert_from_path
10
 
11
+ # Vérification de la disponibilité de Poppler
12
+ def check_poppler():
13
+ poppler_path = None
14
+ potential_paths = [
15
+ '/usr/bin',
16
+ '/usr/local/bin',
17
+ '/opt/homebrew/bin',
18
+ '/app/bin'
19
+ ]
20
+
21
+ for path in potential_paths:
22
+ if os.path.exists(os.path.join(path, 'pdftoppm')):
23
+ poppler_path = path
24
+ print(f"✅ Poppler trouvé dans: {poppler_path}")
25
+ return poppler_path
26
+
27
+ # Essayer de localiser avec la commande which
28
+ try:
29
+ which_result = subprocess.run(['which', 'pdftoppm'], capture_output=True, text=True)
30
+ if which_result.returncode == 0:
31
+ poppler_bin = which_result.stdout.strip()
32
+ poppler_path = os.path.dirname(poppler_bin)
33
+ print(f"✅ Poppler trouvé via 'which' dans: {poppler_path}")
34
+ return poppler_path
35
+ except:
36
+ pass
37
+
38
+ print("⚠️ AVERTISSEMENT: Poppler non trouvé dans les chemins standards!")
39
+ print("⚠️ Les fonctionnalités de prévisualisation et de traitement PDF pourraient ne pas fonctionner.")
40
+ print("⚠️ Veuillez installer Poppler:")
41
+ print(" - Linux: apt-get install poppler-utils")
42
+ print(" - macOS: brew install poppler")
43
+ print(" - Windows: Téléchargez depuis https://github.com/oschwartz10612/poppler-windows/")
44
+ return None
45
+
46
+ # Vérifier Poppler au démarrage
47
+ POPPLER_PATH = check_poppler()
48
+
49
  # Configuration
50
  GOOGLE_API_KEY = "AIzaSyA4ma5pE1pPCzHHn-i9tDWuKqQEgSltMtI"
51
  genai.configure(api_key=GOOGLE_API_KEY)
 
958
 
959
  try:
960
  if file.name.lower().endswith('.pdf'):
961
+ # Utiliser le chemin Poppler détecté au démarrage
962
+ if POPPLER_PATH:
963
+ images = convert_from_path(
964
+ file.name,
965
+ poppler_path=POPPLER_PATH,
966
+ use_pdftocairo=True,
967
+ dpi=150
968
+ )
969
+ else:
970
+ print("Trying without poppler_path")
971
+ images = convert_from_path(
972
+ file.name,
973
+ use_pdftocairo=True,
974
+ dpi=150
975
+ )
976
 
977
  if len(images) > 10:
978
  return {"error": TEXT["error"]["too_many_pages"]}
 
1007
 
1008
  if file.name.lower().endswith('.pdf'):
1009
  try:
1010
+ # Utiliser le chemin Poppler détecté au démarrage
1011
+ if POPPLER_PATH:
1012
+ images = convert_from_path(
1013
+ file.name,
1014
+ first_page=1,
1015
+ last_page=3,
1016
+ poppler_path=POPPLER_PATH,
1017
+ use_pdftocairo=True,
1018
+ dpi=150
1019
+ )
1020
+ else:
1021
+ # Essayer sans spécifier le chemin, en utilisant des options simplifiées
1022
+ print("Trying without poppler_path")
1023
+ images = convert_from_path(
1024
+ file.name,
1025
+ first_page=1,
1026
+ last_page=3,
1027
+ use_pdftocairo=True,
1028
+ dpi=150
1029
+ )
1030
+
1031
  image_paths = []
1032
 
1033
  for i, img in enumerate(images):
 
1035
  img.save(temp_filename)
1036
  image_paths.append(temp_filename)
1037
 
1038
+ print(f"Successfully created {len(image_paths)} preview images")
1039
  return image_paths
1040
+ except Exception as e:
1041
+ print(f"Error converting PDF to images: {str(e)}")
1042
+ # En cas d'erreur, retourner une image d'erreur qui sera affichée
1043
  return []
1044
  elif file.name.lower().endswith(('.png', '.jpg', '.jpeg')):
1045
  return [file.name]