ludigija commited on
Commit
e4fcaef
·
verified ·
1 Parent(s): 0e9d4fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -21
app.py CHANGED
@@ -3,12 +3,19 @@ import fitz # PyMuPDF
3
  import cv2
4
  import numpy as np
5
  from PIL import Image
6
- from transformers import pipeline
7
  import os
8
  import tempfile
 
9
 
10
- # Initialize Mistral model (free)
11
- analyzer = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1")
 
 
 
 
 
 
12
 
13
  def pdf_to_images(pdf_path):
14
  """Convert PDF to high-res images using PyMuPDF"""
@@ -28,11 +35,9 @@ def highlight_differences(img1, img2):
28
  gray1 = cv2.cvtColor(img1_np, cv2.COLOR_RGB2GRAY)
29
  gray2 = cv2.cvtColor(img2_np, cv2.COLOR_RGB2GRAY)
30
 
31
- # Compute absolute difference
32
  diff = cv2.absdiff(gray1, gray2)
33
  _, thresh = cv2.threshold(diff, 25, 255, cv2.THRESH_BINARY)
34
 
35
- # Highlight differences
36
  highlighted = img2_np.copy()
37
  highlighted[thresh == 255] = [255, 0, 0] # Red highlights
38
 
@@ -44,16 +49,16 @@ def extract_text_with_layout(img):
44
  custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
45
  return pytesseract.image_to_string(img, config=custom_config)
46
 
47
- def generate_free_ai_report(before_text, after_text, visual_desc):
48
- """Generate report using free Mistral model"""
49
  prompt = f"""
50
- [INST] Compare these document versions and provide a professional difference report:
51
 
52
  BEFORE VERSION:
53
- {before_text[:3000]}... [truncated]
54
 
55
  AFTER VERSION:
56
- {after_text[:3000]}... [truncated]
57
 
58
  VISUAL ANALYSIS NOTES:
59
  {visual_desc}
@@ -62,21 +67,20 @@ def generate_free_ai_report(before_text, after_text, visual_desc):
62
  1. SUMMARY: 2-3 sentence overview
63
  2. KEY CHANGES: Bullet points of specific changes
64
  3. ANALYSIS: Potential implications
65
-
66
- Use clear, concise language. [/INST]
67
  """
68
 
69
- result = analyzer(
70
- prompt,
71
- max_length=1024,
 
72
  temperature=0.7,
73
  do_sample=True
74
  )
75
 
76
- return result[0]['generated_text'].split('[/INST]')[-1].strip()
77
 
78
  def main():
79
- st.title("Free AI PDF Comparator")
80
 
81
  col1, col2 = st.columns(2)
82
  with col1:
@@ -108,15 +112,15 @@ def main():
108
  # Visual diff
109
  highlighted, diff_score = highlight_differences(img1, img2)
110
 
111
- # Only analyze if significant differences
112
  if diff_score > 5: # Threshold for meaningful changes
113
  # Text extraction
114
  before_text = extract_text_with_layout(img1)
115
  after_text = extract_text_with_layout(img2)
116
 
117
  # Generate report
118
- visual_desc = f"Page {i+1} shows visual changes (difference score: {diff_score:.1f})"
119
- report = generate_free_ai_report(before_text, after_text, visual_desc)
 
120
 
121
  reports.append((i+1, report, highlighted))
122
 
@@ -132,7 +136,7 @@ def main():
132
  st.image(img, use_column_width=True)
133
  with col2:
134
  st.markdown(f"**Page {page_num} Report**")
135
- st.write(report)
136
  else:
137
  st.warning("Please upload both PDF files")
138
 
 
3
  import cv2
4
  import numpy as np
5
  from PIL import Image
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM
7
  import os
8
  import tempfile
9
+ import torch
10
 
11
+ # Initialize free OpenLLaMA model (no auth needed)
12
+ model_name = "openlm-research/open_llama_7b_v2"
13
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
14
+ model = AutoModelForCausalLM.from_pretrained(
15
+ model_name,
16
+ torch_dtype=torch.float16,
17
+ device_map="auto"
18
+ )
19
 
20
  def pdf_to_images(pdf_path):
21
  """Convert PDF to high-res images using PyMuPDF"""
 
35
  gray1 = cv2.cvtColor(img1_np, cv2.COLOR_RGB2GRAY)
36
  gray2 = cv2.cvtColor(img2_np, cv2.COLOR_RGB2GRAY)
37
 
 
38
  diff = cv2.absdiff(gray1, gray2)
39
  _, thresh = cv2.threshold(diff, 25, 255, cv2.THRESH_BINARY)
40
 
 
41
  highlighted = img2_np.copy()
42
  highlighted[thresh == 255] = [255, 0, 0] # Red highlights
43
 
 
49
  custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
50
  return pytesseract.image_to_string(img, config=custom_config)
51
 
52
+ def generate_free_report(before_text, after_text, visual_desc):
53
+ """Generate report using free OpenLLaMA model"""
54
  prompt = f"""
55
+ Compare these document versions and provide a professional difference report:
56
 
57
  BEFORE VERSION:
58
+ {before_text[:1500]}... [truncated]
59
 
60
  AFTER VERSION:
61
+ {after_text[:1500]}... [truncated]
62
 
63
  VISUAL ANALYSIS NOTES:
64
  {visual_desc}
 
67
  1. SUMMARY: 2-3 sentence overview
68
  2. KEY CHANGES: Bullet points of specific changes
69
  3. ANALYSIS: Potential implications
 
 
70
  """
71
 
72
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
73
+ outputs = model.generate(
74
+ **inputs,
75
+ max_new_tokens=512,
76
  temperature=0.7,
77
  do_sample=True
78
  )
79
 
80
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
81
 
82
  def main():
83
+ st.title("Free PDF Comparator")
84
 
85
  col1, col2 = st.columns(2)
86
  with col1:
 
112
  # Visual diff
113
  highlighted, diff_score = highlight_differences(img1, img2)
114
 
 
115
  if diff_score > 5: # Threshold for meaningful changes
116
  # Text extraction
117
  before_text = extract_text_with_layout(img1)
118
  after_text = extract_text_with_layout(img2)
119
 
120
  # Generate report
121
+ visual_desc = f"Page {i+1} changes detected (score: {diff_score:.1f})"
122
+ with st.spinner(f"Analyzing page {i+1}..."):
123
+ report = generate_free_report(before_text, after_text, visual_desc)
124
 
125
  reports.append((i+1, report, highlighted))
126
 
 
136
  st.image(img, use_column_width=True)
137
  with col2:
138
  st.markdown(f"**Page {page_num} Report**")
139
+ st.write(report.split("ANALYSIS:")[-1]) # Show just the analysis part
140
  else:
141
  st.warning("Please upload both PDF files")
142