Spaces:

ludigija
/

CUAD_contract

Running

App Files Files Community

ludigija commited on Apr 1

Commit

e4fcaef

verified ·

1 Parent(s): 0e9d4fd

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -21

app.py CHANGED Viewed

@@ -3,12 +3,19 @@ import fitz  # PyMuPDF
 import cv2
 import numpy as np
 from PIL import Image
-from transformers import pipeline
 import os
 import tempfile
-# Initialize Mistral model (free)
-analyzer = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1")
 def pdf_to_images(pdf_path):
     """Convert PDF to high-res images using PyMuPDF"""
@@ -28,11 +35,9 @@ def highlight_differences(img1, img2):
     gray1 = cv2.cvtColor(img1_np, cv2.COLOR_RGB2GRAY)
     gray2 = cv2.cvtColor(img2_np, cv2.COLOR_RGB2GRAY)
-    # Compute absolute difference
     diff = cv2.absdiff(gray1, gray2)
     _, thresh = cv2.threshold(diff, 25, 255, cv2.THRESH_BINARY)
-    # Highlight differences
     highlighted = img2_np.copy()
     highlighted[thresh == 255] = [255, 0, 0]  # Red highlights
@@ -44,16 +49,16 @@ def extract_text_with_layout(img):
     custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
     return pytesseract.image_to_string(img, config=custom_config)
-def generate_free_ai_report(before_text, after_text, visual_desc):
-    """Generate report using free Mistral model"""
     prompt = f"""
-    [INST] Compare these document versions and provide a professional difference report:
     BEFORE VERSION:
-    {before_text[:3000]}... [truncated]
     AFTER VERSION:
-    {after_text[:3000]}... [truncated]
     VISUAL ANALYSIS NOTES:
     {visual_desc}
@@ -62,21 +67,20 @@ def generate_free_ai_report(before_text, after_text, visual_desc):
     1. SUMMARY: 2-3 sentence overview
     2. KEY CHANGES: Bullet points of specific changes
     3. ANALYSIS: Potential implications
-    Use clear, concise language. [/INST]
     """
-    result = analyzer(
-        prompt,
-        max_length=1024,
         temperature=0.7,
         do_sample=True
     )
-    return result[0]['generated_text'].split('[/INST]')[-1].strip()
 def main():
-    st.title("Free AI PDF Comparator")
     col1, col2 = st.columns(2)
     with col1:
@@ -108,15 +112,15 @@ def main():
                         # Visual diff
                         highlighted, diff_score = highlight_differences(img1, img2)
-                        # Only analyze if significant differences
                         if diff_score > 5:  # Threshold for meaningful changes
                             # Text extraction
                             before_text = extract_text_with_layout(img1)
                             after_text = extract_text_with_layout(img2)
                             # Generate report
-                            visual_desc = f"Page {i+1} shows visual changes (difference score: {diff_score:.1f})"
-                            report = generate_free_ai_report(before_text, after_text, visual_desc)
                             reports.append((i+1, report, highlighted))
@@ -132,7 +136,7 @@ def main():
                                     st.image(img, use_column_width=True)
                                 with col2:
                                     st.markdown(f"**Page {page_num} Report**")
-                                    st.write(report)
         else:
             st.warning("Please upload both PDF files")

 import cv2
 import numpy as np
 from PIL import Image
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import os
 import tempfile
+import torch
+# Initialize free OpenLLaMA model (no auth needed)
+model_name = "openlm-research/open_llama_7b_v2"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
 def pdf_to_images(pdf_path):
     """Convert PDF to high-res images using PyMuPDF"""
     gray1 = cv2.cvtColor(img1_np, cv2.COLOR_RGB2GRAY)
     gray2 = cv2.cvtColor(img2_np, cv2.COLOR_RGB2GRAY)
     diff = cv2.absdiff(gray1, gray2)
     _, thresh = cv2.threshold(diff, 25, 255, cv2.THRESH_BINARY)
     highlighted = img2_np.copy()
     highlighted[thresh == 255] = [255, 0, 0]  # Red highlights
     custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
     return pytesseract.image_to_string(img, config=custom_config)
+def generate_free_report(before_text, after_text, visual_desc):
+    """Generate report using free OpenLLaMA model"""
     prompt = f"""
+    Compare these document versions and provide a professional difference report:
     BEFORE VERSION:
+    {before_text[:1500]}... [truncated]
     AFTER VERSION:
+    {after_text[:1500]}... [truncated]
     VISUAL ANALYSIS NOTES:
     {visual_desc}
     1. SUMMARY: 2-3 sentence overview
     2. KEY CHANGES: Bullet points of specific changes
     3. ANALYSIS: Potential implications
     """
+    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=512,
         temperature=0.7,
         do_sample=True
     )
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
 def main():
+    st.title("Free PDF Comparator")
     col1, col2 = st.columns(2)
     with col1:
                         # Visual diff
                         highlighted, diff_score = highlight_differences(img1, img2)
                         if diff_score > 5:  # Threshold for meaningful changes
                             # Text extraction
                             before_text = extract_text_with_layout(img1)
                             after_text = extract_text_with_layout(img2)
                             # Generate report
+                            visual_desc = f"Page {i+1} changes detected (score: {diff_score:.1f})"
+                            with st.spinner(f"Analyzing page {i+1}..."):
+                                report = generate_free_report(before_text, after_text, visual_desc)
                             reports.append((i+1, report, highlighted))
                                     st.image(img, use_column_width=True)
                                 with col2:
                                     st.markdown(f"**Page {page_num} Report**")
+                                    st.write(report.split("ANALYSIS:")[-1])  # Show just the analysis part
         else:
             st.warning("Please upload both PDF files")