Spaces:

SlouchyBuffalo
/

pages-converter-pro

Running on Zero

App Files Files Community

SlouchyBuffalo commited on May 12

Commit

999355e

verified ·

1 Parent(s): 8c7667b

Create app.py

Browse files

Files changed (1) hide show

app.py +510 -0

app.py ADDED Viewed

	@@ -0,0 +1,510 @@

+# app.py - Mobile-Optimized PWA Pages Converter
+import gradio as gr
+import os
+import spaces
+import tempfile
+import zipfile
+import json
+import re
+from pathlib import Path
+from huggingface_hub import InferenceClient
+import time
+# Debug token
+token = os.getenv("HF_TOKEN")
+print(f"Debug: Token exists = {token is not None}")
+print(f"Debug: Token length = {len(token) if token else 0}")
+# Initialize the client with Cerebras
+client = InferenceClient(
+    "meta-llama/Llama-3.3-70B-Instruct",
+    provider="cerebras",
+    token=token
+)
+@spaces.GPU
+def extract_pages_content(file_path):
+    """Extract content from Apple Pages file with mobile optimization"""
+    print(f"DEBUG: Processing file: {file_path}")
+    print(f"DEBUG: File exists: {os.path.exists(file_path)}")
+    try:
+        content_parts = []
+        with zipfile.ZipFile(file_path, 'r') as zip_ref:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                zip_ref.extractall(temp_dir)
+                temp_path = Path(temp_dir)
+                print(f"DEBUG: Extracted files: {list(temp_path.iterdir())}")
+                print(f"DEBUG: Index folder contents: {list((temp_path / 'Index').iterdir()) if (temp_path / 'Index').is_dir() else 'No Index folder'}")
+                # Strategy 1: Look for iwa files in Index folder
+                index_path = temp_path / "Index"
+                if index_path.exists():
+                    for iwa_file in index_path.glob("*.iwa"):
+                        try:
+                            # iwa files are protobuf archives, try reading as binary
+                            with open(iwa_file, 'rb') as f:
+                                binary_content = f.read()
+                                # Try to find text content in the binary
+                                text_content = binary_content.decode('utf-8', errors='ignore')
+                                # Extract readable text with better filtering
+                                readable_text = re.findall(r'[\x20-\x7E]+', text_content)
+                                # Filter out metadata and system strings
+                                filtered_text = filter_metadata(readable_text)
+                                content_parts.extend(filtered_text)
+                        except Exception as e:
+                            print(f"DEBUG: Error processing {iwa_file}: {e}")
+                            continue
+        if content_parts:
+            # Clean and deduplicate
+            unique_content = list(dict.fromkeys(content_parts))
+            # Join with proper spacing
+            final_content = "\n\n".join(unique_content)
+            print(f"DEBUG: Extracted content length: {len(final_content)}")
+            return final_content
+        else:
+            return "Could not extract readable content from .pages file"
+    except Exception as e:
+        print(f"DEBUG: Exception in extract_pages_content: {e}")
+        return f"Error extracting content: {str(e)}"
+def filter_metadata(text_list):
+    """Filter out metadata and system strings that appear on mobile"""
+    metadata_patterns = [
+        'en_us', 'en_usp', 'gregorian', 'january', 'february', 'march', 'april', 'may', 'june',
+        'july', 'august', 'september', 'october', 'november', 'december',
+        'sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday',
+        '1st quarter', '2nd quarter', '3rd quarter', '4th quarter',
+        'before christ', 'anno domini', 'bc', 'ad',
+        'm/d/yy', 'mmm d', 'eeee', 'yyyy',
+        'webkit', 'safari', 'chrome', 'mozilla',
+        'apple', 'iwork', 'pages'
+    ]
+    # Numeric patterns to filter out
+    numeric_patterns = [
+        r'^\d+\.\d+$',  # Decimal numbers
+        r'^\d{4}$',     # Years
+        r'^\d{1,2}/\d{1,2}/\d{2,4}$'  # Dates
+    ]
+    filtered_parts = []
+    for text in text_list:
+        text_clean = text.strip()
+        text_lower = text_clean.lower()
+        # Skip if empty or too short
+        if len(text_clean) < 3:
+            continue
+        # Skip if matches metadata patterns
+        if any(pattern in text_lower for pattern in metadata_patterns):
+            continue
+        # Skip if matches numeric patterns
+        if any(re.match(pattern, text_clean) for pattern in numeric_patterns):
+            continue
+        # Skip if it's mostly numbers or single letters
+        if re.match(r'^[0-9\s\-\.\/]+$', text_clean):
+            continue
+        # Keep text that seems meaningful (has letters and reasonable length)
+        if re.search(r'[a-zA-Z]', text_clean) and len(text_clean) > 5:
+            filtered_parts.append(text_clean)
+    return filtered_parts
+@spaces.GPU
+def convert_pages_document(file, output_format, progress=gr.Progress()):
+    """Convert Pages document using Cerebras with ZeroGPU acceleration"""
+    if not file:
+        return None, "❌ Please upload a .pages file"
+    try:
+        progress(0.1, desc="📖 Extracting content from .pages file...")
+        # Extract content
+        content = extract_pages_content(file.name)
+        if not content or len(content.strip()) < 10:
+            return None, "❌ Could not extract sufficient content from .pages file"
+        # Log extracted content for debugging
+        print(f"DEBUG: Final extracted content preview: {content[:200]}...")
+        progress(0.4, desc="🤖 Preparing conversion with Cerebras...")
+        # Create format-specific prompt
+        prompt = create_conversion_prompt(content, output_format)
+        progress(0.6, desc="⚡ Converting with Cerebras Lightning Speed...")
+        # Convert using Cerebras
+        try:
+            # Use chat completion instead
+            messages = [{"role": "user", "content": prompt}]
+            response = client.chat_completion(
+                messages=messages,
+                max_tokens=4096,
+                temperature=0.1
+            )
+            # Extract the response text
+            response = response.choices[0].message.content
+        except Exception as e:
+            return None, f"❌ Conversion error: {str(e)}"
+        progress(0.9, desc="💫 Creating output file...")
+        # Create output file
+        output_path = create_output_file(response, output_format)
+        progress(1.0, desc="✅ Conversion complete!")
+        return output_path, f"✅ Successfully converted to {output_format} using ZeroGPU!"
+    except Exception as e:
+        return None, f"❌ Error: {str(e)}"
+def create_conversion_prompt(content, output_format):
+    """Create optimized prompt for Cerebras model"""
+    format_instructions = {
+        "PDF": "Create content suitable for PDF format with proper structure and formatting",
+        "DOCX": "Format as Microsoft Word document with headers, paragraphs, and proper styling",
+        "TXT": "Convert to clean, readable plain text preserving structure",
+        "HTML": "Create well-structured HTML with semantic markup",
+        "Markdown": "Convert to properly formatted Markdown with headers and structure"
+    }
+    return f"""You are an expert document converter. Convert the following Apple Pages document content to {output_format} format.
+INSTRUCTIONS:
+1. Preserve the original structure, formatting, and content organization
+2. Maintain headings, paragraphs, lists, and any tables if present
+3. Ensure the output is clean, professional, and well-formatted
+4. {format_instructions.get(output_format, "Format appropriately for the requested output type")}
+5. Return ONLY the converted content without explanations or meta-commentary
+ORIGINAL CONTENT:
+{content}
+CONVERTED {output_format.upper()} OUTPUT:"""
+def create_output_file(content, output_format):
+    """Create output file in specified format"""
+    # Clean the content (remove potential prompt artifacts)
+    content = content.strip()
+    # Create temporary file with appropriate extension
+    extensions = {
+        "PDF": ".pdf",
+        "DOCX": ".docx",
+        "TXT": ".txt",
+        "HTML": ".html",
+        "Markdown": ".md"
+    }
+    if output_format == "PDF":
+        # Create a temporary file with .pdf extension
+        with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f:
+            from reportlab.pdfgen import canvas
+            from reportlab.lib.pagesizes import letter
+            import textwrap
+            # Create PDF
+            pdf = canvas.Canvas(f.name, pagesize=letter)
+            width, height = letter
+            y_position = height - 50
+            # Split content into lines and wrap long lines
+            lines = []
+            for paragraph in content.split('\n'):
+                if paragraph.strip():
+                    # Wrap long lines at 80 characters
+                    wrapped_lines = textwrap.wrap(paragraph, width=80)
+                    lines.extend(wrapped_lines if wrapped_lines else [''])
+                else:
+                    lines.append('')  # Preserve empty lines
+            for line in lines:
+                if y_position < 50:  # Start new page
+                    pdf.showPage()
+                    y_position = height - 50
+                pdf.drawString(50, y_position, line)
+                y_position -= 20
+            pdf.save()
+            return f.name
+    elif output_format == "DOCX":
+        # Create a temporary file with .docx extension
+        with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f:
+            from docx import Document
+            doc = Document()
+            paragraphs = content.split('\n\n')
+            for para in paragraphs:
+                if para.strip():
+                    doc.add_paragraph(para.strip())
+            doc.save(f.name)
+            return f.name
+    else:
+        # For TXT, HTML, Markdown
+        ext = extensions.get(output_format, ".txt")
+        with tempfile.NamedTemporaryFile(mode='w', suffix=ext, delete=False, encoding='utf-8') as f:
+            f.write(content)
+            return f.name
+# Mobile-optimized CSS
+css = """
+/* Mobile-first responsive design */
+@viewport { width: device-width; zoom: 1.0; }
+.gradio-container {
+    background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
+    min-height: 100vh;
+    width: 100%;
+    overflow-x: hidden;
+}
+.main-content {
+    max-width: 1000px;
+    margin: 0 auto;
+    padding: 1rem;
+    width: 100%;
+    box-sizing: border-box;
+}
+.hero-section {
+    background: white;
+    border-radius: 1rem;
+    padding: 1.5rem;
+    text-align: center;
+    box-shadow: 0 10px 30px rgba(0,0,0,0.1);
+    margin-bottom: 1.5rem;
+}
+.upload-section {
+    background: white;
+    border-radius: 1rem;
+    padding: 1.5rem;
+    box-shadow: 0 5px 15px rgba(0,0,0,0.1);
+    margin-bottom: 1rem;
+}
+.format-selector {
+    background: #f8f9fa;
+    border-radius: 0.5rem;
+    padding: 1rem;
+    margin: 1rem 0;
+}
+.convert-button {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    border: none;
+    padding: 1rem 2rem;
+    border-radius: 0.5rem;
+    font-size: 1.1rem;
+    font-weight: bold;
+    width: 100%;
+    cursor: pointer;
+    transition: all 0.3s ease;
+    min-height: 44px; /* Better touch target */
+}
+.convert-button:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 5px 15px rgba(102, 126, 234, 0.3);
+}
+.zerogpu-badge {
+    display: inline-block;
+    background: linear-gradient(45deg, #ff6b6b, #feca57);
+    color: white;
+    padding: 0.5rem 1rem;
+    border-radius: 2rem;
+    font-weight: bold;
+    font-size: 0.9rem;
+}
+.pro-features {
+    background: #e8f5e9;
+    border-radius: 0.5rem;
+    padding: 1rem;
+    margin-top: 1rem;
+}
+/* Mobile responsiveness */
+@media (max-width: 768px) {
+    .main-content {
+        padding: 0.5rem;
+    }
+    .hero-section {
+        padding: 1rem;
+        margin-bottom: 1rem;
+    }
+    .hero-section h1 {
+        font-size: 1.5rem;
+    }
+    .upload-section {
+        padding: 1rem;
+    }
+    .pro-features {
+        padding: 0.75rem;
+    }
+    .pro-features div {
+        display: grid;
+        grid-template-columns: 1fr;
+        gap: 0.5rem;
+    }
+    .format-selector {
+        padding: 0.75rem;
+    }
+    /* Make radio buttons more touch-friendly */
+    .gradio-radio {
+        gap: 1rem;
+    }
+    .gradio-radio label {
+        padding: 0.75rem;
+        border-radius: 0.5rem;
+        border: 2px solid #e0e0e0;
+        cursor: pointer;
+        transition: all 0.2s ease;
+    }
+    .gradio-radio input[type=radio]:checked + label {
+        background-color: #f0f9ff;
+        border-color: #667eea;
+    }
+}
+/* PWA styling */
+@media (display-mode: standalone) {
+    body {
+        background: #1e3c72;
+    }
+}
+"""
+# Create the Gradio interface with PWA enabled
+with gr.Blocks(css=css, title="Pages Converter Pro - ZeroGPU", theme=gr.themes.Soft()) as app:
+    with gr.Column(elem_classes=["main-content"]):
+        # Hero section
+        gr.HTML("""
+        <div class="hero-section">
+            <h1>📄 Pages Converter Pro</h1>
+            <span class="zerogpu-badge">⚡ ZeroGPU Accelerated</span>
+            <p style="margin-top: 1rem; color: #666;">
+                Convert Apple Pages documents with lightning-fast Cerebras Llama-3.3-70B
+            </p>
+        </div>
+        """)
+        # Pro benefits showcase
+        gr.HTML("""
+        <div class="pro-features">
+            <h3>🚀 HuggingFace Pro Benefits Active</h3>
+            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; margin-top: 1rem;">
+                <div>✅ 5x Usage Quota</div>
+                <div>🔥 Priority Queue Access</div>
+                <div>💎 H200 GPU Hardware</div>
+                <div>⚡ Zero-GPU Acceleration</div>
+            </div>
+        </div>
+        """)
+        # Main conversion interface
+        with gr.Column():
+            with gr.Column(elem_classes=["upload-section"]):
+                gr.HTML("<h3>📎 Upload Your Document</h3>")
+                file_input = gr.File(
+                    label="Select .pages file",
+                    file_types=[".pages"],
+                    elem_id="file-upload"
+                )
+                output_format = gr.Radio(
+                    choices=["PDF", "DOCX", "TXT", "HTML", "Markdown"],
+                    value="PDF",
+                    label="🎯 Output Format",
+                    elem_classes=["format-selector"]
+                )
+                convert_btn = gr.Button(
+                    "⚡ Convert with ZeroGPU",
+                    variant="primary",
+                    elem_classes=["convert-button"]
+                )
+        # Info section (responsive)
+        with gr.Column():
+            gr.HTML("""
+            <div style="background: white; padding: 1.5rem; border-radius: 1rem; box-shadow: 0 5px 15px rgba(0,0,0,0.1); margin-top: 1rem;">
+                <h3>⚡ ZeroGPU Features</h3>
+                <ul style="color: #666;">
+                    <li>Lightning-fast processing</li>
+                    <li>H200 hardware acceleration</li>
+                    <li>Priority queue access</li>
+                    <li>Optimized for mobile</li>
+                </ul>
+                <h3>📋 Supported Formats</h3>
+                <ul style="color: #666;">
+                    <li>📄 PDF (best quality)</li>
+                    <li>📝 Microsoft Word (DOCX)</li>
+                    <li>📋 Plain Text (TXT)</li>
+                    <li>🌐 Web Page (HTML)</li>
+                    <li>✏️ Markdown (MD)</li>
+                </ul>
+            </div>
+            """)
+        # Output section
+        with gr.Row():
+            output_file = gr.File(
+                label="📁 Download Your Converted File",
+                elem_id="output-download"
+            )
+        with gr.Row():
+            status_html = gr.HTML(
+                value="<div style='text-align: center; padding: 1rem; color: #666;'>Ready to convert your Pages document</div>",
+                elem_id="status-display"
+            )
+        # Connect the interface
+        convert_btn.click(
+            fn=convert_pages_document,
+            inputs=[file_input, output_format],
+            outputs=[output_file, status_html],
+            show_progress=True
+        )
+        # Footer
+        gr.HTML("""
+        <div style="text-align: center; margin-top: 2rem; padding: 1.5rem; color: white;">
+            <p>💎 Built exclusively for HuggingFace Pro users</p>
+            <p><small>Powered by Cerebras • Accelerated by ZeroGPU • Made with ❤️</small></p>
+        </div>
+        """)
+# Launch with PWA enabled (automatic on Spaces)
+if __name__ == "__main__":
+    app.launch()