SlouchyBuffalo commited on
Commit
999355e
Β·
verified Β·
1 Parent(s): 8c7667b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +510 -0
app.py ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py - Mobile-Optimized PWA Pages Converter
2
+ import gradio as gr
3
+ import os
4
+ import spaces
5
+ import tempfile
6
+ import zipfile
7
+ import json
8
+ import re
9
+ from pathlib import Path
10
+ from huggingface_hub import InferenceClient
11
+ import time
12
+
13
+ # Debug token
14
+ token = os.getenv("HF_TOKEN")
15
+ print(f"Debug: Token exists = {token is not None}")
16
+ print(f"Debug: Token length = {len(token) if token else 0}")
17
+
18
+ # Initialize the client with Cerebras
19
+ client = InferenceClient(
20
+ "meta-llama/Llama-3.3-70B-Instruct",
21
+ provider="cerebras",
22
+ token=token
23
+ )
24
+
25
+ @spaces.GPU
26
+ def extract_pages_content(file_path):
27
+ """Extract content from Apple Pages file with mobile optimization"""
28
+ print(f"DEBUG: Processing file: {file_path}")
29
+ print(f"DEBUG: File exists: {os.path.exists(file_path)}")
30
+
31
+ try:
32
+ content_parts = []
33
+
34
+ with zipfile.ZipFile(file_path, 'r') as zip_ref:
35
+ with tempfile.TemporaryDirectory() as temp_dir:
36
+ zip_ref.extractall(temp_dir)
37
+ temp_path = Path(temp_dir)
38
+ print(f"DEBUG: Extracted files: {list(temp_path.iterdir())}")
39
+ print(f"DEBUG: Index folder contents: {list((temp_path / 'Index').iterdir()) if (temp_path / 'Index').is_dir() else 'No Index folder'}")
40
+
41
+ # Strategy 1: Look for iwa files in Index folder
42
+ index_path = temp_path / "Index"
43
+ if index_path.exists():
44
+ for iwa_file in index_path.glob("*.iwa"):
45
+ try:
46
+ # iwa files are protobuf archives, try reading as binary
47
+ with open(iwa_file, 'rb') as f:
48
+ binary_content = f.read()
49
+ # Try to find text content in the binary
50
+ text_content = binary_content.decode('utf-8', errors='ignore')
51
+
52
+ # Extract readable text with better filtering
53
+ readable_text = re.findall(r'[\x20-\x7E]+', text_content)
54
+
55
+ # Filter out metadata and system strings
56
+ filtered_text = filter_metadata(readable_text)
57
+ content_parts.extend(filtered_text)
58
+ except Exception as e:
59
+ print(f"DEBUG: Error processing {iwa_file}: {e}")
60
+ continue
61
+
62
+ if content_parts:
63
+ # Clean and deduplicate
64
+ unique_content = list(dict.fromkeys(content_parts))
65
+ # Join with proper spacing
66
+ final_content = "\n\n".join(unique_content)
67
+ print(f"DEBUG: Extracted content length: {len(final_content)}")
68
+ return final_content
69
+ else:
70
+ return "Could not extract readable content from .pages file"
71
+
72
+ except Exception as e:
73
+ print(f"DEBUG: Exception in extract_pages_content: {e}")
74
+ return f"Error extracting content: {str(e)}"
75
+
76
+ def filter_metadata(text_list):
77
+ """Filter out metadata and system strings that appear on mobile"""
78
+ metadata_patterns = [
79
+ 'en_us', 'en_usp', 'gregorian', 'january', 'february', 'march', 'april', 'may', 'june',
80
+ 'july', 'august', 'september', 'october', 'november', 'december',
81
+ 'sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday',
82
+ '1st quarter', '2nd quarter', '3rd quarter', '4th quarter',
83
+ 'before christ', 'anno domini', 'bc', 'ad',
84
+ 'm/d/yy', 'mmm d', 'eeee', 'yyyy',
85
+ 'webkit', 'safari', 'chrome', 'mozilla',
86
+ 'apple', 'iwork', 'pages'
87
+ ]
88
+
89
+ # Numeric patterns to filter out
90
+ numeric_patterns = [
91
+ r'^\d+\.\d+$', # Decimal numbers
92
+ r'^\d{4}$', # Years
93
+ r'^\d{1,2}/\d{1,2}/\d{2,4}$' # Dates
94
+ ]
95
+
96
+ filtered_parts = []
97
+ for text in text_list:
98
+ text_clean = text.strip()
99
+ text_lower = text_clean.lower()
100
+
101
+ # Skip if empty or too short
102
+ if len(text_clean) < 3:
103
+ continue
104
+
105
+ # Skip if matches metadata patterns
106
+ if any(pattern in text_lower for pattern in metadata_patterns):
107
+ continue
108
+
109
+ # Skip if matches numeric patterns
110
+ if any(re.match(pattern, text_clean) for pattern in numeric_patterns):
111
+ continue
112
+
113
+ # Skip if it's mostly numbers or single letters
114
+ if re.match(r'^[0-9\s\-\.\/]+$', text_clean):
115
+ continue
116
+
117
+ # Keep text that seems meaningful (has letters and reasonable length)
118
+ if re.search(r'[a-zA-Z]', text_clean) and len(text_clean) > 5:
119
+ filtered_parts.append(text_clean)
120
+
121
+ return filtered_parts
122
+
123
+ @spaces.GPU
124
+ def convert_pages_document(file, output_format, progress=gr.Progress()):
125
+ """Convert Pages document using Cerebras with ZeroGPU acceleration"""
126
+ if not file:
127
+ return None, "❌ Please upload a .pages file"
128
+
129
+ try:
130
+ progress(0.1, desc="πŸ“– Extracting content from .pages file...")
131
+
132
+ # Extract content
133
+ content = extract_pages_content(file.name)
134
+
135
+ if not content or len(content.strip()) < 10:
136
+ return None, "❌ Could not extract sufficient content from .pages file"
137
+
138
+ # Log extracted content for debugging
139
+ print(f"DEBUG: Final extracted content preview: {content[:200]}...")
140
+
141
+ progress(0.4, desc="πŸ€– Preparing conversion with Cerebras...")
142
+
143
+ # Create format-specific prompt
144
+ prompt = create_conversion_prompt(content, output_format)
145
+
146
+ progress(0.6, desc="⚑ Converting with Cerebras Lightning Speed...")
147
+
148
+ # Convert using Cerebras
149
+ try:
150
+ # Use chat completion instead
151
+ messages = [{"role": "user", "content": prompt}]
152
+ response = client.chat_completion(
153
+ messages=messages,
154
+ max_tokens=4096,
155
+ temperature=0.1
156
+ )
157
+ # Extract the response text
158
+ response = response.choices[0].message.content
159
+ except Exception as e:
160
+ return None, f"❌ Conversion error: {str(e)}"
161
+
162
+ progress(0.9, desc="πŸ’« Creating output file...")
163
+
164
+ # Create output file
165
+ output_path = create_output_file(response, output_format)
166
+
167
+ progress(1.0, desc="βœ… Conversion complete!")
168
+
169
+ return output_path, f"βœ… Successfully converted to {output_format} using ZeroGPU!"
170
+
171
+ except Exception as e:
172
+ return None, f"❌ Error: {str(e)}"
173
+
174
+ def create_conversion_prompt(content, output_format):
175
+ """Create optimized prompt for Cerebras model"""
176
+ format_instructions = {
177
+ "PDF": "Create content suitable for PDF format with proper structure and formatting",
178
+ "DOCX": "Format as Microsoft Word document with headers, paragraphs, and proper styling",
179
+ "TXT": "Convert to clean, readable plain text preserving structure",
180
+ "HTML": "Create well-structured HTML with semantic markup",
181
+ "Markdown": "Convert to properly formatted Markdown with headers and structure"
182
+ }
183
+
184
+ return f"""You are an expert document converter. Convert the following Apple Pages document content to {output_format} format.
185
+
186
+ INSTRUCTIONS:
187
+ 1. Preserve the original structure, formatting, and content organization
188
+ 2. Maintain headings, paragraphs, lists, and any tables if present
189
+ 3. Ensure the output is clean, professional, and well-formatted
190
+ 4. {format_instructions.get(output_format, "Format appropriately for the requested output type")}
191
+ 5. Return ONLY the converted content without explanations or meta-commentary
192
+
193
+ ORIGINAL CONTENT:
194
+ {content}
195
+
196
+ CONVERTED {output_format.upper()} OUTPUT:"""
197
+
198
+ def create_output_file(content, output_format):
199
+ """Create output file in specified format"""
200
+ # Clean the content (remove potential prompt artifacts)
201
+ content = content.strip()
202
+
203
+ # Create temporary file with appropriate extension
204
+ extensions = {
205
+ "PDF": ".pdf",
206
+ "DOCX": ".docx",
207
+ "TXT": ".txt",
208
+ "HTML": ".html",
209
+ "Markdown": ".md"
210
+ }
211
+
212
+ if output_format == "PDF":
213
+ # Create a temporary file with .pdf extension
214
+ with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f:
215
+ from reportlab.pdfgen import canvas
216
+ from reportlab.lib.pagesizes import letter
217
+ import textwrap
218
+
219
+ # Create PDF
220
+ pdf = canvas.Canvas(f.name, pagesize=letter)
221
+ width, height = letter
222
+ y_position = height - 50
223
+
224
+ # Split content into lines and wrap long lines
225
+ lines = []
226
+ for paragraph in content.split('\n'):
227
+ if paragraph.strip():
228
+ # Wrap long lines at 80 characters
229
+ wrapped_lines = textwrap.wrap(paragraph, width=80)
230
+ lines.extend(wrapped_lines if wrapped_lines else [''])
231
+ else:
232
+ lines.append('') # Preserve empty lines
233
+
234
+ for line in lines:
235
+ if y_position < 50: # Start new page
236
+ pdf.showPage()
237
+ y_position = height - 50
238
+ pdf.drawString(50, y_position, line)
239
+ y_position -= 20
240
+
241
+ pdf.save()
242
+ return f.name
243
+
244
+ elif output_format == "DOCX":
245
+ # Create a temporary file with .docx extension
246
+ with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f:
247
+ from docx import Document
248
+
249
+ doc = Document()
250
+ paragraphs = content.split('\n\n')
251
+ for para in paragraphs:
252
+ if para.strip():
253
+ doc.add_paragraph(para.strip())
254
+
255
+ doc.save(f.name)
256
+ return f.name
257
+
258
+ else:
259
+ # For TXT, HTML, Markdown
260
+ ext = extensions.get(output_format, ".txt")
261
+ with tempfile.NamedTemporaryFile(mode='w', suffix=ext, delete=False, encoding='utf-8') as f:
262
+ f.write(content)
263
+ return f.name
264
+
265
+ # Mobile-optimized CSS
266
+ css = """
267
+ /* Mobile-first responsive design */
268
+ @viewport { width: device-width; zoom: 1.0; }
269
+
270
+ .gradio-container {
271
+ background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
272
+ min-height: 100vh;
273
+ width: 100%;
274
+ overflow-x: hidden;
275
+ }
276
+
277
+ .main-content {
278
+ max-width: 1000px;
279
+ margin: 0 auto;
280
+ padding: 1rem;
281
+ width: 100%;
282
+ box-sizing: border-box;
283
+ }
284
+
285
+ .hero-section {
286
+ background: white;
287
+ border-radius: 1rem;
288
+ padding: 1.5rem;
289
+ text-align: center;
290
+ box-shadow: 0 10px 30px rgba(0,0,0,0.1);
291
+ margin-bottom: 1.5rem;
292
+ }
293
+
294
+ .upload-section {
295
+ background: white;
296
+ border-radius: 1rem;
297
+ padding: 1.5rem;
298
+ box-shadow: 0 5px 15px rgba(0,0,0,0.1);
299
+ margin-bottom: 1rem;
300
+ }
301
+
302
+ .format-selector {
303
+ background: #f8f9fa;
304
+ border-radius: 0.5rem;
305
+ padding: 1rem;
306
+ margin: 1rem 0;
307
+ }
308
+
309
+ .convert-button {
310
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
311
+ color: white;
312
+ border: none;
313
+ padding: 1rem 2rem;
314
+ border-radius: 0.5rem;
315
+ font-size: 1.1rem;
316
+ font-weight: bold;
317
+ width: 100%;
318
+ cursor: pointer;
319
+ transition: all 0.3s ease;
320
+ min-height: 44px; /* Better touch target */
321
+ }
322
+
323
+ .convert-button:hover {
324
+ transform: translateY(-2px);
325
+ box-shadow: 0 5px 15px rgba(102, 126, 234, 0.3);
326
+ }
327
+
328
+ .zerogpu-badge {
329
+ display: inline-block;
330
+ background: linear-gradient(45deg, #ff6b6b, #feca57);
331
+ color: white;
332
+ padding: 0.5rem 1rem;
333
+ border-radius: 2rem;
334
+ font-weight: bold;
335
+ font-size: 0.9rem;
336
+ }
337
+
338
+ .pro-features {
339
+ background: #e8f5e9;
340
+ border-radius: 0.5rem;
341
+ padding: 1rem;
342
+ margin-top: 1rem;
343
+ }
344
+
345
+ /* Mobile responsiveness */
346
+ @media (max-width: 768px) {
347
+ .main-content {
348
+ padding: 0.5rem;
349
+ }
350
+
351
+ .hero-section {
352
+ padding: 1rem;
353
+ margin-bottom: 1rem;
354
+ }
355
+
356
+ .hero-section h1 {
357
+ font-size: 1.5rem;
358
+ }
359
+
360
+ .upload-section {
361
+ padding: 1rem;
362
+ }
363
+
364
+ .pro-features {
365
+ padding: 0.75rem;
366
+ }
367
+
368
+ .pro-features div {
369
+ display: grid;
370
+ grid-template-columns: 1fr;
371
+ gap: 0.5rem;
372
+ }
373
+
374
+ .format-selector {
375
+ padding: 0.75rem;
376
+ }
377
+
378
+ /* Make radio buttons more touch-friendly */
379
+ .gradio-radio {
380
+ gap: 1rem;
381
+ }
382
+
383
+ .gradio-radio label {
384
+ padding: 0.75rem;
385
+ border-radius: 0.5rem;
386
+ border: 2px solid #e0e0e0;
387
+ cursor: pointer;
388
+ transition: all 0.2s ease;
389
+ }
390
+
391
+ .gradio-radio input[type=radio]:checked + label {
392
+ background-color: #f0f9ff;
393
+ border-color: #667eea;
394
+ }
395
+ }
396
+
397
+ /* PWA styling */
398
+ @media (display-mode: standalone) {
399
+ body {
400
+ background: #1e3c72;
401
+ }
402
+ }
403
+ """
404
+
405
+ # Create the Gradio interface with PWA enabled
406
+ with gr.Blocks(css=css, title="Pages Converter Pro - ZeroGPU", theme=gr.themes.Soft()) as app:
407
+ with gr.Column(elem_classes=["main-content"]):
408
+ # Hero section
409
+ gr.HTML("""
410
+ <div class="hero-section">
411
+ <h1>πŸ“„ Pages Converter Pro</h1>
412
+ <span class="zerogpu-badge">⚑ ZeroGPU Accelerated</span>
413
+ <p style="margin-top: 1rem; color: #666;">
414
+ Convert Apple Pages documents with lightning-fast Cerebras Llama-3.3-70B
415
+ </p>
416
+ </div>
417
+ """)
418
+
419
+ # Pro benefits showcase
420
+ gr.HTML("""
421
+ <div class="pro-features">
422
+ <h3>πŸš€ HuggingFace Pro Benefits Active</h3>
423
+ <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; margin-top: 1rem;">
424
+ <div>βœ… 5x Usage Quota</div>
425
+ <div>πŸ”₯ Priority Queue Access</div>
426
+ <div>πŸ’Ž H200 GPU Hardware</div>
427
+ <div>⚑ Zero-GPU Acceleration</div>
428
+ </div>
429
+ </div>
430
+ """)
431
+
432
+ # Main conversion interface
433
+ with gr.Column():
434
+ with gr.Column(elem_classes=["upload-section"]):
435
+ gr.HTML("<h3>πŸ“Ž Upload Your Document</h3>")
436
+
437
+ file_input = gr.File(
438
+ label="Select .pages file",
439
+ file_types=[".pages"],
440
+ elem_id="file-upload"
441
+ )
442
+
443
+ output_format = gr.Radio(
444
+ choices=["PDF", "DOCX", "TXT", "HTML", "Markdown"],
445
+ value="PDF",
446
+ label="🎯 Output Format",
447
+ elem_classes=["format-selector"]
448
+ )
449
+
450
+ convert_btn = gr.Button(
451
+ "⚑ Convert with ZeroGPU",
452
+ variant="primary",
453
+ elem_classes=["convert-button"]
454
+ )
455
+
456
+ # Info section (responsive)
457
+ with gr.Column():
458
+ gr.HTML("""
459
+ <div style="background: white; padding: 1.5rem; border-radius: 1rem; box-shadow: 0 5px 15px rgba(0,0,0,0.1); margin-top: 1rem;">
460
+ <h3>⚑ ZeroGPU Features</h3>
461
+ <ul style="color: #666;">
462
+ <li>Lightning-fast processing</li>
463
+ <li>H200 hardware acceleration</li>
464
+ <li>Priority queue access</li>
465
+ <li>Optimized for mobile</li>
466
+ </ul>
467
+
468
+ <h3>πŸ“‹ Supported Formats</h3>
469
+ <ul style="color: #666;">
470
+ <li>πŸ“„ PDF (best quality)</li>
471
+ <li>πŸ“ Microsoft Word (DOCX)</li>
472
+ <li>πŸ“‹ Plain Text (TXT)</li>
473
+ <li>🌐 Web Page (HTML)</li>
474
+ <li>✏️ Markdown (MD)</li>
475
+ </ul>
476
+ </div>
477
+ """)
478
+
479
+ # Output section
480
+ with gr.Row():
481
+ output_file = gr.File(
482
+ label="πŸ“ Download Your Converted File",
483
+ elem_id="output-download"
484
+ )
485
+
486
+ with gr.Row():
487
+ status_html = gr.HTML(
488
+ value="<div style='text-align: center; padding: 1rem; color: #666;'>Ready to convert your Pages document</div>",
489
+ elem_id="status-display"
490
+ )
491
+
492
+ # Connect the interface
493
+ convert_btn.click(
494
+ fn=convert_pages_document,
495
+ inputs=[file_input, output_format],
496
+ outputs=[output_file, status_html],
497
+ show_progress=True
498
+ )
499
+
500
+ # Footer
501
+ gr.HTML("""
502
+ <div style="text-align: center; margin-top: 2rem; padding: 1.5rem; color: white;">
503
+ <p>πŸ’Ž Built exclusively for HuggingFace Pro users</p>
504
+ <p><small>Powered by Cerebras β€’ Accelerated by ZeroGPU β€’ Made with ❀️</small></p>
505
+ </div>
506
+ """)
507
+
508
+ # Launch with PWA enabled (automatic on Spaces)
509
+ if __name__ == "__main__":
510
+ app.launch()