awacke1 commited on
Commit
55c99d6
Β·
verified Β·
1 Parent(s): e293bcf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +176 -30
app.py CHANGED
@@ -3,48 +3,99 @@ import gradio as gr
3
  import pandas as pd
4
  import requests
5
  import io
6
- import dask.dataframe as dd
7
- from datasets import load_dataset, Image
8
- from mlcroissant import Dataset as CroissantDataset
9
- from huggingface_hub import get_token
10
- import polars as pl
11
  import warnings
12
  import traceback
13
  import json
14
- import tempfile # Added for creating temporary files
 
 
15
 
16
- # 🀫 Let's ignore those pesky warnings, shall we?
17
  warnings.filterwarnings("ignore")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  # --- βš™οΈ Configuration & Constants ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  DATASET_CONFIG = {
21
  "caselaw": {
22
  "name": "common-pile/caselaw_access_project", "emoji": "βš–οΈ",
23
- "methods": ["πŸ’¨ API (requests)", "🧊 Dask", "πŸ₯ Croissant"], "is_public": True,
24
  },
25
  "prompts": {
26
  "name": "fka/awesome-chatgpt-prompts", "emoji": "πŸ€–",
27
- "methods": ["🐼 Pandas", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": True,
28
  },
29
  "finance": {
30
  "name": "snorkelai/agent-finance-reasoning", "emoji": "πŸ’°",
31
- "methods": ["🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": False,
32
  },
33
  "medical": {
34
  "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
35
- "methods": ["🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": False,
36
  },
37
  "inscene": {
38
  "name": "peteromallet/InScene-Dataset", "emoji": "πŸ–ΌοΈ",
39
- "methods": ["πŸ€— Datasets", "🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": False,
40
  },
41
  }
42
 
43
  # --- πŸ”§ Helpers & Utility Functions ---
44
 
45
  def get_auth_headers():
46
- token = get_token()
47
- return {"Authorization": f"Bearer {token}"} if token else {}
 
 
 
 
 
 
48
 
49
  # --- ✨ FIXED: dataframe_to_outputs to use temporary files ---
50
  def dataframe_to_outputs(df: pd.DataFrame):
@@ -261,8 +312,24 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
261
  outputs[2] = f"βœ… Found **{len(all_results_df)}** results so far..."
262
 
263
  if dataset_key == 'inscene':
264
- gallery_data = [(row['image'], row.get('text', '')) for _, row in all_results_df.iterrows() if 'image' in row and isinstance(row['image'], Image.Image)]
265
- outputs[1] = gr.Gallery(gallery_data, label="πŸ–ΌοΈ Image Results", height=400)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  yield tuple(outputs)
267
 
268
  outputs[2] = f"🏁 Search complete. Found a total of **{len(all_results_df)}** results."
@@ -289,10 +356,14 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
289
  df = pd.read_json(f"{file_path}medical_o1_sft.json")
290
 
291
  elif "Datasets" in access_method:
 
 
292
  ds = load_dataset(repo_id, split='train', streaming=True).take(1000)
293
  df = pd.DataFrame(ds)
294
 
295
  elif "Polars" in access_method:
 
 
296
  outputs[2] = "⏳ Loading with Polars..."
297
  yield tuple(outputs)
298
  if repo_id == "fka/awesome-chatgpt-prompts":
@@ -302,22 +373,50 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
302
  df = pl_df.to_pandas()
303
 
304
  elif "Dask" in access_method:
 
 
305
  outputs[2] = "⏳ Loading with Dask..."
306
  yield tuple(outputs)
307
  dask_df = dd.read_json(f"hf://datasets/{repo_id}/**/*.jsonl.gz")
308
  df = dask_df.head(1000) # Convert to pandas for processing
309
 
310
  elif "Croissant" in access_method:
 
 
311
  outputs[2] = "⏳ Loading with Croissant..."
312
  yield tuple(outputs)
313
- headers = get_auth_headers() if not config["is_public"] else {}
314
- croissant_url = f"https://huggingface.co/api/datasets/{repo_id}/croissant"
315
- response = requests.get(croissant_url, headers=headers)
316
- response.raise_for_status()
317
- jsonld = response.json()
318
- ds = CroissantDataset(jsonld=jsonld)
319
- records = list(ds.records("default"))[:1000] # Take first 1000
320
- df = pd.DataFrame(records)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
  outputs[2] = "πŸ” Searching loaded data..."
323
  yield tuple(outputs)
@@ -329,8 +428,24 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
329
  outputs[2] = f"🏁 Search complete. Found **{len(final_df)}** results."
330
 
331
  if dataset_key == 'inscene' and not final_df.empty:
332
- gallery_data = [(row['image'], row.get('text', '')) for _, row in final_df.iterrows() if 'image' in row and isinstance(row.get('image'), Image.Image)]
333
- outputs[1] = gr.Gallery(gallery_data, label="πŸ–ΌοΈ Image Results", height=400)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
  yield tuple(outputs)
336
 
@@ -347,9 +462,21 @@ def create_dataset_tab(dataset_key: str):
347
  if not config['is_public']:
348
  gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.")
349
 
 
 
 
 
 
350
  with gr.Row():
351
- access_method = gr.Radio(config['methods'], label="πŸ”‘ Access Method", value=config['methods'][0])
352
- query = gr.Textbox(label="πŸ” Search Query", placeholder="Enter any text to search, or leave blank for samples...")
 
 
 
 
 
 
 
353
 
354
  fetch_button = gr.Button("πŸš€ Go Fetch!")
355
  status_output = gr.Markdown("🏁 Ready to search.")
@@ -385,7 +512,20 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as
385
  "If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
386
  )
387
 
388
- with gr.Accordion("πŸ”§ Quick Start Guide", open=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  gr.Markdown("""
390
  ### πŸš€ Quick Start:
391
  1. **πŸ€– Prompts Tab**: Try API method, search for "translator" or "linux"
@@ -402,7 +542,13 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as
402
  - **🐼 Pandas**: Full dataset access, requires login for gated datasets
403
  - **πŸ€— Datasets**: Good for streaming large datasets
404
  - **🧊 Polars/Dask**: Alternative fast data processing
405
- - **πŸ₯ Croissant**: Metadata-aware loading
 
 
 
 
 
 
406
  """)
407
 
408
  with gr.Tabs():
 
3
  import pandas as pd
4
  import requests
5
  import io
 
 
 
 
 
6
  import warnings
7
  import traceback
8
  import json
9
+ import tempfile
10
+ import os
11
+ import logging
12
 
13
+ # 🀫 Suppress warnings and set logging levels
14
  warnings.filterwarnings("ignore")
15
+ logging.getLogger("absl").setLevel(logging.ERROR) # Suppress MLCroissant warnings
16
+ os.environ["ABSL_LOG_LEVEL"] = "2" # Only show errors
17
+
18
+ # Import optional dependencies with fallbacks
19
+ try:
20
+ import dask.dataframe as dd
21
+ DASK_AVAILABLE = True
22
+ except ImportError:
23
+ DASK_AVAILABLE = False
24
+
25
+ try:
26
+ from datasets import load_dataset, Image
27
+ DATASETS_AVAILABLE = True
28
+ except ImportError:
29
+ DATASETS_AVAILABLE = False
30
+
31
+ try:
32
+ from mlcroissant import Dataset as CroissantDataset
33
+ CROISSANT_AVAILABLE = True
34
+ except ImportError:
35
+ CROISSANT_AVAILABLE = False
36
+
37
+ try:
38
+ from huggingface_hub import get_token
39
+ HF_HUB_AVAILABLE = True
40
+ except ImportError:
41
+ HF_HUB_AVAILABLE = False
42
+
43
+ try:
44
+ import polars as pl
45
+ POLARS_AVAILABLE = True
46
+ except ImportError:
47
+ POLARS_AVAILABLE = False
48
 
49
  # --- βš™οΈ Configuration & Constants ---
50
+ def get_available_methods():
51
+ """πŸ”§ Get available methods based on installed dependencies"""
52
+ base_methods = ["πŸ’¨ API (requests)", "🐼 Pandas"]
53
+
54
+ if DATASETS_AVAILABLE:
55
+ base_methods.append("πŸ€— Datasets")
56
+ if POLARS_AVAILABLE:
57
+ base_methods.append("🧊 Polars")
58
+ if DASK_AVAILABLE:
59
+ base_methods.append("🧊 Dask")
60
+ if CROISSANT_AVAILABLE:
61
+ base_methods.append("πŸ₯ Croissant")
62
+
63
+ return base_methods
64
+
65
  DATASET_CONFIG = {
66
  "caselaw": {
67
  "name": "common-pile/caselaw_access_project", "emoji": "βš–οΈ",
68
+ "methods": get_available_methods(), "is_public": True,
69
  },
70
  "prompts": {
71
  "name": "fka/awesome-chatgpt-prompts", "emoji": "πŸ€–",
72
+ "methods": get_available_methods(), "is_public": True,
73
  },
74
  "finance": {
75
  "name": "snorkelai/agent-finance-reasoning", "emoji": "πŸ’°",
76
+ "methods": get_available_methods(), "is_public": False,
77
  },
78
  "medical": {
79
  "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
80
+ "methods": get_available_methods(), "is_public": False,
81
  },
82
  "inscene": {
83
  "name": "peteromallet/InScene-Dataset", "emoji": "πŸ–ΌοΈ",
84
+ "methods": get_available_methods(), "is_public": False,
85
  },
86
  }
87
 
88
  # --- πŸ”§ Helpers & Utility Functions ---
89
 
90
  def get_auth_headers():
91
+ """πŸ”‘ Get authentication headers if available"""
92
+ if not HF_HUB_AVAILABLE:
93
+ return {}
94
+ try:
95
+ token = get_token()
96
+ return {"Authorization": f"Bearer {token}"} if token else {}
97
+ except Exception:
98
+ return {}
99
 
100
  # --- ✨ FIXED: dataframe_to_outputs to use temporary files ---
101
  def dataframe_to_outputs(df: pd.DataFrame):
 
312
  outputs[2] = f"βœ… Found **{len(all_results_df)}** results so far..."
313
 
314
  if dataset_key == 'inscene':
315
+ try:
316
+ gallery_data = []
317
+ for _, row in all_results_df.iterrows():
318
+ if 'image' in row:
319
+ image_data = row.get('image')
320
+ text_data = row.get('text', '')
321
+
322
+ # Handle different image formats safely
323
+ if hasattr(image_data, 'save'): # PIL Image
324
+ gallery_data.append((image_data, text_data))
325
+ elif isinstance(image_data, str): # Image path or URL
326
+ gallery_data.append((image_data, text_data))
327
+
328
+ if gallery_data:
329
+ outputs[1] = gr.Gallery(gallery_data, label="πŸ–ΌοΈ Image Results", height=400)
330
+ except Exception as img_error:
331
+ # Don't break the flow for image errors
332
+ pass
333
  yield tuple(outputs)
334
 
335
  outputs[2] = f"🏁 Search complete. Found a total of **{len(all_results_df)}** results."
 
356
  df = pd.read_json(f"{file_path}medical_o1_sft.json")
357
 
358
  elif "Datasets" in access_method:
359
+ if not DATASETS_AVAILABLE:
360
+ raise ImportError("datasets library not available. Install with: pip install datasets")
361
  ds = load_dataset(repo_id, split='train', streaming=True).take(1000)
362
  df = pd.DataFrame(ds)
363
 
364
  elif "Polars" in access_method:
365
+ if not POLARS_AVAILABLE:
366
+ raise ImportError("polars library not available. Install with: pip install polars")
367
  outputs[2] = "⏳ Loading with Polars..."
368
  yield tuple(outputs)
369
  if repo_id == "fka/awesome-chatgpt-prompts":
 
373
  df = pl_df.to_pandas()
374
 
375
  elif "Dask" in access_method:
376
+ if not DASK_AVAILABLE:
377
+ raise ImportError("dask library not available. Install with: pip install dask")
378
  outputs[2] = "⏳ Loading with Dask..."
379
  yield tuple(outputs)
380
  dask_df = dd.read_json(f"hf://datasets/{repo_id}/**/*.jsonl.gz")
381
  df = dask_df.head(1000) # Convert to pandas for processing
382
 
383
  elif "Croissant" in access_method:
384
+ if not CROISSANT_AVAILABLE:
385
+ raise ImportError("mlcroissant library not available. Install with: pip install mlcroissant")
386
  outputs[2] = "⏳ Loading with Croissant..."
387
  yield tuple(outputs)
388
+
389
+ try:
390
+ headers = get_auth_headers() if not config["is_public"] else {}
391
+ croissant_url = f"https://huggingface.co/api/datasets/{repo_id}/croissant"
392
+ response = requests.get(croissant_url, headers=headers)
393
+ response.raise_for_status()
394
+ jsonld = response.json()
395
+
396
+ # Suppress MLCroissant warnings during dataset creation
397
+ with warnings.catch_warnings():
398
+ warnings.simplefilter("ignore")
399
+ ds = CroissantDataset(jsonld=jsonld)
400
+ records = list(ds.records("default"))[:1000] # Take first 1000
401
+ df = pd.DataFrame(records)
402
+
403
+ except Exception as croissant_error:
404
+ # If Croissant fails, fall back to API method
405
+ outputs[2] = f"⚠️ Croissant method failed, falling back to API method..."
406
+ yield tuple(outputs)
407
+
408
+ # Retry with API method
409
+ url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset=0&length=100"
410
+ headers = get_auth_headers() if not config["is_public"] else {}
411
+ response = requests.get(url, headers=headers)
412
+ response.raise_for_status()
413
+ data = response.json()
414
+
415
+ if data.get('rows'):
416
+ rows_data = [item['row'] for item in data['rows']]
417
+ df = pd.json_normalize(rows_data)
418
+ else:
419
+ raise Exception("No data available from fallback API method")
420
 
421
  outputs[2] = "πŸ” Searching loaded data..."
422
  yield tuple(outputs)
 
428
  outputs[2] = f"🏁 Search complete. Found **{len(final_df)}** results."
429
 
430
  if dataset_key == 'inscene' and not final_df.empty:
431
+ # Handle image data more safely
432
+ try:
433
+ gallery_data = []
434
+ for _, row in final_df.iterrows():
435
+ if 'image' in row:
436
+ image_data = row.get('image')
437
+ text_data = row.get('text', '')
438
+
439
+ # Handle different image formats
440
+ if hasattr(image_data, 'save'): # PIL Image
441
+ gallery_data.append((image_data, text_data))
442
+ elif isinstance(image_data, str): # Image path or URL
443
+ gallery_data.append((image_data, text_data))
444
+
445
+ if gallery_data:
446
+ outputs[1] = gr.Gallery(gallery_data, label="πŸ–ΌοΈ Image Results", height=400)
447
+ except Exception as img_error:
448
+ outputs[2] += f"\n⚠️ Image display error: {str(img_error)}"
449
 
450
  yield tuple(outputs)
451
 
 
462
  if not config['is_public']:
463
  gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.")
464
 
465
+ # Show available methods for this dataset
466
+ available_methods = config['methods']
467
+ if len(available_methods) < 5: # Some methods missing
468
+ gr.Markdown(f"**Available methods:** {len(available_methods)} of 6 possible methods")
469
+
470
  with gr.Row():
471
+ access_method = gr.Radio(
472
+ available_methods,
473
+ label="πŸ”‘ Access Method",
474
+ value=available_methods[0] if available_methods else "πŸ’¨ API (requests)"
475
+ )
476
+ query = gr.Textbox(
477
+ label="πŸ” Search Query",
478
+ placeholder="Enter any text to search, or leave blank for samples..."
479
+ )
480
 
481
  fetch_button = gr.Button("πŸš€ Go Fetch!")
482
  status_output = gr.Markdown("🏁 Ready to search.")
 
512
  "If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
513
  )
514
 
515
+ # Show dependency status
516
+ def get_dependency_status():
517
+ status = "### πŸ”§ Available Libraries:\n"
518
+ status += f"- **πŸ’¨ API**: βœ… Always available\n"
519
+ status += f"- **🐼 Pandas**: βœ… Available\n"
520
+ status += f"- **πŸ€— Datasets**: {'βœ… Available' if DATASETS_AVAILABLE else '❌ Not installed'}\n"
521
+ status += f"- **🧊 Polars**: {'βœ… Available' if POLARS_AVAILABLE else '❌ Not installed'}\n"
522
+ status += f"- **🧊 Dask**: {'βœ… Available' if DASK_AVAILABLE else '❌ Not installed'}\n"
523
+ status += f"- **πŸ₯ Croissant**: {'βœ… Available' if CROISSANT_AVAILABLE else '❌ Not installed'}\n"
524
+ status += f"- **πŸ”‘ HF Authentication**: {'βœ… Available' if HF_HUB_AVAILABLE else '❌ Not installed'}\n"
525
+ return status
526
+
527
+ with gr.Accordion("πŸ”§ Library Status & Quick Start Guide", open=False):
528
+ gr.Markdown(get_dependency_status())
529
  gr.Markdown("""
530
  ### πŸš€ Quick Start:
531
  1. **πŸ€– Prompts Tab**: Try API method, search for "translator" or "linux"
 
542
  - **🐼 Pandas**: Full dataset access, requires login for gated datasets
543
  - **πŸ€— Datasets**: Good for streaming large datasets
544
  - **🧊 Polars/Dask**: Alternative fast data processing
545
+ - **πŸ₯ Croissant**: Metadata-aware loading (has fallback to API)
546
+
547
+ ### πŸ“¦ Missing Libraries:
548
+ If methods are missing, install with:
549
+ ```bash
550
+ pip install datasets polars dask mlcroissant GitPython
551
+ ```
552
  """)
553
 
554
  with gr.Tabs():