awacke1 commited on
Commit
e293bcf
Β·
verified Β·
1 Parent(s): 8e8efbf

Create app.py.v1

Browse files
Files changed (1) hide show
  1. app.py.v1 +413 -0
app.py.v1 ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ import pandas as pd
4
+ import requests
5
+ import io
6
+ import dask.dataframe as dd
7
+ from datasets import load_dataset, Image
8
+ from mlcroissant import Dataset as CroissantDataset
9
+ from huggingface_hub import get_token
10
+ import polars as pl
11
+ import warnings
12
+ import traceback
13
+ import json
14
+ import tempfile # Added for creating temporary files
15
+
16
+ # 🀫 Let's ignore those pesky warnings, shall we?
17
+ warnings.filterwarnings("ignore")
18
+
19
+ # --- βš™οΈ Configuration & Constants ---
20
+ DATASET_CONFIG = {
21
+ "caselaw": {
22
+ "name": "common-pile/caselaw_access_project", "emoji": "βš–οΈ",
23
+ "methods": ["πŸ’¨ API (requests)", "🧊 Dask", "πŸ₯ Croissant"], "is_public": True,
24
+ },
25
+ "prompts": {
26
+ "name": "fka/awesome-chatgpt-prompts", "emoji": "πŸ€–",
27
+ "methods": ["🐼 Pandas", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": True,
28
+ },
29
+ "finance": {
30
+ "name": "snorkelai/agent-finance-reasoning", "emoji": "πŸ’°",
31
+ "methods": ["🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": False,
32
+ },
33
+ "medical": {
34
+ "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
35
+ "methods": ["🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": False,
36
+ },
37
+ "inscene": {
38
+ "name": "peteromallet/InScene-Dataset", "emoji": "πŸ–ΌοΈ",
39
+ "methods": ["πŸ€— Datasets", "🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": False,
40
+ },
41
+ }
42
+
43
+ # --- πŸ”§ Helpers & Utility Functions ---
44
+
45
+ def get_auth_headers():
46
+ token = get_token()
47
+ return {"Authorization": f"Bearer {token}"} if token else {}
48
+
49
+ # --- ✨ FIXED: dataframe_to_outputs to use temporary files ---
50
+ def dataframe_to_outputs(df: pd.DataFrame):
51
+ """
52
+ πŸ“œ Takes a DataFrame and transforms it into various formats.
53
+ Now uses temporary files for maximum Gradio compatibility.
54
+ """
55
+ if df.empty:
56
+ return "No results found. 🀷", None, None, "No results to copy."
57
+
58
+ df_str = df.astype(str)
59
+ markdown_output = df_str.to_markdown(index=False)
60
+
61
+ # Create a temporary CSV file
62
+ with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as tmp_csv:
63
+ df.to_csv(tmp_csv.name, index=False)
64
+ csv_path = tmp_csv.name
65
+
66
+ # Create a temporary XLSX file
67
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_xlsx:
68
+ df.to_excel(tmp_xlsx.name, index=False, engine='openpyxl')
69
+ xlsx_path = tmp_xlsx.name
70
+
71
+ tab_delimited_output = df.to_csv(sep='\t', index=False)
72
+
73
+ return (
74
+ markdown_output,
75
+ csv_path,
76
+ xlsx_path,
77
+ tab_delimited_output,
78
+ )
79
+
80
+ def handle_error(e: Exception, request=None, response=None):
81
+ """
82
+ 😱 Oh no! An error! This function now creates a detailed debug log.
83
+ """
84
+ error_message = f"🚨 An error occurred: {str(e)}\n"
85
+ auth_tip = "πŸ”‘ For gated datasets, did you log in? Try `huggingface-cli login` in your terminal."
86
+ full_trace = traceback.format_exc()
87
+ print(full_trace)
88
+ if "401" in str(e) or "Gated" in str(e):
89
+ error_message += auth_tip
90
+
91
+ debug_log = f"""--- 🐞 DEBUG LOG ---\nTraceback:\n{full_trace}\n\nException Type: {type(e).__name__}\nException Details: {e}\n"""
92
+ if request:
93
+ debug_log += f"""\n--- REQUEST ---\nMethod: {request.method}\nURL: {request.url}\nHeaders: {json.dumps(dict(request.headers), indent=2)}\n"""
94
+ if response is not None:
95
+ try:
96
+ response_text = json.dumps(response.json(), indent=2)
97
+ except json.JSONDecodeError:
98
+ response_text = response.text
99
+ debug_log += f"""\n--- RESPONSE ---\nStatus Code: {response.status_code}\nHeaders: {json.dumps(dict(response.headers), indent=2)}\nContent:\n{response_text}\n"""
100
+
101
+ return (
102
+ pd.DataFrame(), gr.Gallery(None), f"### 🚨 Error\nAn error occurred. See the debug log below for details.",
103
+ "", None, None, "", f"```python\n# 🚨 Error during execution:\n# {e}\n```",
104
+ gr.Code(value=debug_log, visible=True)
105
+ )
106
+
107
+ def search_dataframe(df: pd.DataFrame, query: str):
108
+ if not query:
109
+ return df.head(100)
110
+ string_cols = df.select_dtypes(include=['object', 'string']).columns
111
+ if string_cols.empty:
112
+ return pd.DataFrame()
113
+ mask = pd.Series([False] * len(df))
114
+ for col in string_cols:
115
+ mask |= df[col].astype(str).str.contains(query, case=False, na=False)
116
+ return df[mask]
117
+
118
+ def generate_code_snippet(dataset_key: str, access_method: str, query: str):
119
+ """
120
+ πŸ’» Generate Python code snippet for the current operation
121
+ """
122
+ config = DATASET_CONFIG[dataset_key]
123
+ repo_id = config["name"]
124
+
125
+ if "API" in access_method:
126
+ return f'''# 🌐 API Access for {repo_id}
127
+ import requests
128
+ import pandas as pd
129
+
130
+ url = "https://datasets-server.huggingface.co/rows"
131
+ params = {{
132
+ "dataset": "{repo_id}",
133
+ "config": "default",
134
+ "split": "train",
135
+ "offset": 0,
136
+ "length": 100
137
+ }}
138
+
139
+ headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}} if needed else {{}}
140
+ response = requests.get(url, params=params, headers=headers)
141
+
142
+ if response.status_code == 200:
143
+ data = response.json()
144
+ rows_data = [item['row'] for item in data['rows']]
145
+ df = pd.json_normalize(rows_data)
146
+
147
+ # Search for: "{query}"
148
+ if "{query}":
149
+ string_cols = df.select_dtypes(include=['object', 'string']).columns
150
+ mask = pd.Series([False] * len(df))
151
+ for col in string_cols:
152
+ mask |= df[col].astype(str).str.contains("{query}", case=False, na=False)
153
+ df = df[mask]
154
+
155
+ print(f"Found {{len(df)}} results")
156
+ print(df.head())
157
+ else:
158
+ print(f"Error: {{response.status_code}} - {{response.text}}")
159
+ '''
160
+
161
+ elif "Pandas" in access_method:
162
+ file_path = "prompts.csv" if repo_id == "fka/awesome-chatgpt-prompts" else "train.parquet"
163
+ return f'''# 🐼 Pandas Access for {repo_id}
164
+ import pandas as pd
165
+
166
+ # You may need: huggingface-cli login
167
+ df = pd.read_{"csv" if "csv" in file_path else "parquet"}("hf://datasets/{repo_id}/{file_path}")
168
+
169
+ # Search for: "{query}"
170
+ if "{query}":
171
+ string_cols = df.select_dtypes(include=['object', 'string']).columns
172
+ mask = pd.Series([False] * len(df))
173
+ for col in string_cols:
174
+ mask |= df[col].astype(str).str.contains("{query}", case=False, na=False)
175
+ df = df[mask]
176
+
177
+ print(f"Found {{len(df)}} results")
178
+ print(df.head())
179
+ '''
180
+
181
+ elif "Datasets" in access_method:
182
+ return f'''# πŸ€— Datasets Library Access for {repo_id}
183
+ from datasets import load_dataset
184
+ import pandas as pd
185
+
186
+ # You may need: huggingface-cli login
187
+ ds = load_dataset("{repo_id}", split="train", streaming=True)
188
+ data = list(ds.take(1000))
189
+ df = pd.DataFrame(data)
190
+
191
+ # Search for: "{query}"
192
+ if "{query}":
193
+ string_cols = df.select_dtypes(include=['object', 'string']).columns
194
+ mask = pd.Series([False] * len(df))
195
+ for col in string_cols:
196
+ mask |= df[col].astype(str).str.contains("{query}", case=False, na=False)
197
+ df = df[mask]
198
+
199
+ print(f"Found {{len(df)}} results")
200
+ print(df.head())
201
+ '''
202
+
203
+ else:
204
+ return f"# Code generation for {access_method} not implemented yet"
205
+
206
+ # --- 🎣 Data Fetching & Processing Functions ---
207
+ def fetch_data(dataset_key: str, access_method: str, query: str):
208
+ """
209
+ πŸš€ Main mission control. Always yields a tuple of 9 values to match the UI components.
210
+ """
211
+ outputs = [pd.DataFrame(), None, "🏁 Ready.", "", None, None, "", "", gr.Code(visible=False)]
212
+ req, res = None, None
213
+ try:
214
+ config = DATASET_CONFIG[dataset_key]
215
+ repo_id = config["name"]
216
+
217
+ # Generate code snippet
218
+ code_snippet = generate_code_snippet(dataset_key, access_method, query)
219
+ outputs[7] = code_snippet
220
+
221
+ if "API" in access_method:
222
+ all_results_df = pd.DataFrame()
223
+ MAX_PAGES = 5
224
+ PAGE_SIZE = 100
225
+
226
+ if not query:
227
+ MAX_PAGES = 1
228
+ outputs[2] = "⏳ No search term. Fetching first 100 records as a sample..."
229
+ yield tuple(outputs)
230
+
231
+ for page in range(MAX_PAGES):
232
+ if query:
233
+ outputs[2] = f"⏳ Searching page {page + 1}..."
234
+ yield tuple(outputs)
235
+
236
+ offset = page * PAGE_SIZE
237
+ url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset={offset}&length={PAGE_SIZE}"
238
+ headers = get_auth_headers() if not config["is_public"] else {}
239
+
240
+ res = requests.get(url, headers=headers)
241
+ req = res.request
242
+ res.raise_for_status()
243
+ data = res.json()
244
+
245
+ if not data.get('rows'):
246
+ outputs[2] = "🏁 No more data to search."
247
+ yield tuple(outputs)
248
+ break
249
+
250
+ # --- ✨ FIXED: JSON processing logic ---
251
+ # Extract the actual data from the 'row' key of each item in the list
252
+ rows_data = [item['row'] for item in data['rows']]
253
+ page_df = pd.json_normalize(rows_data)
254
+
255
+ found_in_page = search_dataframe(page_df, query)
256
+
257
+ if not found_in_page.empty:
258
+ all_results_df = pd.concat([all_results_df, found_in_page]).reset_index(drop=True)
259
+ outputs[0] = all_results_df
260
+ outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(all_results_df)
261
+ outputs[2] = f"βœ… Found **{len(all_results_df)}** results so far..."
262
+
263
+ if dataset_key == 'inscene':
264
+ gallery_data = [(row['image'], row.get('text', '')) for _, row in all_results_df.iterrows() if 'image' in row and isinstance(row['image'], Image.Image)]
265
+ outputs[1] = gr.Gallery(gallery_data, label="πŸ–ΌοΈ Image Results", height=400)
266
+ yield tuple(outputs)
267
+
268
+ outputs[2] = f"🏁 Search complete. Found a total of **{len(all_results_df)}** results."
269
+ yield tuple(outputs)
270
+ return
271
+
272
+ outputs[2] = f"⏳ Loading data via `{access_method}`..."
273
+ yield tuple(outputs)
274
+
275
+ df = pd.DataFrame()
276
+
277
+ if "Pandas" in access_method:
278
+ file_path = f"hf://datasets/{repo_id}/"
279
+ if repo_id == "fka/awesome-chatgpt-prompts":
280
+ file_path += "prompts.csv"
281
+ df = pd.read_csv(file_path)
282
+ else:
283
+ try:
284
+ df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet")
285
+ except:
286
+ try:
287
+ df = pd.read_parquet(f"{file_path}train.parquet")
288
+ except:
289
+ df = pd.read_json(f"{file_path}medical_o1_sft.json")
290
+
291
+ elif "Datasets" in access_method:
292
+ ds = load_dataset(repo_id, split='train', streaming=True).take(1000)
293
+ df = pd.DataFrame(ds)
294
+
295
+ elif "Polars" in access_method:
296
+ outputs[2] = "⏳ Loading with Polars..."
297
+ yield tuple(outputs)
298
+ if repo_id == "fka/awesome-chatgpt-prompts":
299
+ pl_df = pl.read_csv(f"hf://datasets/{repo_id}/prompts.csv")
300
+ else:
301
+ pl_df = pl.read_parquet(f"hf://datasets/{repo_id}/train.parquet")
302
+ df = pl_df.to_pandas()
303
+
304
+ elif "Dask" in access_method:
305
+ outputs[2] = "⏳ Loading with Dask..."
306
+ yield tuple(outputs)
307
+ dask_df = dd.read_json(f"hf://datasets/{repo_id}/**/*.jsonl.gz")
308
+ df = dask_df.head(1000) # Convert to pandas for processing
309
+
310
+ elif "Croissant" in access_method:
311
+ outputs[2] = "⏳ Loading with Croissant..."
312
+ yield tuple(outputs)
313
+ headers = get_auth_headers() if not config["is_public"] else {}
314
+ croissant_url = f"https://huggingface.co/api/datasets/{repo_id}/croissant"
315
+ response = requests.get(croissant_url, headers=headers)
316
+ response.raise_for_status()
317
+ jsonld = response.json()
318
+ ds = CroissantDataset(jsonld=jsonld)
319
+ records = list(ds.records("default"))[:1000] # Take first 1000
320
+ df = pd.DataFrame(records)
321
+
322
+ outputs[2] = "πŸ” Searching loaded data..."
323
+ yield tuple(outputs)
324
+
325
+ final_df = search_dataframe(df, query)
326
+
327
+ outputs[0] = final_df
328
+ outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(final_df)
329
+ outputs[2] = f"🏁 Search complete. Found **{len(final_df)}** results."
330
+
331
+ if dataset_key == 'inscene' and not final_df.empty:
332
+ gallery_data = [(row['image'], row.get('text', '')) for _, row in final_df.iterrows() if 'image' in row and isinstance(row.get('image'), Image.Image)]
333
+ outputs[1] = gr.Gallery(gallery_data, label="πŸ–ΌοΈ Image Results", height=400)
334
+
335
+ yield tuple(outputs)
336
+
337
+ except Exception as e:
338
+ yield handle_error(e, req, res)
339
+
340
+
341
+ # --- πŸ–ΌοΈ UI Generation ---
342
+ def create_dataset_tab(dataset_key: str):
343
+ config = DATASET_CONFIG[dataset_key]
344
+
345
+ with gr.Tab(f"{config['emoji']} {dataset_key.capitalize()}"):
346
+ gr.Markdown(f"## {config['emoji']} Query the `{config['name']}` Dataset")
347
+ if not config['is_public']:
348
+ gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.")
349
+
350
+ with gr.Row():
351
+ access_method = gr.Radio(config['methods'], label="πŸ”‘ Access Method", value=config['methods'][0])
352
+ query = gr.Textbox(label="πŸ” Search Query", placeholder="Enter any text to search, or leave blank for samples...")
353
+
354
+ fetch_button = gr.Button("πŸš€ Go Fetch!")
355
+ status_output = gr.Markdown("🏁 Ready to search.")
356
+ df_output = gr.DataFrame(label="πŸ“Š Results DataFrame", interactive=False, wrap=True)
357
+ gallery_output = gr.Gallery(visible=(dataset_key == 'inscene'), label="πŸ–ΌοΈ Image Results")
358
+
359
+ with gr.Accordion("πŸ“‚ View/Export Full Results", open=False):
360
+ markdown_output = gr.Markdown(label="πŸ“ Markdown View")
361
+ with gr.Row():
362
+ csv_output = gr.File(label="⬇️ Download CSV")
363
+ xlsx_output = gr.File(label="⬇️ Download XLSX")
364
+ copy_output = gr.Code(label="πŸ“‹ Copy-Paste (Tab-Delimited)")
365
+
366
+ code_output = gr.Code(label="πŸ’» Python Code Snippet", language="python")
367
+
368
+ debug_log_output = gr.Code(label="🐞 Debug Log", visible=False)
369
+
370
+ fetch_button.click(
371
+ fn=fetch_data,
372
+ inputs=[gr.State(dataset_key), access_method, query],
373
+ outputs=[
374
+ df_output, gallery_output, status_output, markdown_output,
375
+ csv_output, xlsx_output, copy_output, code_output,
376
+ debug_log_output
377
+ ]
378
+ )
379
+
380
+ # --- πŸš€ Main App ---
381
+ with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo:
382
+ gr.Markdown("# πŸ€— Hugging Face Dataset Explorer")
383
+ gr.Markdown(
384
+ "Select a dataset, choose an access method, and type a query. "
385
+ "If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
386
+ )
387
+
388
+ with gr.Accordion("πŸ”§ Quick Start Guide", open=False):
389
+ gr.Markdown("""
390
+ ### πŸš€ Quick Start:
391
+ 1. **πŸ€– Prompts Tab**: Try API method, search for "translator" or "linux"
392
+ 2. **βš–οΈ Caselaw Tab**: Try API method, search for "contract" or "court"
393
+ 3. **πŸ’° Finance Tab**: Requires login, try API method first
394
+ 4. **🩺 Medical Tab**: Requires login, try API method first
395
+ 5. **πŸ–ΌοΈ InScene Tab**: Requires login, try Datasets method for images
396
+
397
+ ### πŸ”‘ Authentication:
398
+ For gated datasets, run in terminal: `huggingface-cli login`
399
+
400
+ ### πŸ› οΈ Methods:
401
+ - **πŸ’¨ API**: Fast, reliable, works without login (100 rows max)
402
+ - **🐼 Pandas**: Full dataset access, requires login for gated datasets
403
+ - **πŸ€— Datasets**: Good for streaming large datasets
404
+ - **🧊 Polars/Dask**: Alternative fast data processing
405
+ - **πŸ₯ Croissant**: Metadata-aware loading
406
+ """)
407
+
408
+ with gr.Tabs():
409
+ for key in DATASET_CONFIG.keys():
410
+ create_dataset_tab(key)
411
+
412
+ if __name__ == "__main__":
413
+ demo.launch(debug=True)