asoria HF staff commited on
Commit
a093cd2
·
1 Parent(s): 9f3ff28

Adding outlines for prompts

Browse files
app.py CHANGED
@@ -9,7 +9,7 @@ import json
9
  import re
10
  import pandas as pd
11
  from gradio.data_classes import FileData
12
-
13
 
14
  """
15
  TODOs:
@@ -48,62 +48,6 @@ def get_compatible_libraries(dataset: str):
48
  return resp.json()
49
 
50
 
51
- def generate_mapping_prompt(code):
52
- logging.info("Generating mapping prompt")
53
- logging.info(code)
54
- format_instructions = "Format the following python code to a list of cells to be used in a jupyter notebook:\n"
55
- format_instructions += code
56
- format_instructions += """
57
- The output should be a markdown code snippet formatted in the
58
- following schema, including the leading and trailing "```json" and "```":
59
-
60
- ```json
61
- [
62
- {
63
- "cell_type": string // This refers either is a markdown or code cell type.
64
- "source": list of string separated by comma // This is the list of text or python code.
65
- }
66
- ]
67
- ```
68
- """
69
-
70
- return format_instructions
71
-
72
-
73
- def generate_eda_prompt(columns_info, df, first_code):
74
- sample_data = df.head(5).to_dict(orient="records")
75
-
76
- prompt = """
77
- You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
78
-
79
- Columns and Data Types:
80
- {columns_info}
81
-
82
- Sample Data:
83
- {sample_data}
84
-
85
- Please create a pandas EDA notebook that includes the following:
86
-
87
- 1. Summary statistics for numerical columns.
88
- 2. Distribution plots for numerical columns.
89
- 3. Bar plots or count plots for categorical columns.
90
- 4. Correlation matrix and heatmap for numerical columns.
91
- 5. Any additional relevant visualizations or analyses you deem appropriate.
92
-
93
- Ensure the notebook is well-organized, with explanations for each step.
94
-
95
- It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way:
96
-
97
- {first_code}
98
-
99
- """
100
- return prompt.format(
101
- columns_info=columns_info,
102
- sample_data=sample_data,
103
- first_code=first_code,
104
- )
105
-
106
-
107
  def create_notebook_file(cell_commands, notebook_name):
108
  nb = nbf.v4.new_notebook()
109
  nb["cells"] = [
@@ -205,7 +149,8 @@ def generate_cells(dataset_id):
205
  first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
206
  logging.info(f"First split file: {first_file}")
207
  features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
208
- prompt = generate_eda_prompt(features, df, first_code)
 
209
  messages = [gr.ChatMessage(role="user", content=prompt)]
210
  yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
211
 
@@ -226,7 +171,7 @@ def generate_cells(dataset_id):
226
  yield messages
227
  yield messages
228
 
229
- logging.info("---> FOrmated prompt")
230
  formatted_prompt = generate_mapping_prompt(GENERATED_TEXT)
231
  logging.info(formatted_prompt)
232
  prompt_messages = [{"role": "user", "content": formatted_prompt}]
 
9
  import re
10
  import pandas as pd
11
  from gradio.data_classes import FileData
12
+ from utils.prompts import generate_mapping_prompt, generate_eda_prompt
13
 
14
  """
15
  TODOs:
 
48
  return resp.json()
49
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def create_notebook_file(cell_commands, notebook_name):
52
  nb = nbf.v4.new_notebook()
53
  nb["cells"] = [
 
149
  first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
150
  logging.info(f"First split file: {first_file}")
151
  features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
152
+ sample_data = df.head(5).to_dict(orient="records")
153
+ prompt = generate_eda_prompt(features, sample_data, first_code)
154
  messages = [gr.ChatMessage(role="user", content=prompt)]
155
  yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
156
 
 
171
  yield messages
172
  yield messages
173
 
174
+ logging.info("---> Formated prompt")
175
  formatted_prompt = generate_mapping_prompt(GENERATED_TEXT)
176
  logging.info(formatted_prompt)
177
  prompt_messages = [{"role": "user", "content": formatted_prompt}]
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  gradio_huggingfacehub_search==0.0.7
2
  huggingface_hub
3
  nbformat
4
- httpx
 
 
1
  gradio_huggingfacehub_search==0.0.7
2
  huggingface_hub
3
  nbformat
4
+ httpx
5
+ outlines
utils/ __init__.py ADDED
File without changes
utils/__pycache__/prompts.cpython-310.pyc ADDED
Binary file (1.86 kB). View file
 
utils/prompts.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import outlines
2
+
3
+
4
+ @outlines.prompt
5
+ def generate_mapping_prompt(code):
6
+ """Format the following python code to a list of cells to be used in a jupyter notebook:
7
+ {{ code }}
8
+
9
+ The output should be a markdown code snippet formatted in the
10
+ following schema, including the leading and trailing "```json" and "```":
11
+
12
+ ```json
13
+ [
14
+ {
15
+ "cell_type": string // This refers either is a markdown or code cell type.
16
+ "source": list of string separated by comma // This is the list of text or python code.
17
+ }
18
+ ]
19
+ ```
20
+ """
21
+
22
+
23
+ @outlines.prompt
24
+ def generate_eda_prompt(columns_info, sample_data, first_code):
25
+ """You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
26
+
27
+ Columns and Data Types:
28
+ {{ columns_info }}
29
+
30
+ Sample Data:
31
+ {{ sample_data }}
32
+
33
+ Please create a pandas EDA notebook that includes the following:
34
+
35
+ 1. Summary statistics for numerical columns.
36
+ 2. Distribution plots for numerical columns.
37
+ 3. Bar plots or count plots for categorical columns.
38
+ 4. Correlation matrix and heatmap for numerical columns.
39
+ 5. Any additional relevant visualizations or analyses you deem appropriate.
40
+
41
+ Ensure the notebook is well-organized, with explanations for each step.
42
+
43
+ It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way:
44
+
45
+ {{ first_code }}
46
+
47
+ """