pvaluedotone commited on
Commit
dcfdbaf
Β·
verified Β·
1 Parent(s): dc99c13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -31
app.py CHANGED
@@ -6,6 +6,9 @@ import matplotlib.pyplot as plt
6
  import seaborn as sns
7
  from transformers import pipeline
8
 
 
 
 
9
  # Load sentiment pipeline
10
  sentiment_pipeline = pipeline(
11
  "text-classification",
@@ -21,7 +24,7 @@ def clean_text(text):
21
  return text.lower().strip()
22
 
23
  def predict_sentiment(texts):
24
- results = sentiment_pipeline(texts, truncation=True, batch_size=32)
25
  sentiments = []
26
  confidences = []
27
  for r in results:
@@ -41,8 +44,9 @@ def recategorize(labels, mode, pos_threshold, neg_threshold):
41
  "Negative" if lbl <= neg_threshold else
42
  "Neutral" for lbl in labels
43
  ]
 
 
44
 
45
- def analyze_sentiment(file, text_column, mode, pos_thresh, neg_thresh):
46
  try:
47
  df = pd.read_csv(file.name)
48
  except Exception as e:
@@ -51,24 +55,54 @@ def analyze_sentiment(file, text_column, mode, pos_thresh, neg_thresh):
51
  if text_column not in df.columns:
52
  return "Selected column not found.", None, None, None, None, None
53
 
54
- df["clean_text"] = df[text_column].apply(clean_text)
55
- predictions, confidences = predict_sentiment(df["clean_text"].tolist())
56
- df["sentiment_1to10"] = predictions
57
- df["confidence"] = confidences
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  df["sentiment_recategorised"] = recategorize(df["sentiment_1to10"], mode, pos_thresh, neg_thresh)
59
 
60
  # Save results
61
  output_file = "bigbird_sentiment_results.csv"
62
  df.to_csv(output_file, index=False)
63
 
64
- # Plot 1: Original 10-class sentiment distribution
65
- plt.figure(figsize=(6, 4))
66
- sns.countplot(x=df["sentiment_1to10"], palette="Blues")
67
- plt.title("Original 10-Class Sentiment Distribution")
68
- plt.tight_layout()
69
- plot1_path = "original_dist.png"
70
- plt.savefig(plot1_path)
71
- plt.close()
 
 
72
 
73
  # Plot 2: Recategorized sentiment distribution
74
  plt.figure(figsize=(6, 4))
@@ -79,19 +113,22 @@ def analyze_sentiment(file, text_column, mode, pos_thresh, neg_thresh):
79
  plt.savefig(plot2_path)
80
  plt.close()
81
 
82
- # Plot 3: Confidence score distribution
83
- plt.figure(figsize=(6, 4))
84
- sns.histplot(df["confidence"], bins=20, color="orange", kde=True)
85
- plt.title("Confidence Score Distribution")
86
- plt.xlabel("Confidence")
87
- plt.tight_layout()
88
- plot3_path = "confidence_dist.png"
89
- plt.savefig(plot3_path)
90
- plt.close()
 
 
91
 
92
  # Sample preview
93
  preview = df[[text_column, "sentiment_1to10", "confidence", "sentiment_recategorised"]].head(10)
94
- return f"Sentiment analysis complete. Processed {len(df)} rows.", preview, output_file, plot1_path, plot2_path, plot3_path
 
95
 
96
  def get_text_columns(file):
97
  try:
@@ -106,7 +143,7 @@ def get_text_columns(file):
106
  with gr.Blocks() as app:
107
  gr.Markdown("## ✈️ Sentiment analysis with `pvaluedotone/bigbird-flight`")
108
  gr.Markdown("**Citation:** Mat Roni, S. (2025). *Sentiment analysis with Big Bird Flight on Gradio* (version 1.0) [software]. https://huggingface.co/spaces/pvaluedotone/bigbird-flight")
109
- gr.Markdown("Upload a CSV, choose a text column, select output style (10-class, binary, or ternary), and analyze.")
110
 
111
  with gr.Row():
112
  file_input = gr.File(label="Upload CSV", file_types=[".csv"])
@@ -121,8 +158,10 @@ with gr.Blocks() as app:
121
  interactive=True
122
  )
123
 
124
- pos_thresh_slider = gr.Slider(5, 10, value=7, step=1, label="Positive Threshold", visible=False)
125
- neg_thresh_slider = gr.Slider(1, 5, value=4, step=1, label="Negative Threshold", visible=False)
 
 
126
 
127
  def toggle_thresholds(mode):
128
  show_pos = mode != "Original (1–10)"
@@ -134,19 +173,19 @@ with gr.Blocks() as app:
134
 
135
  output_mode.change(toggle_thresholds, inputs=output_mode, outputs=[pos_thresh_slider, neg_thresh_slider])
136
 
137
- run_button = gr.Button("Run Sentiment Analysis")
138
 
139
  status = gr.Textbox(label="Status")
140
  df_output = gr.Dataframe(label="Sample Output (Top 10)")
141
  file_result = gr.File(label="Download Full Results")
142
  plot_orig = gr.Image(label="Original Sentiment Distribution")
143
- plot_recat = gr.Image(label="Recategorized Sentiment Distribution")
144
  plot_conf = gr.Image(label="Confidence Score Distribution")
145
 
146
  run_button.click(
147
  analyze_sentiment,
148
- inputs=[file_input, column_dropdown, output_mode, pos_thresh_slider, neg_thresh_slider],
149
  outputs=[status, df_output, file_result, plot_orig, plot_recat, plot_conf]
150
  )
151
 
152
- app.launch(debug=True)
 
6
  import seaborn as sns
7
  from transformers import pipeline
8
 
9
+ cached_df = None
10
+ cached_file_name = None
11
+
12
  # Load sentiment pipeline
13
  sentiment_pipeline = pipeline(
14
  "text-classification",
 
24
  return text.lower().strip()
25
 
26
  def predict_sentiment(texts):
27
+ results = sentiment_pipeline(texts, truncation=False, batch_size=32)
28
  sentiments = []
29
  confidences = []
30
  for r in results:
 
44
  "Negative" if lbl <= neg_threshold else
45
  "Neutral" for lbl in labels
46
  ]
47
+ def analyze_sentiment(file, text_column, mode, pos_thresh, neg_thresh, auto_fix):
48
+ global cached_df, cached_file_name
49
 
 
50
  try:
51
  df = pd.read_csv(file.name)
52
  except Exception as e:
 
55
  if text_column not in df.columns:
56
  return "Selected column not found.", None, None, None, None, None
57
 
58
+ # Check if sentiment analysis already done and file is unchanged
59
+ if (
60
+ cached_df is not None and
61
+ cached_file_name == file.name and
62
+ "sentiment_1to10" in cached_df.columns and
63
+ "confidence" in cached_df.columns
64
+ ):
65
+ df = cached_df.copy()
66
+ else:
67
+ # Clean and predict
68
+ df["clean_text"] = df[text_column].apply(clean_text)
69
+ predictions, confidences = predict_sentiment(df["clean_text"].tolist())
70
+ df["sentiment_1to10"] = predictions
71
+ df["confidence"] = confidences
72
+ # Cache result
73
+ cached_df = df.copy()
74
+ cached_file_name = file.name
75
+
76
+ # πŸ›‘ Check thresholds
77
+ if mode == "Ternary (Pos/Neu/Neg)":
78
+ if pos_thresh <= neg_thresh:
79
+ if auto_fix:
80
+ neg_thresh = pos_thresh - 1
81
+ if neg_thresh < 1:
82
+ return "⚠️ Cannot auto-correct: thresholds out of valid range (1–10).", None, None, None, None, None
83
+ else:
84
+ return (
85
+ f"⚠️ Invalid thresholds: Positive min ({pos_thresh}) must be greater than Negative max ({neg_thresh}).",
86
+ None, None, None, None, None
87
+ )
88
+
89
+ # Apply recategorization
90
  df["sentiment_recategorised"] = recategorize(df["sentiment_1to10"], mode, pos_thresh, neg_thresh)
91
 
92
  # Save results
93
  output_file = "bigbird_sentiment_results.csv"
94
  df.to_csv(output_file, index=False)
95
 
96
+ # Plot 1: Original 10-class sentiment distribution (only if new analysis)
97
+ if "plot1_path" not in globals():
98
+ plt.figure(figsize=(6, 4))
99
+ sns.countplot(x=df["sentiment_1to10"], palette="Blues")
100
+ plt.title("Original 10-Class Sentiment Distribution")
101
+ plt.tight_layout()
102
+ global plot1_path
103
+ plot1_path = "original_dist.png"
104
+ plt.savefig(plot1_path)
105
+ plt.close()
106
 
107
  # Plot 2: Recategorized sentiment distribution
108
  plt.figure(figsize=(6, 4))
 
113
  plt.savefig(plot2_path)
114
  plt.close()
115
 
116
+ # Plot 3: Confidence score distribution (only if new analysis)
117
+ if "plot3_path" not in globals():
118
+ plt.figure(figsize=(6, 4))
119
+ sns.histplot(df["confidence"], bins=20, color="skyblue", kde=True)
120
+ plt.title("Confidence Score Distribution")
121
+ plt.xlabel("Confidence")
122
+ plt.tight_layout()
123
+ global plot3_path
124
+ plot3_path = "confidence_dist.png"
125
+ plt.savefig(plot3_path)
126
+ plt.close()
127
 
128
  # Sample preview
129
  preview = df[[text_column, "sentiment_1to10", "confidence", "sentiment_recategorised"]].head(10)
130
+ return f"βœ… Sentiment analysis complete. Used cache: {cached_file_name == file.name}", preview, output_file, plot1_path, plot2_path, plot3_path
131
+
132
 
133
  def get_text_columns(file):
134
  try:
 
143
  with gr.Blocks() as app:
144
  gr.Markdown("## ✈️ Sentiment analysis with `pvaluedotone/bigbird-flight`")
145
  gr.Markdown("**Citation:** Mat Roni, S. (2025). *Sentiment analysis with Big Bird Flight on Gradio* (version 1.0) [software]. https://huggingface.co/spaces/pvaluedotone/bigbird-flight")
146
+ gr.Markdown("Upload a CSV, choose a text column to analyse, select output style (10-class, binary, or ternary), and analyse.")
147
 
148
  with gr.Row():
149
  file_input = gr.File(label="Upload CSV", file_types=[".csv"])
 
158
  interactive=True
159
  )
160
 
161
+ pos_thresh_slider = gr.Slider(3, 10, value=7, step=1, label="Positive min", visible=False)
162
+ neg_thresh_slider = gr.Slider(1, 7, value=4, step=1, label="Negative max", visible=False)
163
+ auto_fix_checkbox = gr.Checkbox(label="Auto-correct thresholds if overlapping?", value=True)
164
+
165
 
166
  def toggle_thresholds(mode):
167
  show_pos = mode != "Original (1–10)"
 
173
 
174
  output_mode.change(toggle_thresholds, inputs=output_mode, outputs=[pos_thresh_slider, neg_thresh_slider])
175
 
176
+ run_button = gr.Button("Process sentiment")
177
 
178
  status = gr.Textbox(label="Status")
179
  df_output = gr.Dataframe(label="Sample Output (Top 10)")
180
  file_result = gr.File(label="Download Full Results")
181
  plot_orig = gr.Image(label="Original Sentiment Distribution")
182
+ plot_recat = gr.Image(label="Recategorised Sentiment Distribution")
183
  plot_conf = gr.Image(label="Confidence Score Distribution")
184
 
185
  run_button.click(
186
  analyze_sentiment,
187
+ inputs=[file_input, column_dropdown, output_mode, pos_thresh_slider, neg_thresh_slider, auto_fix_checkbox],
188
  outputs=[status, df_output, file_result, plot_orig, plot_recat, plot_conf]
189
  )
190
 
191
+ app.launch(share=True, debug=True)