Spaces:
Sleeping
Sleeping
Emily Witko
commited on
Commit
·
ec15d48
1
Parent(s):
335a565
Initial commit
Browse files- app.py +167 -0
- requirements.txt +6 -0
app.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from transformers import pipeline
|
4 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
5 |
+
from sklearn.decomposition import LatentDirichletAllocation
|
6 |
+
from rake_nltk import Rake
|
7 |
+
from collections import Counter
|
8 |
+
import re
|
9 |
+
|
10 |
+
def analyze_demographics(file):
|
11 |
+
df = pd.read_excel(file.name)
|
12 |
+
|
13 |
+
results = {
|
14 |
+
"Overall Metrics": {},
|
15 |
+
"Underrepresented Group Metrics": {},
|
16 |
+
"Tenure Metrics": {},
|
17 |
+
"Team Metrics": {},
|
18 |
+
"Nationality Metrics": {},
|
19 |
+
"Legal Entity Metrics": {},
|
20 |
+
"Work Location Metrics": {}
|
21 |
+
}
|
22 |
+
|
23 |
+
tenure_order = ["< 1 year", "1 year - 2 years", "2 years - 3 years", "3 years - 4 years", "> 4 years"]
|
24 |
+
|
25 |
+
recommend_col = "On a scale of 0 to 10, how likely are you to recommend working at Hugging Face to a friend or colleague?"
|
26 |
+
if recommend_col in df.columns:
|
27 |
+
promoters = df[recommend_col].apply(lambda x: x >= 9).sum()
|
28 |
+
detractors = df[recommend_col].apply(lambda x: x <= 6).sum()
|
29 |
+
total_respondents = df[recommend_col].notna().sum()
|
30 |
+
recommend_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
|
31 |
+
recommend_avg = df[recommend_col].mean()
|
32 |
+
results["Overall Metrics"]['HF NPS'] = round(recommend_nps, 2)
|
33 |
+
results["Overall Metrics"]['HF NPS (Average)'] = round(recommend_avg, 2)
|
34 |
+
|
35 |
+
support_col = "On a scale of 0 to 10, how likely are you to recommend the support functions at HF (diversity, finance, hr, legal, security, talent) to a friend or colleague?"
|
36 |
+
if support_col in df.columns:
|
37 |
+
promoters = df[support_col].apply(lambda x: x >= 9).sum()
|
38 |
+
detractors = df[support_col].apply(lambda x: x <= 6).sum()
|
39 |
+
total_respondents = df[support_col].notna().sum()
|
40 |
+
support_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
|
41 |
+
support_avg = df[support_col].mean()
|
42 |
+
results["Overall Metrics"]['Support NPS'] = round(support_nps, 2)
|
43 |
+
results["Overall Metrics"]['Support NPS (Average)'] = round(support_avg, 2)
|
44 |
+
|
45 |
+
demographic_columns = [
|
46 |
+
("I identify as a member of an underrepresented group in tech. (e.g. including but not limited to gender, age, disability, sexuality, etc.)", "Underrepresented Group Metrics"),
|
47 |
+
("How long have you been at Hugging Face? (optional)", "Tenure Metrics"),
|
48 |
+
("Which team are you on here at Hugging Face? (optional)", "Team Metrics"),
|
49 |
+
("What is your primary nationality? (optional -- we only listed the largest groups to ensure anonymity.)", "Nationality Metrics"),
|
50 |
+
("Which legal entity are you employed by at HF? (optional)", "Legal Entity Metrics"),
|
51 |
+
("Are you fully remote or work mostly from a Hugging Face office? (optional)", "Work Location Metrics")
|
52 |
+
]
|
53 |
+
|
54 |
+
for demo_col, demo_category in demographic_columns:
|
55 |
+
if demo_col in df.columns:
|
56 |
+
for col, prefix in [(recommend_col, "HF NPS"), (support_col, "Support NPS")]:
|
57 |
+
if col in df.columns:
|
58 |
+
grouped_demo = df.groupby(demo_col)[col]
|
59 |
+
nps_by_demo = {}
|
60 |
+
for group, scores in grouped_demo:
|
61 |
+
promoters = scores.apply(lambda x: x >= 9).sum()
|
62 |
+
detractors = scores.apply(lambda x: x <= 6).sum()
|
63 |
+
total = scores.notna().sum()
|
64 |
+
nps_by_demo[group] = ((promoters - detractors) / total) * 100 if total > 0 else None
|
65 |
+
if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
|
66 |
+
sorted_nps_by_demo = {k: nps_by_demo.get(k, None) for k in tenure_order if k in nps_by_demo}
|
67 |
+
results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in sorted_nps_by_demo.items()}
|
68 |
+
else:
|
69 |
+
results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in nps_by_demo.items()}
|
70 |
+
averages_demo = grouped_demo.mean()
|
71 |
+
if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
|
72 |
+
sorted_averages_demo = {k: averages_demo.get(k, None) for k in tenure_order if k in averages_demo}
|
73 |
+
results[demo_category][f'{prefix} (Average)'] = {k: round(v, 2) if v is not None else None for k, v in sorted_averages_demo.items()}
|
74 |
+
else:
|
75 |
+
results[demo_category][f'{prefix} (Average)'] = averages_demo.round(2).to_dict()
|
76 |
+
|
77 |
+
return results
|
78 |
+
|
79 |
+
def analyze_why_columns(file):
|
80 |
+
df = pd.read_excel(file.name)
|
81 |
+
why_columns = [col for col in df.columns if col.startswith("Why")]
|
82 |
+
|
83 |
+
results = {}
|
84 |
+
|
85 |
+
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
|
86 |
+
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
87 |
+
|
88 |
+
for col in why_columns:
|
89 |
+
column_data = df[col].dropna().tolist()
|
90 |
+
|
91 |
+
# Sentiment Analysis with Confidence Scores
|
92 |
+
sentiments = sentiment_analyzer(column_data)
|
93 |
+
sentiment_summary = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
|
94 |
+
detailed_sentiments = {"POSITIVE": [], "NEGATIVE": [], "NEUTRAL": []}
|
95 |
+
|
96 |
+
for response, sentiment in zip(column_data, sentiments):
|
97 |
+
label = sentiment["label"]
|
98 |
+
score = sentiment["score"]
|
99 |
+
sentiment_summary[label] += 1
|
100 |
+
detailed_sentiments[label].append({"response": response, "score": round(score, 2)})
|
101 |
+
|
102 |
+
# Topic Modeling
|
103 |
+
vectorizer = CountVectorizer(stop_words='english')
|
104 |
+
X = vectorizer.fit_transform(column_data)
|
105 |
+
lda = LatentDirichletAllocation(n_components=3, random_state=0)
|
106 |
+
lda.fit(X)
|
107 |
+
topics = []
|
108 |
+
for idx, topic in enumerate(lda.components_):
|
109 |
+
top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]]
|
110 |
+
topics.append(f"Topic {idx + 1}: " + ", ".join(top_words))
|
111 |
+
|
112 |
+
# Keyword Extraction
|
113 |
+
combined_text = " ".join(column_data)
|
114 |
+
word_list = re.findall(r"\b\w+\b", combined_text.lower())
|
115 |
+
bigram_vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english')
|
116 |
+
bigram_counts = bigram_vectorizer.fit_transform([combined_text])
|
117 |
+
bigram_features = bigram_vectorizer.get_feature_names_out()
|
118 |
+
bigram_counts_sum = bigram_counts.toarray().sum(axis=0)
|
119 |
+
bigram_frequency = Counter(dict(zip(bigram_features, bigram_counts_sum))).most_common(10)
|
120 |
+
keywords = [f"{phrase} ({count} mentions)" for phrase, count in bigram_frequency]
|
121 |
+
|
122 |
+
# Summarization
|
123 |
+
def split_text(text, max_length=1000):
|
124 |
+
words = text.split()
|
125 |
+
for i in range(0, len(words), max_length):
|
126 |
+
yield " ".join(words[i:i + max_length])
|
127 |
+
|
128 |
+
summaries = []
|
129 |
+
for chunk in split_text(combined_text, max_length=500):
|
130 |
+
summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
|
131 |
+
summaries.append(summary)
|
132 |
+
|
133 |
+
final_summary = " ".join(summaries)
|
134 |
+
|
135 |
+
# Store results
|
136 |
+
results[col] = {
|
137 |
+
"Sentiment Analysis Summary": sentiment_summary,
|
138 |
+
"Detailed Sentiments": detailed_sentiments,
|
139 |
+
"Topics": topics,
|
140 |
+
"Keywords": keywords,
|
141 |
+
"Summary": final_summary
|
142 |
+
}
|
143 |
+
|
144 |
+
return results
|
145 |
+
|
146 |
+
def process_file(file):
|
147 |
+
quantitative_results = analyze_demographics(file)
|
148 |
+
qualitative_results = analyze_why_columns(file)
|
149 |
+
|
150 |
+
return quantitative_results, qualitative_results
|
151 |
+
|
152 |
+
def app():
|
153 |
+
file_input = gr.File(label="Upload Survey Data (Excel format)")
|
154 |
+
text_output = gr.JSON(label="Quantitative Analysis Results")
|
155 |
+
qualitative_output = gr.JSON(label="Qualitative Analysis Results")
|
156 |
+
|
157 |
+
iface = gr.Interface(
|
158 |
+
fn=process_file,
|
159 |
+
inputs=file_input,
|
160 |
+
outputs=[text_output, qualitative_output],
|
161 |
+
title="Survey Data Analyzer",
|
162 |
+
description="Analyze both quantitative and qualitative survey data. Upload an Excel file to generate insights."
|
163 |
+
)
|
164 |
+
return iface
|
165 |
+
|
166 |
+
if __name__ == "__main__":
|
167 |
+
app().launch()
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==3.40.0
|
2 |
+
pandas==1.5.3
|
3 |
+
openpyxl==3.1.2
|
4 |
+
scikit-learn==1.2.2
|
5 |
+
transformers==4.34.0
|
6 |
+
torch==2.0.1
|