Emily Witko commited on
Commit
ec15d48
·
1 Parent(s): 335a565

Initial commit

Browse files
Files changed (2) hide show
  1. app.py +167 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from transformers import pipeline
4
+ from sklearn.feature_extraction.text import CountVectorizer
5
+ from sklearn.decomposition import LatentDirichletAllocation
6
+ from rake_nltk import Rake
7
+ from collections import Counter
8
+ import re
9
+
10
+ def analyze_demographics(file):
11
+ df = pd.read_excel(file.name)
12
+
13
+ results = {
14
+ "Overall Metrics": {},
15
+ "Underrepresented Group Metrics": {},
16
+ "Tenure Metrics": {},
17
+ "Team Metrics": {},
18
+ "Nationality Metrics": {},
19
+ "Legal Entity Metrics": {},
20
+ "Work Location Metrics": {}
21
+ }
22
+
23
+ tenure_order = ["< 1 year", "1 year - 2 years", "2 years - 3 years", "3 years - 4 years", "> 4 years"]
24
+
25
+ recommend_col = "On a scale of 0 to 10, how likely are you to recommend working at Hugging Face to a friend or colleague?"
26
+ if recommend_col in df.columns:
27
+ promoters = df[recommend_col].apply(lambda x: x >= 9).sum()
28
+ detractors = df[recommend_col].apply(lambda x: x <= 6).sum()
29
+ total_respondents = df[recommend_col].notna().sum()
30
+ recommend_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
31
+ recommend_avg = df[recommend_col].mean()
32
+ results["Overall Metrics"]['HF NPS'] = round(recommend_nps, 2)
33
+ results["Overall Metrics"]['HF NPS (Average)'] = round(recommend_avg, 2)
34
+
35
+ support_col = "On a scale of 0 to 10, how likely are you to recommend the support functions at HF (diversity, finance, hr, legal, security, talent) to a friend or colleague?"
36
+ if support_col in df.columns:
37
+ promoters = df[support_col].apply(lambda x: x >= 9).sum()
38
+ detractors = df[support_col].apply(lambda x: x <= 6).sum()
39
+ total_respondents = df[support_col].notna().sum()
40
+ support_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
41
+ support_avg = df[support_col].mean()
42
+ results["Overall Metrics"]['Support NPS'] = round(support_nps, 2)
43
+ results["Overall Metrics"]['Support NPS (Average)'] = round(support_avg, 2)
44
+
45
+ demographic_columns = [
46
+ ("I identify as a member of an underrepresented group in tech. (e.g. including but not limited to gender, age, disability, sexuality, etc.)", "Underrepresented Group Metrics"),
47
+ ("How long have you been at Hugging Face? (optional)", "Tenure Metrics"),
48
+ ("Which team are you on here at Hugging Face? (optional)", "Team Metrics"),
49
+ ("What is your primary nationality? (optional -- we only listed the largest groups to ensure anonymity.)", "Nationality Metrics"),
50
+ ("Which legal entity are you employed by at HF? (optional)", "Legal Entity Metrics"),
51
+ ("Are you fully remote or work mostly from a Hugging Face office? (optional)", "Work Location Metrics")
52
+ ]
53
+
54
+ for demo_col, demo_category in demographic_columns:
55
+ if demo_col in df.columns:
56
+ for col, prefix in [(recommend_col, "HF NPS"), (support_col, "Support NPS")]:
57
+ if col in df.columns:
58
+ grouped_demo = df.groupby(demo_col)[col]
59
+ nps_by_demo = {}
60
+ for group, scores in grouped_demo:
61
+ promoters = scores.apply(lambda x: x >= 9).sum()
62
+ detractors = scores.apply(lambda x: x <= 6).sum()
63
+ total = scores.notna().sum()
64
+ nps_by_demo[group] = ((promoters - detractors) / total) * 100 if total > 0 else None
65
+ if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
66
+ sorted_nps_by_demo = {k: nps_by_demo.get(k, None) for k in tenure_order if k in nps_by_demo}
67
+ results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in sorted_nps_by_demo.items()}
68
+ else:
69
+ results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in nps_by_demo.items()}
70
+ averages_demo = grouped_demo.mean()
71
+ if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
72
+ sorted_averages_demo = {k: averages_demo.get(k, None) for k in tenure_order if k in averages_demo}
73
+ results[demo_category][f'{prefix} (Average)'] = {k: round(v, 2) if v is not None else None for k, v in sorted_averages_demo.items()}
74
+ else:
75
+ results[demo_category][f'{prefix} (Average)'] = averages_demo.round(2).to_dict()
76
+
77
+ return results
78
+
79
+ def analyze_why_columns(file):
80
+ df = pd.read_excel(file.name)
81
+ why_columns = [col for col in df.columns if col.startswith("Why")]
82
+
83
+ results = {}
84
+
85
+ sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
86
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
87
+
88
+ for col in why_columns:
89
+ column_data = df[col].dropna().tolist()
90
+
91
+ # Sentiment Analysis with Confidence Scores
92
+ sentiments = sentiment_analyzer(column_data)
93
+ sentiment_summary = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
94
+ detailed_sentiments = {"POSITIVE": [], "NEGATIVE": [], "NEUTRAL": []}
95
+
96
+ for response, sentiment in zip(column_data, sentiments):
97
+ label = sentiment["label"]
98
+ score = sentiment["score"]
99
+ sentiment_summary[label] += 1
100
+ detailed_sentiments[label].append({"response": response, "score": round(score, 2)})
101
+
102
+ # Topic Modeling
103
+ vectorizer = CountVectorizer(stop_words='english')
104
+ X = vectorizer.fit_transform(column_data)
105
+ lda = LatentDirichletAllocation(n_components=3, random_state=0)
106
+ lda.fit(X)
107
+ topics = []
108
+ for idx, topic in enumerate(lda.components_):
109
+ top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]]
110
+ topics.append(f"Topic {idx + 1}: " + ", ".join(top_words))
111
+
112
+ # Keyword Extraction
113
+ combined_text = " ".join(column_data)
114
+ word_list = re.findall(r"\b\w+\b", combined_text.lower())
115
+ bigram_vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english')
116
+ bigram_counts = bigram_vectorizer.fit_transform([combined_text])
117
+ bigram_features = bigram_vectorizer.get_feature_names_out()
118
+ bigram_counts_sum = bigram_counts.toarray().sum(axis=0)
119
+ bigram_frequency = Counter(dict(zip(bigram_features, bigram_counts_sum))).most_common(10)
120
+ keywords = [f"{phrase} ({count} mentions)" for phrase, count in bigram_frequency]
121
+
122
+ # Summarization
123
+ def split_text(text, max_length=1000):
124
+ words = text.split()
125
+ for i in range(0, len(words), max_length):
126
+ yield " ".join(words[i:i + max_length])
127
+
128
+ summaries = []
129
+ for chunk in split_text(combined_text, max_length=500):
130
+ summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
131
+ summaries.append(summary)
132
+
133
+ final_summary = " ".join(summaries)
134
+
135
+ # Store results
136
+ results[col] = {
137
+ "Sentiment Analysis Summary": sentiment_summary,
138
+ "Detailed Sentiments": detailed_sentiments,
139
+ "Topics": topics,
140
+ "Keywords": keywords,
141
+ "Summary": final_summary
142
+ }
143
+
144
+ return results
145
+
146
+ def process_file(file):
147
+ quantitative_results = analyze_demographics(file)
148
+ qualitative_results = analyze_why_columns(file)
149
+
150
+ return quantitative_results, qualitative_results
151
+
152
+ def app():
153
+ file_input = gr.File(label="Upload Survey Data (Excel format)")
154
+ text_output = gr.JSON(label="Quantitative Analysis Results")
155
+ qualitative_output = gr.JSON(label="Qualitative Analysis Results")
156
+
157
+ iface = gr.Interface(
158
+ fn=process_file,
159
+ inputs=file_input,
160
+ outputs=[text_output, qualitative_output],
161
+ title="Survey Data Analyzer",
162
+ description="Analyze both quantitative and qualitative survey data. Upload an Excel file to generate insights."
163
+ )
164
+ return iface
165
+
166
+ if __name__ == "__main__":
167
+ app().launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio==3.40.0
2
+ pandas==1.5.3
3
+ openpyxl==3.1.2
4
+ scikit-learn==1.2.2
5
+ transformers==4.34.0
6
+ torch==2.0.1