mariagrandury commited on
Commit
30918aa
·
1 Parent(s): 6c8936b

implement script and add languages from Spain

Browse files
datasets_cache.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a139fa06dfe21c909136c004dc91e0dc0a92e81ffb7ca68fc6c1353d8717851c
3
+ size 33831411
hub_datasets_by_language.ipynb → explore.ipynb RENAMED
File without changes
hub_datasets_by_language.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ from collections import Counter
4
+ from datetime import datetime
5
+
6
+ import matplotlib.pyplot as plt
7
+ import numpy as np
8
+ import pandas as pd
9
+ from huggingface_hub import HfApi
10
+
11
+ # Define colors for each language
12
+ LANGUAGE_COLORS = {
13
+ "english": "orange",
14
+ "spanish": "blue",
15
+ "catalan": "red",
16
+ "galician": "green",
17
+ "basque": "purple",
18
+ }
19
+
20
+ GRID = False
21
+
22
+
23
+ def fetch_datasets(cache_file="datasets_cache.pkl"):
24
+ """Fetch and filter datasets from HuggingFace Hub with caching"""
25
+ # Check if cached data exists and is less than 24 hours old
26
+ if os.path.exists(cache_file):
27
+ cache_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
28
+ if cache_age < 24 * 3600: # 24 hours in seconds
29
+ print("Loading datasets from cache...")
30
+ with open(cache_file, "rb") as f:
31
+ return pickle.load(f)
32
+ else:
33
+ print("Cache is older than 24 hours, fetching fresh data...")
34
+ else:
35
+ print("No cache found, fetching datasets from Hugging Face Hub...")
36
+
37
+ hf_api = HfApi()
38
+ all_datasets = list(hf_api.list_datasets(full=True))
39
+
40
+ # Filter datasets by language
41
+ english_filter = filter(
42
+ lambda d: "language:en" in d.tags
43
+ and not any(
44
+ tag.startswith("language:") and tag != "language:en" for tag in d.tags
45
+ ),
46
+ all_datasets,
47
+ )
48
+ spanish_filter = filter(
49
+ lambda d: "language:es" in d.tags
50
+ and not any(
51
+ tag.startswith("language:") and tag != "language:es" for tag in d.tags
52
+ ),
53
+ all_datasets,
54
+ )
55
+ catalan_filter = filter(
56
+ lambda d: "language:ca" in d.tags
57
+ and not any(
58
+ tag.startswith("language:") and tag != "language:ca" for tag in d.tags
59
+ ),
60
+ all_datasets,
61
+ )
62
+ galician_filter = filter(
63
+ lambda d: "language:gl" in d.tags
64
+ and not any(
65
+ tag.startswith("language:") and tag != "language:gl" for tag in d.tags
66
+ ),
67
+ all_datasets,
68
+ )
69
+ basque_filter = filter(
70
+ lambda d: "language:eu" in d.tags
71
+ and not any(
72
+ tag.startswith("language:") and tag != "language:eu" for tag in d.tags
73
+ ),
74
+ all_datasets,
75
+ )
76
+ filtered_datasets = {
77
+ "english": list(english_filter),
78
+ "spanish": list(spanish_filter),
79
+ "catalan": list(catalan_filter),
80
+ "galician": list(galician_filter),
81
+ "basque": list(basque_filter),
82
+ }
83
+
84
+ # Cache the filtered datasets
85
+ print("Saving datasets to cache...")
86
+ with open(cache_file, "wb") as f:
87
+ pickle.dump(filtered_datasets, f)
88
+
89
+ return filtered_datasets
90
+
91
+
92
+ def create_bar_plots(datasets, output_dir):
93
+ """Create horizontal and vertical bar plots"""
94
+ # Extract creation dates and counts
95
+ years = sorted(
96
+ set(
97
+ date.year
98
+ for date in [
99
+ d.created_at.date() for d in datasets["english"] + datasets["spanish"]
100
+ ]
101
+ )
102
+ )
103
+ english_counts = Counter(
104
+ date.year for date in [d.created_at.date() for d in datasets["english"]]
105
+ )
106
+ spanish_counts = Counter(
107
+ date.year for date in [d.created_at.date() for d in datasets["spanish"]]
108
+ )
109
+
110
+ # Horizontal bar plot
111
+ plt.figure(figsize=(8, 5))
112
+ bar_width = 0.4
113
+ years_index = np.arange(len(years))
114
+
115
+ plt.bar(
116
+ years_index - bar_width / 2,
117
+ [english_counts[year] for year in years],
118
+ width=bar_width,
119
+ label="English",
120
+ color=LANGUAGE_COLORS["english"],
121
+ )
122
+ plt.bar(
123
+ years_index + bar_width / 2,
124
+ [spanish_counts[year] for year in years],
125
+ width=bar_width,
126
+ label="Spanish",
127
+ color=LANGUAGE_COLORS["spanish"],
128
+ )
129
+
130
+ plt.xlabel("Year", fontsize=10)
131
+ plt.ylabel("Number of Datasets", fontsize=10)
132
+ plt.xticks(years_index, years, fontsize=10)
133
+ plt.legend()
134
+ plt.grid(GRID)
135
+ plt.tight_layout()
136
+ plt.savefig(f"{output_dir}/bar_plot_horizontal.png")
137
+ plt.close()
138
+
139
+ # Vertical bar plot
140
+ plt.figure(figsize=(8, 5))
141
+ plt.bar(
142
+ years,
143
+ [english_counts[year] for year in years],
144
+ width=0.4,
145
+ label="English",
146
+ color=LANGUAGE_COLORS["english"],
147
+ )
148
+ plt.bar(
149
+ years,
150
+ [spanish_counts[year] for year in years],
151
+ width=0.4,
152
+ label="Spanish",
153
+ color=LANGUAGE_COLORS["spanish"],
154
+ bottom=[english_counts[year] for year in years],
155
+ )
156
+
157
+ plt.xlabel("Year", fontsize=10)
158
+ plt.ylabel("Number of Datasets", fontsize=10)
159
+ plt.xticks(years, fontsize=10)
160
+ plt.legend()
161
+ plt.tight_layout()
162
+ plt.grid(GRID)
163
+ plt.savefig(f"{output_dir}/bar_plot_vertical.png")
164
+ plt.close()
165
+
166
+
167
+ def create_pie_chart(datasets, output_dir):
168
+ """Create pie chart showing distribution of datasets by language"""
169
+ # Calculate counts
170
+ counts = {
171
+ lang.capitalize(): len(datasets[lang])
172
+ for lang in ["english", "spanish", "catalan", "galician", "basque"]
173
+ }
174
+
175
+ plt.figure(figsize=(8, 8))
176
+ plt.pie(
177
+ counts.values(),
178
+ labels=counts.keys(),
179
+ autopct="%1.1f%%",
180
+ startangle=180,
181
+ colors=[
182
+ LANGUAGE_COLORS[lang]
183
+ for lang in ["english", "spanish", "catalan", "galician", "basque"]
184
+ ],
185
+ )
186
+ plt.axis("equal")
187
+ plt.savefig(f"{output_dir}/pie_chart.png")
188
+ plt.close()
189
+
190
+
191
+ def create_time_series(datasets, output_dir):
192
+ """Create time series plots"""
193
+ # Prepare data
194
+ creation_dates_english = [d.created_at.date() for d in datasets["english"]]
195
+ creation_dates_spanish = [d.created_at.date() for d in datasets["spanish"]]
196
+
197
+ df_english = pd.DataFrame(creation_dates_english, columns=["Date"])
198
+ df_spanish = pd.DataFrame(creation_dates_spanish, columns=["Date"])
199
+
200
+ df_english["Count"] = 1
201
+ df_spanish["Count"] = 1
202
+
203
+ df_english["Date"] = pd.to_datetime(df_english["Date"])
204
+ df_spanish["Date"] = pd.to_datetime(df_spanish["Date"])
205
+
206
+ # Cumulative plots
207
+ df_english_cum = (
208
+ df_english.groupby(pd.Grouper(key="Date", freq="MS")).sum().cumsum()
209
+ )
210
+ df_spanish_cum = (
211
+ df_spanish.groupby(pd.Grouper(key="Date", freq="MS")).sum().cumsum()
212
+ )
213
+
214
+ plt.figure(figsize=(10, 6))
215
+ plt.plot(
216
+ df_english_cum.index,
217
+ df_english_cum["Count"],
218
+ label="English",
219
+ color=LANGUAGE_COLORS["english"],
220
+ )
221
+ plt.plot(
222
+ df_spanish_cum.index,
223
+ df_spanish_cum["Count"],
224
+ label="Spanish",
225
+ color=LANGUAGE_COLORS["spanish"],
226
+ )
227
+
228
+ plt.xlabel("Date", fontsize=10)
229
+ plt.ylabel("Cumulative Number of Datasets", fontsize=10)
230
+ plt.xticks(rotation=45, fontsize=10)
231
+ plt.legend(loc="upper left")
232
+ plt.tight_layout()
233
+ plt.grid(GRID)
234
+ plt.savefig(f"{output_dir}/time_series.png")
235
+ plt.close()
236
+
237
+
238
+ def create_stack_area_plots(datasets, output_dir):
239
+ """Create stacked area plots"""
240
+ # Prepare data for all languages
241
+ all_dates = []
242
+ languages = ["english", "spanish", "catalan", "galician", "basque"]
243
+ for lang in languages:
244
+ all_dates.extend([d.created_at.date() for d in datasets[lang]])
245
+
246
+ # Create a common date range for all languages
247
+ min_date = min(all_dates)
248
+ max_date = max(all_dates)
249
+ date_range = pd.date_range(start=min_date, end=max_date, freq="MS")
250
+
251
+ # Create separate DataFrames for each language
252
+ dfs = {}
253
+ for lang in languages:
254
+ dates = [d.created_at.date() for d in datasets[lang]]
255
+ df = pd.DataFrame({"Date": dates})
256
+ df["Count"] = 1
257
+ df["Date"] = pd.to_datetime(df["Date"])
258
+ # Reindex to common date range and fill missing values with 0
259
+ df_grouped = df.groupby(pd.Grouper(key="Date", freq="MS")).sum()
260
+ df_grouped = df_grouped.reindex(date_range, fill_value=0)
261
+ dfs[lang] = df_grouped.cumsum()
262
+
263
+ # Plot stacked area for all languages
264
+ plt.figure(figsize=(10, 6))
265
+ plt.stackplot(
266
+ date_range,
267
+ [dfs[lang]["Count"].values for lang in languages],
268
+ labels=[lang.capitalize() for lang in languages],
269
+ colors=[LANGUAGE_COLORS[lang] for lang in languages],
270
+ )
271
+
272
+ plt.xlabel("Date", fontsize=10)
273
+ plt.ylabel("Cumulative Number of Datasets", fontsize=10)
274
+ plt.xticks(rotation=45, fontsize=10)
275
+ plt.legend(loc="upper left")
276
+ plt.tight_layout()
277
+ plt.grid(GRID)
278
+ plt.savefig(f"{output_dir}/stack_area.png")
279
+ plt.close()
280
+
281
+ # Plot stacked area for all except English
282
+ plt.figure(figsize=(10, 6))
283
+ plt.stackplot(
284
+ date_range,
285
+ [
286
+ dfs[lang]["Count"].values
287
+ for lang in ["spanish", "catalan", "galician", "basque"]
288
+ ],
289
+ labels=["Spanish", "Catalan", "Galician", "Basque"],
290
+ colors=[
291
+ LANGUAGE_COLORS[lang]
292
+ for lang in ["spanish", "catalan", "galician", "basque"]
293
+ ],
294
+ )
295
+
296
+ plt.xlabel("Date", fontsize=10)
297
+ plt.ylabel("Cumulative Number of Datasets", fontsize=10)
298
+ plt.xticks(rotation=45, fontsize=10)
299
+ plt.legend(loc="upper left")
300
+ plt.tight_layout()
301
+ plt.grid(GRID)
302
+ plt.savefig(f"{output_dir}/stack_area_es_ca_gl_eu.png")
303
+ plt.close()
304
+
305
+ # Plot stacked area for English and Spanish
306
+ plt.figure(figsize=(10, 6))
307
+ plt.stackplot(
308
+ date_range,
309
+ [dfs[lang]["Count"].values for lang in ["english", "spanish"]],
310
+ labels=["English", "Spanish"],
311
+ colors=[LANGUAGE_COLORS[lang] for lang in ["english", "spanish"]],
312
+ )
313
+
314
+ plt.xlabel("Date", fontsize=10)
315
+ plt.ylabel("Cumulative Number of Datasets", fontsize=10)
316
+ plt.xticks(rotation=45, fontsize=10)
317
+ plt.legend(loc="upper left")
318
+ plt.tight_layout()
319
+ plt.grid(GRID)
320
+ plt.savefig(f"{output_dir}/stack_area_en_es.png")
321
+ plt.close()
322
+
323
+ # Plot stacked area for Spanish only
324
+ plt.figure(figsize=(10, 6))
325
+ plt.stackplot(
326
+ date_range,
327
+ [dfs["spanish"]["Count"].values],
328
+ labels=["Spanish"],
329
+ colors=[LANGUAGE_COLORS["spanish"]],
330
+ )
331
+
332
+ plt.xlabel("Date", fontsize=10)
333
+ plt.ylabel("Cumulative Number of Datasets", fontsize=10)
334
+ plt.xticks(rotation=45, fontsize=10)
335
+ plt.legend(loc="upper left")
336
+ plt.tight_layout()
337
+ plt.grid(GRID)
338
+ plt.savefig(f"{output_dir}/stack_area_es.png")
339
+ plt.close()
340
+
341
+
342
+ def main():
343
+ # Create output directory if it doesn't exist
344
+ output_dir = "plots"
345
+ os.makedirs(output_dir, exist_ok=True)
346
+
347
+ # Fetch datasets
348
+ print("Fetching datasets from Hugging Face Hub...")
349
+ datasets = fetch_datasets()
350
+
351
+ # Create visualizations
352
+ print("Creating bar plots...")
353
+ create_bar_plots(datasets, output_dir)
354
+
355
+ print("Creating pie chart...")
356
+ create_pie_chart(datasets, output_dir)
357
+
358
+ print("Creating time series plots...")
359
+ create_time_series(datasets, output_dir)
360
+
361
+ print("Creating stack area plots...")
362
+ create_stack_area_plots(datasets, output_dir)
363
+
364
+ print(f"All visualizations have been saved to the '{output_dir}' directory")
365
+
366
+
367
+ if __name__ == "__main__":
368
+ main()
plots/bar_plot_horizontal.png CHANGED
plots/bar_plot_vertical.png CHANGED
plots/pie_chart.png ADDED
plots/stack_area.png CHANGED
plots/stack_area_en_es.png ADDED
plots/stack_area_es.png CHANGED
plots/stack_area_es_ca_gl_eu.png ADDED
plots/time_series.png CHANGED