Spaces:

mariagrandury
/

language-gap-in-hf-hub

Running

App Files Files Community

mariagrandury commited on Apr 24

Commit

30918aa

1 Parent(s): 6c8936b

implement script and add languages from Spain

Browse files

Files changed (11) hide show

datasets_cache.pkl +3 -0
hub_datasets_by_language.ipynb → explore.ipynb +0 -0
hub_datasets_by_language.py +368 -0
plots/bar_plot_horizontal.png +0 -0
plots/bar_plot_vertical.png +0 -0
plots/pie_chart.png +0 -0
plots/stack_area.png +0 -0
plots/stack_area_en_es.png +0 -0
plots/stack_area_es.png +0 -0
plots/stack_area_es_ca_gl_eu.png +0 -0
plots/time_series.png +0 -0

datasets_cache.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a139fa06dfe21c909136c004dc91e0dc0a92e81ffb7ca68fc6c1353d8717851c
+size 33831411

hub_datasets_by_language.ipynb → explore.ipynb RENAMED Viewed

File without changes

hub_datasets_by_language.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import os
+import pickle
+from collections import Counter
+from datetime import datetime
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from huggingface_hub import HfApi
+# Define colors for each language
+LANGUAGE_COLORS = {
+    "english": "orange",
+    "spanish": "blue",
+    "catalan": "red",
+    "galician": "green",
+    "basque": "purple",
+}
+GRID = False
+def fetch_datasets(cache_file="datasets_cache.pkl"):
+    """Fetch and filter datasets from HuggingFace Hub with caching"""
+    # Check if cached data exists and is less than 24 hours old
+    if os.path.exists(cache_file):
+        cache_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
+        if cache_age < 24 * 3600:  # 24 hours in seconds
+            print("Loading datasets from cache...")
+            with open(cache_file, "rb") as f:
+                return pickle.load(f)
+        else:
+            print("Cache is older than 24 hours, fetching fresh data...")
+    else:
+        print("No cache found, fetching datasets from Hugging Face Hub...")
+    hf_api = HfApi()
+    all_datasets = list(hf_api.list_datasets(full=True))
+    # Filter datasets by language
+    english_filter = filter(
+        lambda d: "language:en" in d.tags
+        and not any(
+            tag.startswith("language:") and tag != "language:en" for tag in d.tags
+        ),
+        all_datasets,
+    )
+    spanish_filter = filter(
+        lambda d: "language:es" in d.tags
+        and not any(
+            tag.startswith("language:") and tag != "language:es" for tag in d.tags
+        ),
+        all_datasets,
+    )
+    catalan_filter = filter(
+        lambda d: "language:ca" in d.tags
+        and not any(
+            tag.startswith("language:") and tag != "language:ca" for tag in d.tags
+        ),
+        all_datasets,
+    )
+    galician_filter = filter(
+        lambda d: "language:gl" in d.tags
+        and not any(
+            tag.startswith("language:") and tag != "language:gl" for tag in d.tags
+        ),
+        all_datasets,
+    )
+    basque_filter = filter(
+        lambda d: "language:eu" in d.tags
+        and not any(
+            tag.startswith("language:") and tag != "language:eu" for tag in d.tags
+        ),
+        all_datasets,
+    )
+    filtered_datasets = {
+        "english": list(english_filter),
+        "spanish": list(spanish_filter),
+        "catalan": list(catalan_filter),
+        "galician": list(galician_filter),
+        "basque": list(basque_filter),
+    }
+    # Cache the filtered datasets
+    print("Saving datasets to cache...")
+    with open(cache_file, "wb") as f:
+        pickle.dump(filtered_datasets, f)
+    return filtered_datasets
+def create_bar_plots(datasets, output_dir):
+    """Create horizontal and vertical bar plots"""
+    # Extract creation dates and counts
+    years = sorted(
+        set(
+            date.year
+            for date in [
+                d.created_at.date() for d in datasets["english"] + datasets["spanish"]
+            ]
+        )
+    )
+    english_counts = Counter(
+        date.year for date in [d.created_at.date() for d in datasets["english"]]
+    )
+    spanish_counts = Counter(
+        date.year for date in [d.created_at.date() for d in datasets["spanish"]]
+    )
+    # Horizontal bar plot
+    plt.figure(figsize=(8, 5))
+    bar_width = 0.4
+    years_index = np.arange(len(years))
+    plt.bar(
+        years_index - bar_width / 2,
+        [english_counts[year] for year in years],
+        width=bar_width,
+        label="English",
+        color=LANGUAGE_COLORS["english"],
+    )
+    plt.bar(
+        years_index + bar_width / 2,
+        [spanish_counts[year] for year in years],
+        width=bar_width,
+        label="Spanish",
+        color=LANGUAGE_COLORS["spanish"],
+    )
+    plt.xlabel("Year", fontsize=10)
+    plt.ylabel("Number of Datasets", fontsize=10)
+    plt.xticks(years_index, years, fontsize=10)
+    plt.legend()
+    plt.grid(GRID)
+    plt.tight_layout()
+    plt.savefig(f"{output_dir}/bar_plot_horizontal.png")
+    plt.close()
+    # Vertical bar plot
+    plt.figure(figsize=(8, 5))
+    plt.bar(
+        years,
+        [english_counts[year] for year in years],
+        width=0.4,
+        label="English",
+        color=LANGUAGE_COLORS["english"],
+    )
+    plt.bar(
+        years,
+        [spanish_counts[year] for year in years],
+        width=0.4,
+        label="Spanish",
+        color=LANGUAGE_COLORS["spanish"],
+        bottom=[english_counts[year] for year in years],
+    )
+    plt.xlabel("Year", fontsize=10)
+    plt.ylabel("Number of Datasets", fontsize=10)
+    plt.xticks(years, fontsize=10)
+    plt.legend()
+    plt.tight_layout()
+    plt.grid(GRID)
+    plt.savefig(f"{output_dir}/bar_plot_vertical.png")
+    plt.close()
+def create_pie_chart(datasets, output_dir):
+    """Create pie chart showing distribution of datasets by language"""
+    # Calculate counts
+    counts = {
+        lang.capitalize(): len(datasets[lang])
+        for lang in ["english", "spanish", "catalan", "galician", "basque"]
+    }
+    plt.figure(figsize=(8, 8))
+    plt.pie(
+        counts.values(),
+        labels=counts.keys(),
+        autopct="%1.1f%%",
+        startangle=180,
+        colors=[
+            LANGUAGE_COLORS[lang]
+            for lang in ["english", "spanish", "catalan", "galician", "basque"]
+        ],
+    )
+    plt.axis("equal")
+    plt.savefig(f"{output_dir}/pie_chart.png")
+    plt.close()
+def create_time_series(datasets, output_dir):
+    """Create time series plots"""
+    # Prepare data
+    creation_dates_english = [d.created_at.date() for d in datasets["english"]]
+    creation_dates_spanish = [d.created_at.date() for d in datasets["spanish"]]
+    df_english = pd.DataFrame(creation_dates_english, columns=["Date"])
+    df_spanish = pd.DataFrame(creation_dates_spanish, columns=["Date"])
+    df_english["Count"] = 1
+    df_spanish["Count"] = 1
+    df_english["Date"] = pd.to_datetime(df_english["Date"])
+    df_spanish["Date"] = pd.to_datetime(df_spanish["Date"])
+    # Cumulative plots
+    df_english_cum = (
+        df_english.groupby(pd.Grouper(key="Date", freq="MS")).sum().cumsum()
+    )
+    df_spanish_cum = (
+        df_spanish.groupby(pd.Grouper(key="Date", freq="MS")).sum().cumsum()
+    )
+    plt.figure(figsize=(10, 6))
+    plt.plot(
+        df_english_cum.index,
+        df_english_cum["Count"],
+        label="English",
+        color=LANGUAGE_COLORS["english"],
+    )
+    plt.plot(
+        df_spanish_cum.index,
+        df_spanish_cum["Count"],
+        label="Spanish",
+        color=LANGUAGE_COLORS["spanish"],
+    )
+    plt.xlabel("Date", fontsize=10)
+    plt.ylabel("Cumulative Number of Datasets", fontsize=10)
+    plt.xticks(rotation=45, fontsize=10)
+    plt.legend(loc="upper left")
+    plt.tight_layout()
+    plt.grid(GRID)
+    plt.savefig(f"{output_dir}/time_series.png")
+    plt.close()
+def create_stack_area_plots(datasets, output_dir):
+    """Create stacked area plots"""
+    # Prepare data for all languages
+    all_dates = []
+    languages = ["english", "spanish", "catalan", "galician", "basque"]
+    for lang in languages:
+        all_dates.extend([d.created_at.date() for d in datasets[lang]])
+    # Create a common date range for all languages
+    min_date = min(all_dates)
+    max_date = max(all_dates)
+    date_range = pd.date_range(start=min_date, end=max_date, freq="MS")
+    # Create separate DataFrames for each language
+    dfs = {}
+    for lang in languages:
+        dates = [d.created_at.date() for d in datasets[lang]]
+        df = pd.DataFrame({"Date": dates})
+        df["Count"] = 1
+        df["Date"] = pd.to_datetime(df["Date"])
+        # Reindex to common date range and fill missing values with 0
+        df_grouped = df.groupby(pd.Grouper(key="Date", freq="MS")).sum()
+        df_grouped = df_grouped.reindex(date_range, fill_value=0)
+        dfs[lang] = df_grouped.cumsum()
+    # Plot stacked area for all languages
+    plt.figure(figsize=(10, 6))
+    plt.stackplot(
+        date_range,
+        [dfs[lang]["Count"].values for lang in languages],
+        labels=[lang.capitalize() for lang in languages],
+        colors=[LANGUAGE_COLORS[lang] for lang in languages],
+    )
+    plt.xlabel("Date", fontsize=10)
+    plt.ylabel("Cumulative Number of Datasets", fontsize=10)
+    plt.xticks(rotation=45, fontsize=10)
+    plt.legend(loc="upper left")
+    plt.tight_layout()
+    plt.grid(GRID)
+    plt.savefig(f"{output_dir}/stack_area.png")
+    plt.close()
+    # Plot stacked area for all except English
+    plt.figure(figsize=(10, 6))
+    plt.stackplot(
+        date_range,
+        [
+            dfs[lang]["Count"].values
+            for lang in ["spanish", "catalan", "galician", "basque"]
+        ],
+        labels=["Spanish", "Catalan", "Galician", "Basque"],
+        colors=[
+            LANGUAGE_COLORS[lang]
+            for lang in ["spanish", "catalan", "galician", "basque"]
+        ],
+    )
+    plt.xlabel("Date", fontsize=10)
+    plt.ylabel("Cumulative Number of Datasets", fontsize=10)
+    plt.xticks(rotation=45, fontsize=10)
+    plt.legend(loc="upper left")
+    plt.tight_layout()
+    plt.grid(GRID)
+    plt.savefig(f"{output_dir}/stack_area_es_ca_gl_eu.png")
+    plt.close()
+    # Plot stacked area for English and Spanish
+    plt.figure(figsize=(10, 6))
+    plt.stackplot(
+        date_range,
+        [dfs[lang]["Count"].values for lang in ["english", "spanish"]],
+        labels=["English", "Spanish"],
+        colors=[LANGUAGE_COLORS[lang] for lang in ["english", "spanish"]],
+    )
+    plt.xlabel("Date", fontsize=10)
+    plt.ylabel("Cumulative Number of Datasets", fontsize=10)
+    plt.xticks(rotation=45, fontsize=10)
+    plt.legend(loc="upper left")
+    plt.tight_layout()
+    plt.grid(GRID)
+    plt.savefig(f"{output_dir}/stack_area_en_es.png")
+    plt.close()
+    # Plot stacked area for Spanish only
+    plt.figure(figsize=(10, 6))
+    plt.stackplot(
+        date_range,
+        [dfs["spanish"]["Count"].values],
+        labels=["Spanish"],
+        colors=[LANGUAGE_COLORS["spanish"]],
+    )
+    plt.xlabel("Date", fontsize=10)
+    plt.ylabel("Cumulative Number of Datasets", fontsize=10)
+    plt.xticks(rotation=45, fontsize=10)
+    plt.legend(loc="upper left")
+    plt.tight_layout()
+    plt.grid(GRID)
+    plt.savefig(f"{output_dir}/stack_area_es.png")
+    plt.close()
+def main():
+    # Create output directory if it doesn't exist
+    output_dir = "plots"
+    os.makedirs(output_dir, exist_ok=True)
+    # Fetch datasets
+    print("Fetching datasets from Hugging Face Hub...")
+    datasets = fetch_datasets()
+    # Create visualizations
+    print("Creating bar plots...")
+    create_bar_plots(datasets, output_dir)
+    print("Creating pie chart...")
+    create_pie_chart(datasets, output_dir)
+    print("Creating time series plots...")
+    create_time_series(datasets, output_dir)
+    print("Creating stack area plots...")
+    create_stack_area_plots(datasets, output_dir)
+    print(f"All visualizations have been saved to the '{output_dir}' directory")
+if __name__ == "__main__":
+    main()