{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "9b494ecb", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bCPvBCk_VLoi", "outputId": "48174b27-072f-4cf9-bfcc-2a7cb12f60ba", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "!pip install huggingface_hub" ] }, { "cell_type": "code", "execution_count": null, "id": "d736660e", "metadata": { "id": "NbQeXxudVJW9", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "from datetime import datetime\n", "\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "from huggingface_hub import HfApi\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8dc1a8d8", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ogyTHBYJVZ8I", "outputId": "f23a554a-7328-4e50-d87c-90368294467d", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "hf_api = HfApi()\n", "\n", "all_datasets = hf_api.list_datasets(full=True)\n", "\n", "total_count = len(list(all_datasets))\n", "print(total_count)" ] }, { "cell_type": "code", "execution_count": null, "id": "299e6d56", "metadata": { "id": "GXDMUU-4XmaI", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# language_filter = filter(lambda dataset: 'language:es' in dataset.tags, all_datasets) # 882\n", "\n", "# spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n", "\n", "#filtered_datasets_2 = filter(lambda dataset: \"es\" in dataset.card_data.language, all_datasets) # 882\n", "\n", "#filtered_datasets_3 = filter(lambda dataset: dataset.card_data.language == [\"es\"], all_datasets) #\n", "\n", "#for dataset in spanish_only_datasets:\n", "# print(dataset)\n", "# break" ] }, { "cell_type": "code", "execution_count": null, "id": "691d8f3a", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pjCvHVq_hChx", "outputId": "d279d0bc-a3c6-4994-f23c-a7274b1f4ee8", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "hf_api = HfApi()\n", "\n", "all_datasets = hf_api.list_datasets(full=True)\n", "\n", "spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n", "spanish_datasets = list(spanish_filter)\n", "spanish_count = len(list(spanish_datasets))\n", "print(spanish_count)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c9676c89", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WANGkTpGRw8t", "outputId": "da8931bf-7ae2-438d-8188-20190f568193", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "hf_api = HfApi()\n", "\n", "all_datasets = hf_api.list_datasets(full=True)\n", "\n", "english_filter = filter(lambda d: \"language:en\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:en\" for tag in d.tags), all_datasets)\n", "english_datasets = list(english_filter)\n", "english_count = len(list(english_datasets))\n", "print(english_count)" ] }, { "cell_type": "code", "execution_count": null, "id": "bf300ce6", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yPtF0G7SWS53", "outputId": "a2a51160-c803-4e7f-a6dc-8879eea1dd69", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "hf_api = HfApi()\n", "\n", "all_datasets = hf_api.list_datasets(full=True)\n", "\n", "chinese_filter = filter(lambda d: \"language:zh\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:zh\" for tag in d.tags), all_datasets)\n", "chinese_datasets = list(chinese_filter)\n", "chinese_count = len(list(chinese_datasets))\n", "print(chinese_count)" ] }, { "cell_type": "code", "execution_count": null, "id": "407c46fc", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RlxAlOOsW7p9", "outputId": "f1c12edd-5502-4018-b9a7-149f9fc29322", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "hf_api = HfApi()\n", "\n", "all_datasets = hf_api.list_datasets(full=True)\n", "\n", "french_filter = filter(lambda d: \"language:fr\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:fr\" for tag in d.tags), all_datasets)\n", "french_datasets = list(french_filter)\n", "french_count = len(list(french_datasets))\n", "print(french_count)" ] }, { "cell_type": "code", "execution_count": null, "id": "a7d82d5d", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OMQfBXjUYBPz", "outputId": "8cd3fdb9-0bc8-4d82-d25b-fb9eef7118ed", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "hf_api = HfApi()\n", "\n", "all_datasets = hf_api.list_datasets(full=True)\n", "\n", "mono_filter = filter(lambda dataset: sum(tag.startswith('language:') for tag in dataset.tags) == 1, all_datasets)\n", "mono_datasets = list(mono_filter)\n", "mono_count = len(list(mono_datasets))\n", "print(mono_count)" ] }, { "cell_type": "code", "execution_count": null, "id": "6dc0ac68", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 180 }, "id": "sTPechkdWmYS", "outputId": "bb49f9f4-150b-4a29-d58e-faff4f88cce3", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Extract creation date\n", "\n", "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n", "assert len(creation_dates_spanish) == 318\n", "\n", "creation_dates_english = [d.created_at.date() for d in english_datasets]\n", "assert len(creation_dates_english) == 8336" ] }, { "cell_type": "code", "execution_count": null, "id": "57d206ec", "metadata": { "id": "hefZVynDSjjE", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "print(creation_dates_spanish[0])" ] }, { "cell_type": "markdown", "id": "b80e411d", "metadata": { "id": "aFaEBlkkSbrs", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "source": [ "## Bar Chart\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "96652421", "metadata": { "id": "dYJ2zd4dShYh", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "from collections import Counter\n", "\n", "# Sample data (replace with your actual data)\n", "creation_dates_english = [d.created_at.date() for d in english_datasets]\n", "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n", "\n", "# Extract years from the creation dates\n", "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n", "english_counts = Counter(date.year for date in creation_dates_english)\n", "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n", "\n", "# Plotting the bar chart\n", "plt.figure(figsize=(10, 6))\n", "plt.bar(years, [english_counts[year] for year in years], width=0.4, label='English Datasets', color='blue')\n", "plt.bar(years, [spanish_counts[year] for year in years], width=0.4, label='Spanish Datasets', color='orange', bottom=[english_counts[year] for year in years])\n", "\n", "# Adding labels and title\n", "plt.xlabel('Year')\n", "plt.ylabel('Number of Datasets')\n", "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n", "plt.xticks(years)\n", "plt.legend()\n", "\n", "# Display the plot\n", "plt.grid(True)\n", "plt.tight_layout()\n", "plt.show()\n", "plt.savefig(\"plots/bar_stack.png\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2d1ae015", "metadata": { "id": "wViEE4wCUVgs", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "from collections import Counter\n", "\n", "# Sample data (replace with your actual data)\n", "creation_dates_english = [d.created_at.date() for d in english_datasets]\n", "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n", "\n", "# Extract years from the creation dates\n", "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n", "english_counts = Counter(date.year for date in creation_dates_english)\n", "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n", "\n", "# Define the width of each bar\n", "bar_width = 0.4\n", "\n", "# Define the x-coordinates for the bars\n", "years_index = np.arange(len(years))\n", "\n", "# Plotting the side-by-side bar chart\n", "plt.figure(figsize=(10, 6))\n", "plt.bar(years_index - bar_width/2, [english_counts[year] for year in years], width=bar_width, label='English Datasets', color='blue')\n", "plt.bar(years_index + bar_width/2, [spanish_counts[year] for year in years], width=bar_width, label='Spanish Datasets', color='orange')\n", "\n", "# Adding labels and title\n", "plt.xlabel('Year')\n", "plt.ylabel('Number of Datasets')\n", "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n", "plt.xticks(years_index, years)\n", "plt.legend()\n", "\n", "# Display the plot\n", "plt.grid(True)\n", "plt.tight_layout()\n", "plt.show()\n", "plt.savefig(\"plots/bar_width.png\")" ] }, { "cell_type": "markdown", "id": "cddf7237", "metadata": { "id": "Hp8vNA6LUA1E", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "source": [ "# Stacked Area Chart\n" ] }, { "cell_type": "code", "execution_count": null, "id": "68255399", "metadata": { "id": "CWgCunzGUCot", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "from collections import Counter\n", "\n", "# Sample data (replace with your actual data)\n", "creation_dates_english = [d.created_at.date() for d in english_datasets]\n", "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n", "\n", "# Extract years from the creation dates\n", "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n", "english_counts = Counter(date.year for date in creation_dates_english)\n", "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n", "\n", "# Calculate cumulative counts\n", "english_datasets_cumulative = [english_counts[year] for year in years]\n", "spanish_datasets_cumulative = [spanish_counts[year] for year in years]\n", "for i in range(1, len(years)):\n", " english_datasets_cumulative[i] += english_datasets_cumulative[i-1]\n", " spanish_datasets_cumulative[i] += spanish_datasets_cumulative[i-1]\n", "\n", "# Plotting the stacked area chart\n", "plt.figure(figsize=(10, 6))\n", "plt.stackplot(years, english_datasets_cumulative, spanish_datasets_cumulative, labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n", "\n", "# Adding labels and title\n", "plt.xlabel('Year')\n", "plt.ylabel('Cumulative Number of Datasets')\n", "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n", "plt.xticks(years)\n", "plt.legend(loc='upper left')\n", "\n", "# Display the plot\n", "plt.grid(True)\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "plt.savefig(\"plots/stack_area_1.png\")" ] }, { "cell_type": "code", "execution_count": null, "id": "4ba74cf5", "metadata": { "id": "GwRpZwYWhau3", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "from collections import Counter\n", "\n", "# Sample data (replace with your actual data)\n", "creation_dates_english = [d.created_at.date() for d in english_datasets]\n", "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n", "\n", "# Extract months from the creation dates\n", "months_english = [(date.year, date.month) for date in creation_dates_english]\n", "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n", "\n", "# Count the occurrences of each month\n", "english_counts = Counter(months_english)\n", "spanish_counts = Counter(months_spanish)\n", "\n", "# Create a DataFrame for English datasets\n", "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n", "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n", "df_english = df_english.sort_index()\n", "\n", "# Create a DataFrame for Spanish datasets\n", "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n", "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n", "df_spanish = df_spanish.sort_index()\n", "\n", "# Merge the DataFrames\n", "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n", "\n", "# Convert index to datetime\n", "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n", "\n", "# Calculate cumulative sum\n", "df_cumulative = df.cumsum()\n", "\n", "# Plotting the stacked area chart\n", "plt.figure(figsize=(8, 5))\n", "plt.stackplot(df_cumulative.index, df_cumulative['English'], df_cumulative['Spanish'], labels=['English', 'Spanish'], colors=['orange', 'blue'])\n", "\n", "# Adding labels and title\n", "plt.xlabel('Creation date')\n", "plt.ylabel('Cumulative number of monolingual datasets')\n", "plt.title('Cumulative growth of monolingual English and Spanish datasets in the Hugging Face Hub over time')\n", "\n", "# Display the plot\n", "plt.xticks(rotation=45)\n", "plt.legend(loc='upper left')\n", "plt.grid(False)\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "plt.savefig(\"plots/stack_area_2.png\")" ] }, { "cell_type": "code", "execution_count": null, "id": "d96225ce", "metadata": { "id": "kJQ0OgRtglOQ", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "from collections import Counter\n", "\n", "# Sample data (replace with your actual data)\n", "creation_dates_english = [d.created_at.date() for d in english_datasets]\n", "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n", "\n", "# Extract months from the creation dates\n", "months_english = [(date.year, date.month) for date in creation_dates_english]\n", "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n", "\n", "# Count the occurrences of each month\n", "english_counts = Counter(months_english)\n", "spanish_counts = Counter(months_spanish)\n", "\n", "# Create a DataFrame for English datasets\n", "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n", "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n", "df_english = df_english.sort_index()\n", "\n", "# Create a DataFrame for Spanish datasets\n", "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n", "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n", "df_spanish = df_spanish.sort_index()\n", "\n", "# Merge the DataFrames\n", "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n", "\n", "# Convert index to datetime\n", "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n", "\n", "# Plotting the stacked area chart\n", "plt.figure(figsize=(10, 6))\n", "plt.stackplot(df.index, df['English'], df['Spanish'], labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n", "\n", "# Adding labels and title\n", "plt.xlabel('Date')\n", "plt.ylabel('Cumulative Number of Datasets')\n", "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n", "\n", "# Display the plot\n", "plt.xticks(rotation=45)\n", "plt.legend(loc='upper left')\n", "plt.grid(True)\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "plt.savefig(\"plots/stack_area_3.png\")" ] }, { "cell_type": "markdown", "id": "7bbec0ac", "metadata": { "id": "IAnFHiPlgnRE", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "source": [ "## Pie Chart" ] }, { "cell_type": "code", "execution_count": null, "id": "7c3dd684", "metadata": { "id": "8tKR1x-kVeZT", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "from collections import Counter\n", "\n", "# Calculate the count of \"other\" datasets\n", "other_count = mono_count - (english_count + spanish_count + chinese_count + french_count)\n", "\n", "# Pie chart data\n", "labels = ['English', 'Chinese', 'French', 'Spanish', 'Other']\n", "sizes = [english_count, chinese_count, french_count, spanish_count, other_count]\n", "\n", "# Plotting the pie chart\n", "plt.figure(figsize=(8, 8))\n", "plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=180, colors=['blue', 'red', 'green', 'orange', 'purple'])\n", "plt.title('Distribution of Monolingual Datasets by Language')\n", "plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n", "\n", "# Display the plot\n", "plt.show()\n", "\n", "plt.savefig(\"plots/pie_chart.png\")" ] }, { "cell_type": "markdown", "id": "11c1c9c8", "metadata": { "id": "z2xf8FrHROxy", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "source": [ "# Time series plot" ] }, { "cell_type": "code", "execution_count": null, "id": "1bb6a676", "metadata": { "id": "DuPFSZKUhyQj", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Prepare data for plotting\n", "\n", "df = pd.DataFrame(creation_dates_spanish, columns=[\"Date\"])\n", "df[\"Count\"] = 1\n", "# Ensure the 'Date' column is of type datetime\n", "df['Date'] = pd.to_datetime(df['Date'])\n", "# Group by month and calculate cumulative sum\n", "df = df.groupby(pd.Grouper(key=\"Date\", freq=\"MS\")).sum().cumsum()\n", "\n", "# Plot the data\n", "plt.figure(figsize=(10, 6))\n", "plt.plot(\n", " df.index,\n", " df[\"Count\"],\n", " #marker=\"o\",\n", " color=\"g\"\n", ")\n", "plt.title(\"Evolución de bases de datos monolingües en español\")\n", "plt.xlabel(\"Fecha\")\n", "plt.ylabel(\"Número de bases de datos\")\n", "plt.grid(True)\n", "plt.xticks(rotation=45)\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "2fc77d7f", "metadata": { "id": "-Vu3PIe2hITq", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "from collections import Counter\n", "\n", "# Sample data (replace with your actual data)\n", "creation_dates_english = [d.created_at.date() for d in english_datasets]\n", "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n", "\n", "# Extract months from the creation dates\n", "months_english = [(date.year, date.month) for date in creation_dates_english]\n", "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n", "\n", "# Count the occurrences of each month\n", "english_counts = Counter(months_english)\n", "spanish_counts = Counter(months_spanish)\n", "\n", "# Create a DataFrame for English datasets\n", "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n", "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n", "df_english = df_english.sort_index()\n", "\n", "# Create a DataFrame for Spanish datasets\n", "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n", "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n", "df_spanish = df_spanish.sort_index()\n", "\n", "# Merge the DataFrames\n", "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n", "\n", "# Convert index to datetime\n", "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n", "\n", "# Calculate cumulative sum\n", "df_cumulative = df.cumsum()\n", "\n", "# Plotting the cumulative chart\n", "plt.figure(figsize=(10, 6))\n", "plt.plot(df_cumulative.index, df_cumulative['English'], label='English', color='blue')\n", "plt.plot(df_cumulative.index, df_cumulative['Spanish'], label='Spanish', color='orange')\n", "\n", "# Adding labels and title\n", "plt.xlabel('Date')\n", "plt.ylabel('Cumulative Number of Datasets')\n", "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n", "\n", "# Display the plot\n", "plt.xticks(rotation=45)\n", "plt.legend(loc='upper left')\n", "plt.grid(True)\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6c0d23ac", "metadata": { "id": "KG__of2IfdHu", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "from collections import Counter\n", "\n", "# Sample data (replace with your actual data)\n", "creation_dates_english = [d.created_at.date() for d in english_datasets]\n", "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n", "\n", "# Extract years from the creation dates\n", "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n", "english_counts = Counter(date.year for date in creation_dates_english)\n", "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n", "\n", "# Prepare data for plotting\n", "english_series = pd.Series([english_counts[year] for year in years], index=years)\n", "spanish_series = pd.Series([spanish_counts[year] for year in years], index=years)\n", "\n", "# Plotting the time series\n", "plt.figure(figsize=(10, 6))\n", "plt.plot(english_series.index, english_series.values, label='English', color='blue')\n", "plt.plot(spanish_series.index, spanish_series.values, label='Spanish', color='orange')\n", "\n", "# Adding labels and title\n", "plt.title('Evolution of English and Spanish Datasets Over Time')\n", "plt.xlabel('Year')\n", "plt.ylabel('Number of Datasets')\n", "plt.legend()\n", "plt.grid(True)\n", "plt.xticks(rotation=45)\n", "plt.tight_layout()\n", "plt.show()\n" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python", "version": "3.11.6" }, "papermill": { "default_parameters": {}, "duration": 0.047858, "end_time": "2024-05-15T09:04:29.634379", "environment_variables": {}, "exception": null, "input_path": "numero_datasets_hub.ipynb", "output_path": "numero_datasets_hub_output.ipynb", "parameters": {}, "start_time": "2024-05-15T09:04:29.586521", "version": "2.6.0" } }, "nbformat": 4, "nbformat_minor": 5 }