{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b494ecb",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "bCPvBCk_VLoi",
    "outputId": "48174b27-072f-4cf9-bfcc-2a7cb12f60ba",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "!pip install huggingface_hub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d736660e",
   "metadata": {
    "id": "NbQeXxudVJW9",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from huggingface_hub import HfApi\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8dc1a8d8",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "ogyTHBYJVZ8I",
    "outputId": "f23a554a-7328-4e50-d87c-90368294467d",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "hf_api = HfApi()\n",
    "\n",
    "all_datasets = hf_api.list_datasets(full=True)\n",
    "\n",
    "total_count = len(list(all_datasets))\n",
    "print(total_count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "299e6d56",
   "metadata": {
    "id": "GXDMUU-4XmaI",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# language_filter = filter(lambda dataset: 'language:es' in dataset.tags, all_datasets) # 882\n",
    "\n",
    "# spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
    "\n",
    "#filtered_datasets_2 = filter(lambda dataset: \"es\" in dataset.card_data.language, all_datasets) # 882\n",
    "\n",
    "#filtered_datasets_3 = filter(lambda dataset: dataset.card_data.language == [\"es\"], all_datasets) #\n",
    "\n",
    "#for dataset in spanish_only_datasets:\n",
    "#    print(dataset)\n",
    "#    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "691d8f3a",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "pjCvHVq_hChx",
    "outputId": "d279d0bc-a3c6-4994-f23c-a7274b1f4ee8",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "hf_api = HfApi()\n",
    "\n",
    "all_datasets = hf_api.list_datasets(full=True)\n",
    "\n",
    "spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
    "spanish_datasets = list(spanish_filter)\n",
    "spanish_count = len(list(spanish_datasets))\n",
    "print(spanish_count)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c9676c89",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "WANGkTpGRw8t",
    "outputId": "da8931bf-7ae2-438d-8188-20190f568193",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "hf_api = HfApi()\n",
    "\n",
    "all_datasets = hf_api.list_datasets(full=True)\n",
    "\n",
    "english_filter = filter(lambda d: \"language:en\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:en\" for tag in d.tags), all_datasets)\n",
    "english_datasets = list(english_filter)\n",
    "english_count = len(list(english_datasets))\n",
    "print(english_count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf300ce6",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "yPtF0G7SWS53",
    "outputId": "a2a51160-c803-4e7f-a6dc-8879eea1dd69",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "hf_api = HfApi()\n",
    "\n",
    "all_datasets = hf_api.list_datasets(full=True)\n",
    "\n",
    "chinese_filter = filter(lambda d: \"language:zh\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:zh\" for tag in d.tags), all_datasets)\n",
    "chinese_datasets = list(chinese_filter)\n",
    "chinese_count = len(list(chinese_datasets))\n",
    "print(chinese_count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "407c46fc",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "RlxAlOOsW7p9",
    "outputId": "f1c12edd-5502-4018-b9a7-149f9fc29322",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "hf_api = HfApi()\n",
    "\n",
    "all_datasets = hf_api.list_datasets(full=True)\n",
    "\n",
    "french_filter = filter(lambda d: \"language:fr\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:fr\" for tag in d.tags), all_datasets)\n",
    "french_datasets = list(french_filter)\n",
    "french_count = len(list(french_datasets))\n",
    "print(french_count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7d82d5d",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "OMQfBXjUYBPz",
    "outputId": "8cd3fdb9-0bc8-4d82-d25b-fb9eef7118ed",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "hf_api = HfApi()\n",
    "\n",
    "all_datasets = hf_api.list_datasets(full=True)\n",
    "\n",
    "mono_filter = filter(lambda dataset: sum(tag.startswith('language:') for tag in dataset.tags) == 1, all_datasets)\n",
    "mono_datasets = list(mono_filter)\n",
    "mono_count = len(list(mono_datasets))\n",
    "print(mono_count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6dc0ac68",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 180
    },
    "id": "sTPechkdWmYS",
    "outputId": "bb49f9f4-150b-4a29-d58e-faff4f88cce3",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Extract creation date\n",
    "\n",
    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
    "assert len(creation_dates_spanish) == 318\n",
    "\n",
    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
    "assert len(creation_dates_english) == 8336"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57d206ec",
   "metadata": {
    "id": "hefZVynDSjjE",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "print(creation_dates_spanish[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b80e411d",
   "metadata": {
    "id": "aFaEBlkkSbrs",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Bar Chart\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96652421",
   "metadata": {
    "id": "dYJ2zd4dShYh",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from collections import Counter\n",
    "\n",
    "# Sample data (replace with your actual data)\n",
    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
    "\n",
    "# Extract years from the creation dates\n",
    "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
    "english_counts = Counter(date.year for date in creation_dates_english)\n",
    "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
    "\n",
    "# Plotting the bar chart\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.bar(years, [english_counts[year] for year in years], width=0.4, label='English Datasets', color='blue')\n",
    "plt.bar(years, [spanish_counts[year] for year in years], width=0.4, label='Spanish Datasets', color='orange', bottom=[english_counts[year] for year in years])\n",
    "\n",
    "# Adding labels and title\n",
    "plt.xlabel('Year')\n",
    "plt.ylabel('Number of Datasets')\n",
    "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
    "plt.xticks(years)\n",
    "plt.legend()\n",
    "\n",
    "# Display the plot\n",
    "plt.grid(True)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "plt.savefig(\"plots/bar_stack.png\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d1ae015",
   "metadata": {
    "id": "wViEE4wCUVgs",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from collections import Counter\n",
    "\n",
    "# Sample data (replace with your actual data)\n",
    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
    "\n",
    "# Extract years from the creation dates\n",
    "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
    "english_counts = Counter(date.year for date in creation_dates_english)\n",
    "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
    "\n",
    "# Define the width of each bar\n",
    "bar_width = 0.4\n",
    "\n",
    "# Define the x-coordinates for the bars\n",
    "years_index = np.arange(len(years))\n",
    "\n",
    "# Plotting the side-by-side bar chart\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.bar(years_index - bar_width/2, [english_counts[year] for year in years], width=bar_width, label='English Datasets', color='blue')\n",
    "plt.bar(years_index + bar_width/2, [spanish_counts[year] for year in years], width=bar_width, label='Spanish Datasets', color='orange')\n",
    "\n",
    "# Adding labels and title\n",
    "plt.xlabel('Year')\n",
    "plt.ylabel('Number of Datasets')\n",
    "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
    "plt.xticks(years_index, years)\n",
    "plt.legend()\n",
    "\n",
    "# Display the plot\n",
    "plt.grid(True)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "plt.savefig(\"plots/bar_width.png\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cddf7237",
   "metadata": {
    "id": "Hp8vNA6LUA1E",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Stacked Area Chart\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68255399",
   "metadata": {
    "id": "CWgCunzGUCot",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from collections import Counter\n",
    "\n",
    "# Sample data (replace with your actual data)\n",
    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
    "\n",
    "# Extract years from the creation dates\n",
    "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
    "english_counts = Counter(date.year for date in creation_dates_english)\n",
    "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
    "\n",
    "# Calculate cumulative counts\n",
    "english_datasets_cumulative = [english_counts[year] for year in years]\n",
    "spanish_datasets_cumulative = [spanish_counts[year] for year in years]\n",
    "for i in range(1, len(years)):\n",
    "    english_datasets_cumulative[i] += english_datasets_cumulative[i-1]\n",
    "    spanish_datasets_cumulative[i] += spanish_datasets_cumulative[i-1]\n",
    "\n",
    "# Plotting the stacked area chart\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.stackplot(years, english_datasets_cumulative, spanish_datasets_cumulative, labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
    "\n",
    "# Adding labels and title\n",
    "plt.xlabel('Year')\n",
    "plt.ylabel('Cumulative Number of Datasets')\n",
    "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
    "plt.xticks(years)\n",
    "plt.legend(loc='upper left')\n",
    "\n",
    "# Display the plot\n",
    "plt.grid(True)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "plt.savefig(\"plots/stack_area_1.png\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ba74cf5",
   "metadata": {
    "id": "GwRpZwYWhau3",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from collections import Counter\n",
    "\n",
    "# Sample data (replace with your actual data)\n",
    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
    "\n",
    "# Extract months from the creation dates\n",
    "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
    "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
    "\n",
    "# Count the occurrences of each month\n",
    "english_counts = Counter(months_english)\n",
    "spanish_counts = Counter(months_spanish)\n",
    "\n",
    "# Create a DataFrame for English datasets\n",
    "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
    "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
    "df_english = df_english.sort_index()\n",
    "\n",
    "# Create a DataFrame for Spanish datasets\n",
    "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
    "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
    "df_spanish = df_spanish.sort_index()\n",
    "\n",
    "# Merge the DataFrames\n",
    "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
    "\n",
    "# Convert index to datetime\n",
    "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
    "\n",
    "# Calculate cumulative sum\n",
    "df_cumulative = df.cumsum()\n",
    "\n",
    "# Plotting the stacked area chart\n",
    "plt.figure(figsize=(8, 5))\n",
    "plt.stackplot(df_cumulative.index, df_cumulative['English'], df_cumulative['Spanish'], labels=['English', 'Spanish'], colors=['orange', 'blue'])\n",
    "\n",
    "# Adding labels and title\n",
    "plt.xlabel('Creation date')\n",
    "plt.ylabel('Cumulative number of monolingual datasets')\n",
    "plt.title('Cumulative growth of monolingual English and Spanish datasets in the Hugging Face Hub over time')\n",
    "\n",
    "# Display the plot\n",
    "plt.xticks(rotation=45)\n",
    "plt.legend(loc='upper left')\n",
    "plt.grid(False)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "plt.savefig(\"plots/stack_area_2.png\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d96225ce",
   "metadata": {
    "id": "kJQ0OgRtglOQ",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from collections import Counter\n",
    "\n",
    "# Sample data (replace with your actual data)\n",
    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
    "\n",
    "# Extract months from the creation dates\n",
    "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
    "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
    "\n",
    "# Count the occurrences of each month\n",
    "english_counts = Counter(months_english)\n",
    "spanish_counts = Counter(months_spanish)\n",
    "\n",
    "# Create a DataFrame for English datasets\n",
    "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
    "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
    "df_english = df_english.sort_index()\n",
    "\n",
    "# Create a DataFrame for Spanish datasets\n",
    "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
    "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
    "df_spanish = df_spanish.sort_index()\n",
    "\n",
    "# Merge the DataFrames\n",
    "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
    "\n",
    "# Convert index to datetime\n",
    "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
    "\n",
    "# Plotting the stacked area chart\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.stackplot(df.index, df['English'], df['Spanish'], labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
    "\n",
    "# Adding labels and title\n",
    "plt.xlabel('Date')\n",
    "plt.ylabel('Cumulative Number of Datasets')\n",
    "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
    "\n",
    "# Display the plot\n",
    "plt.xticks(rotation=45)\n",
    "plt.legend(loc='upper left')\n",
    "plt.grid(True)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "plt.savefig(\"plots/stack_area_3.png\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7bbec0ac",
   "metadata": {
    "id": "IAnFHiPlgnRE",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Pie Chart"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c3dd684",
   "metadata": {
    "id": "8tKR1x-kVeZT",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from collections import Counter\n",
    "\n",
    "# Calculate the count of \"other\" datasets\n",
    "other_count = mono_count - (english_count + spanish_count + chinese_count + french_count)\n",
    "\n",
    "# Pie chart data\n",
    "labels = ['English', 'Chinese', 'French', 'Spanish', 'Other']\n",
    "sizes = [english_count, chinese_count, french_count, spanish_count, other_count]\n",
    "\n",
    "# Plotting the pie chart\n",
    "plt.figure(figsize=(8, 8))\n",
    "plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=180, colors=['blue', 'red', 'green', 'orange', 'purple'])\n",
    "plt.title('Distribution of Monolingual Datasets by Language')\n",
    "plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.\n",
    "\n",
    "# Display the plot\n",
    "plt.show()\n",
    "\n",
    "plt.savefig(\"plots/pie_chart.png\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "11c1c9c8",
   "metadata": {
    "id": "z2xf8FrHROxy",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Time series plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1bb6a676",
   "metadata": {
    "id": "DuPFSZKUhyQj",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Prepare data for plotting\n",
    "\n",
    "df = pd.DataFrame(creation_dates_spanish, columns=[\"Date\"])\n",
    "df[\"Count\"] = 1\n",
    "# Ensure the 'Date' column is of type datetime\n",
    "df['Date'] = pd.to_datetime(df['Date'])\n",
    "# Group by month and calculate cumulative sum\n",
    "df = df.groupby(pd.Grouper(key=\"Date\", freq=\"MS\")).sum().cumsum()\n",
    "\n",
    "# Plot the data\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.plot(\n",
    "    df.index,\n",
    "    df[\"Count\"],\n",
    "    #marker=\"o\",\n",
    "    color=\"g\"\n",
    ")\n",
    "plt.title(\"Evolución de bases de datos monolingües en español\")\n",
    "plt.xlabel(\"Fecha\")\n",
    "plt.ylabel(\"Número de bases de datos\")\n",
    "plt.grid(True)\n",
    "plt.xticks(rotation=45)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2fc77d7f",
   "metadata": {
    "id": "-Vu3PIe2hITq",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from collections import Counter\n",
    "\n",
    "# Sample data (replace with your actual data)\n",
    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
    "\n",
    "# Extract months from the creation dates\n",
    "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
    "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
    "\n",
    "# Count the occurrences of each month\n",
    "english_counts = Counter(months_english)\n",
    "spanish_counts = Counter(months_spanish)\n",
    "\n",
    "# Create a DataFrame for English datasets\n",
    "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
    "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
    "df_english = df_english.sort_index()\n",
    "\n",
    "# Create a DataFrame for Spanish datasets\n",
    "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
    "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
    "df_spanish = df_spanish.sort_index()\n",
    "\n",
    "# Merge the DataFrames\n",
    "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
    "\n",
    "# Convert index to datetime\n",
    "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
    "\n",
    "# Calculate cumulative sum\n",
    "df_cumulative = df.cumsum()\n",
    "\n",
    "# Plotting the cumulative chart\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.plot(df_cumulative.index, df_cumulative['English'], label='English', color='blue')\n",
    "plt.plot(df_cumulative.index, df_cumulative['Spanish'], label='Spanish', color='orange')\n",
    "\n",
    "# Adding labels and title\n",
    "plt.xlabel('Date')\n",
    "plt.ylabel('Cumulative Number of Datasets')\n",
    "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
    "\n",
    "# Display the plot\n",
    "plt.xticks(rotation=45)\n",
    "plt.legend(loc='upper left')\n",
    "plt.grid(True)\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c0d23ac",
   "metadata": {
    "id": "KG__of2IfdHu",
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from collections import Counter\n",
    "\n",
    "# Sample data (replace with your actual data)\n",
    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
    "\n",
    "# Extract years from the creation dates\n",
    "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
    "english_counts = Counter(date.year for date in creation_dates_english)\n",
    "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
    "\n",
    "# Prepare data for plotting\n",
    "english_series = pd.Series([english_counts[year] for year in years], index=years)\n",
    "spanish_series = pd.Series([spanish_counts[year] for year in years], index=years)\n",
    "\n",
    "# Plotting the time series\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.plot(english_series.index, english_series.values, label='English', color='blue')\n",
    "plt.plot(spanish_series.index, spanish_series.values, label='Spanish', color='orange')\n",
    "\n",
    "# Adding labels and title\n",
    "plt.title('Evolution of English and Spanish Datasets Over Time')\n",
    "plt.xlabel('Year')\n",
    "plt.ylabel('Number of Datasets')\n",
    "plt.legend()\n",
    "plt.grid(True)\n",
    "plt.xticks(rotation=45)\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "T4",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.11.6"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 0.047858,
   "end_time": "2024-05-15T09:04:29.634379",
   "environment_variables": {},
   "exception": null,
   "input_path": "numero_datasets_hub.ipynb",
   "output_path": "numero_datasets_hub_output.ipynb",
   "parameters": {},
   "start_time": "2024-05-15T09:04:29.586521",
   "version": "2.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}