import os import pickle from datetime import datetime import matplotlib.pyplot as plt import pandas as pd from huggingface_hub import HfApi # Define colors for each language LANGUAGE_COLORS = { "english": "orange", "spanish": "blue", } def fetch_models(cache_file="models_cache.pkl"): """Fetch and filter models from HuggingFace Hub with caching""" # Check if cached data exists and is less than 24 hours old if os.path.exists(cache_file): cache_age = datetime.now().timestamp() - os.path.getmtime(cache_file) if cache_age < 24 * 3600: # 24 hours in seconds print("Loading models from cache...") with open(cache_file, "rb") as f: return pickle.load(f) else: print("Cache is older than 24 hours, fetching fresh data...") else: print("No cache found, fetching models from Hugging Face Hub...") hf_api = HfApi() all_models = list(hf_api.list_models(full=True)) # Filter models by language english_filter = filter( lambda m: any(tag == "language:en" for tag in m.tags) and not any( tag.startswith("language:") and tag != "language:en" for tag in m.tags ), all_models, ) spanish_filter = filter( lambda m: any(tag == "language:es" for tag in m.tags) and not any( tag.startswith("language:") and tag != "language:es" for tag in m.tags ), all_models, ) filtered_models = { "english": list(english_filter), "spanish": list(spanish_filter), } # Cache the filtered models print("Saving models to cache...") with open(cache_file, "wb") as f: pickle.dump(filtered_models, f) return filtered_models def create_stack_area_plot(models, output_dir): """Create stacked area plot for English and Spanish models""" # Prepare data for all languages all_dates = [] languages = ["english", "spanish"] for lang in languages: all_dates.extend([d.created_at.date() for d in models[lang]]) if not all_dates: print("No models found for any language. Skipping plot creation.") return # Create a common date range for all languages min_date = min(all_dates) max_date = max(all_dates) date_range = pd.date_range(start=min_date, end=max_date, freq="MS") # Create separate DataFrames for each language dfs = {} for lang in languages: dates = [d.created_at.date() for d in models[lang]] df = pd.DataFrame({"Date": dates}) df["Count"] = 1 df["Date"] = pd.to_datetime(df["Date"]) # Reindex to common date range and fill missing values with 0 df_grouped = df.groupby(pd.Grouper(key="Date", freq="MS")).sum() df_grouped = df_grouped.reindex(date_range, fill_value=0) dfs[lang] = df_grouped.cumsum() # Plot stacked area for English and Spanish plt.figure(figsize=(10, 6)) plt.stackplot( date_range, [dfs[lang]["Count"].values for lang in languages], labels=["English", "Spanish"], colors=[LANGUAGE_COLORS[lang] for lang in languages], ) plt.xlabel("Date", fontsize=10) plt.ylabel("Cumulative Number of Models", fontsize=10) plt.xticks(rotation=45, fontsize=10) plt.legend(loc="upper left") plt.tight_layout() plt.savefig(f"{output_dir}/models_stack_area_en_es.png") plt.close() def main(): # Create output directory if it doesn't exist output_dir = "plots" os.makedirs(output_dir, exist_ok=True) # Fetch models print("Fetching models from Hugging Face Hub...") models = fetch_models() # Print model counts print("\nModel counts:") for lang, models_list in models.items(): print(f"{lang.capitalize()}: {len(models_list)}") # Create visualization print("\nCreating stack area plot...") create_stack_area_plot(models, output_dir) print(f"Plot has been saved to the '{output_dir}' directory") if __name__ == "__main__": main()