#!/usr/bin/env python3 """ AI News Summarizer A script to fetch, summarize, and create reports on recent AI news articles based on a specified topic. """ import argparse from huggingface_hub import HfApi, InferenceClient from newspaper import Article import pandas as pd import requests from datetime import date, timedelta import json import os from tqdm.auto import tqdm def parse_arguments(): """Parse command line arguments""" parser = argparse.ArgumentParser(description='AI News Summarizer') parser.add_argument('--topic', type=str, default="Language Models", help='Topic to search for news articles (default: "Language Models")') parser.add_argument('--num-articles', type=int, default=50, help='Number of articles to fetch (default: 50)') parser.add_argument('--provider', type=str, default="fireworks-ai", help='Inference provider for HuggingFace (default: "fireworks-ai")') parser.add_argument('--repo-id', type=str, default="lvwerra/news-reports", help='HuggingFace repo ID to upload the report (default: "lvwerra/news-reports")') args = parser.parse_args() return args def main(): # Parse arguments args = parse_arguments() # Environment variables NEWS_API_KEY = os.getenv("NEWS_API_KEY") HF_API_KEY = os.getenv("HF_API_KEY") NEWS_ENDPOINT = 'https://newsapi.org/v2/everything' MODEL = "Qwen/Qwen3-30B-A3B" # Initialize clients client = InferenceClient(provider=args.provider, api_key=HF_API_KEY) # Set topic and number of articles topic = args.topic num = args.num_articles # Configure tqdm for pandas tqdm.pandas(desc="") print(f"Fetching top {num} articles on '{topic}' of today...") articles = fetch_news_articles(topic, num) df = pd.DataFrame.from_records(articles) print(f"Downloading and parsing {len(df)} articles...") df["content_full"] = df["url"].progress_apply(fetch_full_article) mask = df['content_full'].str.contains("Failed to fetch artcile.") df.loc[mask, 'content_full'] = df.loc[mask, 'content'] print(f"Summarizing each article (total={len(df)})...") df["summary_raw"] = df["content_full"].progress_apply(lambda x: summarize(x, client, MODEL)) df["summary_clean"] = df["summary_raw"].apply(lambda x: x.split("")[1].strip() if "" in x else x.strip()) print(f"Create report...") df["article_summary"] = df.apply(format_summary, axis=1) sep = "\n\n" + "-"*80 + "\n\n" overview = sep.join([f"Article: {i+1}\n{article}" for i, article in enumerate(df["article_summary"])]) report = create_report(overview, client, MODEL, topic) # Extract report content final_report = postprocess_report(report, overview, topic, num, MODEL) file_path = f"reports/{'-'.join(topic.lower().split())}/{date.today().strftime('%Y-%m-%d')}.md" print(f"Uploading to {args.repo_id} under {file_path}...") # Upload to HuggingFace hf_api = HfApi() hf_api.upload_file( path_or_fileobj=final_report.encode("utf-8"), path_in_repo=file_path, repo_id=args.repo_id, repo_type="space", token=HF_API_KEY, ) print("Job finished!") def fetch_news_articles(topic, num_articles=10): """Fetch news articles on the given topic""" NEWS_API_KEY = os.getenv("NEWS_API_KEY") NEWS_ENDPOINT = 'https://newsapi.org/v2/everything' today = date.today().strftime('%Y-%m-%d') yesterday = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d') params = { 'q': topic, 'from': yesterday, 'to': today, 'sortBy': 'popularity', 'language': 'en', 'pageSize': num_articles, 'apiKey': NEWS_API_KEY } response = requests.get(NEWS_ENDPOINT, params=params) if response.status_code == 200: data = response.json() return data['articles'] else: print(f"Error: {response.status_code}") print(response.text) return [] def fetch_full_article(url): """Fetch and parse the full content of an article""" try: a = Article(url) a.download() a.parse() return a.text except: return "Failed to fetch artcile." def summarize(article, client, model): """Summarize an article using the HuggingFace inference API""" user_msg = f"""\ Summarize the following news article in a few bullet points. \ Note that the reader is an expert in the field and wants only the most relevant and novel information and likes to know the specific details. \ So keep the summary brief but don't omit technical terms or specific information. Article: {article} /no_think""" messages=[ { "role": "user", "content": user_msg, } ] response = client.chat_completion( model=model, messages=messages, temperature=0.8, max_tokens=512, ) return response.choices[0].message.content def format_summary(row): """Format article summary""" summary = f"""\ Title: **{row['title']}** Published: {row['publishedAt']} Description: {row['description']} URL: {row['url']} Summary:\n{row['summary_clean']}""" return summary def create_report(articles_overview, client, model, topic): """Create a comprehensive report from all article summaries""" user_msg = f"""\ News articles:\ {articles_overview} =========================== Create a summary report of the newspaper articles above. Ignore everything that's not releated to the topic '{topic}' Separete the report into categories, for example for AI it could be catogories like: - Breaking news: anything that can also appear below but is the most important news of the day - Model news (e.g. new model releases, or insights about existing models etc.) - Startups (e.g. new startups, fundraising etc.) - Big Tech news (e.g. news from Google/Meta/OpenAI etc.) - Policy (e.g. US administration or EU policy) - Products (e.g. news of products that are powered by AI in some way) - Miscellaneous (whatever doesn't fit into the others but still relevant to the topic) For other topics come up with other categories that make sense. Style: The reader is an expert in the field and wants only the most relevant and novel information. \ Omit articles that are irrelevant to the field of AI and feel free to aggregate several articles about the same topic into one point. \ Format: Use markdown formatting and add links at the end of each section linking to the original articles. Example snippet: # NEWS_SUMMARY --- ## **Breaking News** - **Google and Apple in talks to integrate Gemini AI into Apple Intelligence by mid-2025** _[Apple Insider](https://appleinsider.com/articles/25/04/30/google-wants-gemini-ai-deal-with-apple-by-mid-2025), [The Verge](https://www.theverge.com/news/658770/google-gemini-apple-iphone-deal-ai)_ - Google’s Gemini AI could enhance Siri with advanced reasoning and contextual capabilities, though Apple’s strict privacy controls may limit deep system access. - A potential deal could accelerate Apple’s AI development and expand Google’s AI reach. - **Apple Vision Pro launch delayed** _[Six Colors](https://sixcolors.com/post/2025/04/apple-in-the-enterprise-the-complete-2025-commentary/)_ - Apple’s mixed-reality headset, featuring advanced AI integration, is expected to arrive in 2025, though specifics remain unclear. --- ... followed by the other sections. """ messages=[ { "role": "user", "content": user_msg, } ] response = client.chat_completion( model=model, messages=messages, temperature=0.8, max_tokens=32000, ) return response.choices[0].message.content def postprocess_report(report, summaries, topic, num_articles, model): report_summary = f"""\ # News Summary: {topic} **Period:** {(date.today() - timedelta(days=1)).strftime('%Y-%m-%d')} to {date.today().strftime('%Y-%m-%d')} | \ **Processed articles:** {num_articles} | **Model**: {model} """ report_content = report.split("")[1].strip() if "" in report else report.strip() report_thoughts = report.split("")[0].strip() if "" in report else "No thoughts." report_thoughts.replace("", "") final_report = f"""\ {report_content.replace('# NEWS_SUMMARY', report_summary)} ## Details ## Sources {summaries} ## Model reasoning {report_thoughts} """ return final_report if __name__ == "__main__": main()