|
|
|
""" |
|
AI News Summarizer |
|
|
|
A script to fetch, summarize, and create reports on recent AI news articles based on a specified topic. |
|
""" |
|
|
|
import argparse |
|
from huggingface_hub import HfApi, InferenceClient |
|
from newspaper import Article |
|
import pandas as pd |
|
import requests |
|
from datetime import date, timedelta |
|
import json |
|
import os |
|
from tqdm.auto import tqdm |
|
|
|
def parse_arguments(): |
|
"""Parse command line arguments""" |
|
parser = argparse.ArgumentParser(description='AI News Summarizer') |
|
parser.add_argument('--topic', type=str, default="Language Models", |
|
help='Topic to search for news articles (default: "Language Models")') |
|
parser.add_argument('--num-articles', type=int, default=50, |
|
help='Number of articles to fetch (default: 50)') |
|
parser.add_argument('--provider', type=str, default="fireworks-ai", |
|
help='Inference provider for HuggingFace (default: "fireworks-ai")') |
|
parser.add_argument('--repo-id', type=str, default="lvwerra/news-reports", |
|
help='HuggingFace repo ID to upload the report (default: "lvwerra/news-reports")') |
|
|
|
args = parser.parse_args() |
|
return args |
|
|
|
def main(): |
|
|
|
args = parse_arguments() |
|
|
|
|
|
NEWS_API_KEY = os.getenv("NEWS_API_KEY") |
|
HF_API_KEY = os.getenv("HF_API_KEY") |
|
NEWS_ENDPOINT = 'https://newsapi.org/v2/everything' |
|
MODEL = "Qwen/Qwen3-30B-A3B" |
|
|
|
|
|
client = InferenceClient(provider=args.provider, api_key=HF_API_KEY) |
|
|
|
|
|
topic = args.topic |
|
num = args.num_articles |
|
|
|
|
|
tqdm.pandas(desc="") |
|
|
|
print(f"Fetching top {num} articles on '{topic}' of today...") |
|
articles = fetch_news_articles(topic, num) |
|
df = pd.DataFrame.from_records(articles) |
|
|
|
print(f"Downloading and parsing {len(df)} articles...") |
|
df["content_full"] = df["url"].progress_apply(fetch_full_article) |
|
mask = df['content_full'].str.contains("Failed to fetch artcile.") |
|
df.loc[mask, 'content_full'] = df.loc[mask, 'content'] |
|
|
|
print(f"Summarizing each article (total={len(df)})...") |
|
df["summary_raw"] = df["content_full"].progress_apply(lambda x: summarize(x, client, MODEL)) |
|
df["summary_clean"] = df["summary_raw"].apply(lambda x: x.split("</think>")[1].strip() if "</think>" in x else x.strip()) |
|
|
|
print(f"Create report...") |
|
df["article_summary"] = df.apply(format_summary, axis=1) |
|
|
|
sep = "\n\n" + "-"*80 + "\n\n" |
|
overview = sep.join([f"Article: {i+1}\n{article}" for i, article in enumerate(df["article_summary"])]) |
|
report = create_report(overview, client, MODEL, topic) |
|
|
|
|
|
final_report = postprocess_report(report, overview, topic, num, MODEL) |
|
|
|
file_path = f"reports/{'-'.join(topic.lower().split())}/{date.today().strftime('%Y-%m-%d')}.md" |
|
print(f"Uploading to {args.repo_id} under {file_path}...") |
|
|
|
hf_api = HfApi() |
|
hf_api.upload_file( |
|
path_or_fileobj=final_report.encode("utf-8"), |
|
path_in_repo=file_path, |
|
repo_id=args.repo_id, |
|
repo_type="space", |
|
token=HF_API_KEY, |
|
) |
|
|
|
print("Job finished!") |
|
|
|
def fetch_news_articles(topic, num_articles=10): |
|
"""Fetch news articles on the given topic""" |
|
NEWS_API_KEY = os.getenv("NEWS_API_KEY") |
|
NEWS_ENDPOINT = 'https://newsapi.org/v2/everything' |
|
|
|
today = date.today().strftime('%Y-%m-%d') |
|
yesterday = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d') |
|
|
|
params = { |
|
'q': topic, |
|
'from': yesterday, |
|
'to': today, |
|
'sortBy': 'popularity', |
|
'language': 'en', |
|
'pageSize': num_articles, |
|
'apiKey': NEWS_API_KEY |
|
} |
|
|
|
response = requests.get(NEWS_ENDPOINT, params=params) |
|
|
|
if response.status_code == 200: |
|
data = response.json() |
|
return data['articles'] |
|
else: |
|
print(f"Error: {response.status_code}") |
|
print(response.text) |
|
return [] |
|
|
|
def fetch_full_article(url): |
|
"""Fetch and parse the full content of an article""" |
|
try: |
|
a = Article(url) |
|
a.download() |
|
a.parse() |
|
return a.text |
|
except: |
|
return "Failed to fetch artcile." |
|
|
|
def summarize(article, client, model): |
|
"""Summarize an article using the HuggingFace inference API""" |
|
user_msg = f"""\ |
|
Summarize the following news article in a few bullet points. \ |
|
Note that the reader is an expert in the field and wants only the most relevant and novel information and likes to know the specific details. \ |
|
So keep the summary brief but don't omit technical terms or specific information. |
|
|
|
Article: |
|
{article} |
|
|
|
/no_think""" |
|
|
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": user_msg, |
|
} |
|
] |
|
|
|
response = client.chat_completion( |
|
model=model, |
|
messages=messages, |
|
temperature=0.8, |
|
max_tokens=512, |
|
) |
|
|
|
return response.choices[0].message.content |
|
|
|
def format_summary(row): |
|
"""Format article summary""" |
|
summary = f"""\ |
|
Title: **{row['title']}** |
|
|
|
Published: {row['publishedAt']} |
|
|
|
Description: {row['description']} |
|
|
|
URL: {row['url']} |
|
|
|
Summary:\n{row['summary_clean']}""" |
|
return summary |
|
|
|
def create_report(articles_overview, client, model, topic): |
|
"""Create a comprehensive report from all article summaries""" |
|
user_msg = f"""\ |
|
News articles:\ |
|
{articles_overview} |
|
|
|
=========================== |
|
|
|
Create a summary report of the newspaper articles above. Ignore everything that's not releated to the topic '{topic}' |
|
|
|
Separete the report into categories, for example for AI it could be catogories like: |
|
- Breaking news: anything that can also appear below but is the most important news of the day |
|
- Model news (e.g. new model releases, or insights about existing models etc.) |
|
- Startups (e.g. new startups, fundraising etc.) |
|
- Big Tech news (e.g. news from Google/Meta/OpenAI etc.) |
|
- Policy (e.g. US administration or EU policy) |
|
- Products (e.g. news of products that are powered by AI in some way) |
|
- Miscellaneous (whatever doesn't fit into the others but still relevant to the topic) |
|
|
|
For other topics come up with other categories that make sense. |
|
|
|
Style: The reader is an expert in the field and wants only the most relevant and novel information. \ |
|
Omit articles that are irrelevant to the field of AI and feel free to aggregate several articles about the same topic into one point. \ |
|
|
|
Format: Use markdown formatting and add links at the end of each section linking to the original articles. |
|
|
|
Example snippet: |
|
|
|
# NEWS_SUMMARY |
|
|
|
--- |
|
|
|
## **Breaking News** |
|
- **Google and Apple in talks to integrate Gemini AI into Apple Intelligence by mid-2025** _[Apple Insider](https://appleinsider.com/articles/25/04/30/google-wants-gemini-ai-deal-with-apple-by-mid-2025), [The Verge](https://www.theverge.com/news/658770/google-gemini-apple-iphone-deal-ai)_ |
|
- Google’s Gemini AI could enhance Siri with advanced reasoning and contextual capabilities, though Apple’s strict privacy controls may limit deep system access. |
|
- A potential deal could accelerate Apple’s AI development and expand Google’s AI reach. |
|
- **Apple Vision Pro launch delayed** _[Six Colors](https://sixcolors.com/post/2025/04/apple-in-the-enterprise-the-complete-2025-commentary/)_ |
|
- Apple’s mixed-reality headset, featuring advanced AI integration, is expected to arrive in 2025, though specifics remain unclear. |
|
|
|
--- |
|
|
|
... followed by the other sections. |
|
|
|
""" |
|
|
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": user_msg, |
|
} |
|
] |
|
|
|
response = client.chat_completion( |
|
model=model, |
|
messages=messages, |
|
temperature=0.8, |
|
max_tokens=32000, |
|
) |
|
|
|
return response.choices[0].message.content |
|
|
|
def postprocess_report(report, summaries, topic, num_articles, model): |
|
report_summary = f"""\ |
|
# News Summary: {topic} |
|
|
|
**Period:** {(date.today() - timedelta(days=1)).strftime('%Y-%m-%d')} to {date.today().strftime('%Y-%m-%d')} | \ |
|
**Processed articles:** {num_articles} | **Model**: {model} |
|
""" |
|
|
|
report_content = report.split("</think>")[1].strip() if "</think>" in report else report.strip() |
|
report_thoughts = report.split("</think>")[0].strip() if "</think>" in report else "No thoughts." |
|
report_thoughts.replace("<think>", "") |
|
|
|
final_report = f"""\ |
|
{report_content.replace('# NEWS_SUMMARY', report_summary)} |
|
|
|
## Details |
|
|
|
## Sources |
|
|
|
{summaries} |
|
|
|
## Model reasoning |
|
|
|
{report_thoughts} |
|
|
|
""" |
|
|
|
return final_report |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |