ai-news / run.py
lvwerra's picture
lvwerra HF Staff
Update run.py
af29208 verified
raw
history blame contribute delete
8.75 kB
#!/usr/bin/env python3
"""
AI News Summarizer
A script to fetch, summarize, and create reports on recent AI news articles based on a specified topic.
"""
import argparse
from huggingface_hub import HfApi, InferenceClient
from newspaper import Article
import pandas as pd
import requests
from datetime import date, timedelta
import json
import os
from tqdm.auto import tqdm
def parse_arguments():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(description='AI News Summarizer')
parser.add_argument('--topic', type=str, default="Language Models",
help='Topic to search for news articles (default: "Language Models")')
parser.add_argument('--num-articles', type=int, default=50,
help='Number of articles to fetch (default: 50)')
parser.add_argument('--provider', type=str, default="fireworks-ai",
help='Inference provider for HuggingFace (default: "fireworks-ai")')
parser.add_argument('--repo-id', type=str, default="lvwerra/news-reports",
help='HuggingFace repo ID to upload the report (default: "lvwerra/news-reports")')
args = parser.parse_args()
return args
def main():
# Parse arguments
args = parse_arguments()
# Environment variables
NEWS_API_KEY = os.getenv("NEWS_API_KEY")
HF_API_KEY = os.getenv("HF_API_KEY")
NEWS_ENDPOINT = 'https://newsapi.org/v2/everything'
MODEL = "Qwen/Qwen3-30B-A3B"
# Initialize clients
client = InferenceClient(provider=args.provider, api_key=HF_API_KEY)
# Set topic and number of articles
topic = args.topic
num = args.num_articles
# Configure tqdm for pandas
tqdm.pandas(desc="")
print(f"Fetching top {num} articles on '{topic}' of today...")
articles = fetch_news_articles(topic, num)
df = pd.DataFrame.from_records(articles)
print(f"Downloading and parsing {len(df)} articles...")
df["content_full"] = df["url"].progress_apply(fetch_full_article)
mask = df['content_full'].str.contains("Failed to fetch artcile.")
df.loc[mask, 'content_full'] = df.loc[mask, 'content']
print(f"Summarizing each article (total={len(df)})...")
df["summary_raw"] = df["content_full"].progress_apply(lambda x: summarize(x, client, MODEL))
df["summary_clean"] = df["summary_raw"].apply(lambda x: x.split("</think>")[1].strip() if "</think>" in x else x.strip())
print(f"Create report...")
df["article_summary"] = df.apply(format_summary, axis=1)
sep = "\n\n" + "-"*80 + "\n\n"
overview = sep.join([f"Article: {i+1}\n{article}" for i, article in enumerate(df["article_summary"])])
report = create_report(overview, client, MODEL, topic)
# Extract report content
final_report = postprocess_report(report, overview, topic, num, MODEL)
file_path = f"reports/{'-'.join(topic.lower().split())}/{date.today().strftime('%Y-%m-%d')}.md"
print(f"Uploading to {args.repo_id} under {file_path}...")
# Upload to HuggingFace
hf_api = HfApi()
hf_api.upload_file(
path_or_fileobj=final_report.encode("utf-8"),
path_in_repo=file_path,
repo_id=args.repo_id,
repo_type="space",
token=HF_API_KEY,
)
print("Job finished!")
def fetch_news_articles(topic, num_articles=10):
"""Fetch news articles on the given topic"""
NEWS_API_KEY = os.getenv("NEWS_API_KEY")
NEWS_ENDPOINT = 'https://newsapi.org/v2/everything'
today = date.today().strftime('%Y-%m-%d')
yesterday = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d')
params = {
'q': topic,
'from': yesterday,
'to': today,
'sortBy': 'popularity',
'language': 'en',
'pageSize': num_articles,
'apiKey': NEWS_API_KEY
}
response = requests.get(NEWS_ENDPOINT, params=params)
if response.status_code == 200:
data = response.json()
return data['articles']
else:
print(f"Error: {response.status_code}")
print(response.text)
return []
def fetch_full_article(url):
"""Fetch and parse the full content of an article"""
try:
a = Article(url)
a.download()
a.parse()
return a.text
except:
return "Failed to fetch artcile."
def summarize(article, client, model):
"""Summarize an article using the HuggingFace inference API"""
user_msg = f"""\
Summarize the following news article in a few bullet points. \
Note that the reader is an expert in the field and wants only the most relevant and novel information and likes to know the specific details. \
So keep the summary brief but don't omit technical terms or specific information.
Article:
{article}
/no_think"""
messages=[
{
"role": "user",
"content": user_msg,
}
]
response = client.chat_completion(
model=model,
messages=messages,
temperature=0.8,
max_tokens=512,
)
return response.choices[0].message.content
def format_summary(row):
"""Format article summary"""
summary = f"""\
Title: **{row['title']}**
Published: {row['publishedAt']}
Description: {row['description']}
URL: {row['url']}
Summary:\n{row['summary_clean']}"""
return summary
def create_report(articles_overview, client, model, topic):
"""Create a comprehensive report from all article summaries"""
user_msg = f"""\
News articles:\
{articles_overview}
===========================
Create a summary report of the newspaper articles above. Ignore everything that's not releated to the topic '{topic}'
Separete the report into categories, for example for AI it could be catogories like:
- Breaking news: anything that can also appear below but is the most important news of the day
- Model news (e.g. new model releases, or insights about existing models etc.)
- Startups (e.g. new startups, fundraising etc.)
- Big Tech news (e.g. news from Google/Meta/OpenAI etc.)
- Policy (e.g. US administration or EU policy)
- Products (e.g. news of products that are powered by AI in some way)
- Miscellaneous (whatever doesn't fit into the others but still relevant to the topic)
For other topics come up with other categories that make sense.
Style: The reader is an expert in the field and wants only the most relevant and novel information. \
Omit articles that are irrelevant to the field of AI and feel free to aggregate several articles about the same topic into one point. \
Format: Use markdown formatting and add links at the end of each section linking to the original articles.
Example snippet:
# NEWS_SUMMARY
---
## **Breaking News**
- **Google and Apple in talks to integrate Gemini AI into Apple Intelligence by mid-2025** _[Apple Insider](https://appleinsider.com/articles/25/04/30/google-wants-gemini-ai-deal-with-apple-by-mid-2025), [The Verge](https://www.theverge.com/news/658770/google-gemini-apple-iphone-deal-ai)_
- Google’s Gemini AI could enhance Siri with advanced reasoning and contextual capabilities, though Apple’s strict privacy controls may limit deep system access.
- A potential deal could accelerate Apple’s AI development and expand Google’s AI reach.
- **Apple Vision Pro launch delayed** _[Six Colors](https://sixcolors.com/post/2025/04/apple-in-the-enterprise-the-complete-2025-commentary/)_
- Apple’s mixed-reality headset, featuring advanced AI integration, is expected to arrive in 2025, though specifics remain unclear.
---
... followed by the other sections.
"""
messages=[
{
"role": "user",
"content": user_msg,
}
]
response = client.chat_completion(
model=model,
messages=messages,
temperature=0.8,
max_tokens=32000,
)
return response.choices[0].message.content
def postprocess_report(report, summaries, topic, num_articles, model):
report_summary = f"""\
# News Summary: {topic}
**Period:** {(date.today() - timedelta(days=1)).strftime('%Y-%m-%d')} to {date.today().strftime('%Y-%m-%d')} | \
**Processed articles:** {num_articles} | **Model**: {model}
"""
report_content = report.split("</think>")[1].strip() if "</think>" in report else report.strip()
report_thoughts = report.split("</think>")[0].strip() if "</think>" in report else "No thoughts."
report_thoughts.replace("<think>", "")
final_report = f"""\
{report_content.replace('# NEWS_SUMMARY', report_summary)}
## Details
## Sources
{summaries}
## Model reasoning
{report_thoughts}
"""
return final_report
if __name__ == "__main__":
main()