File size: 8,752 Bytes
b6cc122 e62a82f b6cc122 ccac437 b6cc122 c945162 b6cc122 fa9df97 b6cc122 a4321ae b6cc122 ccac437 b6cc122 e62a82f b6cc122 e62a82f b6cc122 e62a82f b6cc122 e62a82f b6cc122 ccac437 b6cc122 ccac437 b6cc122 af29208 b6cc122 ccac437 b6cc122 af29208 b6cc122 ccac437 23109fa b6cc122 ccac437 b6cc122 ccac437 6876d23 ccac437 23109fa ccac437 6876d23 ccac437 6876d23 ccac437 b6cc122 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 |
#!/usr/bin/env python3
"""
AI News Summarizer
A script to fetch, summarize, and create reports on recent AI news articles based on a specified topic.
"""
import argparse
from huggingface_hub import HfApi, InferenceClient
from newspaper import Article
import pandas as pd
import requests
from datetime import date, timedelta
import json
import os
from tqdm.auto import tqdm
def parse_arguments():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(description='AI News Summarizer')
parser.add_argument('--topic', type=str, default="Language Models",
help='Topic to search for news articles (default: "Language Models")')
parser.add_argument('--num-articles', type=int, default=50,
help='Number of articles to fetch (default: 50)')
parser.add_argument('--provider', type=str, default="fireworks-ai",
help='Inference provider for HuggingFace (default: "fireworks-ai")')
parser.add_argument('--repo-id', type=str, default="lvwerra/news-reports",
help='HuggingFace repo ID to upload the report (default: "lvwerra/news-reports")')
args = parser.parse_args()
return args
def main():
# Parse arguments
args = parse_arguments()
# Environment variables
NEWS_API_KEY = os.getenv("NEWS_API_KEY")
HF_API_KEY = os.getenv("HF_API_KEY")
NEWS_ENDPOINT = 'https://newsapi.org/v2/everything'
MODEL = "Qwen/Qwen3-30B-A3B"
# Initialize clients
client = InferenceClient(provider=args.provider, api_key=HF_API_KEY)
# Set topic and number of articles
topic = args.topic
num = args.num_articles
# Configure tqdm for pandas
tqdm.pandas(desc="")
print(f"Fetching top {num} articles on '{topic}' of today...")
articles = fetch_news_articles(topic, num)
df = pd.DataFrame.from_records(articles)
print(f"Downloading and parsing {len(df)} articles...")
df["content_full"] = df["url"].progress_apply(fetch_full_article)
mask = df['content_full'].str.contains("Failed to fetch artcile.")
df.loc[mask, 'content_full'] = df.loc[mask, 'content']
print(f"Summarizing each article (total={len(df)})...")
df["summary_raw"] = df["content_full"].progress_apply(lambda x: summarize(x, client, MODEL))
df["summary_clean"] = df["summary_raw"].apply(lambda x: x.split("</think>")[1].strip() if "</think>" in x else x.strip())
print(f"Create report...")
df["article_summary"] = df.apply(format_summary, axis=1)
sep = "\n\n" + "-"*80 + "\n\n"
overview = sep.join([f"Article: {i+1}\n{article}" for i, article in enumerate(df["article_summary"])])
report = create_report(overview, client, MODEL, topic)
# Extract report content
final_report = postprocess_report(report, overview, topic, num, MODEL)
file_path = f"reports/{'-'.join(topic.lower().split())}/{date.today().strftime('%Y-%m-%d')}.md"
print(f"Uploading to {args.repo_id} under {file_path}...")
# Upload to HuggingFace
hf_api = HfApi()
hf_api.upload_file(
path_or_fileobj=final_report.encode("utf-8"),
path_in_repo=file_path,
repo_id=args.repo_id,
repo_type="space",
token=HF_API_KEY,
)
print("Job finished!")
def fetch_news_articles(topic, num_articles=10):
"""Fetch news articles on the given topic"""
NEWS_API_KEY = os.getenv("NEWS_API_KEY")
NEWS_ENDPOINT = 'https://newsapi.org/v2/everything'
today = date.today().strftime('%Y-%m-%d')
yesterday = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d')
params = {
'q': topic,
'from': yesterday,
'to': today,
'sortBy': 'popularity',
'language': 'en',
'pageSize': num_articles,
'apiKey': NEWS_API_KEY
}
response = requests.get(NEWS_ENDPOINT, params=params)
if response.status_code == 200:
data = response.json()
return data['articles']
else:
print(f"Error: {response.status_code}")
print(response.text)
return []
def fetch_full_article(url):
"""Fetch and parse the full content of an article"""
try:
a = Article(url)
a.download()
a.parse()
return a.text
except:
return "Failed to fetch artcile."
def summarize(article, client, model):
"""Summarize an article using the HuggingFace inference API"""
user_msg = f"""\
Summarize the following news article in a few bullet points. \
Note that the reader is an expert in the field and wants only the most relevant and novel information and likes to know the specific details. \
So keep the summary brief but don't omit technical terms or specific information.
Article:
{article}
/no_think"""
messages=[
{
"role": "user",
"content": user_msg,
}
]
response = client.chat_completion(
model=model,
messages=messages,
temperature=0.8,
max_tokens=512,
)
return response.choices[0].message.content
def format_summary(row):
"""Format article summary"""
summary = f"""\
Title: **{row['title']}**
Published: {row['publishedAt']}
Description: {row['description']}
URL: {row['url']}
Summary:\n{row['summary_clean']}"""
return summary
def create_report(articles_overview, client, model, topic):
"""Create a comprehensive report from all article summaries"""
user_msg = f"""\
News articles:\
{articles_overview}
===========================
Create a summary report of the newspaper articles above. Ignore everything that's not releated to the topic '{topic}'
Separete the report into categories, for example for AI it could be catogories like:
- Breaking news: anything that can also appear below but is the most important news of the day
- Model news (e.g. new model releases, or insights about existing models etc.)
- Startups (e.g. new startups, fundraising etc.)
- Big Tech news (e.g. news from Google/Meta/OpenAI etc.)
- Policy (e.g. US administration or EU policy)
- Products (e.g. news of products that are powered by AI in some way)
- Miscellaneous (whatever doesn't fit into the others but still relevant to the topic)
For other topics come up with other categories that make sense.
Style: The reader is an expert in the field and wants only the most relevant and novel information. \
Omit articles that are irrelevant to the field of AI and feel free to aggregate several articles about the same topic into one point. \
Format: Use markdown formatting and add links at the end of each section linking to the original articles.
Example snippet:
# NEWS_SUMMARY
---
## **Breaking News**
- **Google and Apple in talks to integrate Gemini AI into Apple Intelligence by mid-2025** _[Apple Insider](https://appleinsider.com/articles/25/04/30/google-wants-gemini-ai-deal-with-apple-by-mid-2025), [The Verge](https://www.theverge.com/news/658770/google-gemini-apple-iphone-deal-ai)_
- Google’s Gemini AI could enhance Siri with advanced reasoning and contextual capabilities, though Apple’s strict privacy controls may limit deep system access.
- A potential deal could accelerate Apple’s AI development and expand Google’s AI reach.
- **Apple Vision Pro launch delayed** _[Six Colors](https://sixcolors.com/post/2025/04/apple-in-the-enterprise-the-complete-2025-commentary/)_
- Apple’s mixed-reality headset, featuring advanced AI integration, is expected to arrive in 2025, though specifics remain unclear.
---
... followed by the other sections.
"""
messages=[
{
"role": "user",
"content": user_msg,
}
]
response = client.chat_completion(
model=model,
messages=messages,
temperature=0.8,
max_tokens=32000,
)
return response.choices[0].message.content
def postprocess_report(report, summaries, topic, num_articles, model):
report_summary = f"""\
# News Summary: {topic}
**Period:** {(date.today() - timedelta(days=1)).strftime('%Y-%m-%d')} to {date.today().strftime('%Y-%m-%d')} | \
**Processed articles:** {num_articles} | **Model**: {model}
"""
report_content = report.split("</think>")[1].strip() if "</think>" in report else report.strip()
report_thoughts = report.split("</think>")[0].strip() if "</think>" in report else "No thoughts."
report_thoughts.replace("<think>", "")
final_report = f"""\
{report_content.replace('# NEWS_SUMMARY', report_summary)}
## Details
## Sources
{summaries}
## Model reasoning
{report_thoughts}
"""
return final_report
if __name__ == "__main__":
main() |